From bb403657a5f5c7d50e84244b97e69ece022d798a Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Thu, 7 Jun 2018 11:26:16 +0100 Subject: [PATCH 01/80] acuda stubs --- Makefile | 28 +++++--- src/cuda_kernels/GpuAligner.cu | 121 +++++++++++++++++++++++++++++++++ src/cuda_kernels/GpuAligner.h | 61 +++++++++++++++++ 3 files changed, 201 insertions(+), 9 deletions(-) create mode 100644 src/cuda_kernels/GpuAligner.cu create mode 100644 src/cuda_kernels/GpuAligner.h diff --git a/Makefile b/Makefile index 9b484626..afe84ea1 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # # Sub directories containing source code, except for the main programs -SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/alignment src/pore_model +SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/alignment src/pore_model src/cuda_kernels # # Set libraries, paths, flags and options @@ -11,9 +11,12 @@ SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/ali LIBS=-lz CXXFLAGS ?= -g -O3 CXXFLAGS += -std=c++11 -fopenmp -fsigned-char -CFLAGS ?= -O3 -std=c99 +CFLAGS ?= -std=c99 -O3 CXX ?= g++ CC ?= gcc +NVCC = nvcc +NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -g +CURTFLAGS ?= -L/usr/local/cuda-9.0/lib64 -lcudart # Change the value of HDF5, EIGEN, or HTS below to any value to disable compilation of bundled code HDF5?=install @@ -102,20 +105,24 @@ eigen/INSTALL: # Find the source files by searching subdirectories CPP_SRC := $(foreach dir, $(SUBDIRS), $(wildcard $(dir)/*.cpp)) +CU_SRC := $(foreach dir, $(SUBDIRS), $(wildcard $(dir)/*.cu)) C_SRC := $(foreach dir, $(SUBDIRS), $(wildcard $(dir)/*.c)) EXE_SRC=src/main/nanopolish.cpp src/test/nanopolish_test.cpp # Automatically generated object names CPP_OBJ=$(CPP_SRC:.cpp=.o) C_OBJ=$(C_SRC:.c=.o) +CU_OBJ=$(CU_SRC:.cu=.o) + +.SUFFIXES: .cu # Generate dependencies PHONY=depend depend: .depend -.depend: $(CPP_SRC) $(C_SRC) $(EXE_SRC) $(H5_LIB) $(EIGEN_CHECK) +.depend: $(CPP_SRC) $(C_SRC) $(CU_SRC) $(EXE_SRC) $(H5_LIB) $(EIGEN_CHECK) rm -f ./.depend - $(CXX) $(CXXFLAGS) $(CPPFLAGS) -MM $(CPP_SRC) $(C_SRC) > ./.depend; + $(CXX) $(CXXFLAGS) $(CPPFLAGS) $(NVCCFLAGS) $(NVCC) -MM $(CPP_SRC) $(C_SRC) $(CU_SRC) > ./.depend; include .depend @@ -126,16 +133,19 @@ include .depend .c.o: $(CC) -o $@ -c $(CFLAGS) $(CPPFLAGS) $(H5_INCLUDE) -fPIC $< +.cu.o: + $(NVCC) -o $@ -c $(NVCCFLAGS) $(CPPFLAGS) $< + # Link main executable -$(PROGRAM): src/main/nanopolish.o $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(EIGEN_CHECK) - $(CXX) -o $@ $(CXXFLAGS) $(CPPFLAGS) -fPIC $< $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(LIBS) $(LDFLAGS) +$(PROGRAM): src/main/nanopolish.o $(CU_OBJ) $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(EIGEN_CHECK) + $(CXX) -o $@ $(CXXFLAGS) $(CPPFLAGS) -fPIC $< $(CPP_OBJ) $(CU_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(LIBS) $(LDFLAGS) $(CURTFLAGS) # Link test executable -$(TEST_PROGRAM): src/test/nanopolish_test.o $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) - $(CXX) -o $@ $(CXXFLAGS) $(CPPFLAGS) -fPIC $< $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(LIBS) $(LDFLAGS) +$(TEST_PROGRAM): src/test/nanopolish_test.o $(CPP_OBJ) $(CU_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) + $(CXX) -o $@ $(CXXFLAGS) $(CPPFLAGS) -fPIC $< $(CPP_OBJ) $(CU_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(LIBS) $(LDFLAGS) $(CURTFLAGS) test: $(TEST_PROGRAM) ./$(TEST_PROGRAM) clean: - rm -f $(PROGRAM) $(TEST_PROGRAM) $(CPP_OBJ) $(C_OBJ) src/main/nanopolish.o src/test/nanopolish_test.o + rm -f $(PROGRAM) $(TEST_PROGRAM) $(CPP_OBJ) $(CU_OBJ) $(C_OBJ) src/main/nanopolish.o src/test/nanopolish_test.o diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu new file mode 100644 index 00000000..1f9ae48d --- /dev/null +++ b/src/cuda_kernels/GpuAligner.cu @@ -0,0 +1,121 @@ +#include +#include +#include "GpuAligner.h" +#include + +__global__ void findSumToN(int *n, int limit) +{ + int tId = threadIdx.x; + + for (int i=0; i<=(int)log2((double)limit); i++) + { + if (tId%(int)(pow(2.0,(double)(i+1))) == 0){ + if (tId+(int)pow(2.0, (double)i) >= limit) break; + n[tId] += n[tId+(int)pow(2.0, (double)i)]; + } + __syncthreads(); + } +} + +GpuAligner::GpuAligner() +{ + y = 20; + asize = y*sizeof(int); + for (int i=0; i>>(n_d, y); + cudaMemcpy(n, n_d, asize, cudaMemcpyDeviceToHost); + cudaFree (n_d); + return n[0]; +} + +void GpuAligner::setY(int newVal) +{ + y = newVal; + asize = y*sizeof(int); + for (int i=0; i sequences, + std::vector event_sequences, + uint32_t alignment_flags){ + + assert(!sequences.empty()); + assert(std::string(sequences[0].get_alphabet()->get_name()) == "nucleotide"); + for (auto e: event_sequences) { + assert(std::string(e.pore_model->pmalphabet->get_name()) == "nucleotide"); + assert(e.read->pore_type == PT_R9); + } + + size_t num_models = sequences.size(); + double num_model_penalty = log(num_models); + + assert(num_models == 1); //this is temporary + + // start preparing the data for the CUDA Kernel + + + + return 0.210964; +} + +std::vector GpuAligner::variantScoresThresholded(std::vector input_variants, + Haplotype base_haplotype, + std::vector event_sequences, + uint32_t alignment_flags, + int screen_score_threshold, + std::vector methylation_types) { + int numVariants = input_variants.size(); + + std::vector out_variants = input_variants; + std::vector variant_haplotypes(numVariants, base_haplotype); + + //loop over the vector, applying the variants to the haplotypes + for (int i = 0; i base_sequences = generate_methylated_alternatives(base_haplotype.get_sequence(), + methylation_types); + + assert(base_sequences.size() == 1); + + std::vector> methylatedVariantSequences; + for(auto variant: variant_haplotypes) { + std::vector variant_sequences = generate_methylated_alternatives( + variant.get_sequence(), methylation_types); + methylatedVariantSequences.push_back(variant_sequences); + + } + + //For now let's not worry about methylation + assert(methylatedVariantSequences.size() == numVariants); + for (auto m: methylatedVariantSequences) { + assert(m.size() == 1); + } + //Next we need to get the scores. + + // return the sum of the score for the base sequences over all the event sequences + double base_score = scoreKernel(base_sequences, event_sequences, alignment_flags); + + std::vector v; + v.push_back(base_score); + return v; +} \ No newline at end of file diff --git a/src/cuda_kernels/GpuAligner.h b/src/cuda_kernels/GpuAligner.h new file mode 100644 index 00000000..b6a8bbe1 --- /dev/null +++ b/src/cuda_kernels/GpuAligner.h @@ -0,0 +1,61 @@ +// +// Created by mike on 05/06/18. +// +#include +#include "nanopolish_variant.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "htslib/faidx.h" +#include "nanopolish_poremodel.h" +#include "nanopolish_transition_parameters.h" +#include "nanopolish_matrix.h" +#include "nanopolish_klcs.h" +#include "nanopolish_profile_hmm.h" +#include "nanopolish_alignment_db.h" +#include "nanopolish_anchor.h" +#include "nanopolish_variant.h" +#include "nanopolish_haplotype.h" +#include "nanopolish_pore_model_set.h" +#include "nanopolish_duration_model.h" +#include "nanopolish_variant_db.h" +#include "profiler.h" +#include "progress.h" +#include "stdaln.h" +#include + +#ifndef GPU_ALIGNER_H +#define GPU_ALIGNER_H1 + +class GpuAligner +{ +public: + int n[20]; + int y; + int asize; + + GpuAligner(); + int calculateSum(); + void setY(int); + + std::vector + variantScoresThresholded(std::vector tmp_variants, Haplotype haplotype, std::vector event_sequences, + uint32_t alignment_flags, int screen_score_threshold, std::vector methylation_types);// { + //return std::vector(); + //} +}; +#endif // GPU_ALIGNER_H \ No newline at end of file From 26f042d2eb8691be9f07eb3f7ae25e1dfca501d0 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Thu, 7 Jun 2018 17:31:36 +0100 Subject: [PATCH 02/80] Sending event means to device --- src/common/nanopolish_variant.cpp | 2 +- src/cuda_kernels/GpuAligner.cu | 88 ++++++++++++++++++++++++++- src/hmm/nanopolish_profile_hmm_r9.cpp | 2 +- src/hmm/nanopolish_profile_hmm_r9.inl | 2 +- src/main/nanopolish.cpp | 11 ++-- src/nanopolish_call_variants.cpp | 60 ++++++++++++++++-- 6 files changed, 152 insertions(+), 13 deletions(-) diff --git a/src/common/nanopolish_variant.cpp b/src/common/nanopolish_variant.cpp index 725a62ab..b73a6b2b 100644 --- a/src/common/nanopolish_variant.cpp +++ b/src/common/nanopolish_variant.cpp @@ -664,7 +664,7 @@ std::vector multi_call(VariantGroup& variant_group, // Variant score_variant_thresholded(const Variant& input_variant, Haplotype base_haplotype, - const std::vector& input, + const std::vector& input, // raw reads (I think) const uint32_t alignment_flags, const uint32_t score_threshold, const std::vector& methylation_types) diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index 1f9ae48d..122195e7 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -2,9 +2,12 @@ #include #include "GpuAligner.h" #include +#include "nanopolish_profile_hmm_r9.h" + __global__ void findSumToN(int *n, int limit) { + //printf("HELLO FROM SUM\n"); int tId = threadIdx.x; for (int i=0; i<=(int)log2((double)limit); i++) @@ -17,6 +20,20 @@ __global__ void findSumToN(int *n, int limit) } } + +__global__ void getScores(float * eventData, float * returnValues) +{ + int tId = threadIdx.x; + if (tId == 0) { + printf("data: %f\n", eventData[0]); + printf("data: %f\n", eventData[1]); + printf("data: %f\n", eventData[2]); + } + returnValues[0] = 0.356; + //__syncthreads(); +} + + GpuAligner::GpuAligner() { y = 20; @@ -53,11 +70,13 @@ double scoreKernel(std::vector sequences, std::vector event_sequences, uint32_t alignment_flags){ + // These asserts are here during the development phase assert(!sequences.empty()); assert(std::string(sequences[0].get_alphabet()->get_name()) == "nucleotide"); for (auto e: event_sequences) { assert(std::string(e.pore_model->pmalphabet->get_name()) == "nucleotide"); assert(e.read->pore_type == PT_R9); + assert( (e.rc && e.event_stride == -1) || (!e.rc && e.event_stride == 1)); } size_t num_models = sequences.size(); @@ -65,11 +84,76 @@ double scoreKernel(std::vector sequences, assert(num_models == 1); //this is temporary - // start preparing the data for the CUDA Kernel + auto sequence = sequences[0]; // temporary. We are only going to score one sequence against a set of events for now. + + const uint32_t k = event_sequences[0].pore_model->k; //k is the kmerity + uint32_t n_kmers = sequence.length() - k + 1; //number of kmers in the sequence + + uint32_t n_states = PSR9_NUM_STATES * (n_kmers + 2); // + 2 for explicit terminal states + + std::vector n_rows; //number of rows in the DP table (n_events + 1) + std::vector e_starts; //event starts + + for(auto e: event_sequences){ + uint32_t e_start = e.event_start_idx; + e_starts.push_back(e_start); + uint32_t e_end = e.event_stop_idx; + uint32_t n_events = 0; + if(e_end > e_start) + n_events = e_end - e_start + 1; + else + n_events = e_start - e_end + 1; + + n_rows.push_back(n_events + 1); + } + + + // Prepare raw data and send it over to the score calculator kernel + + // Buffer 1: Raw event data and associated starts and stops + + size_t numEventsTotal = 0; + //1. Count the total number of events across all reads + std::vector eventLengths; + for (auto e: event_sequences){ + size_t numEvents = e.read->events->size(); + + eventLengths.push_back(numEvents); + numEventsTotal += numEvents; + } + + float * eventMeans; + //Allocate a host buffer to store the event means + size_t eventMeansSize = numEventsTotal * sizeof(float); + cudaHostAlloc(&eventMeans, eventMeansSize , cudaHostAllocDefault); + + size_t offset = 0; + for (auto ev: event_sequences){ + size_t num_events = ev.read->events->size(); + for (int i=0;ievents[0][i].mean; //taking the first element. Not sure what the second one is.. + } + offset += num_events; + } + + + float* devicePtr; + cudaMalloc( (void**)&devicePtr, eventMeansSize); + cudaMemcpy( devicePtr, eventMeans, eventMeansSize, cudaMemcpyHostToDevice ); + + dim3 dimBlock( 1, 1 ); + dim3 dimGrid( 1, 1 ); + + float * returnValues; + cudaMalloc((void **) &returnValues, sizeof(float) * num_models); //one score per read + float * returnedValues; + getScores<<>>(devicePtr, returnValues); + cudaMemcpy(returnedValues, returnValues, num_models *sizeof(float), cudaMemcpyDeviceToHost ); - return 0.210964; + auto r = returnedValues[0]; + return r; } std::vector GpuAligner::variantScoresThresholded(std::vector input_variants, diff --git a/src/hmm/nanopolish_profile_hmm_r9.cpp b/src/hmm/nanopolish_profile_hmm_r9.cpp index 773394a7..1f365ebe 100644 --- a/src/hmm/nanopolish_profile_hmm_r9.cpp +++ b/src/hmm/nanopolish_profile_hmm_r9.cpp @@ -46,7 +46,7 @@ float profile_hmm_score_r9(const HMMInputSequence& sequence, const HMMInputData& FloatMatrix fm; allocate_matrix(fm, n_rows, n_states); - profile_hmm_forward_initialize_r9(fm); + profile_hmm_forward_initialize_r9(fm); // what does this do? ProfileHMMForwardOutputR9 output(&fm); diff --git a/src/hmm/nanopolish_profile_hmm_r9.inl b/src/hmm/nanopolish_profile_hmm_r9.inl index 71d52aba..76de768f 100644 --- a/src/hmm/nanopolish_profile_hmm_r9.inl +++ b/src/hmm/nanopolish_profile_hmm_r9.inl @@ -285,7 +285,7 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, // Calculate number of blocks // A block of the HMM is a set of states for one kmer - uint32_t num_blocks = output.get_num_columns() / PSR9_NUM_STATES; + uint32_t num_blocks = output.get_num_columns() / PSR9_NUM_STATES; // num_columns is the number of kmers uint32_t last_event_row_idx = output.get_num_rows() - 1; // Precompute the transition probabilites for each kmer block diff --git a/src/main/nanopolish.cpp b/src/main/nanopolish.cpp index 054f5063..cc6fcab7 100644 --- a/src/main/nanopolish.cpp +++ b/src/main/nanopolish.cpp @@ -64,6 +64,7 @@ int main(int argc, char** argv) { // Turn off HDF's exception printing, which is generally unhelpful for users H5Eset_auto(0, NULL, NULL); + std::cout << "CHECKPOINT 1\n"; int ret = 0; if(argc <= 1) { @@ -73,9 +74,11 @@ int main(int argc, char** argv) } else { std::string command(argv[1]); auto iter = programs.find(command); - if (iter != programs.end()) - ret = iter->second( argc - 1, argv + 1); - else + if (iter != programs.end()) { + std::cout << "CHECKPOINT 2: " << iter->first <second(argc - 1, argv + 1); + } + else ret = print_usage( argc - 1, argv + 1); } @@ -88,7 +91,7 @@ int main(int argc, char** argv) extern int g_failed_alignment_reads; extern int g_bad_fast5_file; if(g_total_reads > 0) { - fprintf(stderr, "[post-run summary] total reads: %d, unparseable: %d, qc fail: %d, could not calibrate: %d, no alignment: %d, bad fast5: %d\n", + fprintf(stderr, "[post-run summaryz] total reads: %d, unparseable: %d, qc fail: %d, could not calibrate: %d, no alignment: %d, bad fast5: %d\n", g_total_reads, g_unparseable_reads, g_qc_fail_reads, g_failed_calibration_reads, g_failed_alignment_reads, g_bad_fast5_file); } return ret; diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp index 053dff15..34d46ddc 100644 --- a/src/nanopolish_call_variants.cpp +++ b/src/nanopolish_call_variants.cpp @@ -38,6 +38,9 @@ #include "profiler.h" #include "progress.h" #include "stdaln.h" +#include +#include + // Macros #define max3(x,y,z) std::max(std::max(x,y), z) @@ -277,11 +280,18 @@ std::vector generate_candidate_single_base_edits(const AlignmentDB& ali int region_end, uint32_t alignment_flags) { + std::cout << "CHECKPOINT 13" << std::endl; + auto start = std::chrono::high_resolution_clock::now(); + std::vector out_variants; std::string contig = alignments.get_region_contig(); // Add all positively-scoring single-base changes into the candidate set + + + auto scoring = std::chrono::high_resolution_clock::now() - std::chrono::high_resolution_clock::now(); + for(size_t i = region_start; i < region_end; ++i) { int calling_start = i - opt::screen_flanking_sequence; @@ -335,15 +345,44 @@ std::vector generate_candidate_single_base_edits(const AlignmentDB& ali calling_start, alignments.get_reference_substring(contig, calling_start, calling_end)); + GpuAligner aligner; + aligner.setY(15); + std::cout << aligner.calculateSum() < scores = aligner.variantScoresThresholded(tmp_variants, test_haplotype, event_sequences, + alignment_flags, opt::screen_score_threshold, + opt::methylation_types); + for(const Variant& v : tmp_variants) { - Variant scored_variant = score_variant_thresholded(v, test_haplotype, event_sequences, alignment_flags, opt::screen_score_threshold, opt::methylation_types); + auto t0 = std::chrono::high_resolution_clock::now(); + Variant scored_variant = score_variant_thresholded(v, + test_haplotype, + event_sequences, + alignment_flags, + opt::screen_score_threshold, + opt::methylation_types); + auto t1 = std::chrono::high_resolution_clock::now(); + scoring += t1-t0; scored_variant.info = ""; if(scored_variant.quality > 0) { out_variants.push_back(scored_variant); } } - } + + std::cout << "CHECKPOINT 14 - Region end - start ength= " << region_end - region_start << std::endl; + + auto end = std::chrono::high_resolution_clock::now(); + + auto duration = std::chrono::duration_cast( end - start ).count(); + + auto screening = std::chrono::duration_cast(scoring).count(); + + std::cout << "FUNCTION TOOK " << duration << "ms" << std::endl; + std::cout << "SCREENING COMPONENT TOOK " << screening << "ms" << std::endl; + + + return out_variants; } @@ -894,7 +933,7 @@ Haplotype call_variants_for_region(const std::string& contig, int region_start, alignments.get_region_start(), alignments.get_reference()); */ - + std::cout<<"CHECKPOINT 8 - Data loaded"< candidate_variants; if(opt::candidates_file.empty()) { @@ -903,13 +942,16 @@ Haplotype call_variants_for_region(const std::string& contig, int region_start, candidate_variants = read_variants_for_region(opt::candidates_file, contig, region_start, region_end); } + std::cout<<"CHECKPOINT 9 - Candidate variants generated"< single_base_edits = generate_candidate_single_base_edits(alignments, region_start, region_end, alignment_flags); - + std::cout<<"CHECKPOINT 11 - Single base edits generated"< dedup_set(candidate_variants.begin(), candidate_variants.end()); @@ -918,6 +960,8 @@ Haplotype call_variants_for_region(const std::string& contig, int region_start, std::sort(candidate_variants.begin(), candidate_variants.end(), sortByPosition); } + std::cout<<"CHECKPOINT 10 - Additional candidate variants generated"< tag_fields; @@ -1187,10 +1235,14 @@ int call_variants_main(int argc, char** argv) Variant::make_vcf_tag_string("FORMAT", "GT", 1, "String", "Genotype")); + std::cout << "Checkpoint 5" << std::endl; + Variant::write_vcf_header(out_fp, tag_fields); Haplotype haplotype = call_variants_for_region(contig, start_base, end_base, out_fp); + std::cout << "Checkpoint 6" << std::endl; + if(out_fp != stdout) { fclose(out_fp); } From 381c3c5b7ea36707fa2dba23786b5e3bbf2f5a9e Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Mon, 11 Jun 2018 15:12:47 +0100 Subject: [PATCH 03/80] estimating emission probabilities --- Makefile | 4 +- src/cuda_kernels/GpuAligner.cu | 239 +++++++++++++++++++++++--- src/hmm/nanopolish_profile_hmm_r9.inl | 2 +- 3 files changed, 222 insertions(+), 23 deletions(-) diff --git a/Makefile b/Makefile index afe84ea1..27030a55 100644 --- a/Makefile +++ b/Makefile @@ -9,9 +9,9 @@ SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/ali #Basic flags every build needs LIBS=-lz -CXXFLAGS ?= -g -O3 +CXXFLAGS ?= -g #-O3 CXXFLAGS += -std=c++11 -fopenmp -fsigned-char -CFLAGS ?= -std=c99 -O3 +CFLAGS ?= -std=c99 #-O3 CXX ?= g++ CC ?= gcc NVCC = nvcc diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index 122195e7..c0b5b798 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -4,7 +4,6 @@ #include #include "nanopolish_profile_hmm_r9.h" - __global__ void findSumToN(int *n, int limit) { //printf("HELLO FROM SUM\n"); @@ -20,16 +19,126 @@ __global__ void findSumToN(int *n, int limit) } } +//TODO: Implement, inc pore model +__device__ float lp_match_r9(int rank, + float mean, + float * poreModelLevelLogStdv, + float * poreModelLevelStdv, + float * poreModelLevelMean){ + float log_inv_sqrt_2pi = log(0.3989422804014327); // no need to calculate this every time. better solutions available.. + + // STEP 1: GET DRIFT-SCALED LEVEL: + float level = mean; //TODO: Do actual drift scaling. this is a cheat + // TODO: STEP 2: Get *scaled* Gaussian from pore model + //these can just be pulled from the model + //float gaussian_mean = 0.0; + //float gaussian_stdv = 0.0; + //float gaussian_log_level_stdv = 0.0; + float gaussian_mean = poreModelLevelMean[rank]; + float gaussian_stdv = poreModelLevelStdv[rank]; + float gaussian_log_level_stdv = poreModelLevelLogStdv[rank]; + // Step 3: calculate log-normal PDF + float a = (level - gaussian_mean) / gaussian_stdv; // g is the gaussian parameters + return log_inv_sqrt_2pi - gaussian_log_level_stdv + (-0.5f * a * a); // log_inv_sqrt_2pi is defined in a comment above + + return 0.1973; +} -__global__ void getScores(float * eventData, float * returnValues) +__global__ void getScores(float * eventData, + float * readEventsPerBase, + int * numRowsPerRead, + int * eventStarts, + int * eventStrides, + int * kmer_ranks, + int * kmer_ranks_rc, + int * eventOffsets, // Offset to use for getting an event IDX for a specific read (read obtained by block IDX) + float * poreModelLevelLogStdv, + float * poreModelLevelStdv, + float * poreModelLevelMean, + float * returnValues) { - int tId = threadIdx.x; - if (tId == 0) { - printf("data: %f\n", eventData[0]); - printf("data: %f\n", eventData[1]); - printf("data: %f\n", eventData[2]); + printf("Entered\n"); + //float log_inv_sqrt_2pi = log(0.3989422804014327); + + //Step 1: calculate transitions. For now we are going to use external params. + int readIdx = blockIdx.x; + float read_events_per_base = readEventsPerBase[readIdx]; + int numRows = numRowsPerRead[readIdx]; // Number of rows in this DP table. + int e_start = eventStarts[readIdx]; // Event start for read + int e_stride = eventStrides[readIdx]; + int e_offset = eventOffsets[readIdx]; // Within the event means etc, the offset needed for this block to get a specific event + //int kmer_ranks = kmerRanks[readIdx.x]; // TODO: Use RC for RC reads + + int kmerIdx = threadIdx.x; + + float p_stay = 1 - (1 / read_events_per_base); + + //printf("Events per base: %f \n", read_events_per_base); + float p_skip = 0.0025; + float p_bad = 0.001; + float p_bad_self = p_bad; + float p_skip_self = 0.3; + + float p_mk = p_skip; // probability of not observing an event at all + float p_mb = p_bad; // probabilty of observing a bad event + float p_mm_self = p_stay; // probability of observing additional events from this k-mer + float p_mm_next = 1.0f - p_mm_self - p_mk - p_mb; // normal movement from state to state + + // transitions from event split state in previous block + float p_bb = p_bad_self; + float p_bk, p_bm_next, p_bm_self; + p_bk = p_bm_next = p_bm_self = (1.0f - p_bb) / 3; + + // transitions from kmer skip state in previous block + float p_kk = p_skip_self; + float p_km = 1.0f - p_kk; + + // We assign some transition probabilities. I believe this is correct and they don't vary by location in the sequence (why would they) + float lp_mk = log(p_mk); + float lp_mb = log(p_mb); + float lp_mm_self = log(p_mm_self); + float lp_mm_next = log(p_mm_next); + float lp_bb = log(p_bb); + float lp_bk = log(p_bk); + float lp_bm_next = log(p_bm_next); + float lp_bm_self = log(p_bm_self); + float lp_kk = log(p_kk); + float lp_km = log(p_km); + + + // Start filling out the "DP table" + // Each thread is going to work on an individual P-HMM Block + // WRONG - need to use threadIdx & think carefully. we have one thread per block/kmer. each block has 3 states tho. + //int kmerIdx = blockIdx.x; + int curBlockIdx = kmerIdx + 1; // Accounts for fact that we are not working with start block. + int prevBlockIdx = curBlockIdx -1; + int prevBlockOffset = PSR9_NUM_STATES * prevBlockIdx; + int curBlockOffset = PSR9_NUM_STATES * curBlockIdx; + + // the penalty is controlled by the transition probability + float BAD_EVENT_PENALTY = 0.0f; + + for(int row=1; row sequences, std::vector event_sequences, uint32_t alignment_flags){ + // Extract the pore model. + //Let's assume that every event sequence has the same pore model + //event_sequences[0].pore_model. + // These asserts are here during the development phase assert(!sequences.empty()); assert(std::string(sequences[0].get_alphabet()->get_name()) == "nucleotide"); @@ -93,10 +206,15 @@ double scoreKernel(std::vector sequences, std::vector n_rows; //number of rows in the DP table (n_events + 1) std::vector e_starts; //event starts + std::vector event_strides; for(auto e: event_sequences){ uint32_t e_start = e.event_start_idx; e_starts.push_back(e_start); + + uint32_t e_stride = e.event_stride; + event_strides.push_back(e_stride); + uint32_t e_end = e.event_stop_idx; uint32_t n_events = 0; if(e_end > e_start) @@ -107,6 +225,12 @@ double scoreKernel(std::vector sequences, n_rows.push_back(n_events + 1); } + std::vector kmer_ranks(n_kmers); + std::vector kmer_ranks_rc(n_kmers); + for(size_t ki = 0; ki < n_kmers; ++ki) { + kmer_ranks[ki] = sequences[0].get_kmer_rank(ki, k, false); + kmer_ranks_rc[ki] = sequences[0].get_kmer_rank(ki, k, true); + } // Prepare raw data and send it over to the score calculator kernel @@ -115,10 +239,14 @@ double scoreKernel(std::vector sequences, size_t numEventsTotal = 0; //1. Count the total number of events across all reads std::vector eventLengths; + std::vector eventsPerBase; for (auto e: event_sequences){ size_t numEvents = e.read->events->size(); + float readEventsPerBase = e.read->events_per_base[e.strand]; eventLengths.push_back(numEvents); + eventsPerBase.push_back(readEventsPerBase); + numEventsTotal += numEvents; } @@ -127,8 +255,10 @@ double scoreKernel(std::vector sequences, size_t eventMeansSize = numEventsTotal * sizeof(float); cudaHostAlloc(&eventMeans, eventMeansSize , cudaHostAllocDefault); + std::vector eventOffsets; size_t offset = 0; for (auto ev: event_sequences){ + eventOffsets.push_back(offset); size_t num_events = ev.read->events->size(); for (int i=0;ievents[0][i].mean; //taking the first element. Not sure what the second one is.. @@ -136,24 +266,93 @@ double scoreKernel(std::vector sequences, offset += num_events; } + int num_states = event_sequences[0].pore_model->states.size(); + std::vector pore_model_level_log_stdv(num_states); + std::vector pore_model_level_mean(num_states); + std::vector pore_model_level_stdv(num_states); + + for(int st=0; ststates[0]; //let's just initially get the params for AAAAAA + pore_model_level_log_stdv[st] = params.level_log_stdv; + pore_model_level_mean[st] = params.level_mean; + pore_model_level_stdv[st] = params.level_stdv; + } - float* devicePtr; - cudaMalloc( (void**)&devicePtr, eventMeansSize); - cudaMemcpy( devicePtr, eventMeans, eventMeansSize, cudaMemcpyHostToDevice ); - dim3 dimBlock( 1, 1 ); - dim3 dimGrid( 1, 1 ); + float* poreModelLevelLogStdvDev; + cudaMalloc( (void**)&poreModelLevelLogStdvDev, pore_model_level_log_stdv.size() * sizeof(float)); + cudaMemcpy( poreModelLevelLogStdvDev, pore_model_level_log_stdv.data(), pore_model_level_log_stdv.size() * sizeof(float), cudaMemcpyHostToDevice ); + + float* poreModelLevelMeanDev; + cudaMalloc( (void**)&poreModelLevelMeanDev, pore_model_level_mean.size() * sizeof(float)); + cudaMemcpy( poreModelLevelMeanDev, pore_model_level_mean.data(), pore_model_level_mean.size() * sizeof(float), cudaMemcpyHostToDevice ); + + float* poreModelLevelStdvDev; + cudaMalloc( (void**)&poreModelLevelStdvDev, pore_model_level_stdv.size() * sizeof(float)); + cudaMemcpy( poreModelLevelStdvDev, pore_model_level_stdv.data(), pore_model_level_stdv.size() * sizeof(float), cudaMemcpyHostToDevice ); + + + float* eventsPerBaseDev; + cudaMalloc( (void**)&eventsPerBaseDev, eventsPerBase.size() * sizeof(float)); + cudaMemcpy( eventsPerBaseDev, eventsPerBase.data(), eventsPerBase.size() * sizeof(float), cudaMemcpyHostToDevice ); + + float* eventMeansDev; + cudaMalloc( (void**)&eventMeansDev, eventMeansSize); + cudaMemcpy( eventMeansDev, eventMeans, eventMeansSize, cudaMemcpyHostToDevice ); + + int* numRowsDev; + cudaMalloc( (void**)&numRowsDev, n_rows.size() * sizeof(int)); + cudaMemcpy( numRowsDev, n_rows.data(), n_rows.size() * sizeof(int), cudaMemcpyHostToDevice ); + + int* kmerRanksDev; + int* kmerRanksRCDev; + cudaMalloc( (void**)&kmerRanksDev, kmer_ranks.size() * sizeof(int)); + cudaMalloc( (void**)&kmerRanksRCDev, kmer_ranks_rc.size() * sizeof(int)); + cudaMemcpy( kmerRanksDev, kmer_ranks.data(), kmer_ranks.size() * sizeof(int), cudaMemcpyHostToDevice ); + cudaMemcpy( kmerRanksRCDev, kmer_ranks_rc.data(), kmer_ranks_rc.size() * sizeof(int), cudaMemcpyHostToDevice ); + + int* eventStartsDev; + cudaMalloc( (void**)&eventStartsDev, e_starts.size() * sizeof(int)); + cudaMemcpy( eventStartsDev, e_starts.data(), e_starts.size() * sizeof(int), cudaMemcpyHostToDevice ); + + int* eventStridesDev; + cudaMalloc( (void**)&eventStridesDev, event_strides.size() * sizeof(int)); + cudaMemcpy( eventStridesDev, event_strides.data(), event_strides.size() * sizeof(int), cudaMemcpyHostToDevice ); + + int* eventOffsetsDev; + cudaMalloc( (void**)&eventOffsetsDev, eventOffsets.size() * sizeof(int)); + cudaMemcpy( eventOffsetsDev, eventOffsets.data(), eventOffsets.size() * sizeof(int), cudaMemcpyHostToDevice ); + + + dim3 dimBlock(num_models); + + int num_blocks = n_states / PSR9_NUM_STATES; + uint32_t num_kmers = num_blocks - 2; // two terminal blocks + + + dim3 dimGrid(num_blocks - 2); // One thread per state, not including Start and Terminal state. float * returnValues; cudaMalloc((void **) &returnValues, sizeof(float) * num_models); //one score per read float * returnedValues; - getScores<<>>(devicePtr, returnValues); - - cudaMemcpy(returnedValues, returnValues, num_models *sizeof(float), cudaMemcpyDeviceToHost ); - - auto r = returnedValues[0]; - return r; + getScores<<>>(eventMeansDev, + eventsPerBaseDev, + numRowsDev, + eventStartsDev, + eventStridesDev, + kmerRanksDev, + kmerRanksRCDev, + eventOffsetsDev, + poreModelLevelLogStdvDev, + poreModelLevelStdvDev, + poreModelLevelMeanDev, + returnValues); + + //cudaMemcpy(returnedValues, returnValues, num_models *sizeof(float), cudaMemcpyDeviceToHost ); + + //auto r = returnedValues[0]; + return 0.0; } std::vector GpuAligner::variantScoresThresholded(std::vector input_variants, @@ -202,4 +401,4 @@ std::vector GpuAligner::variantScoresThresholded(std::vector in std::vector v; v.push_back(base_score); return v; -} \ No newline at end of file +} diff --git a/src/hmm/nanopolish_profile_hmm_r9.inl b/src/hmm/nanopolish_profile_hmm_r9.inl index 76de768f..bd8ce1e6 100644 --- a/src/hmm/nanopolish_profile_hmm_r9.inl +++ b/src/hmm/nanopolish_profile_hmm_r9.inl @@ -285,7 +285,7 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, // Calculate number of blocks // A block of the HMM is a set of states for one kmer - uint32_t num_blocks = output.get_num_columns() / PSR9_NUM_STATES; // num_columns is the number of kmers + uint32_t num_blocks = output.get_num_columns() / PSR9_NUM_STATES; // num_columns is the number of HMM STATES uint32_t last_event_row_idx = output.get_num_rows() - 1; // Precompute the transition probabilites for each kmer block From 8b74a57fcbec1a06f4a9ddb1fc9e0aff605d99dd Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Mon, 11 Jun 2018 16:21:05 +0100 Subject: [PATCH 04/80] estimating emission probabilities --- src/cuda_kernels/GpuAligner.cu | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index c0b5b798..819511c1 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -25,15 +25,13 @@ __device__ float lp_match_r9(int rank, float * poreModelLevelLogStdv, float * poreModelLevelStdv, float * poreModelLevelMean){ + float log_inv_sqrt_2pi = log(0.3989422804014327); // no need to calculate this every time. better solutions available.. // STEP 1: GET DRIFT-SCALED LEVEL: float level = mean; //TODO: Do actual drift scaling. this is a cheat - // TODO: STEP 2: Get *scaled* Gaussian from pore model + // TODO: Apply scaling to these 3 model values as is done in the CPP implementation //these can just be pulled from the model - //float gaussian_mean = 0.0; - //float gaussian_stdv = 0.0; - //float gaussian_log_level_stdv = 0.0; float gaussian_mean = poreModelLevelMean[rank]; float gaussian_stdv = poreModelLevelStdv[rank]; float gaussian_log_level_stdv = poreModelLevelLogStdv[rank]; @@ -57,7 +55,13 @@ __global__ void getScores(float * eventData, float * poreModelLevelMean, float * returnValues) { - printf("Entered\n"); + int MAX_STATES=1024; + // kmer probabilities will be stored here + __shared__ float prevProbabilities[MAX_STATES]; + for (int i =0;i Date: Mon, 11 Jun 2018 17:56:08 +0100 Subject: [PATCH 05/80] kermel Executing to completion but incomplete -WIP --- Makefile | 4 +- src/cuda_kernels/GpuAligner.cu | 89 +++++++++++++-------------- src/hmm/nanopolish_profile_hmm_r9.inl | 1 + src/nanopolish_call_variants.cpp | 12 ++-- 4 files changed, 53 insertions(+), 53 deletions(-) diff --git a/Makefile b/Makefile index 27030a55..afe84ea1 100644 --- a/Makefile +++ b/Makefile @@ -9,9 +9,9 @@ SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/ali #Basic flags every build needs LIBS=-lz -CXXFLAGS ?= -g #-O3 +CXXFLAGS ?= -g -O3 CXXFLAGS += -std=c++11 -fopenmp -fsigned-char -CFLAGS ?= -std=c99 #-O3 +CFLAGS ?= -std=c99 -O3 CXX ?= g++ CC ?= gcc NVCC = nvcc diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index 819511c1..d369d030 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -4,19 +4,10 @@ #include #include "nanopolish_profile_hmm_r9.h" -__global__ void findSumToN(int *n, int limit) -{ - //printf("HELLO FROM SUM\n"); - int tId = threadIdx.x; - - for (int i=0; i<=(int)log2((double)limit); i++) - { - if (tId%(int)(pow(2.0,(double)(i+1))) == 0){ - if (tId+(int)pow(2.0, (double)i) >= limit) break; - n[tId] += n[tId+(int)pow(2.0, (double)i)]; - } - __syncthreads(); - } +#define MAX_STATES 1024 + +__device__ float logsumexpf(float x, float y){ + return fmax(x, y) + log1pf(expf(-fabsf(y-x))); } //TODO: Implement, inc pore model @@ -38,8 +29,6 @@ __device__ float lp_match_r9(int rank, // Step 3: calculate log-normal PDF float a = (level - gaussian_mean) / gaussian_stdv; // g is the gaussian parameters return log_inv_sqrt_2pi - gaussian_log_level_stdv + (-0.5f * a * a); // log_inv_sqrt_2pi is defined in a comment above - - return 0.1973; } __global__ void getScores(float * eventData, @@ -55,7 +44,6 @@ __global__ void getScores(float * eventData, float * poreModelLevelMean, float * returnValues) { - int MAX_STATES=1024; // kmer probabilities will be stored here __shared__ float prevProbabilities[MAX_STATES]; for (int i =0;i>>(n_d, y); - cudaMemcpy(n, n_d, asize, cudaMemcpyDeviceToHost); - cudaFree (n_d); - return n[0]; -} - -void GpuAligner::setY(int newVal) -{ - y = newVal; - asize = y*sizeof(int); - for (int i=0; i sequences, std::vector event_sequences, uint32_t alignment_flags){ @@ -184,6 +171,7 @@ double scoreKernel(std::vector sequences, //Let's assume that every event sequence has the same pore model //event_sequences[0].pore_model. + int num_reads = event_sequences.size(); // These asserts are here during the development phase assert(!sequences.empty()); assert(std::string(sequences[0].get_alphabet()->get_name()) == "nucleotide"); @@ -325,7 +313,7 @@ double scoreKernel(std::vector sequences, cudaMemcpy( eventOffsetsDev, eventOffsets.data(), eventOffsets.size() * sizeof(int), cudaMemcpyHostToDevice ); - dim3 dimBlock(num_models); + dim3 dimBlock(num_reads); int num_blocks = n_states / PSR9_NUM_STATES; uint32_t num_kmers = num_blocks - 2; // two terminal blocks @@ -334,9 +322,11 @@ double scoreKernel(std::vector sequences, dim3 dimGrid(num_blocks - 2); // One thread per state, not including Start and Terminal state. float * returnValues; - cudaMalloc((void **) &returnValues, sizeof(float) * num_models); //one score per read + cudaMalloc((void **) &returnValues, sizeof(float) * num_reads); //one score per read + + //TODO: this should be a cuda memalloc + float* returnedValues = new float[num_reads]; - float * returnedValues; getScores<<>>(eventMeansDev, eventsPerBaseDev, numRowsDev, @@ -350,10 +340,15 @@ double scoreKernel(std::vector sequences, poreModelLevelMeanDev, returnValues); - //cudaMemcpy(returnedValues, returnValues, num_models *sizeof(float), cudaMemcpyDeviceToHost ); + cudaMemcpy(returnedValues, returnValues, num_reads *sizeof(float), cudaMemcpyDeviceToHost); + + float r = 0.0; + for(int i=0; i GpuAligner::variantScoresThresholded(std::vector input_variants, diff --git a/src/hmm/nanopolish_profile_hmm_r9.inl b/src/hmm/nanopolish_profile_hmm_r9.inl index bd8ce1e6..6b06e633 100644 --- a/src/hmm/nanopolish_profile_hmm_r9.inl +++ b/src/hmm/nanopolish_profile_hmm_r9.inl @@ -369,6 +369,7 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, scores.x[HMT_FROM_SOFT] = -INFINITY; output.update_cell(row, curr_block_offset + PSR9_BAD_EVENT, scores, lp_emission_b); + // in cu this is where the shared memory sync on prev states would go. // state PSR9_KMER_SKIP scores.x[HMT_FROM_SAME_M] = -INFINITY; scores.x[HMT_FROM_PREV_M] = bt.lp_mk + output.get(row, prev_block_offset + PSR9_MATCH); diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp index 34d46ddc..dd01261a 100644 --- a/src/nanopolish_call_variants.cpp +++ b/src/nanopolish_call_variants.cpp @@ -291,6 +291,7 @@ std::vector generate_candidate_single_base_edits(const AlignmentDB& ali auto scoring = std::chrono::high_resolution_clock::now() - std::chrono::high_resolution_clock::now(); + auto gpu_exec = std::chrono::high_resolution_clock::now() - std::chrono::high_resolution_clock::now(); for(size_t i = region_start; i < region_end; ++i) { @@ -346,12 +347,12 @@ std::vector generate_candidate_single_base_edits(const AlignmentDB& ali alignments.get_reference_substring(contig, calling_start, calling_end)); GpuAligner aligner; - aligner.setY(15); - std::cout << aligner.calculateSum() < scores = aligner.variantScoresThresholded(tmp_variants, test_haplotype, event_sequences, alignment_flags, opt::screen_score_threshold, opt::methylation_types); + auto tf_gpu = std::chrono::high_resolution_clock::now(); + gpu_exec = tf_gpu - t0_gpu; for(const Variant& v : tmp_variants) { auto t0 = std::chrono::high_resolution_clock::now(); @@ -378,8 +379,11 @@ std::vector generate_candidate_single_base_edits(const AlignmentDB& ali auto screening = std::chrono::duration_cast(scoring).count(); + auto gpu_screening = std::chrono::duration_cast(gpu_exec).count(); + std::cout << "FUNCTION TOOK " << duration << "ms" << std::endl; - std::cout << "SCREENING COMPONENT TOOK " << screening << "ms" << std::endl; + std::cout << "SCREENING (CPU) COMPONENT TOOK " << screening << "ms" << std::endl; + std::cout << "SCREENING (GPU) COMPONENT TOOK " << gpu_screening << "ms" << std::endl; From 2fb2b0b415a1bfc78df2186b2bc9169f0fa3bb09 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Tue, 12 Jun 2018 15:47:31 +0100 Subject: [PATCH 06/80] Sending correct sequences to GPU --- src/cuda_kernels/GpuAligner.cu | 81 ++++++++++++++++++-------------- src/nanopolish_call_variants.cpp | 3 +- 2 files changed, 48 insertions(+), 36 deletions(-) diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index d369d030..7cfb5c11 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -239,8 +239,8 @@ double scoreKernel(std::vector sequences, numEventsTotal += numEvents; } - float * eventMeans; //Allocate a host buffer to store the event means + float * eventMeans; size_t eventMeansSize = numEventsTotal * sizeof(float); cudaHostAlloc(&eventMeans, eventMeansSize , cudaHostAllocDefault); @@ -256,6 +256,7 @@ double scoreKernel(std::vector sequences, } int num_states = event_sequences[0].pore_model->states.size(); + std::vector pore_model_level_log_stdv(num_states); std::vector pore_model_level_mean(num_states); std::vector pore_model_level_stdv(num_states); @@ -270,47 +271,47 @@ double scoreKernel(std::vector sequences, float* poreModelLevelLogStdvDev; cudaMalloc( (void**)&poreModelLevelLogStdvDev, pore_model_level_log_stdv.size() * sizeof(float)); - cudaMemcpy( poreModelLevelLogStdvDev, pore_model_level_log_stdv.data(), pore_model_level_log_stdv.size() * sizeof(float), cudaMemcpyHostToDevice ); + cudaMemcpyAsync( poreModelLevelLogStdvDev, pore_model_level_log_stdv.data(), pore_model_level_log_stdv.size() * sizeof(float), cudaMemcpyHostToDevice ); float* poreModelLevelMeanDev; cudaMalloc( (void**)&poreModelLevelMeanDev, pore_model_level_mean.size() * sizeof(float)); - cudaMemcpy( poreModelLevelMeanDev, pore_model_level_mean.data(), pore_model_level_mean.size() * sizeof(float), cudaMemcpyHostToDevice ); + cudaMemcpyAsync( poreModelLevelMeanDev, pore_model_level_mean.data(), pore_model_level_mean.size() * sizeof(float), cudaMemcpyHostToDevice ); float* poreModelLevelStdvDev; cudaMalloc( (void**)&poreModelLevelStdvDev, pore_model_level_stdv.size() * sizeof(float)); - cudaMemcpy( poreModelLevelStdvDev, pore_model_level_stdv.data(), pore_model_level_stdv.size() * sizeof(float), cudaMemcpyHostToDevice ); + cudaMemcpyAsync( poreModelLevelStdvDev, pore_model_level_stdv.data(), pore_model_level_stdv.size() * sizeof(float), cudaMemcpyHostToDevice ); float* eventsPerBaseDev; cudaMalloc( (void**)&eventsPerBaseDev, eventsPerBase.size() * sizeof(float)); - cudaMemcpy( eventsPerBaseDev, eventsPerBase.data(), eventsPerBase.size() * sizeof(float), cudaMemcpyHostToDevice ); + cudaMemcpyAsync( eventsPerBaseDev, eventsPerBase.data(), eventsPerBase.size() * sizeof(float), cudaMemcpyHostToDevice ); float* eventMeansDev; cudaMalloc( (void**)&eventMeansDev, eventMeansSize); - cudaMemcpy( eventMeansDev, eventMeans, eventMeansSize, cudaMemcpyHostToDevice ); + cudaMemcpyAsync( eventMeansDev, eventMeans, eventMeansSize, cudaMemcpyHostToDevice ); //malloc is taking 300us int* numRowsDev; cudaMalloc( (void**)&numRowsDev, n_rows.size() * sizeof(int)); - cudaMemcpy( numRowsDev, n_rows.data(), n_rows.size() * sizeof(int), cudaMemcpyHostToDevice ); + cudaMemcpyAsync( numRowsDev, n_rows.data(), n_rows.size() * sizeof(int), cudaMemcpyHostToDevice ); int* kmerRanksDev; int* kmerRanksRCDev; cudaMalloc( (void**)&kmerRanksDev, kmer_ranks.size() * sizeof(int)); cudaMalloc( (void**)&kmerRanksRCDev, kmer_ranks_rc.size() * sizeof(int)); - cudaMemcpy( kmerRanksDev, kmer_ranks.data(), kmer_ranks.size() * sizeof(int), cudaMemcpyHostToDevice ); - cudaMemcpy( kmerRanksRCDev, kmer_ranks_rc.data(), kmer_ranks_rc.size() * sizeof(int), cudaMemcpyHostToDevice ); + cudaMemcpyAsync( kmerRanksDev, kmer_ranks.data(), kmer_ranks.size() * sizeof(int), cudaMemcpyHostToDevice ); + cudaMemcpyAsync( kmerRanksRCDev, kmer_ranks_rc.data(), kmer_ranks_rc.size() * sizeof(int), cudaMemcpyHostToDevice ); int* eventStartsDev; cudaMalloc( (void**)&eventStartsDev, e_starts.size() * sizeof(int)); - cudaMemcpy( eventStartsDev, e_starts.data(), e_starts.size() * sizeof(int), cudaMemcpyHostToDevice ); + cudaMemcpyAsync( eventStartsDev, e_starts.data(), e_starts.size() * sizeof(int), cudaMemcpyHostToDevice ); int* eventStridesDev; cudaMalloc( (void**)&eventStridesDev, event_strides.size() * sizeof(int)); - cudaMemcpy( eventStridesDev, event_strides.data(), event_strides.size() * sizeof(int), cudaMemcpyHostToDevice ); + cudaMemcpyAsync( eventStridesDev, event_strides.data(), event_strides.size() * sizeof(int), cudaMemcpyHostToDevice ); int* eventOffsetsDev; cudaMalloc( (void**)&eventOffsetsDev, eventOffsets.size() * sizeof(int)); - cudaMemcpy( eventOffsetsDev, eventOffsets.data(), eventOffsets.size() * sizeof(int), cudaMemcpyHostToDevice ); + cudaMemcpyAsync( eventOffsetsDev, eventOffsets.data(), eventOffsets.size() * sizeof(int), cudaMemcpyHostToDevice ); dim3 dimBlock(num_reads); @@ -325,7 +326,9 @@ double scoreKernel(std::vector sequences, cudaMalloc((void **) &returnValues, sizeof(float) * num_reads); //one score per read //TODO: this should be a cuda memalloc - float* returnedValues = new float[num_reads]; + float* returnedValues;// = new float[num_reads]; + //size_t eventMeansSize = numEventsTotal * sizeof(float); + cudaHostAlloc(&returnedValues, num_reads * sizeof(float) , cudaHostAllocDefault); getScores<<>>(eventMeansDev, eventsPerBaseDev, @@ -340,14 +343,31 @@ double scoreKernel(std::vector sequences, poreModelLevelMeanDev, returnValues); - cudaMemcpy(returnedValues, returnValues, num_reads *sizeof(float), cudaMemcpyDeviceToHost); + //cudaDeviceSynchronize(); + cudaMemcpyAsync(returnedValues, returnValues, num_reads *sizeof(float), cudaMemcpyDeviceToHost); + + // Free device memory + cudaFree(eventMeansDev); + cudaFree(eventsPerBaseDev); + cudaFree(numRowsDev); + cudaFree(eventStartsDev); + cudaFree(eventStridesDev); + cudaFree(kmerRanksDev); + cudaFree(kmerRanksRCDev); + cudaFree(eventOffsetsDev); + cudaFree(poreModelLevelLogStdvDev); + cudaFree(poreModelLevelStdvDev); + cudaFree(poreModelLevelMeanDev); + + + //Free host memory + cudaFreeHost(eventMeans); float r = 0.0; for(int i=0; i GpuAligner::variantScoresThresholded(std::vector in variant_haplotypes[i].apply_variant(input_variants[i]); } - - //variant_haplotype.apply_variant(input_variant); - // Make methylated versions of each input sequence. Once for the base haplotype and once each for each variant std::vector base_sequences = generate_methylated_alternatives(base_haplotype.get_sequence(), methylation_types); - - assert(base_sequences.size() == 1); - - std::vector> methylatedVariantSequences; - for(auto variant: variant_haplotypes) { - std::vector variant_sequences = generate_methylated_alternatives( - variant.get_sequence(), methylation_types); - methylatedVariantSequences.push_back(variant_sequences); - + std::vector> variant_sequences; + for (auto v: variant_haplotypes){ + auto variant_sequence = generate_methylated_alternatives(v.get_sequence(), methylation_types); + variant_sequences.push_back(variant_sequence); } - //For now let's not worry about methylation - assert(methylatedVariantSequences.size() == numVariants); - for (auto m: methylatedVariantSequences) { - assert(m.size() == 1); - } - //Next we need to get the scores. + assert(base_sequences.size() == 1); // return the sum of the score for the base sequences over all the event sequences double base_score = scoreKernel(base_sequences, event_sequences, alignment_flags); - std::vector v; - v.push_back(base_score); + std::vector v(variant_sequences.size()); + for (int i=0; i generate_candidate_single_base_edits(const AlignmentDB& ali auto start = std::chrono::high_resolution_clock::now(); std::vector out_variants; + std::vector out_variants_gpu; std::string contig = alignments.get_region_contig(); @@ -352,7 +353,7 @@ std::vector generate_candidate_single_base_edits(const AlignmentDB& ali alignment_flags, opt::screen_score_threshold, opt::methylation_types); auto tf_gpu = std::chrono::high_resolution_clock::now(); - gpu_exec = tf_gpu - t0_gpu; + gpu_exec += tf_gpu - t0_gpu; for(const Variant& v : tmp_variants) { auto t0 = std::chrono::high_resolution_clock::now(); From b614313a7a66b68cbf1e2021c87f51b02835e894 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Tue, 12 Jun 2018 16:00:57 +0100 Subject: [PATCH 07/80] Correct grid size --- src/cuda_kernels/GpuAligner.cu | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index 7cfb5c11..50400979 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -313,19 +313,15 @@ double scoreKernel(std::vector sequences, cudaMalloc( (void**)&eventOffsetsDev, eventOffsets.size() * sizeof(int)); cudaMemcpyAsync( eventOffsetsDev, eventOffsets.data(), eventOffsets.size() * sizeof(int), cudaMemcpyHostToDevice ); - - dim3 dimBlock(num_reads); - int num_blocks = n_states / PSR9_NUM_STATES; uint32_t num_kmers = num_blocks - 2; // two terminal blocks - - dim3 dimGrid(num_blocks - 2); // One thread per state, not including Start and Terminal state. + dim3 dimBlock(num_blocks - 2); + dim3 dimGrid(1); // One thread per state, not including Start and Terminal state. float * returnValues; cudaMalloc((void **) &returnValues, sizeof(float) * num_reads); //one score per read - //TODO: this should be a cuda memalloc float* returnedValues;// = new float[num_reads]; //size_t eventMeansSize = numEventsTotal * sizeof(float); cudaHostAlloc(&returnedValues, num_reads * sizeof(float) , cudaHostAllocDefault); @@ -359,7 +355,6 @@ double scoreKernel(std::vector sequences, cudaFree(poreModelLevelStdvDev); cudaFree(poreModelLevelMeanDev); - //Free host memory cudaFreeHost(eventMeans); From aa43bc92a755e6124327e7aec34f32444e943604 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Fri, 15 Jun 2018 10:14:40 +0100 Subject: [PATCH 08/80] Match state almost working --- Makefile | 4 +- src/cuda_kernels/GpuAligner.cu | 122 +++++++++++++++++++++----- src/hmm/nanopolish_emissions.h | 11 ++- src/hmm/nanopolish_profile_hmm_r9.inl | 41 +++++++-- 4 files changed, 149 insertions(+), 29 deletions(-) diff --git a/Makefile b/Makefile index afe84ea1..199f5d1d 100644 --- a/Makefile +++ b/Makefile @@ -9,9 +9,9 @@ SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/ali #Basic flags every build needs LIBS=-lz -CXXFLAGS ?= -g -O3 +CXXFLAGS ?= -g -Og CXXFLAGS += -std=c++11 -fopenmp -fsigned-char -CFLAGS ?= -std=c99 -O3 +CFLAGS ?= -std=c99 #-O3 CXX ?= g++ CC ?= gcc NVCC = nvcc diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index 50400979..4c851014 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -7,7 +7,11 @@ #define MAX_STATES 1024 __device__ float logsumexpf(float x, float y){ - return fmax(x, y) + log1pf(expf(-fabsf(y-x))); + if(x == -INFINITY && y == -INFINITY){ + return -INFINITY; + } + float result = fmax(x, y) + log1pf(expf(-fabsf(y - x))); + return result; } //TODO: Implement, inc pore model @@ -15,7 +19,8 @@ __device__ float lp_match_r9(int rank, float mean, float * poreModelLevelLogStdv, float * poreModelLevelStdv, - float * poreModelLevelMean){ + float * poreModelLevelMean, + bool debug = false){ float log_inv_sqrt_2pi = log(0.3989422804014327); // no need to calculate this every time. better solutions available.. @@ -28,7 +33,23 @@ __device__ float lp_match_r9(int rank, float gaussian_log_level_stdv = poreModelLevelLogStdv[rank]; // Step 3: calculate log-normal PDF float a = (level - gaussian_mean) / gaussian_stdv; // g is the gaussian parameters - return log_inv_sqrt_2pi - gaussian_log_level_stdv + (-0.5f * a * a); // log_inv_sqrt_2pi is defined in a comment above + + float emission = log_inv_sqrt_2pi - gaussian_log_level_stdv + (-0.5f * a * a); // log_inv_sqrt_2pi is defined in a comment above + + if (debug == true) { + if (threadIdx.x == 0) { + printf(">GPU: kmer rank is %i\n", rank); + printf(">GPU: level %f\n", level); + printf(">GPU: gaussian mean %f\n", gaussian_mean); + printf(">GPU: gaussian stdv %f\n", gaussian_stdv); + printf(">GPU: gaussian log level stdv %f\n", gaussian_log_level_stdv); + printf(">GPU a: %f\n", a); + printf(">GPU emission: %f\n", emission); + } + } + + return emission; // log_inv_sqrt_2pi is defined in a comment above + } __global__ void getScores(float * eventData, @@ -42,16 +63,16 @@ __global__ void getScores(float * eventData, float * poreModelLevelLogStdv, float * poreModelLevelStdv, float * poreModelLevelMean, - float * returnValues) -{ - // kmer probabilities will be stored here + float * returnValues) { + + // Initialise the prev probability row, which is the row of the DP table + + int n_states = blockDim.x * PSR9_NUM_STATES + 2 * PSR9_NUM_STATES; // 3 blocks per kmer and then 3 each for start and end state. __shared__ float prevProbabilities[MAX_STATES]; - for (int i =0;iGPU e_start %i\n", e_start); + } int kmerIdx = threadIdx.x; + uint32_t rank = kmer_ranks[kmerIdx]; // lexical rank of a kmer + printf("Kmer idx %i, Rank: %i\n", kmerIdx, rank); float p_stay = 1 - (1 / read_events_per_base); - - //printf("Events per base: %f \n", read_events_per_base); float p_skip = 0.0025; float p_bad = 0.001; float p_bad_self = p_bad; @@ -97,6 +125,8 @@ __global__ void getScores(float * eventData, float lp_kk = log(p_kk); float lp_km = log(p_km); + float lp_sm, lp_ms; + lp_sm = lp_ms = 0.0f; // Start filling out the "DP table" // Each thread is going to work on an individual P-HMM Block @@ -113,13 +143,20 @@ __global__ void getScores(float * eventData, for(int row=1; rowGPU score HMT_FROM_SAME_M is %f\n", HMT_FROM_SAME_M); + printf(">GPU score HMT_FROM_PREV_M is %f\n", HMT_FROM_PREV_M); + printf(">GPU score HMT_FROM_SAME_B is %f\n", HMT_FROM_SAME_B); + printf(">GPU score HMT_FROM_PREV_B is %f\n", HMT_FROM_PREV_B); + printf(">GPU score HMT_FROM_PREV_K is %f\n", HMT_FROM_PREV_K); + } + // m_s is the probability of going from the start state // to this kmer. The start state is (currently) only // allowed to go to the first kmer. If ALLOW_PRE_CLIP @@ -136,17 +195,40 @@ __global__ void getScores(float * eventData, // with a penalty; // TODO: Implemnet the HMT_FROM_SOFT score. this appears needed but I don't yet understand it. - // NOW calculate the score + // calculate the score float sum = HMT_FROM_SAME_M; + + sum = logsumexpf(sum, HMT_FROM_SOFT); + if (debug == true){ + printf("Sum1 is : %f\n", sum); + } sum = logsumexpf(sum, HMT_FROM_PREV_M); + if (debug == true){ + printf("Sum2 is : %f\n", sum); + } + sum = logsumexpf(sum, HMT_FROM_SAME_B); sum = logsumexpf(sum, HMT_FROM_PREV_B); + if (debug == true){ + printf("Sum3 is : %f\n", sum); + } + sum = logsumexpf(sum, HMT_FROM_PREV_K); sum += lp_emission_m; + if (debug == true){ + printf("Sum4 is : %f\n", sum); + } __syncthreads(); - prevProbabilities[curBlockIdx + PSR9_MATCH] = sum; + prevProbabilities[curBlockOffset + PSR9_MATCH] = sum; __syncthreads(); + + if ((threadIdx.x == 0) && (row == 1)) { + printf("Number of states is %i\n", n_states); + for (int c = 0; c < n_states; c++) { + printf("GPU> Value for row 1 and col %i is %f\n", c, prevProbabilities[c]); + } + } } @@ -386,10 +468,10 @@ std::vector GpuAligner::variantScoresThresholded(std::vector in std::vector base_sequences = generate_methylated_alternatives(base_haplotype.get_sequence(), methylation_types); std::vector> variant_sequences; - for (auto v: variant_haplotypes){ - auto variant_sequence = generate_methylated_alternatives(v.get_sequence(), methylation_types); - variant_sequences.push_back(variant_sequence); - } + //for (auto v: variant_haplotypes){ + // auto variant_sequence = generate_methylated_alternatives(v.get_sequence(), methylation_types); + // variant_sequences.push_back(variant_sequence); + //} assert(base_sequences.size() == 1); diff --git a/src/hmm/nanopolish_emissions.h b/src/hmm/nanopolish_emissions.h index f9e85142..599a24fb 100644 --- a/src/hmm/nanopolish_emissions.h +++ b/src/hmm/nanopolish_emissions.h @@ -58,11 +58,20 @@ inline float log_probability_match_r9(const SquiggleRead& read, const PoreModel& pore_model, uint32_t kmer_rank, uint32_t event_idx, - uint8_t strand) + uint8_t strand, + bool debug = false) { // event level mean, scaled with the drift value float level = read.get_drift_scaled_level(event_idx, strand); + GaussianParameters gp = read.get_scaled_gaussian_from_pore_model_state(pore_model, strand, kmer_rank); + if (debug == true) { + printf(">CPU kmer_rank is: %i\n", kmer_rank); + printf(">CPU level is: %f\n", level); + printf(">CPU gaussian mean: %f\n", gp.mean); + printf(">CPU gaussian stdv: %f\n", gp.stdv); + printf(">CPU gaussian log_level_stdv: %f\n", gp.log_stdv); + } float lp = log_normal_pdf(level, gp); return lp; } diff --git a/src/hmm/nanopolish_profile_hmm_r9.inl b/src/hmm/nanopolish_profile_hmm_r9.inl index 6b06e633..d15161fe 100644 --- a/src/hmm/nanopolish_profile_hmm_r9.inl +++ b/src/hmm/nanopolish_profile_hmm_r9.inl @@ -216,7 +216,7 @@ inline std::vector make_pre_flanking(const HMMInputData& data, pre_flank[i] = log(TRANS_CLIP_SELF) + log_probability_background(*data.read, event_idx, data.strand) + // emit from background pre_flank[i - 1]; // this accounts for the transition from the start & to the silent pre - + } return pre_flank; @@ -261,7 +261,7 @@ inline std::vector make_post_flanking(const HMMInputData& data, template inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, const HMMInputData& _data, - const uint32_t, + const uint32_t, //e_start apparently not used by this function uint32_t flags, ProfileHMMOutput& output) { @@ -282,7 +282,8 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, #endif uint32_t e_start = data.event_start_idx; - + + printf(">CPU e_start: %i\n", e_start); // Calculate number of blocks // A block of the HMM is a set of states for one kmer uint32_t num_blocks = output.get_num_columns() / PSR9_NUM_STATES; // num_columns is the number of HMM STATES @@ -301,8 +302,11 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, assert( data.pore_model->states.size() == sequence.get_num_kmer_ranks(k) ); std::vector kmer_ranks(num_kmers); - for(size_t ki = 0; ki < num_kmers; ++ki) - kmer_ranks[ki] = sequence.get_kmer_rank(ki, k, data.rc); + for(size_t ki = 0; ki < num_kmers; ++ki) { + int kr = sequence.get_kmer_rank(ki, k, data.rc); + printf("Kmer rank: %i\n", kr); + kmer_ranks[ki] = kr; + } size_t num_events = output.get_num_rows() - 1; @@ -337,7 +341,7 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, // Emission probabilities uint32_t event_idx = e_start + (row - 1) * data.event_stride; uint32_t rank = kmer_ranks[kmer_idx]; - float lp_emission_m = log_probability_match_r9(*data.read, *data.pore_model, rank, event_idx, data.strand); + float lp_emission_m = log_probability_match_r9(*data.read, *data.pore_model, rank, event_idx, data.strand, true); float lp_emission_b = BAD_EVENT_PENALTY; HMMUpdateScores scores; @@ -360,6 +364,20 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, output.update_cell(row, curr_block_offset + PSR9_MATCH, scores, lp_emission_m); + printf("======\n"); + //diagnostics - after match has been applied + if (row == 1) { + auto nc = output.get_num_columns(); + //for (int i = 0; i < nc; i++) { + // printf("CPU> Value for row 0 col %i is %f\n", i, output.get(0, i)); + //} + for (int i = 0; i < nc; i++) { + printf("CPU> Value for row 1 col %i is %f\n", i, output.get(1, i)); + } + } + + + // state PSR9_BAD_EVENT scores.x[HMT_FROM_SAME_M] = bt.lp_mb + output.get(row - 1, curr_block_offset + PSR9_MATCH); scores.x[HMT_FROM_PREV_M] = -INFINITY; // not allowed @@ -369,6 +387,16 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, scores.x[HMT_FROM_SOFT] = -INFINITY; output.update_cell(row, curr_block_offset + PSR9_BAD_EVENT, scores, lp_emission_b); + if ((block == 1) && (row == 1)){ //blcok 1 corresponds to threadIdx 0 on GPU + printf("lp_emission_m is %f\n", lp_emission_m); + printf("PSR9_MATCH is %i\n", PSR9_MATCH); + printf(">CPU score HMT_FROM_SAME_M is %f\n", scores.x[HMT_FROM_SAME_M]); + printf(">CPU score HMT_FROM_PREV_M is %f\n", scores.x[HMT_FROM_PREV_M]); + printf(">CPU score HMT_FROM_SAME_B is %f\n", scores.x[HMT_FROM_SAME_B]); + printf(">CPU score HMT_FROM_PREV_B is %f\n", scores.x[HMT_FROM_PREV_B]); + printf(">CPU score HMT_FROM_PREV_K is %f\n", scores.x[HMT_FROM_PREV_K]); + } + // in cu this is where the shared memory sync on prev states would go. // state PSR9_KMER_SKIP scores.x[HMT_FROM_SAME_M] = -INFINITY; @@ -425,6 +453,7 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, #endif } } + return output.get_end(); } From a4dbf437da740ed0ee490833ba95aaea35873c20 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Fri, 15 Jun 2018 17:03:32 +0100 Subject: [PATCH 09/80] All states now being updated, but no terminal kmer or scaling --- src/cuda_kernels/GpuAligner.cu | 107 ++++++++++++++++++++------ src/hmm/nanopolish_profile_hmm_r9.inl | 48 ++++++------ 2 files changed, 105 insertions(+), 50 deletions(-) diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index 4c851014..3bc3bcdf 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -173,28 +173,6 @@ __global__ void getScores(float * eventData, // with a penalty; float HMT_FROM_SOFT = (kmerIdx == 0 && (event_idx == e_start)) ? lp_sm : -INFINITY; // TODO: Add the pre-flank to this calculation. Also flags and HAF_ALLOW_PRE_CLIP - if ((threadIdx.x == 0) && (row == 1)){ - printf("rank %i\n", rank); - printf("event mean %f\n", event_mean); - printf("poreModelLevelLogStdv %f\n", poreModelLevelLogStdv); - printf("poreModelLevelStdv %f\n", poreModelLevelStdv); - printf("poreModelLevelMean %f\n", poreModelLevelMean); - printf("lp_emission_m is %f\n", lp_emission_m); - printf("PSR9_MATCH is %i\n", PSR9_MATCH); - printf(">GPU score HMT_FROM_SAME_M is %f\n", HMT_FROM_SAME_M); - printf(">GPU score HMT_FROM_PREV_M is %f\n", HMT_FROM_PREV_M); - printf(">GPU score HMT_FROM_SAME_B is %f\n", HMT_FROM_SAME_B); - printf(">GPU score HMT_FROM_PREV_B is %f\n", HMT_FROM_PREV_B); - printf(">GPU score HMT_FROM_PREV_K is %f\n", HMT_FROM_PREV_K); - } - - // m_s is the probability of going from the start state - // to this kmer. The start state is (currently) only - // allowed to go to the first kmer. If ALLOW_PRE_CLIP - // is defined, we allow all events before this one to be skipped, - // with a penalty; - // TODO: Implemnet the HMT_FROM_SOFT score. this appears needed but I don't yet understand it. - // calculate the score float sum = HMT_FROM_SAME_M; @@ -219,14 +197,93 @@ __global__ void getScores(float * eventData, printf("Sum4 is : %f\n", sum); } + float newMatchScore = sum; + // Here need to calculate the bad event score + + // state PSR9_BAD_EVENT + HMT_FROM_SAME_M = lp_mb + prevProbabilities[curBlockOffset + PSR9_MATCH]; + HMT_FROM_PREV_M = -INFINITY; // not allowed + HMT_FROM_SAME_B = lp_bb + prevProbabilities[curBlockOffset + PSR9_BAD_EVENT]; + HMT_FROM_PREV_B = -INFINITY; + HMT_FROM_PREV_K = -INFINITY; + HMT_FROM_SOFT = -INFINITY; + + sum = HMT_FROM_SAME_M; + sum = logsumexpf(sum, HMT_FROM_PREV_M); + sum = logsumexpf(sum, HMT_FROM_SAME_B); + sum = logsumexpf(sum, HMT_FROM_PREV_B); + sum = logsumexpf(sum, HMT_FROM_PREV_K); + sum = logsumexpf(sum, HMT_FROM_SOFT); + sum += lp_emission_b; + + float newBadEventScore = sum; + + // Write row out + prevProbabilities[curBlockOffset + PSR9_MATCH] = newMatchScore; + prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] = newBadEventScore; __syncthreads(); - prevProbabilities[curBlockOffset + PSR9_MATCH] = sum; + + // state PSR9_KMER_SKIP + HMT_FROM_SAME_M = -INFINITY; + HMT_FROM_PREV_M = lp_mk + prevProbabilities[prevBlockOffset + PSR9_MATCH]; + HMT_FROM_SAME_B = -INFINITY; + HMT_FROM_PREV_B = lp_bk + prevProbabilities[prevBlockOffset + PSR9_BAD_EVENT]; + + HMT_FROM_SOFT = -INFINITY; + + sum = HMT_FROM_SAME_M; + sum = logsumexpf(sum, HMT_FROM_PREV_M); + sum = logsumexpf(sum, HMT_FROM_SAME_B); + sum = logsumexpf(sum, HMT_FROM_PREV_B); + sum = logsumexpf(sum, HMT_FROM_PREV_K); + sum = logsumexpf(sum, HMT_FROM_SOFT); + sum += 0.0;//No emission. redundant. + + float newSkipScore = sum; + + prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] = newSkipScore; __syncthreads(); - if ((threadIdx.x == 0) && (row == 1)) { + //Now need to do the skip-skip transition, which is serial. + if (threadIdx.x == 0){ + for (int blkidx = 2;blkidx <= blockDim.x; blkidx++){ + //calculate the skipscore using the previous + //Current skip score for block blkidx: + float curSkipScore = prevProbabilities[blkidx * PSR9_NUM_STATES + PSR9_KMER_SKIP]; + printf("Current skip score for block %i is %f",blkidx, curSkipScore); + //new score to add - TODO: use the correct lp_kk score + + HMT_FROM_PREV_K = lp_kk + newSkipScore; + newSkipScore = logsumexpf(curSkipScore, HMT_FROM_PREV_K); + //add it + prevProbabilities[blkidx * PSR9_NUM_STATES + PSR9_KMER_SKIP] = newSkipScore; + } + } + + // Now do the end state + __syncthreads(); + + if ((threadIdx.x == 1) && (row == 1)){ + printf("rank %i\n", rank); + printf("event mean %f\n", event_mean); + printf("poreModelLevelLogStdv %f\n", poreModelLevelLogStdv); + printf("poreModelLevelStdv %f\n", poreModelLevelStdv); + printf("poreModelLevelMean %f\n", poreModelLevelMean); + printf("lp_emission_m is %f\n", lp_emission_m); + printf("PSR9_MATCH is %i\n", PSR9_MATCH); + printf(">GPU score HMT_FROM_SAME_M is %f\n", HMT_FROM_SAME_M); + printf(">GPU score HMT_FROM_PREV_M is %f\n", HMT_FROM_PREV_M); + printf(">GPU score HMT_FROM_SAME_B is %f\n", HMT_FROM_SAME_B); + printf(">GPU score HMT_FROM_PREV_B is %f\n", HMT_FROM_PREV_B); + printf(">GPU score HMT_FROM_PREV_K is %f\n", HMT_FROM_PREV_K); + printf(">GPU newSkipScore is %f\n", newSkipScore); + } + + + if ((threadIdx.x == 0) && (row == 3)) { printf("Number of states is %i\n", n_states); for (int c = 0; c < n_states; c++) { - printf("GPU> Value for row 1 and col %i is %f\n", c, prevProbabilities[c]); + printf("GPU> Value for row 3 and col %i is %f\n", c, prevProbabilities[c]); } } } diff --git a/src/hmm/nanopolish_profile_hmm_r9.inl b/src/hmm/nanopolish_profile_hmm_r9.inl index d15161fe..d8738101 100644 --- a/src/hmm/nanopolish_profile_hmm_r9.inl +++ b/src/hmm/nanopolish_profile_hmm_r9.inl @@ -326,6 +326,16 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, // Fill in matrix for(uint32_t row = 1; row < output.get_num_rows(); row++) { + printf("======\n"); + //diagnostics - after match and bad event have been applied + if (row == 4) { // row 1 has been computed so we can have a peek + auto nc = output.get_num_columns(); + int rw = 3; + for (int i = 0; i < nc; i++) { + printf("CPU> Value for row %i col %i is %f\n", rw, i, output.get(rw, i)); + } + } + // Skip the first block which is the start state, it was initialized above // Similarily skip the last block, which is calculated in the terminate() function for(uint32_t block = 1; block < num_blocks - 1; block++) { @@ -364,38 +374,16 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, output.update_cell(row, curr_block_offset + PSR9_MATCH, scores, lp_emission_m); - printf("======\n"); - //diagnostics - after match has been applied - if (row == 1) { - auto nc = output.get_num_columns(); - //for (int i = 0; i < nc; i++) { - // printf("CPU> Value for row 0 col %i is %f\n", i, output.get(0, i)); - //} - for (int i = 0; i < nc; i++) { - printf("CPU> Value for row 1 col %i is %f\n", i, output.get(1, i)); - } - } - - - - // state PSR9_BAD_EVENT + // state PSR9_BAD_EVENT scores.x[HMT_FROM_SAME_M] = bt.lp_mb + output.get(row - 1, curr_block_offset + PSR9_MATCH); scores.x[HMT_FROM_PREV_M] = -INFINITY; // not allowed scores.x[HMT_FROM_SAME_B] = bt.lp_bb + output.get(row - 1, curr_block_offset + PSR9_BAD_EVENT); scores.x[HMT_FROM_PREV_B] = -INFINITY; scores.x[HMT_FROM_PREV_K] = -INFINITY; scores.x[HMT_FROM_SOFT] = -INFINITY; + printf("before: %f:\n", output.get(row, curr_block_offset + PSR9_BAD_EVENT)); output.update_cell(row, curr_block_offset + PSR9_BAD_EVENT, scores, lp_emission_b); - - if ((block == 1) && (row == 1)){ //blcok 1 corresponds to threadIdx 0 on GPU - printf("lp_emission_m is %f\n", lp_emission_m); - printf("PSR9_MATCH is %i\n", PSR9_MATCH); - printf(">CPU score HMT_FROM_SAME_M is %f\n", scores.x[HMT_FROM_SAME_M]); - printf(">CPU score HMT_FROM_PREV_M is %f\n", scores.x[HMT_FROM_PREV_M]); - printf(">CPU score HMT_FROM_SAME_B is %f\n", scores.x[HMT_FROM_SAME_B]); - printf(">CPU score HMT_FROM_PREV_B is %f\n", scores.x[HMT_FROM_PREV_B]); - printf(">CPU score HMT_FROM_PREV_K is %f\n", scores.x[HMT_FROM_PREV_K]); - } + printf("after: %f:\n", output.get(row, curr_block_offset + PSR9_BAD_EVENT)); // in cu this is where the shared memory sync on prev states would go. // state PSR9_KMER_SKIP @@ -407,6 +395,16 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, scores.x[HMT_FROM_SOFT] = -INFINITY; output.update_cell(row, curr_block_offset + PSR9_KMER_SKIP, scores, 0.0f); // no emission + if ((block == 1) && (row == 1)){ //blcok 1 corresponds to threadIdx 0 on GPU + printf("lp_emission_m is %f\n", lp_emission_m); + printf("PSR9_MATCH is %i\n", PSR9_MATCH); + printf(">CPU score HMT_FROM_SAME_M is %f\n", scores.x[HMT_FROM_SAME_M]); + printf(">CPU score HMT_FROM_PREV_M is %f\n", scores.x[HMT_FROM_PREV_M]); + printf(">CPU score HMT_FROM_SAME_B is %f\n", scores.x[HMT_FROM_SAME_B]); + printf(">CPU score HMT_FROM_PREV_B is %f\n", scores.x[HMT_FROM_PREV_B]); + printf(">CPU score HMT_FROM_PREV_K is %f\n", scores.x[HMT_FROM_PREV_K]); + } + // If POST_CLIP is enabled we allow the last kmer to transition directly // to the end after any event. Otherwise we only allow it from the // last kmer/event match. From 46e6ead916c8293ff0f93e197996360d6cf6c125 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Mon, 25 Jun 2018 17:34:34 +0100 Subject: [PATCH 10/80] diagnosing issue --- src/cuda_kernels/GpuAligner.cu | 95 ++++++++++++++++++++++----- src/hmm/nanopolish_emissions.h | 5 +- src/hmm/nanopolish_profile_hmm_r9.inl | 27 ++++---- src/nanopolish_call_variants.cpp | 3 + 4 files changed, 101 insertions(+), 29 deletions(-) diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index 3bc3bcdf..3cef2919 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -20,17 +20,23 @@ __device__ float lp_match_r9(int rank, float * poreModelLevelLogStdv, float * poreModelLevelStdv, float * poreModelLevelMean, + float scale, + float shift, + float var, + float logVar, bool debug = false){ float log_inv_sqrt_2pi = log(0.3989422804014327); // no need to calculate this every time. better solutions available.. // STEP 1: GET DRIFT-SCALED LEVEL: - float level = mean; //TODO: Do actual drift scaling. this is a cheat + float level = mean; // TODO: Apply scaling to these 3 model values as is done in the CPP implementation //these can just be pulled from the model - float gaussian_mean = poreModelLevelMean[rank]; - float gaussian_stdv = poreModelLevelStdv[rank]; - float gaussian_log_level_stdv = poreModelLevelLogStdv[rank]; + + float gaussian_mean = scale * poreModelLevelMean[rank] + shift; + float gaussian_stdv = poreModelLevelStdv[rank] * var; + float gaussian_log_level_stdv = poreModelLevelLogStdv[rank] + logVar; + // Step 3: calculate log-normal PDF float a = (level - gaussian_mean) / gaussian_stdv; // g is the gaussian parameters @@ -63,6 +69,10 @@ __global__ void getScores(float * eventData, float * poreModelLevelLogStdv, float * poreModelLevelStdv, float * poreModelLevelMean, + float * scaleDev, + float * shiftDev, + float * varDev, + float * logVarDev, float * returnValues) { // Initialise the prev probability row, which is the row of the DP table @@ -140,11 +150,18 @@ __global__ void getScores(float * eventData, // the penalty is controlled by the transition probability float BAD_EVENT_PENALTY = 0.0f; + float scale = scaleDev[readIdx]; + float shift = shiftDev[readIdx]; + float var = varDev[readIdx]; + float logVar = logVarDev[readIdx]; + for(int row=1; row lp_emission_m %f\n", lp_emission_m); + printf("GPU> level being used to calculate emission: %f\n", event_mean); + } float lp_emission_b = BAD_EVENT_PENALTY; // Get all the scores for a match @@ -250,7 +276,7 @@ __global__ void getScores(float * eventData, //calculate the skipscore using the previous //Current skip score for block blkidx: float curSkipScore = prevProbabilities[blkidx * PSR9_NUM_STATES + PSR9_KMER_SKIP]; - printf("Current skip score for block %i is %f",blkidx, curSkipScore); + //printf("Current skip score for block %i is %f",blkidx, curSkipScore); //new score to add - TODO: use the correct lp_kk score HMT_FROM_PREV_K = lp_kk + newSkipScore; @@ -266,9 +292,9 @@ __global__ void getScores(float * eventData, if ((threadIdx.x == 1) && (row == 1)){ printf("rank %i\n", rank); printf("event mean %f\n", event_mean); - printf("poreModelLevelLogStdv %f\n", poreModelLevelLogStdv); - printf("poreModelLevelStdv %f\n", poreModelLevelStdv); - printf("poreModelLevelMean %f\n", poreModelLevelMean); + printf("poreModelLevelLogStdv %f\n", poreModelLevelLogStdv[0]); + printf("poreModelLevelStdv %f\n", poreModelLevelStdv[0]); + printf("poreModelLevelMean %f\n", poreModelLevelMean[0]); printf("lp_emission_m is %f\n", lp_emission_m); printf("PSR9_MATCH is %i\n", PSR9_MATCH); printf(">GPU score HMT_FROM_SAME_M is %f\n", HMT_FROM_SAME_M); @@ -280,10 +306,10 @@ __global__ void getScores(float * eventData, } - if ((threadIdx.x == 0) && (row == 3)) { + if ((threadIdx.x == 0) && (row == 1)) { printf("Number of states is %i\n", n_states); for (int c = 0; c < n_states; c++) { - printf("GPU> Value for row 3 and col %i is %f\n", c, prevProbabilities[c]); + printf("GPU> Value for row %i and col %i is %f\n",row, c, prevProbabilities[c]); } } } @@ -389,7 +415,9 @@ double scoreKernel(std::vector sequences, eventOffsets.push_back(offset); size_t num_events = ev.read->events->size(); for (int i=0;ievents[0][i].mean; //taking the first element. Not sure what the second one is.. + auto scaled = ev.read->get_drift_scaled_level(i, ev.strand); // send the data in drift scaled + //auto unscaled = ev.read->events[0][i].mean; //taking the first element. Not sure what the second one is.. + eventMeans[offset + i] = scaled; } offset += num_events; } @@ -400,13 +428,41 @@ double scoreKernel(std::vector sequences, std::vector pore_model_level_mean(num_states); std::vector pore_model_level_stdv(num_states); + //TODO: Fix this. for(int st=0; ststates[0]; //let's just initially get the params for AAAAAA + auto params = event_sequences[0].pore_model->states[st]; //let's just initially get the params for AAAAAA pore_model_level_log_stdv[st] = params.level_log_stdv; pore_model_level_mean[st] = params.level_mean; pore_model_level_stdv[st] = params.level_stdv; } + std::vector scale(num_reads); + std::vector shift(num_reads); + std::vector var(num_reads); + std::vector log_var(num_reads); + + for (int i=0;iscalings->scale; + shift[i] = event_sequences[i].read->scalings->shift; + var[i] = event_sequences[i].read->scalings->var; + log_var[i] = event_sequences[i].read->scalings->log_var; + } + + float* scaleDev; + float* shiftDev; + float* varDev; + float* logVarDev; + + cudaMalloc( (void**)&scaleDev, scale.size() * sizeof(float)); + cudaMalloc( (void**)&shiftDev, shift.size() * sizeof(float)); + cudaMalloc( (void**)&varDev, var.size() * sizeof(float)); + cudaMalloc( (void**)&logVarDev, log_var.size() * sizeof(float)); + + cudaMemcpyAsync( scaleDev, scale.data(), scale.size() * sizeof(float), cudaMemcpyHostToDevice ); + cudaMemcpyAsync( shiftDev, shift.data(), shift.size() * sizeof(float), cudaMemcpyHostToDevice ); + cudaMemcpyAsync( varDev, var.data(), var.size() * sizeof(float), cudaMemcpyHostToDevice ); + cudaMemcpyAsync( logVarDev, log_var.data(), log_var.size() * sizeof(float), cudaMemcpyHostToDevice ); + float* poreModelLevelLogStdvDev; cudaMalloc( (void**)&poreModelLevelLogStdvDev, pore_model_level_log_stdv.size() * sizeof(float)); @@ -455,8 +511,8 @@ double scoreKernel(std::vector sequences, int num_blocks = n_states / PSR9_NUM_STATES; uint32_t num_kmers = num_blocks - 2; // two terminal blocks - dim3 dimBlock(num_blocks - 2); - dim3 dimGrid(1); // One thread per state, not including Start and Terminal state. + dim3 dimBlock(num_blocks - 2); // One thread per state, not including Start and Terminal state. + dim3 dimGrid(1); // Only looking at first event at the moment float * returnValues; cudaMalloc((void **) &returnValues, sizeof(float) * num_reads); //one score per read @@ -476,6 +532,10 @@ double scoreKernel(std::vector sequences, poreModelLevelLogStdvDev, poreModelLevelStdvDev, poreModelLevelMeanDev, + scaleDev, + shiftDev, + varDev, + logVarDev, returnValues); //cudaDeviceSynchronize(); @@ -493,6 +553,10 @@ double scoreKernel(std::vector sequences, cudaFree(poreModelLevelLogStdvDev); cudaFree(poreModelLevelStdvDev); cudaFree(poreModelLevelMeanDev); + cudaFree(scaleDev); + cudaFree(shiftDev); + cudaFree(varDev); + cudaFree(logVarDev); //Free host memory cudaFreeHost(eventMeans); @@ -525,6 +589,7 @@ std::vector GpuAligner::variantScoresThresholded(std::vector in std::vector base_sequences = generate_methylated_alternatives(base_haplotype.get_sequence(), methylation_types); std::vector> variant_sequences; + //for (auto v: variant_haplotypes){ // auto variant_sequence = generate_methylated_alternatives(v.get_sequence(), methylation_types); // variant_sequences.push_back(variant_sequence); diff --git a/src/hmm/nanopolish_emissions.h b/src/hmm/nanopolish_emissions.h index 599a24fb..7b5c8108 100644 --- a/src/hmm/nanopolish_emissions.h +++ b/src/hmm/nanopolish_emissions.h @@ -63,9 +63,12 @@ inline float log_probability_match_r9(const SquiggleRead& read, { // event level mean, scaled with the drift value float level = read.get_drift_scaled_level(event_idx, strand); - + if (debug == true){ + printf("Level being used to calculate emission: %f\n", level); + } GaussianParameters gp = read.get_scaled_gaussian_from_pore_model_state(pore_model, strand, kmer_rank); if (debug == true) { + printf(">CPU Strand is: %i\n", strand); printf(">CPU kmer_rank is: %i\n", kmer_rank); printf(">CPU level is: %f\n", level); printf(">CPU gaussian mean: %f\n", gp.mean); diff --git a/src/hmm/nanopolish_profile_hmm_r9.inl b/src/hmm/nanopolish_profile_hmm_r9.inl index d8738101..bc0235c6 100644 --- a/src/hmm/nanopolish_profile_hmm_r9.inl +++ b/src/hmm/nanopolish_profile_hmm_r9.inl @@ -326,11 +326,11 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, // Fill in matrix for(uint32_t row = 1; row < output.get_num_rows(); row++) { - printf("======\n"); + //printf("======\n"); //diagnostics - after match and bad event have been applied if (row == 4) { // row 1 has been computed so we can have a peek auto nc = output.get_num_columns(); - int rw = 3; + int rw = 1; for (int i = 0; i < nc; i++) { printf("CPU> Value for row %i col %i is %f\n", rw, i, output.get(rw, i)); } @@ -352,6 +352,7 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, uint32_t event_idx = e_start + (row - 1) * data.event_stride; uint32_t rank = kmer_ranks[kmer_idx]; float lp_emission_m = log_probability_match_r9(*data.read, *data.pore_model, rank, event_idx, data.strand, true); + printf("CPU> lp_emission_m %f\n", lp_emission_m); float lp_emission_b = BAD_EVENT_PENALTY; HMMUpdateScores scores; @@ -381,9 +382,9 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, scores.x[HMT_FROM_PREV_B] = -INFINITY; scores.x[HMT_FROM_PREV_K] = -INFINITY; scores.x[HMT_FROM_SOFT] = -INFINITY; - printf("before: %f:\n", output.get(row, curr_block_offset + PSR9_BAD_EVENT)); + //printf("before: %f:\n", output.get(row, curr_block_offset + PSR9_BAD_EVENT)); output.update_cell(row, curr_block_offset + PSR9_BAD_EVENT, scores, lp_emission_b); - printf("after: %f:\n", output.get(row, curr_block_offset + PSR9_BAD_EVENT)); + //printf("after: %f:\n", output.get(row, curr_block_offset + PSR9_BAD_EVENT)); // in cu this is where the shared memory sync on prev states would go. // state PSR9_KMER_SKIP @@ -395,15 +396,15 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, scores.x[HMT_FROM_SOFT] = -INFINITY; output.update_cell(row, curr_block_offset + PSR9_KMER_SKIP, scores, 0.0f); // no emission - if ((block == 1) && (row == 1)){ //blcok 1 corresponds to threadIdx 0 on GPU - printf("lp_emission_m is %f\n", lp_emission_m); - printf("PSR9_MATCH is %i\n", PSR9_MATCH); - printf(">CPU score HMT_FROM_SAME_M is %f\n", scores.x[HMT_FROM_SAME_M]); - printf(">CPU score HMT_FROM_PREV_M is %f\n", scores.x[HMT_FROM_PREV_M]); - printf(">CPU score HMT_FROM_SAME_B is %f\n", scores.x[HMT_FROM_SAME_B]); - printf(">CPU score HMT_FROM_PREV_B is %f\n", scores.x[HMT_FROM_PREV_B]); - printf(">CPU score HMT_FROM_PREV_K is %f\n", scores.x[HMT_FROM_PREV_K]); - } + //if ((block == 1) && (row == 1)){ //blcok 1 corresponds to threadIdx 0 on GPU + // printf("lp_emission_m is %f\n", lp_emission_m); + // printf("PSR9_MATCH is %i\n", PSR9_MATCH); + // printf(">CPU score HMT_FROM_SAME_M is %f\n", scores.x[HMT_FROM_SAME_M]); + // printf(">CPU score HMT_FROM_PREV_M is %f\n", scores.x[HMT_FROM_PREV_M]); + // printf(">CPU score HMT_FROM_SAME_B is %f\n", scores.x[HMT_FROM_SAME_B]); + // printf(">CPU score HMT_FROM_PREV_B is %f\n", scores.x[HMT_FROM_PREV_B]); + // printf(">CPU score HMT_FROM_PREV_K is %f\n", scores.x[HMT_FROM_PREV_K]); + //} // If POST_CLIP is enabled we allow the last kmer to transition directly // to the end after any event. Otherwise we only allow it from the diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp index 97319c76..ec289603 100644 --- a/src/nanopolish_call_variants.cpp +++ b/src/nanopolish_call_variants.cpp @@ -349,9 +349,12 @@ std::vector generate_candidate_single_base_edits(const AlignmentDB& ali GpuAligner aligner; auto t0_gpu = std::chrono::high_resolution_clock::now(); + // get the scaled levels. + std::vector scores = aligner.variantScoresThresholded(tmp_variants, test_haplotype, event_sequences, alignment_flags, opt::screen_score_threshold, opt::methylation_types); + auto tf_gpu = std::chrono::high_resolution_clock::now(); gpu_exec += tf_gpu - t0_gpu; From e148c87f54c145cfcfddc2d4d44f4679be98ab94 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Wed, 27 Jun 2018 13:22:24 +0100 Subject: [PATCH 11/80] Dynamic Programming Table the same for GPU and CPU except end --- src/cuda_kernels/GpuAligner.cu | 88 +++++++++++++++++++-------- src/hmm/nanopolish_emissions.h | 1 + src/hmm/nanopolish_profile_hmm_r9.inl | 14 +---- 3 files changed, 66 insertions(+), 37 deletions(-) diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index 3cef2919..204078a4 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -73,6 +73,7 @@ __global__ void getScores(float * eventData, float * shiftDev, float * varDev, float * logVarDev, + float * preFlankingDev, float * returnValues) { // Initialise the prev probability row, which is the row of the DP table @@ -91,9 +92,6 @@ __global__ void getScores(float * eventData, int e_stride = eventStrides[readIdx]; int e_offset = eventOffsets[readIdx]; // Within the event means etc, the offset needed for this block to get a specific event - //float levelLogStdv = poreModelLevelLogStdv[e_offset]; - //float levelStdv = poreModelLevelStdv[e_offset]; - //float levelMean = poreModelLevelMean[e_offset]; if (threadIdx.x == 0){ printf(">GPU e_start %i\n", e_start); @@ -158,12 +156,12 @@ __global__ void getScores(float * eventData, for(int row=1; row lp_emission_m %f\n", lp_emission_m); - printf("GPU> level being used to calculate emission: %f\n", event_mean); - } + float lp_emission_b = BAD_EVENT_PENALTY; // Get all the scores for a match @@ -192,12 +186,18 @@ __global__ void getScores(float * eventData, float HMT_FROM_PREV_B = lp_bm_next + prevProbabilities[prevBlockOffset + PSR9_BAD_EVENT]; float HMT_FROM_PREV_K = lp_km + prevProbabilities[prevBlockOffset + PSR9_KMER_SKIP]; + + // m_s is the probability of going from the start state // to this kmer. The start state is (currently) only // allowed to go to the first kmer. If ALLOW_PRE_CLIP // is defined, we allow all events before this one to be skipped, // with a penalty; - float HMT_FROM_SOFT = (kmerIdx == 0 && (event_idx == e_start)) ? lp_sm : -INFINITY; // TODO: Add the pre-flank to this calculation. Also flags and HAF_ALLOW_PRE_CLIP + float HMT_FROM_SOFT = (kmerIdx == 0 && + (event_idx == e_start || + (HAF_ALLOW_PRE_CLIP))) ? lp_sm + preFlank : -INFINITY; // TEST! TODO: Add the pre-flank to this calculation. Also flags and HAF_ALLOW_PRE_CLIP. For now this is left out and should not have a big effect + + // calculate the score float sum = HMT_FROM_SAME_M; @@ -226,6 +226,18 @@ __global__ void getScores(float * eventData, float newMatchScore = sum; // Here need to calculate the bad event score + if (debug==true){ + printf("GPU> lp_emission_m for row %i and thread %i %f\n", row, threadIdx.x, lp_emission_m); + printf("GPU> level being used to calculate emission for thread 0: %f\n", event_mean); + printf("GPU> match score for row %i and thread %i %f\\n\", row, threadIdx.x", newMatchScore); + printf("GPU> HMT_FROM_SAME_M: %f\n", HMT_FROM_SAME_M); + printf("GPU> HMT_FROM_PREV_M: %f\n", HMT_FROM_PREV_M); + printf("GPU> HMT_FROM_SAME_B: %f\n", HMT_FROM_SAME_B); + printf("GPU> HMT_FROM_PREV_B: %f\n", HMT_FROM_PREV_B); + printf("GPU> HMT_FROM_PREV_K: %f\n", HMT_FROM_PREV_K); + printf("GPU> HMT_FROM_SOFT: %f\n", HMT_FROM_SOFT); + + } // state PSR9_BAD_EVENT HMT_FROM_SAME_M = lp_mb + prevProbabilities[curBlockOffset + PSR9_MATCH]; HMT_FROM_PREV_M = -INFINITY; // not allowed @@ -244,7 +256,7 @@ __global__ void getScores(float * eventData, float newBadEventScore = sum; - // Write row out + // Write row out. prevProbabilities now becomes "current probabilities" for evaluating skips. prevProbabilities[curBlockOffset + PSR9_MATCH] = newMatchScore; prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] = newBadEventScore; __syncthreads(); @@ -254,7 +266,6 @@ __global__ void getScores(float * eventData, HMT_FROM_PREV_M = lp_mk + prevProbabilities[prevBlockOffset + PSR9_MATCH]; HMT_FROM_SAME_B = -INFINITY; HMT_FROM_PREV_B = lp_bk + prevProbabilities[prevBlockOffset + PSR9_BAD_EVENT]; - HMT_FROM_SOFT = -INFINITY; sum = HMT_FROM_SAME_M; @@ -273,23 +284,26 @@ __global__ void getScores(float * eventData, //Now need to do the skip-skip transition, which is serial. if (threadIdx.x == 0){ for (int blkidx = 2;blkidx <= blockDim.x; blkidx++){ + auto skipIdx = blkidx * PSR9_NUM_STATES + PSR9_KMER_SKIP; //calculate the skipscore using the previous //Current skip score for block blkidx: - float curSkipScore = prevProbabilities[blkidx * PSR9_NUM_STATES + PSR9_KMER_SKIP]; + float prevSkipScore = prevProbabilities[skipIdx - PSR9_NUM_STATES]; + float curSkipScore = prevProbabilities[skipIdx]; //printf("Current skip score for block %i is %f",blkidx, curSkipScore); //new score to add - TODO: use the correct lp_kk score - HMT_FROM_PREV_K = lp_kk + newSkipScore; + HMT_FROM_PREV_K = lp_kk + prevSkipScore; newSkipScore = logsumexpf(curSkipScore, HMT_FROM_PREV_K); //add it - prevProbabilities[blkidx * PSR9_NUM_STATES + PSR9_KMER_SKIP] = newSkipScore; + prevProbabilities[skipIdx] = newSkipScore; + __syncthreads(); } } // Now do the end state __syncthreads(); - if ((threadIdx.x == 1) && (row == 1)){ + if ((threadIdx.x == 0) && (row == 3)){ printf("rank %i\n", rank); printf("event mean %f\n", event_mean); printf("poreModelLevelLogStdv %f\n", poreModelLevelLogStdv[0]); @@ -306,7 +320,7 @@ __global__ void getScores(float * eventData, } - if ((threadIdx.x == 0) && (row == 1)) { + if ((threadIdx.x == 0) && (row == 3)) { printf("Number of states is %i\n", n_states); for (int c = 0; c < n_states; c++) { printf("GPU> Value for row %i and col %i is %f\n",row, c, prevProbabilities[c]); @@ -360,7 +374,10 @@ double scoreKernel(std::vector sequences, std::vector n_rows; //number of rows in the DP table (n_events + 1) std::vector e_starts; //event starts - std::vector event_strides; + std::vector event_strides; + + std::vector> pre_flanks; + std::vector> post_flanks; for(auto e: event_sequences){ uint32_t e_start = e.event_start_idx; @@ -377,6 +394,12 @@ double scoreKernel(std::vector sequences, n_events = e_start - e_end + 1; n_rows.push_back(n_events + 1); + + std::vector pre_flank = make_pre_flanking(e, e_start, n_events); + std::vector post_flank = make_post_flanking(e, e_start, n_events); + + pre_flanks.push_back(pre_flank); + post_flanks.push_back(post_flank); } std::vector kmer_ranks(n_kmers); @@ -404,20 +427,28 @@ double scoreKernel(std::vector sequences, numEventsTotal += numEvents; } - //Allocate a host buffer to store the event means + + //Allocate a host buffer to store the event means, pre and post-flank data float * eventMeans; size_t eventMeansSize = numEventsTotal * sizeof(float); cudaHostAlloc(&eventMeans, eventMeansSize , cudaHostAllocDefault); + //Allocate a host buffer to store the event means, pre and post-flank data + float * preFlankingHost; + cudaHostAlloc(&preFlankingHost, numEventsTotal * sizeof(float) , cudaHostAllocDefault); + std::vector eventOffsets; size_t offset = 0; - for (auto ev: event_sequences){ + for(int j=0;jevents->size(); for (int i=0;iget_drift_scaled_level(i, ev.strand); // send the data in drift scaled + auto event_idx = e_starts[j] + i * event_strides[0]; + auto scaled = ev.read->get_drift_scaled_level(event_idx, ev.strand); // send the data in drift scaled //auto unscaled = ev.read->events[0][i].mean; //taking the first element. Not sure what the second one is.. eventMeans[offset + i] = scaled; + preFlankingHost[offset + i] = pre_flanks[j][i]; //also copy over the pre-flanking data, since it has a 1-1 correspondence with events } offset += num_events; } @@ -485,6 +516,10 @@ double scoreKernel(std::vector sequences, cudaMalloc( (void**)&eventMeansDev, eventMeansSize); cudaMemcpyAsync( eventMeansDev, eventMeans, eventMeansSize, cudaMemcpyHostToDevice ); //malloc is taking 300us + float* preFlankingDev; + cudaMalloc( (void**)&preFlankingDev, eventMeansSize); + cudaMemcpyAsync( preFlankingDev, preFlankingHost, eventMeansSize, cudaMemcpyHostToDevice ); //malloc is taking 300us + int* numRowsDev; cudaMalloc( (void**)&numRowsDev, n_rows.size() * sizeof(int)); cudaMemcpyAsync( numRowsDev, n_rows.data(), n_rows.size() * sizeof(int), cudaMemcpyHostToDevice ); @@ -536,6 +571,7 @@ double scoreKernel(std::vector sequences, shiftDev, varDev, logVarDev, + preFlankingDev, returnValues); //cudaDeviceSynchronize(); @@ -557,6 +593,8 @@ double scoreKernel(std::vector sequences, cudaFree(shiftDev); cudaFree(varDev); cudaFree(logVarDev); + cudaFree(preFlankingDev); + //Free host memory cudaFreeHost(eventMeans); diff --git a/src/hmm/nanopolish_emissions.h b/src/hmm/nanopolish_emissions.h index 7b5c8108..6069ac81 100644 --- a/src/hmm/nanopolish_emissions.h +++ b/src/hmm/nanopolish_emissions.h @@ -68,6 +68,7 @@ inline float log_probability_match_r9(const SquiggleRead& read, } GaussianParameters gp = read.get_scaled_gaussian_from_pore_model_state(pore_model, strand, kmer_rank); if (debug == true) { + printf(">Event IDX is: %i\n", event_idx); printf(">CPU Strand is: %i\n", strand); printf(">CPU kmer_rank is: %i\n", kmer_rank); printf(">CPU level is: %f\n", level); diff --git a/src/hmm/nanopolish_profile_hmm_r9.inl b/src/hmm/nanopolish_profile_hmm_r9.inl index bc0235c6..4728b680 100644 --- a/src/hmm/nanopolish_profile_hmm_r9.inl +++ b/src/hmm/nanopolish_profile_hmm_r9.inl @@ -330,7 +330,7 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, //diagnostics - after match and bad event have been applied if (row == 4) { // row 1 has been computed so we can have a peek auto nc = output.get_num_columns(); - int rw = 1; + int rw = 3; for (int i = 0; i < nc; i++) { printf("CPU> Value for row %i col %i is %f\n", rw, i, output.get(rw, i)); } @@ -372,7 +372,7 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, scores.x[HMT_FROM_SOFT] = (kmer_idx == 0 && (event_idx == e_start || (flags & HAF_ALLOW_PRE_CLIP))) ? lp_sm + pre_flank[row - 1] : -INFINITY; - + printf("lp_emission_m is %f\n", lp_emission_m); output.update_cell(row, curr_block_offset + PSR9_MATCH, scores, lp_emission_m); // state PSR9_BAD_EVENT @@ -396,16 +396,6 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, scores.x[HMT_FROM_SOFT] = -INFINITY; output.update_cell(row, curr_block_offset + PSR9_KMER_SKIP, scores, 0.0f); // no emission - //if ((block == 1) && (row == 1)){ //blcok 1 corresponds to threadIdx 0 on GPU - // printf("lp_emission_m is %f\n", lp_emission_m); - // printf("PSR9_MATCH is %i\n", PSR9_MATCH); - // printf(">CPU score HMT_FROM_SAME_M is %f\n", scores.x[HMT_FROM_SAME_M]); - // printf(">CPU score HMT_FROM_PREV_M is %f\n", scores.x[HMT_FROM_PREV_M]); - // printf(">CPU score HMT_FROM_SAME_B is %f\n", scores.x[HMT_FROM_SAME_B]); - // printf(">CPU score HMT_FROM_PREV_B is %f\n", scores.x[HMT_FROM_PREV_B]); - // printf(">CPU score HMT_FROM_PREV_K is %f\n", scores.x[HMT_FROM_PREV_K]); - //} - // If POST_CLIP is enabled we allow the last kmer to transition directly // to the end after any event. Otherwise we only allow it from the // last kmer/event match. From 6dddd4853439d1e99b7d7de6fcf2f56fef9a8491 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Mon, 2 Jul 2018 14:30:57 +0100 Subject: [PATCH 12/80] first two base scores correct, bug for other ones --- src/cuda_kernels/GpuAligner.cu | 174 +++++++++++++------------- src/hmm/nanopolish_profile_hmm_r9.inl | 12 +- src/nanopolish_call_variants.cpp | 4 +- 3 files changed, 98 insertions(+), 92 deletions(-) diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index 204078a4..aa59da03 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -14,7 +14,6 @@ __device__ float logsumexpf(float x, float y){ return result; } -//TODO: Implement, inc pore model __device__ float lp_match_r9(int rank, float mean, float * poreModelLevelLogStdv, @@ -42,18 +41,6 @@ __device__ float lp_match_r9(int rank, float emission = log_inv_sqrt_2pi - gaussian_log_level_stdv + (-0.5f * a * a); // log_inv_sqrt_2pi is defined in a comment above - if (debug == true) { - if (threadIdx.x == 0) { - printf(">GPU: kmer rank is %i\n", rank); - printf(">GPU: level %f\n", level); - printf(">GPU: gaussian mean %f\n", gaussian_mean); - printf(">GPU: gaussian stdv %f\n", gaussian_stdv); - printf(">GPU: gaussian log level stdv %f\n", gaussian_log_level_stdv); - printf(">GPU a: %f\n", a); - printf(">GPU emission: %f\n", emission); - } - } - return emission; // log_inv_sqrt_2pi is defined in a comment above } @@ -74,15 +61,25 @@ __global__ void getScores(float * eventData, float * varDev, float * logVarDev, float * preFlankingDev, + float * postFlankingDev, float * returnValues) { // Initialise the prev probability row, which is the row of the DP table + int n_kmers = blockDim.x; // Question: How does this deal with the case where the block is bigger than the sequence, such as if one variant is a deletion? + int n_states = n_kmers * PSR9_NUM_STATES + 2 * PSR9_NUM_STATES; // 3 blocks per kmer and then 3 each for start and end state. + + //initialise the return value + returnValues[blockIdx.x] = -INFINITY; - int n_states = blockDim.x * PSR9_NUM_STATES + 2 * PSR9_NUM_STATES; // 3 blocks per kmer and then 3 each for start and end state. __shared__ float prevProbabilities[MAX_STATES]; - for (int i = 0; i < n_states; i++) { + + // Initialise the previous probabilities + for (int i = 0; i < n_states - PSR9_NUM_STATES; i++) { prevProbabilities[i] = -INFINITY; } + for (int i = n_states - PSR9_NUM_STATES; i < n_states; i++) { + prevProbabilities[i] = 0; // Is this correct? + } //Step 1: calculate transitions. For now we are going to use external params. int readIdx = blockIdx.x; @@ -92,14 +89,8 @@ __global__ void getScores(float * eventData, int e_stride = eventStrides[readIdx]; int e_offset = eventOffsets[readIdx]; // Within the event means etc, the offset needed for this block to get a specific event - - if (threadIdx.x == 0){ - printf(">GPU e_start %i\n", e_start); - } - int kmerIdx = threadIdx.x; uint32_t rank = kmer_ranks[kmerIdx]; // lexical rank of a kmer - printf("Kmer idx %i, Rank: %i\n", kmerIdx, rank); float p_stay = 1 - (1 / read_events_per_base); float p_skip = 0.0025; @@ -158,10 +149,11 @@ __global__ void getScores(float * eventData, int event_idx = e_start + (row - 1) * e_stride; float event_mean = eventData[e_offset + row - 1]; float preFlank = preFlankingDev[e_offset + row - 1]; + float postFlank = postFlankingDev[e_offset + row - 1]; bool debug = false; - if (threadIdx.x == 0 && row == 3){ + if (threadIdx.x == 0 && (row == numRows -1) && blockIdx.x == 0){ debug = true; } @@ -195,49 +187,22 @@ __global__ void getScores(float * eventData, // with a penalty; float HMT_FROM_SOFT = (kmerIdx == 0 && (event_idx == e_start || - (HAF_ALLOW_PRE_CLIP))) ? lp_sm + preFlank : -INFINITY; // TEST! TODO: Add the pre-flank to this calculation. Also flags and HAF_ALLOW_PRE_CLIP. For now this is left out and should not have a big effect - - + (HAF_ALLOW_PRE_CLIP))) ? lp_sm + preFlank : -INFINITY; // TODO: Add flag for HAF ALLOW_PRE_CLIP // calculate the score float sum = HMT_FROM_SAME_M; - sum = logsumexpf(sum, HMT_FROM_SOFT); - if (debug == true){ - printf("Sum1 is : %f\n", sum); - } sum = logsumexpf(sum, HMT_FROM_PREV_M); - if (debug == true){ - printf("Sum2 is : %f\n", sum); - } - sum = logsumexpf(sum, HMT_FROM_SAME_B); sum = logsumexpf(sum, HMT_FROM_PREV_B); - if (debug == true){ - printf("Sum3 is : %f\n", sum); - } - sum = logsumexpf(sum, HMT_FROM_PREV_K); sum += lp_emission_m; - if (debug == true){ - printf("Sum4 is : %f\n", sum); - } + float newMatchScore = sum; // Here need to calculate the bad event score - if (debug==true){ - printf("GPU> lp_emission_m for row %i and thread %i %f\n", row, threadIdx.x, lp_emission_m); - printf("GPU> level being used to calculate emission for thread 0: %f\n", event_mean); - printf("GPU> match score for row %i and thread %i %f\\n\", row, threadIdx.x", newMatchScore); - printf("GPU> HMT_FROM_SAME_M: %f\n", HMT_FROM_SAME_M); - printf("GPU> HMT_FROM_PREV_M: %f\n", HMT_FROM_PREV_M); - printf("GPU> HMT_FROM_SAME_B: %f\n", HMT_FROM_SAME_B); - printf("GPU> HMT_FROM_PREV_B: %f\n", HMT_FROM_PREV_B); - printf("GPU> HMT_FROM_PREV_K: %f\n", HMT_FROM_PREV_K); - printf("GPU> HMT_FROM_SOFT: %f\n", HMT_FROM_SOFT); - } // state PSR9_BAD_EVENT HMT_FROM_SAME_M = lp_mb + prevProbabilities[curBlockOffset + PSR9_MATCH]; HMT_FROM_PREV_M = -INFINITY; // not allowed @@ -281,29 +246,52 @@ __global__ void getScores(float * eventData, prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] = newSkipScore; __syncthreads(); - //Now need to do the skip-skip transition, which is serial. + //Now need to do the skip-skip transition, which is serial so for now letting one thread execute it. if (threadIdx.x == 0){ for (int blkidx = 2;blkidx <= blockDim.x; blkidx++){ auto skipIdx = blkidx * PSR9_NUM_STATES + PSR9_KMER_SKIP; - //calculate the skipscore using the previous - //Current skip score for block blkidx: float prevSkipScore = prevProbabilities[skipIdx - PSR9_NUM_STATES]; float curSkipScore = prevProbabilities[skipIdx]; - //printf("Current skip score for block %i is %f",blkidx, curSkipScore); - //new score to add - TODO: use the correct lp_kk score - HMT_FROM_PREV_K = lp_kk + prevSkipScore; newSkipScore = logsumexpf(curSkipScore, HMT_FROM_PREV_K); - //add it prevProbabilities[skipIdx] = newSkipScore; __syncthreads(); } } + __syncthreads(); + + int lastKmerIdx = n_kmers -1; + int lastRowIdx = numRows -1; + float end; + // Now do the post-clip transition + if(kmerIdx == lastKmerIdx && ( (HAF_ALLOW_POST_CLIP) || row == lastRowIdx)) { + float lp1 = lp_ms + prevProbabilities[curBlockOffset + PSR9_MATCH] + postFlank; + float lp2 = lp_ms + prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] + postFlank; + float lp3 = lp_ms + prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] + postFlank; + + printf(">GPU Post-clip transition on row %i, read %i, threadIdx is %i\n" + "LP1=%f\n" + "LP2=%f\n" + "LP3=%f\n", + row, + blockIdx.x, + threadIdx.x, + lp1, + lp2, + lp3); + + end = returnValues[blockIdx.x]; + end = logsumexpf(end, lp1); + end = logsumexpf(end, lp2); + end = logsumexpf(end, lp3); + returnValues[blockIdx.x] = end; + } // Now do the end state __syncthreads(); - if ((threadIdx.x == 0) && (row == 3)){ + // DIAGNOSTIC + if (debug == true){ printf("rank %i\n", rank); printf("event mean %f\n", event_mean); printf("poreModelLevelLogStdv %f\n", poreModelLevelLogStdv[0]); @@ -317,10 +305,6 @@ __global__ void getScores(float * eventData, printf(">GPU score HMT_FROM_PREV_B is %f\n", HMT_FROM_PREV_B); printf(">GPU score HMT_FROM_PREV_K is %f\n", HMT_FROM_PREV_K); printf(">GPU newSkipScore is %f\n", newSkipScore); - } - - - if ((threadIdx.x == 0) && (row == 3)) { printf("Number of states is %i\n", n_states); for (int c = 0; c < n_states; c++) { printf("GPU> Value for row %i and col %i is %f\n",row, c, prevProbabilities[c]); @@ -328,8 +312,6 @@ __global__ void getScores(float * eventData, } } - - returnValues[blockIdx.x] = 0.356; __syncthreads(); } @@ -342,9 +324,9 @@ GpuAligner::GpuAligner() n[i] = i; } -double scoreKernel(std::vector sequences, - std::vector event_sequences, - uint32_t alignment_flags){ +std::vector scoreKernel(std::vector sequences, + std::vector event_sequences, + uint32_t alignment_flags){ // Extract the pore model. //Let's assume that every event sequence has the same pore model @@ -435,7 +417,9 @@ double scoreKernel(std::vector sequences, //Allocate a host buffer to store the event means, pre and post-flank data float * preFlankingHost; + float * postFlankingHost; cudaHostAlloc(&preFlankingHost, numEventsTotal * sizeof(float) , cudaHostAllocDefault); + cudaHostAlloc(&postFlankingHost, numEventsTotal * sizeof(float) , cudaHostAllocDefault); std::vector eventOffsets; size_t offset = 0; @@ -449,6 +433,7 @@ double scoreKernel(std::vector sequences, //auto unscaled = ev.read->events[0][i].mean; //taking the first element. Not sure what the second one is.. eventMeans[offset + i] = scaled; preFlankingHost[offset + i] = pre_flanks[j][i]; //also copy over the pre-flanking data, since it has a 1-1 correspondence with events + postFlankingHost[offset + i] = post_flanks[j][i]; //also copy over the pre-flanking data, since it has a 1-1 correspondence with events } offset += num_events; } @@ -461,7 +446,7 @@ double scoreKernel(std::vector sequences, //TODO: Fix this. for(int st=0; ststates[st]; //let's just initially get the params for AAAAAA + auto params = event_sequences[0].pore_model->states[st]; //TODO: Is this OK? pore_model_level_log_stdv[st] = params.level_log_stdv; pore_model_level_mean[st] = params.level_mean; pore_model_level_stdv[st] = params.level_stdv; @@ -520,6 +505,10 @@ double scoreKernel(std::vector sequences, cudaMalloc( (void**)&preFlankingDev, eventMeansSize); cudaMemcpyAsync( preFlankingDev, preFlankingHost, eventMeansSize, cudaMemcpyHostToDevice ); //malloc is taking 300us + float* postFlankingDev; + cudaMalloc( (void**)&postFlankingDev, eventMeansSize); + cudaMemcpyAsync( postFlankingDev, postFlankingHost, eventMeansSize, cudaMemcpyHostToDevice ); //malloc is taking 300us + int* numRowsDev; cudaMalloc( (void**)&numRowsDev, n_rows.size() * sizeof(int)); cudaMemcpyAsync( numRowsDev, n_rows.data(), n_rows.size() * sizeof(int), cudaMemcpyHostToDevice ); @@ -544,18 +533,18 @@ double scoreKernel(std::vector sequences, cudaMemcpyAsync( eventOffsetsDev, eventOffsets.data(), eventOffsets.size() * sizeof(int), cudaMemcpyHostToDevice ); int num_blocks = n_states / PSR9_NUM_STATES; - uint32_t num_kmers = num_blocks - 2; // two terminal blocks + uint32_t num_kmers = num_blocks - 2; // two terminal blocks. Not currently used but left here for now. dim3 dimBlock(num_blocks - 2); // One thread per state, not including Start and Terminal state. - dim3 dimGrid(1); // Only looking at first event at the moment + dim3 dimGrid(num_reads); // let's look at only the first read - float * returnValues; - cudaMalloc((void **) &returnValues, sizeof(float) * num_reads); //one score per read + float * returnValuesDev; + cudaMalloc((void **) &returnValuesDev, sizeof(float) * num_reads); //one score per read - float* returnedValues;// = new float[num_reads]; - //size_t eventMeansSize = numEventsTotal * sizeof(float); + float* returnedValues; cudaHostAlloc(&returnedValues, num_reads * sizeof(float) , cudaHostAllocDefault); + printf("About to run getscores...\n"); getScores<<>>(eventMeansDev, eventsPerBaseDev, numRowsDev, @@ -572,10 +561,11 @@ double scoreKernel(std::vector sequences, varDev, logVarDev, preFlankingDev, - returnValues); + postFlankingDev, + returnValuesDev); - //cudaDeviceSynchronize(); - cudaMemcpyAsync(returnedValues, returnValues, num_reads *sizeof(float), cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + cudaMemcpyAsync(returnedValues, returnValuesDev, num_reads *sizeof(float), cudaMemcpyDeviceToHost); // Free device memory cudaFree(eventMeansDev); @@ -594,14 +584,15 @@ double scoreKernel(std::vector sequences, cudaFree(varDev); cudaFree(logVarDev); cudaFree(preFlankingDev); - + cudaFree(postFlankingDev); //Free host memory cudaFreeHost(eventMeans); - float r = 0.0; + //Send all the scores back + std::vector r(num_reads); for(int i=0; i GpuAligner::variantScoresThresholded(std::vector in methylation_types); std::vector> variant_sequences; - //for (auto v: variant_haplotypes){ - // auto variant_sequence = generate_methylated_alternatives(v.get_sequence(), methylation_types); - // variant_sequences.push_back(variant_sequence); - //} + for (auto v: variant_haplotypes){ + auto variant_sequence = generate_methylated_alternatives(v.get_sequence(), methylation_types); + variant_sequences.push_back(variant_sequence); + } assert(base_sequences.size() == 1); // return the sum of the score for the base sequences over all the event sequences - double base_score = scoreKernel(base_sequences, event_sequences, alignment_flags); + auto base_scores = scoreKernel(base_sequences, event_sequences, alignment_flags); std::vector v(variant_sequences.size()); for (int i=0; i Value for row %i col %i is %f\n", rw, i, output.get(rw, i)); } @@ -399,7 +399,9 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, // If POST_CLIP is enabled we allow the last kmer to transition directly // to the end after any event. Otherwise we only allow it from the // last kmer/event match. + if(kmer_idx == last_kmer_idx && ( (flags & HAF_ALLOW_POST_CLIP) || row == last_event_row_idx)) { + printf(">CPU Post-clip transition on row %i\n", row); float lp1 = lp_ms + output.get(row, curr_block_offset + PSR9_MATCH) + post_flank[row - 1]; float lp2 = lp_ms + output.get(row, curr_block_offset + PSR9_BAD_EVENT) + post_flank[row - 1]; float lp3 = lp_ms + output.get(row, curr_block_offset + PSR9_KMER_SKIP) + post_flank[row - 1]; @@ -407,6 +409,12 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, output.update_end(lp1, row, curr_block_offset + PSR9_MATCH); output.update_end(lp2, row, curr_block_offset + PSR9_BAD_EVENT); output.update_end(lp3, row, curr_block_offset + PSR9_KMER_SKIP); + + printf(">LP1 %f\n", lp1); + printf(">LP2 %f\n", lp2); + printf(">LP3 %f\n", lp3); + printf(">end %f\n", output.get_end()); + } #ifdef DEBUG_LOCAL_ALIGNMENT diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp index ec289603..864a8c59 100644 --- a/src/nanopolish_call_variants.cpp +++ b/src/nanopolish_call_variants.cpp @@ -352,7 +352,7 @@ std::vector generate_candidate_single_base_edits(const AlignmentDB& ali // get the scaled levels. std::vector scores = aligner.variantScoresThresholded(tmp_variants, test_haplotype, event_sequences, - alignment_flags, opt::screen_score_threshold, + alignment_flags, 10,//opt::screen_score_threshold, opt::methylation_types); auto tf_gpu = std::chrono::high_resolution_clock::now(); @@ -364,7 +364,7 @@ std::vector generate_candidate_single_base_edits(const AlignmentDB& ali test_haplotype, event_sequences, alignment_flags, - opt::screen_score_threshold, + 10,//opt::screen_score_threshold, opt::methylation_types); auto t1 = std::chrono::high_resolution_clock::now(); scoring += t1-t0; From 03ffaba469a0c2f9c3803963f289c71cfcdc1a8c Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Tue, 3 Jul 2018 14:36:29 +0100 Subject: [PATCH 13/80] GPU and CPU versions now giving same results --- src/common/nanopolish_variant.cpp | 1 + src/cuda_kernels/GpuAligner.cu | 116 ++++++++++++++++---------- src/hmm/nanopolish_emissions.h | 24 +++--- src/hmm/nanopolish_profile_hmm.cpp | 1 + src/hmm/nanopolish_profile_hmm_r9.inl | 54 +++++++----- src/nanopolish_call_variants.cpp | 5 +- 6 files changed, 122 insertions(+), 79 deletions(-) diff --git a/src/common/nanopolish_variant.cpp b/src/common/nanopolish_variant.cpp index b73a6b2b..bbc5933b 100644 --- a/src/common/nanopolish_variant.cpp +++ b/src/common/nanopolish_variant.cpp @@ -686,6 +686,7 @@ Variant score_variant_thresholded(const Variant& input_variant, if(fabs(total_score) < score_threshold) { // Calculate scores using the base nucleotide model + printf("Working with input %i\n", j); double base_score = profile_hmm_score_set(base_sequences, input[j], alignment_flags); double variant_score = profile_hmm_score_set(variant_sequences, input[j], alignment_flags); diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index aa59da03..ef52b68c 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -87,10 +87,25 @@ __global__ void getScores(float * eventData, int numRows = numRowsPerRead[readIdx]; // Number of rows in this DP table. int e_start = eventStarts[readIdx]; // Event start for read int e_stride = eventStrides[readIdx]; + bool rc = false; + if (e_stride == -1){ + rc = true; + } int e_offset = eventOffsets[readIdx]; // Within the event means etc, the offset needed for this block to get a specific event + if(blockIdx.x==2){ // read 2 is an RC read + printf("Block IDX is %i and stride is %i\n", blockIdx.x, e_stride); + } + int kmerIdx = threadIdx.x; - uint32_t rank = kmer_ranks[kmerIdx]; // lexical rank of a kmer + uint32_t rank; + + if (rc == true) { + rank = kmer_ranks_rc[kmerIdx]; + //printf("Using an RC rank of %i\n", rank); + }else{ + rank = kmer_ranks[kmerIdx]; + } float p_stay = 1 - (1 / read_events_per_base); float p_skip = 0.0025; @@ -112,7 +127,7 @@ __global__ void getScores(float * eventData, float p_kk = p_skip_self; float p_km = 1.0f - p_kk; - // We assign some transition probabilities. I believe this is correct and they don't vary by location in the sequence (why would they) + // We assign some transition probabilities. I believe this is correct and they don't vary by location in the sequence float lp_mk = log(p_mk); float lp_mb = log(p_mb); float lp_mm_self = log(p_mm_self); @@ -153,7 +168,7 @@ __global__ void getScores(float * eventData, bool debug = false; - if (threadIdx.x == 0 && (row == numRows -1) && blockIdx.x == 0){ + if (threadIdx.x == 0 && (row == numRows -1) && blockIdx.x == 2){ debug = true; } @@ -189,6 +204,12 @@ __global__ void getScores(float * eventData, (event_idx == e_start || (HAF_ALLOW_PRE_CLIP))) ? lp_sm + preFlank : -INFINITY; // TODO: Add flag for HAF ALLOW_PRE_CLIP + if (blockIdx.x == 2 && threadIdx.x == 0 && row == 2){ + printf("HMT_FROM_SOFT should be (?) -5.99 but is in fact %f\n", HMT_FROM_SOFT); + printf("event IDX is %i\n", event_idx); + printf("e_start is %i\n", e_start); + } + // calculate the score float sum = HMT_FROM_SAME_M; sum = logsumexpf(sum, HMT_FROM_SOFT); @@ -269,17 +290,17 @@ __global__ void getScores(float * eventData, float lp1 = lp_ms + prevProbabilities[curBlockOffset + PSR9_MATCH] + postFlank; float lp2 = lp_ms + prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] + postFlank; float lp3 = lp_ms + prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] + postFlank; - - printf(">GPU Post-clip transition on row %i, read %i, threadIdx is %i\n" - "LP1=%f\n" - "LP2=%f\n" - "LP3=%f\n", - row, - blockIdx.x, - threadIdx.x, - lp1, - lp2, - lp3); +// +// printf(">GPU Post-clip transition on row %i, read %i, threadIdx is %i\n" +// "LP1=%f\n" +// "LP2=%f\n" +// "LP3=%f\n", +// row, +// blockIdx.x, +// threadIdx.x, +// lp1, +// lp2, +// lp3); end = returnValues[blockIdx.x]; end = logsumexpf(end, lp1); @@ -290,29 +311,39 @@ __global__ void getScores(float * eventData, // Now do the end state __syncthreads(); - // DIAGNOSTIC - if (debug == true){ - printf("rank %i\n", rank); - printf("event mean %f\n", event_mean); - printf("poreModelLevelLogStdv %f\n", poreModelLevelLogStdv[0]); - printf("poreModelLevelStdv %f\n", poreModelLevelStdv[0]); - printf("poreModelLevelMean %f\n", poreModelLevelMean[0]); - printf("lp_emission_m is %f\n", lp_emission_m); - printf("PSR9_MATCH is %i\n", PSR9_MATCH); - printf(">GPU score HMT_FROM_SAME_M is %f\n", HMT_FROM_SAME_M); - printf(">GPU score HMT_FROM_PREV_M is %f\n", HMT_FROM_PREV_M); - printf(">GPU score HMT_FROM_SAME_B is %f\n", HMT_FROM_SAME_B); - printf(">GPU score HMT_FROM_PREV_B is %f\n", HMT_FROM_PREV_B); - printf(">GPU score HMT_FROM_PREV_K is %f\n", HMT_FROM_PREV_K); - printf(">GPU newSkipScore is %f\n", newSkipScore); - printf("Number of states is %i\n", n_states); - for (int c = 0; c < n_states; c++) { - printf("GPU> Value for row %i and col %i is %f\n",row, c, prevProbabilities[c]); - } - } - } + if ((blockIdx.x == 2) && (threadIdx.x == 0)){ +// printf("rank %i\n", rank); +// printf("event mean %f\n", event_mean); +// printf("poreModelLevelLogStdv %f\n", poreModelLevelLogStdv[0]); +// printf("poreModelLevelStdv %f\n", poreModelLevelStdv[0]); +// printf("poreModelLevelMean %f\n", poreModelLevelMean[0]); +// printf("lp_emission_m is %f\n", lp_emission_m); +// printf("PSR9_MATCH is %i\n", PSR9_MATCH); +// printf(">GPU score HMT_FROM_SAME_M is %f\n", HMT_FROM_SAME_M); +// printf(">GPU score HMT_FROM_PREV_M is %f\n", HMT_FROM_PREV_M); +// printf(">GPU score HMT_FROM_SAME_B is %f\n", HMT_FROM_SAME_B); +// printf(">GPU score HMT_FROM_PREV_B is %f\n", HMT_FROM_PREV_B); +// printf(">GPU score HMT_FROM_PREV_K is %f\n", HMT_FROM_PREV_K); +// printf(">GPU newSkipScore is %f\n", newSkipScore); +// printf("Number of states is %i\n", n_states); + for (int c = 0; c < n_states; c++) { + printf("GPU> Value for row %i and col %i is %f\n", row, c, prevProbabilities[c]); + } + printf("HMT_FROM_SOFT = %f\n", HMT_FROM_SOFT); + printf("lp_mk = %f\n", lp_mk); + printf("lp_mb = %f\n", lp_mb); + printf("lp_mm_self = %f\n", lp_mm_self); + printf("lp_mm_next = %f\n", lp_mm_next); + printf("lp_bb = %f\n", lp_bb); + printf("lp_bk = %f\n", lp_bk); + printf("lp_bm_next = %f\n", lp_bm_next); + printf("lp_bm_self = %f\n", lp_bm_self); + printf("lp_kk = %f\n", lp_kk); + printf("lp_km = %f\n", lp_km); - __syncthreads(); + } + } + __syncthreads(); } @@ -426,9 +457,9 @@ std::vector scoreKernel(std::vector sequences, for(int j=0;jevents->size(); + size_t num_events = 100;//TODO: FIX! ev.read->events->size(); for (int i=0;iget_drift_scaled_level(event_idx, ev.strand); // send the data in drift scaled //auto unscaled = ev.read->events[0][i].mean; //taking the first element. Not sure what the second one is.. eventMeans[offset + i] = scaled; @@ -458,10 +489,11 @@ std::vector scoreKernel(std::vector sequences, std::vector log_var(num_reads); for (int i=0;iscalings->scale; - shift[i] = event_sequences[i].read->scalings->shift; - var[i] = event_sequences[i].read->scalings->var; - log_var[i] = event_sequences[i].read->scalings->log_var; + auto read = event_sequences[i]; + scale[i] = event_sequences[i].read->scalings[read.strand].scale; + shift[i] = event_sequences[i].read->scalings[read.strand].shift; + var[i] = event_sequences[i].read->scalings[read.strand].var; + log_var[i] = event_sequences[i].read->scalings[read.strand].log_var; } float* scaleDev; diff --git a/src/hmm/nanopolish_emissions.h b/src/hmm/nanopolish_emissions.h index 6069ac81..5f99a410 100644 --- a/src/hmm/nanopolish_emissions.h +++ b/src/hmm/nanopolish_emissions.h @@ -63,19 +63,19 @@ inline float log_probability_match_r9(const SquiggleRead& read, { // event level mean, scaled with the drift value float level = read.get_drift_scaled_level(event_idx, strand); - if (debug == true){ - printf("Level being used to calculate emission: %f\n", level); - } + //if (debug == true){ + // printf("Level being used to calculate emission: %f\n", level); + //} GaussianParameters gp = read.get_scaled_gaussian_from_pore_model_state(pore_model, strand, kmer_rank); - if (debug == true) { - printf(">Event IDX is: %i\n", event_idx); - printf(">CPU Strand is: %i\n", strand); - printf(">CPU kmer_rank is: %i\n", kmer_rank); - printf(">CPU level is: %f\n", level); - printf(">CPU gaussian mean: %f\n", gp.mean); - printf(">CPU gaussian stdv: %f\n", gp.stdv); - printf(">CPU gaussian log_level_stdv: %f\n", gp.log_stdv); - } +// if (debug == true) { +// printf(">Event IDX is: %i\n", event_idx); +// printf(">CPU Strand is: %i\n", strand); +// printf(">CPU kmer_rank is: %i\n", kmer_rank); +// printf(">CPU level is: %f\n", level); +// printf(">CPU gaussian mean: %f\n", gp.mean); +// printf(">CPU gaussian stdv: %f\n", gp.stdv); +// printf(">CPU gaussian log_level_stdv: %f\n", gp.log_stdv); +// } float lp = log_normal_pdf(level, gp); return lp; } diff --git a/src/hmm/nanopolish_profile_hmm.cpp b/src/hmm/nanopolish_profile_hmm.cpp index 6d5d0f37..d82ec344 100644 --- a/src/hmm/nanopolish_profile_hmm.cpp +++ b/src/hmm/nanopolish_profile_hmm.cpp @@ -31,6 +31,7 @@ float profile_hmm_score(const HMMInputSequence& sequence, const HMMInputData& da float profile_hmm_score_set(const std::vector& sequences, const HMMInputData& data, const uint32_t flags) { + printf("In profile_hmm_score set function...\n"); assert(!sequences.empty()); assert(std::string(sequences[0].get_alphabet()->get_name()) == "nucleotide"); assert(std::string(data.pore_model->pmalphabet->get_name()) == "nucleotide"); diff --git a/src/hmm/nanopolish_profile_hmm_r9.inl b/src/hmm/nanopolish_profile_hmm_r9.inl index 315618a3..f402fd1c 100644 --- a/src/hmm/nanopolish_profile_hmm_r9.inl +++ b/src/hmm/nanopolish_profile_hmm_r9.inl @@ -283,7 +283,7 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, uint32_t e_start = data.event_start_idx; - printf(">CPU e_start: %i\n", e_start); + //printf(">CPU e_start: %i\n", e_start); // Calculate number of blocks // A block of the HMM is a set of states for one kmer uint32_t num_blocks = output.get_num_columns() / PSR9_NUM_STATES; // num_columns is the number of HMM STATES @@ -303,8 +303,8 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, std::vector kmer_ranks(num_kmers); for(size_t ki = 0; ki < num_kmers; ++ki) { - int kr = sequence.get_kmer_rank(ki, k, data.rc); - printf("Kmer rank: %i\n", kr); + int kr = sequence.get_kmer_rank(ki, k, data.rc); // can * -1 here to see if 3rd is correct + printf(">CPU Kmer rank: %i\n", kr); kmer_ranks[ki] = kr; } @@ -326,16 +326,6 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, // Fill in matrix for(uint32_t row = 1; row < output.get_num_rows(); row++) { - //printf("======\n"); - //diagnostics - after match and bad event have been applied - if (row == 29) { // row 1 has been computed so we can have a peek - auto nc = output.get_num_columns(); - int rw = 28; - for (int i = 0; i < nc; i++) { - printf("CPU> Value for row %i col %i is %f\n", rw, i, output.get(rw, i)); - } - } - // Skip the first block which is the start state, it was initialized above // Similarily skip the last block, which is calculated in the terminate() function for(uint32_t block = 1; block < num_blocks - 1; block++) { @@ -352,7 +342,7 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, uint32_t event_idx = e_start + (row - 1) * data.event_stride; uint32_t rank = kmer_ranks[kmer_idx]; float lp_emission_m = log_probability_match_r9(*data.read, *data.pore_model, rank, event_idx, data.strand, true); - printf("CPU> lp_emission_m %f\n", lp_emission_m); + //printf("CPU> lp_emission_m %f\n", lp_emission_m); float lp_emission_b = BAD_EVENT_PENALTY; HMMUpdateScores scores; @@ -364,6 +354,8 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, scores.x[HMT_FROM_PREV_B] = bt.lp_bm_next + output.get(row - 1, prev_block_offset + PSR9_BAD_EVENT); scores.x[HMT_FROM_PREV_K] = bt.lp_km + output.get(row - 1, prev_block_offset + PSR9_KMER_SKIP); + scores.x[HMT_FROM_PREV_B] = bt.lp_bm_next + output.get(row - 1, prev_block_offset + PSR9_BAD_EVENT); + // m_s is the probability of going from the start state // to this kmer. The start state is (currently) only // allowed to go to the first kmer. If ALLOW_PRE_CLIP @@ -372,7 +364,18 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, scores.x[HMT_FROM_SOFT] = (kmer_idx == 0 && (event_idx == e_start || (flags & HAF_ALLOW_PRE_CLIP))) ? lp_sm + pre_flank[row - 1] : -INFINITY; - printf("lp_emission_m is %f\n", lp_emission_m); + + if (row == 2) { + printf("Working with matches in row 2\n"); + printf("HMT_FROM_SOFT IS %f\n", scores.x[HMT_FROM_SOFT]); + printf("Strand is %i\n", data.strand); + printf("bt.lp_mm_self %f\n", bt.lp_mm_self); + printf("bt.lp_mm_next %f\n", bt.lp_mm_next); + printf("bt.lp_bm_self %f\n", bt.lp_bm_self); + printf("bt.lp_bm_next %f\n", bt.lp_bm_next); + printf("bt.lp_km %f\n", bt.lp_km); + } + output.update_cell(row, curr_block_offset + PSR9_MATCH, scores, lp_emission_m); // state PSR9_BAD_EVENT @@ -401,7 +404,7 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, // last kmer/event match. if(kmer_idx == last_kmer_idx && ( (flags & HAF_ALLOW_POST_CLIP) || row == last_event_row_idx)) { - printf(">CPU Post-clip transition on row %i\n", row); + //printf(">CPU Post-clip transition on row %i\n", row); float lp1 = lp_ms + output.get(row, curr_block_offset + PSR9_MATCH) + post_flank[row - 1]; float lp2 = lp_ms + output.get(row, curr_block_offset + PSR9_BAD_EVENT) + post_flank[row - 1]; float lp3 = lp_ms + output.get(row, curr_block_offset + PSR9_KMER_SKIP) + post_flank[row - 1]; @@ -410,13 +413,14 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, output.update_end(lp2, row, curr_block_offset + PSR9_BAD_EVENT); output.update_end(lp3, row, curr_block_offset + PSR9_KMER_SKIP); - printf(">LP1 %f\n", lp1); - printf(">LP2 %f\n", lp2); - printf(">LP3 %f\n", lp3); - printf(">end %f\n", output.get_end()); + //printf(">LP1 %f\n", lp1); + //printf(">LP2 %f\n", lp2); + //printf(">LP3 %f\n", lp3); + //printf(">end %f\n", output.get_end()); } + #ifdef DEBUG_LOCAL_ALIGNMENT printf("[%d %d] start: %.2lf pre: %.2lf fm: %.2lf\n", event_idx, kmer_idx, m_s + lp_emission_m, pre_flank[row - 1], output.get(row, curr_block_offset + PSR9_MATCH)); printf("[%d %d] end: %.2lf post: %.2lf\n", event_idx, kmer_idx, lp_end, post_flank[row - 1]); @@ -451,7 +455,13 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, } } - - return output.get_end(); + for(uint32_t row = 1; row < output.get_num_rows(); row++) { + //for (int col=0; col Value for row %i and col %i is %f\n", row, col, output.get(row, col)); + // } + } + + + return output.get_end(); } diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp index 864a8c59..037a4a60 100644 --- a/src/nanopolish_call_variants.cpp +++ b/src/nanopolish_call_variants.cpp @@ -352,9 +352,8 @@ std::vector generate_candidate_single_base_edits(const AlignmentDB& ali // get the scaled levels. std::vector scores = aligner.variantScoresThresholded(tmp_variants, test_haplotype, event_sequences, - alignment_flags, 10,//opt::screen_score_threshold, + alignment_flags, opt::screen_score_threshold, opt::methylation_types); - auto tf_gpu = std::chrono::high_resolution_clock::now(); gpu_exec += tf_gpu - t0_gpu; @@ -364,7 +363,7 @@ std::vector generate_candidate_single_base_edits(const AlignmentDB& ali test_haplotype, event_sequences, alignment_flags, - 10,//opt::screen_score_threshold, + opt::screen_score_threshold, opt::methylation_types); auto t1 = std::chrono::high_resolution_clock::now(); scoring += t1-t0; From ac82456d3f8fe3e8998c2a62d1fcec8e20742ec3 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Tue, 3 Jul 2018 15:54:02 +0100 Subject: [PATCH 14/80] Removed print statements --- Makefile | 6 +- src/common/nanopolish_variant.cpp | 2 +- src/cuda_kernels/GpuAligner.cu | 84 +++++++++++++-------------- src/hmm/nanopolish_profile_hmm.cpp | 2 +- src/hmm/nanopolish_profile_hmm_r9.inl | 22 +++---- src/nanopolish_call_variants.cpp | 4 +- 6 files changed, 60 insertions(+), 60 deletions(-) diff --git a/Makefile b/Makefile index 199f5d1d..38ff3360 100644 --- a/Makefile +++ b/Makefile @@ -9,13 +9,13 @@ SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/ali #Basic flags every build needs LIBS=-lz -CXXFLAGS ?= -g -Og +CXXFLAGS ?= -O3 #-g CXXFLAGS += -std=c++11 -fopenmp -fsigned-char -CFLAGS ?= -std=c99 #-O3 +CFLAGS ?= -std=c99 -O3 CXX ?= g++ CC ?= gcc NVCC = nvcc -NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -g +NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include #-g CURTFLAGS ?= -L/usr/local/cuda-9.0/lib64 -lcudart # Change the value of HDF5, EIGEN, or HTS below to any value to disable compilation of bundled code diff --git a/src/common/nanopolish_variant.cpp b/src/common/nanopolish_variant.cpp index bbc5933b..357c7fae 100644 --- a/src/common/nanopolish_variant.cpp +++ b/src/common/nanopolish_variant.cpp @@ -686,7 +686,7 @@ Variant score_variant_thresholded(const Variant& input_variant, if(fabs(total_score) < score_threshold) { // Calculate scores using the base nucleotide model - printf("Working with input %i\n", j); + //printf("Working with input %i\n", j); double base_score = profile_hmm_score_set(base_sequences, input[j], alignment_flags); double variant_score = profile_hmm_score_set(variant_sequences, input[j], alignment_flags); diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index ef52b68c..75947314 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -93,9 +93,9 @@ __global__ void getScores(float * eventData, } int e_offset = eventOffsets[readIdx]; // Within the event means etc, the offset needed for this block to get a specific event - if(blockIdx.x==2){ // read 2 is an RC read - printf("Block IDX is %i and stride is %i\n", blockIdx.x, e_stride); - } + //if(blockIdx.x==2){ // read 2 is an RC read + // printf("Block IDX is %i and stride is %i\n", blockIdx.x, e_stride); + //} int kmerIdx = threadIdx.x; uint32_t rank; @@ -204,11 +204,11 @@ __global__ void getScores(float * eventData, (event_idx == e_start || (HAF_ALLOW_PRE_CLIP))) ? lp_sm + preFlank : -INFINITY; // TODO: Add flag for HAF ALLOW_PRE_CLIP - if (blockIdx.x == 2 && threadIdx.x == 0 && row == 2){ - printf("HMT_FROM_SOFT should be (?) -5.99 but is in fact %f\n", HMT_FROM_SOFT); - printf("event IDX is %i\n", event_idx); - printf("e_start is %i\n", e_start); - } + //if (blockIdx.x == 2 && threadIdx.x == 0 && row == 2){ + // printf("HMT_FROM_SOFT should be (?) -5.99 but is in fact %f\n", HMT_FROM_SOFT); + // printf("event IDX is %i\n", event_idx); + // printf("e_start is %i\n", e_start); + //} // calculate the score float sum = HMT_FROM_SAME_M; @@ -311,37 +311,37 @@ __global__ void getScores(float * eventData, // Now do the end state __syncthreads(); - if ((blockIdx.x == 2) && (threadIdx.x == 0)){ -// printf("rank %i\n", rank); -// printf("event mean %f\n", event_mean); -// printf("poreModelLevelLogStdv %f\n", poreModelLevelLogStdv[0]); -// printf("poreModelLevelStdv %f\n", poreModelLevelStdv[0]); -// printf("poreModelLevelMean %f\n", poreModelLevelMean[0]); -// printf("lp_emission_m is %f\n", lp_emission_m); -// printf("PSR9_MATCH is %i\n", PSR9_MATCH); -// printf(">GPU score HMT_FROM_SAME_M is %f\n", HMT_FROM_SAME_M); -// printf(">GPU score HMT_FROM_PREV_M is %f\n", HMT_FROM_PREV_M); -// printf(">GPU score HMT_FROM_SAME_B is %f\n", HMT_FROM_SAME_B); -// printf(">GPU score HMT_FROM_PREV_B is %f\n", HMT_FROM_PREV_B); -// printf(">GPU score HMT_FROM_PREV_K is %f\n", HMT_FROM_PREV_K); -// printf(">GPU newSkipScore is %f\n", newSkipScore); -// printf("Number of states is %i\n", n_states); - for (int c = 0; c < n_states; c++) { - printf("GPU> Value for row %i and col %i is %f\n", row, c, prevProbabilities[c]); - } - printf("HMT_FROM_SOFT = %f\n", HMT_FROM_SOFT); - printf("lp_mk = %f\n", lp_mk); - printf("lp_mb = %f\n", lp_mb); - printf("lp_mm_self = %f\n", lp_mm_self); - printf("lp_mm_next = %f\n", lp_mm_next); - printf("lp_bb = %f\n", lp_bb); - printf("lp_bk = %f\n", lp_bk); - printf("lp_bm_next = %f\n", lp_bm_next); - printf("lp_bm_self = %f\n", lp_bm_self); - printf("lp_kk = %f\n", lp_kk); - printf("lp_km = %f\n", lp_km); - - } +// if ((blockIdx.x == 2) && (threadIdx.x == 0)){ +//// printf("rank %i\n", rank); +//// printf("event mean %f\n", event_mean); +//// printf("poreModelLevelLogStdv %f\n", poreModelLevelLogStdv[0]); +//// printf("poreModelLevelStdv %f\n", poreModelLevelStdv[0]); +//// printf("poreModelLevelMean %f\n", poreModelLevelMean[0]); +//// printf("lp_emission_m is %f\n", lp_emission_m); +//// printf("PSR9_MATCH is %i\n", PSR9_MATCH); +//// printf(">GPU score HMT_FROM_SAME_M is %f\n", HMT_FROM_SAME_M); +//// printf(">GPU score HMT_FROM_PREV_M is %f\n", HMT_FROM_PREV_M); +//// printf(">GPU score HMT_FROM_SAME_B is %f\n", HMT_FROM_SAME_B); +//// printf(">GPU score HMT_FROM_PREV_B is %f\n", HMT_FROM_PREV_B); +//// printf(">GPU score HMT_FROM_PREV_K is %f\n", HMT_FROM_PREV_K); +//// printf(">GPU newSkipScore is %f\n", newSkipScore); +//// printf("Number of states is %i\n", n_states); +// for (int c = 0; c < n_states; c++) { +// printf("GPU> Value for row %i and col %i is %f\n", row, c, prevProbabilities[c]); +// } +// printf("HMT_FROM_SOFT = %f\n", HMT_FROM_SOFT); +// printf("lp_mk = %f\n", lp_mk); +// printf("lp_mb = %f\n", lp_mb); +// printf("lp_mm_self = %f\n", lp_mm_self); +// printf("lp_mm_next = %f\n", lp_mm_next); +// printf("lp_bb = %f\n", lp_bb); +// printf("lp_bk = %f\n", lp_bk); +// printf("lp_bm_next = %f\n", lp_bm_next); +// printf("lp_bm_self = %f\n", lp_bm_self); +// printf("lp_kk = %f\n", lp_kk); +// printf("lp_km = %f\n", lp_km); +// +// } } __syncthreads(); } @@ -576,7 +576,7 @@ std::vector scoreKernel(std::vector sequences, float* returnedValues; cudaHostAlloc(&returnedValues, num_reads * sizeof(float) , cudaHostAllocDefault); - printf("About to run getscores...\n"); + //printf("About to run getscores...\n"); getScores<<>>(eventMeansDev, eventsPerBaseDev, numRowsDev, @@ -663,11 +663,11 @@ std::vector GpuAligner::variantScoresThresholded(std::vector in std::vector v(variant_sequences.size()); for (int i=0; i& sequences, const HMMInputData& data, const uint32_t flags) { - printf("In profile_hmm_score set function...\n"); + //printf("In profile_hmm_score set function...\n"); assert(!sequences.empty()); assert(std::string(sequences[0].get_alphabet()->get_name()) == "nucleotide"); assert(std::string(data.pore_model->pmalphabet->get_name()) == "nucleotide"); diff --git a/src/hmm/nanopolish_profile_hmm_r9.inl b/src/hmm/nanopolish_profile_hmm_r9.inl index f402fd1c..001e44f4 100644 --- a/src/hmm/nanopolish_profile_hmm_r9.inl +++ b/src/hmm/nanopolish_profile_hmm_r9.inl @@ -304,7 +304,7 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, std::vector kmer_ranks(num_kmers); for(size_t ki = 0; ki < num_kmers; ++ki) { int kr = sequence.get_kmer_rank(ki, k, data.rc); // can * -1 here to see if 3rd is correct - printf(">CPU Kmer rank: %i\n", kr); + //printf(">CPU Kmer rank: %i\n", kr); kmer_ranks[ki] = kr; } @@ -365,16 +365,16 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, (event_idx == e_start || (flags & HAF_ALLOW_PRE_CLIP))) ? lp_sm + pre_flank[row - 1] : -INFINITY; - if (row == 2) { - printf("Working with matches in row 2\n"); - printf("HMT_FROM_SOFT IS %f\n", scores.x[HMT_FROM_SOFT]); - printf("Strand is %i\n", data.strand); - printf("bt.lp_mm_self %f\n", bt.lp_mm_self); - printf("bt.lp_mm_next %f\n", bt.lp_mm_next); - printf("bt.lp_bm_self %f\n", bt.lp_bm_self); - printf("bt.lp_bm_next %f\n", bt.lp_bm_next); - printf("bt.lp_km %f\n", bt.lp_km); - } + //if (row == 2) { + // printf("Working with matches in row 2\n"); + // printf("HMT_FROM_SOFT IS %f\n", scores.x[HMT_FROM_SOFT]); + // printf("Strand is %i\n", data.strand); + // printf("bt.lp_mm_self %f\n", bt.lp_mm_self); + // printf("bt.lp_mm_next %f\n", bt.lp_mm_next); + // printf("bt.lp_bm_self %f\n", bt.lp_bm_self); + // printf("bt.lp_bm_next %f\n", bt.lp_bm_next); + // printf("bt.lp_km %f\n", bt.lp_km); + //} output.update_cell(row, curr_block_offset + PSR9_MATCH, scores, lp_emission_m); diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp index 037a4a60..ee3e6d05 100644 --- a/src/nanopolish_call_variants.cpp +++ b/src/nanopolish_call_variants.cpp @@ -352,7 +352,7 @@ std::vector generate_candidate_single_base_edits(const AlignmentDB& ali // get the scaled levels. std::vector scores = aligner.variantScoresThresholded(tmp_variants, test_haplotype, event_sequences, - alignment_flags, opt::screen_score_threshold, + alignment_flags, 100000,//opt::screen_score_threshold, opt::methylation_types); auto tf_gpu = std::chrono::high_resolution_clock::now(); gpu_exec += tf_gpu - t0_gpu; @@ -363,7 +363,7 @@ std::vector generate_candidate_single_base_edits(const AlignmentDB& ali test_haplotype, event_sequences, alignment_flags, - opt::screen_score_threshold, + 100000,//opt::screen_score_threshold, opt::methylation_types); auto t1 = std::chrono::high_resolution_clock::now(); scoring += t1-t0; From 10db85ad60ed15c7af54fd828f4e00a27b4dc6c9 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Wed, 4 Jul 2018 11:33:35 +0100 Subject: [PATCH 15/80] Fixed bug with overly-large host allocations --- Makefile | 4 ++-- src/cuda_kernels/GpuAligner.cu | 14 ++++++++------ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/Makefile b/Makefile index 38ff3360..68fb07a9 100644 --- a/Makefile +++ b/Makefile @@ -9,13 +9,13 @@ SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/ali #Basic flags every build needs LIBS=-lz -CXXFLAGS ?= -O3 #-g +CXXFLAGS ?= -O3 -g CXXFLAGS += -std=c++11 -fopenmp -fsigned-char CFLAGS ?= -std=c99 -O3 CXX ?= g++ CC ?= gcc NVCC = nvcc -NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include #-g +NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -g CURTFLAGS ?= -L/usr/local/cuda-9.0/lib64 -lcudart # Change the value of HDF5, EIGEN, or HTS below to any value to disable compilation of bundled code diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index 75947314..b375f2e8 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -392,6 +392,7 @@ std::vector scoreKernel(std::vector sequences, std::vector> pre_flanks; std::vector> post_flanks; + int numEventsTotal = 0; for(auto e: event_sequences){ uint32_t e_start = e.event_start_idx; e_starts.push_back(e_start); @@ -407,6 +408,7 @@ std::vector scoreKernel(std::vector sequences, n_events = e_start - e_end + 1; n_rows.push_back(n_events + 1); + numEventsTotal += n_events + 1; // TODO: is +1 necessary? std::vector pre_flank = make_pre_flanking(e, e_start, n_events); std::vector post_flank = make_post_flanking(e, e_start, n_events); @@ -426,18 +428,18 @@ std::vector scoreKernel(std::vector sequences, // Buffer 1: Raw event data and associated starts and stops - size_t numEventsTotal = 0; + // size_t numEventsTotal; //1. Count the total number of events across all reads - std::vector eventLengths; + //std::vector eventLengths; std::vector eventsPerBase; for (auto e: event_sequences){ size_t numEvents = e.read->events->size(); float readEventsPerBase = e.read->events_per_base[e.strand]; - eventLengths.push_back(numEvents); + //eventLengths.push_back(numEvents); eventsPerBase.push_back(readEventsPerBase); - numEventsTotal += numEvents; + //numEventsTotal += numEvents; } @@ -457,7 +459,7 @@ std::vector scoreKernel(std::vector sequences, for(int j=0;jevents->size(); + size_t num_events = n_rows[j];//TODO: is this sometimes causing a segfault? is it correct? for (int i=0;iget_drift_scaled_level(event_idx, ev.strand); // send the data in drift scaled @@ -513,7 +515,7 @@ std::vector scoreKernel(std::vector sequences, float* poreModelLevelLogStdvDev; - cudaMalloc( (void**)&poreModelLevelLogStdvDev, pore_model_level_log_stdv.size() * sizeof(float)); + cudaMalloc( (void**)&poreModelLevelLogStdvDev, pore_model_level_log_stdv.size() * sizeof(float)); // for some reason this malloc is slow cudaMemcpyAsync( poreModelLevelLogStdvDev, pore_model_level_log_stdv.data(), pore_model_level_log_stdv.size() * sizeof(float), cudaMemcpyHostToDevice ); float* poreModelLevelMeanDev; From f5c0b4a6d46a73eade0fe01ec04a60c1b16280ed Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Wed, 4 Jul 2018 11:53:32 +0100 Subject: [PATCH 16/80] removed some print statements --- Makefile | 4 +-- src/cuda_kernels/GpuAligner.cu | 52 ---------------------------------- 2 files changed, 2 insertions(+), 54 deletions(-) diff --git a/Makefile b/Makefile index 68fb07a9..ed4eccff 100644 --- a/Makefile +++ b/Makefile @@ -9,13 +9,13 @@ SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/ali #Basic flags every build needs LIBS=-lz -CXXFLAGS ?= -O3 -g +CXXFLAGS ?= -O3 #-g CXXFLAGS += -std=c++11 -fopenmp -fsigned-char CFLAGS ?= -std=c99 -O3 CXX ?= g++ CC ?= gcc NVCC = nvcc -NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -g +NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -O3 #-g CURTFLAGS ?= -L/usr/local/cuda-9.0/lib64 -lcudart # Change the value of HDF5, EIGEN, or HTS below to any value to disable compilation of bundled code diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index b375f2e8..247e7e35 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -93,10 +93,6 @@ __global__ void getScores(float * eventData, } int e_offset = eventOffsets[readIdx]; // Within the event means etc, the offset needed for this block to get a specific event - //if(blockIdx.x==2){ // read 2 is an RC read - // printf("Block IDX is %i and stride is %i\n", blockIdx.x, e_stride); - //} - int kmerIdx = threadIdx.x; uint32_t rank; @@ -204,12 +200,6 @@ __global__ void getScores(float * eventData, (event_idx == e_start || (HAF_ALLOW_PRE_CLIP))) ? lp_sm + preFlank : -INFINITY; // TODO: Add flag for HAF ALLOW_PRE_CLIP - //if (blockIdx.x == 2 && threadIdx.x == 0 && row == 2){ - // printf("HMT_FROM_SOFT should be (?) -5.99 but is in fact %f\n", HMT_FROM_SOFT); - // printf("event IDX is %i\n", event_idx); - // printf("e_start is %i\n", e_start); - //} - // calculate the score float sum = HMT_FROM_SAME_M; sum = logsumexpf(sum, HMT_FROM_SOFT); @@ -290,17 +280,6 @@ __global__ void getScores(float * eventData, float lp1 = lp_ms + prevProbabilities[curBlockOffset + PSR9_MATCH] + postFlank; float lp2 = lp_ms + prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] + postFlank; float lp3 = lp_ms + prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] + postFlank; -// -// printf(">GPU Post-clip transition on row %i, read %i, threadIdx is %i\n" -// "LP1=%f\n" -// "LP2=%f\n" -// "LP3=%f\n", -// row, -// blockIdx.x, -// threadIdx.x, -// lp1, -// lp2, -// lp3); end = returnValues[blockIdx.x]; end = logsumexpf(end, lp1); @@ -311,37 +290,6 @@ __global__ void getScores(float * eventData, // Now do the end state __syncthreads(); -// if ((blockIdx.x == 2) && (threadIdx.x == 0)){ -//// printf("rank %i\n", rank); -//// printf("event mean %f\n", event_mean); -//// printf("poreModelLevelLogStdv %f\n", poreModelLevelLogStdv[0]); -//// printf("poreModelLevelStdv %f\n", poreModelLevelStdv[0]); -//// printf("poreModelLevelMean %f\n", poreModelLevelMean[0]); -//// printf("lp_emission_m is %f\n", lp_emission_m); -//// printf("PSR9_MATCH is %i\n", PSR9_MATCH); -//// printf(">GPU score HMT_FROM_SAME_M is %f\n", HMT_FROM_SAME_M); -//// printf(">GPU score HMT_FROM_PREV_M is %f\n", HMT_FROM_PREV_M); -//// printf(">GPU score HMT_FROM_SAME_B is %f\n", HMT_FROM_SAME_B); -//// printf(">GPU score HMT_FROM_PREV_B is %f\n", HMT_FROM_PREV_B); -//// printf(">GPU score HMT_FROM_PREV_K is %f\n", HMT_FROM_PREV_K); -//// printf(">GPU newSkipScore is %f\n", newSkipScore); -//// printf("Number of states is %i\n", n_states); -// for (int c = 0; c < n_states; c++) { -// printf("GPU> Value for row %i and col %i is %f\n", row, c, prevProbabilities[c]); -// } -// printf("HMT_FROM_SOFT = %f\n", HMT_FROM_SOFT); -// printf("lp_mk = %f\n", lp_mk); -// printf("lp_mb = %f\n", lp_mb); -// printf("lp_mm_self = %f\n", lp_mm_self); -// printf("lp_mm_next = %f\n", lp_mm_next); -// printf("lp_bb = %f\n", lp_bb); -// printf("lp_bk = %f\n", lp_bk); -// printf("lp_bm_next = %f\n", lp_bm_next); -// printf("lp_bm_self = %f\n", lp_bm_self); -// printf("lp_kk = %f\n", lp_kk); -// printf("lp_km = %f\n", lp_km); -// -// } } __syncthreads(); } From 5a203a40cbb5c3a65cf7123195ca7fa42cfdd4ad Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Wed, 4 Jul 2018 12:57:35 +0100 Subject: [PATCH 17/80] removed some print statements --- src/cuda_kernels/GpuAligner.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index 247e7e35..fd5e3706 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -526,7 +526,6 @@ std::vector scoreKernel(std::vector sequences, float* returnedValues; cudaHostAlloc(&returnedValues, num_reads * sizeof(float) , cudaHostAllocDefault); - //printf("About to run getscores...\n"); getScores<<>>(eventMeansDev, eventsPerBaseDev, numRowsDev, From 458a84cc775c43b0f0f2cc06712af8df2e507063 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Wed, 4 Jul 2018 15:11:06 +0100 Subject: [PATCH 18/80] Sharing a lot more memory --- Makefile | 4 +- src/cuda_kernels/GpuAligner.cu | 172 +++++++++++++++++++-------------- 2 files changed, 99 insertions(+), 77 deletions(-) diff --git a/Makefile b/Makefile index ed4eccff..0c08f211 100644 --- a/Makefile +++ b/Makefile @@ -9,13 +9,13 @@ SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/ali #Basic flags every build needs LIBS=-lz -CXXFLAGS ?= -O3 #-g +CXXFLAGS ?= -O3# -g CXXFLAGS += -std=c++11 -fopenmp -fsigned-char CFLAGS ?= -std=c99 -O3 CXX ?= g++ CC ?= gcc NVCC = nvcc -NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -O3 #-g +NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -O3# -g CURTFLAGS ?= -L/usr/local/cuda-9.0/lib64 -lcudart # Change the value of HDF5, EIGEN, or HTS below to any value to disable compilation of bundled code diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index fd5e3706..2be67114 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -303,9 +303,9 @@ GpuAligner::GpuAligner() n[i] = i; } -std::vector scoreKernel(std::vector sequences, - std::vector event_sequences, - uint32_t alignment_flags){ +std::vector> scoreKernel(std::vector sequences, + std::vector event_sequences, + uint32_t alignment_flags){ // Extract the pore model. //Let's assume that every event sequence has the same pore model @@ -324,14 +324,11 @@ std::vector scoreKernel(std::vector sequences, size_t num_models = sequences.size(); double num_model_penalty = log(num_models); - assert(num_models == 1); //this is temporary + assert(num_models != 1); //this is temporary - auto sequence = sequences[0]; // temporary. We are only going to score one sequence against a set of events for now. + //auto sequence = sequences[0]; // temporary. We are only going to score one sequence against a set of events for now. const uint32_t k = event_sequences[0].pore_model->k; //k is the kmerity - uint32_t n_kmers = sequence.length() - k + 1; //number of kmers in the sequence - - uint32_t n_states = PSR9_NUM_STATES * (n_kmers + 2); // + 2 for explicit terminal states std::vector n_rows; //number of rows in the DP table (n_events + 1) std::vector e_starts; //event starts @@ -365,13 +362,6 @@ std::vector scoreKernel(std::vector sequences, post_flanks.push_back(post_flank); } - std::vector kmer_ranks(n_kmers); - std::vector kmer_ranks_rc(n_kmers); - for(size_t ki = 0; ki < n_kmers; ++ki) { - kmer_ranks[ki] = sequences[0].get_kmer_rank(ki, k, false); - kmer_ranks_rc[ki] = sequences[0].get_kmer_rank(ki, k, true); - } - // Prepare raw data and send it over to the score calculator kernel // Buffer 1: Raw event data and associated starts and stops @@ -383,11 +373,8 @@ std::vector scoreKernel(std::vector sequences, for (auto e: event_sequences){ size_t numEvents = e.read->events->size(); float readEventsPerBase = e.read->events_per_base[e.strand]; - //eventLengths.push_back(numEvents); eventsPerBase.push_back(readEventsPerBase); - - //numEventsTotal += numEvents; } @@ -461,7 +448,6 @@ std::vector scoreKernel(std::vector sequences, cudaMemcpyAsync( varDev, var.data(), var.size() * sizeof(float), cudaMemcpyHostToDevice ); cudaMemcpyAsync( logVarDev, log_var.data(), log_var.size() * sizeof(float), cudaMemcpyHostToDevice ); - float* poreModelLevelLogStdvDev; cudaMalloc( (void**)&poreModelLevelLogStdvDev, pore_model_level_log_stdv.size() * sizeof(float)); // for some reason this malloc is slow cudaMemcpyAsync( poreModelLevelLogStdvDev, pore_model_level_log_stdv.data(), pore_model_level_log_stdv.size() * sizeof(float), cudaMemcpyHostToDevice ); @@ -495,13 +481,6 @@ std::vector scoreKernel(std::vector sequences, cudaMalloc( (void**)&numRowsDev, n_rows.size() * sizeof(int)); cudaMemcpyAsync( numRowsDev, n_rows.data(), n_rows.size() * sizeof(int), cudaMemcpyHostToDevice ); - int* kmerRanksDev; - int* kmerRanksRCDev; - cudaMalloc( (void**)&kmerRanksDev, kmer_ranks.size() * sizeof(int)); - cudaMalloc( (void**)&kmerRanksRCDev, kmer_ranks_rc.size() * sizeof(int)); - cudaMemcpyAsync( kmerRanksDev, kmer_ranks.data(), kmer_ranks.size() * sizeof(int), cudaMemcpyHostToDevice ); - cudaMemcpyAsync( kmerRanksRCDev, kmer_ranks_rc.data(), kmer_ranks_rc.size() * sizeof(int), cudaMemcpyHostToDevice ); - int* eventStartsDev; cudaMalloc( (void**)&eventStartsDev, e_starts.size() * sizeof(int)); cudaMemcpyAsync( eventStartsDev, e_starts.data(), e_starts.size() * sizeof(int), cudaMemcpyHostToDevice ); @@ -514,39 +493,78 @@ std::vector scoreKernel(std::vector sequences, cudaMalloc( (void**)&eventOffsetsDev, eventOffsets.size() * sizeof(int)); cudaMemcpyAsync( eventOffsetsDev, eventOffsets.data(), eventOffsets.size() * sizeof(int), cudaMemcpyHostToDevice ); - int num_blocks = n_states / PSR9_NUM_STATES; - uint32_t num_kmers = num_blocks - 2; // two terminal blocks. Not currently used but left here for now. - - dim3 dimBlock(num_blocks - 2); // One thread per state, not including Start and Terminal state. - dim3 dimGrid(num_reads); // let's look at only the first read - float * returnValuesDev; cudaMalloc((void **) &returnValuesDev, sizeof(float) * num_reads); //one score per read float* returnedValues; cudaHostAlloc(&returnedValues, num_reads * sizeof(float) , cudaHostAllocDefault); - getScores<<>>(eventMeansDev, - eventsPerBaseDev, - numRowsDev, - eventStartsDev, - eventStridesDev, - kmerRanksDev, - kmerRanksRCDev, - eventOffsetsDev, - poreModelLevelLogStdvDev, - poreModelLevelStdvDev, - poreModelLevelMeanDev, - scaleDev, - shiftDev, - varDev, - logVarDev, - preFlankingDev, - postFlankingDev, - returnValuesDev); - - cudaDeviceSynchronize(); - cudaMemcpyAsync(returnedValues, returnValuesDev, num_reads *sizeof(float), cudaMemcpyDeviceToHost); + uint8_t num_streams = sequences.size(); + cudaStream_t streams[num_streams]; + //float *data[num_streams]; + + + std::vector> results(sequences.size()); + for (int i =0; i kmer_ranks(n_kmers); + std::vector kmer_ranks_rc(n_kmers); + + for(size_t ki = 0; ki < n_kmers; ++ki) { + kmer_ranks[ki] = sequence.get_kmer_rank(ki, k, false); + kmer_ranks_rc[ki] = sequence.get_kmer_rank(ki, k, true); + } + + int num_blocks = n_states / PSR9_NUM_STATES; + uint32_t num_kmers = num_blocks - 2; // two terminal blocks. Not currently used but left here for now. + + dim3 dimBlock(num_blocks - 2); // One thread per state, not including Start and Terminal state. + dim3 dimGrid(num_reads); // let's look at only the first read + + int *kmerRanksDev; + int *kmerRanksRCDev; + cudaMalloc((void **) &kmerRanksDev, kmer_ranks.size() * sizeof(int)); + cudaMalloc((void **) &kmerRanksRCDev, kmer_ranks_rc.size() * sizeof(int)); + cudaMemcpyAsync(kmerRanksDev, kmer_ranks.data(), kmer_ranks.size() * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpyAsync(kmerRanksRCDev, kmer_ranks_rc.data(), kmer_ranks_rc.size() * sizeof(int), + cudaMemcpyHostToDevice); + + getScores <<< dimGrid, dimBlock, 0>>> (eventMeansDev, + eventsPerBaseDev, + numRowsDev, + eventStartsDev, + eventStridesDev, + kmerRanksDev, + kmerRanksRCDev, + eventOffsetsDev, + poreModelLevelLogStdvDev, + poreModelLevelStdvDev, + poreModelLevelMeanDev, + scaleDev, + shiftDev, + varDev, + logVarDev, + preFlankingDev, + postFlankingDev, + returnValuesDev); + + cudaDeviceSynchronize(); + cudaMemcpyAsync(returnedValues, returnValuesDev, num_reads *sizeof(float), cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + + cudaFree(kmerRanksDev); + cudaFree(kmerRanksRCDev); + + //Send all the scores back + //std::vector r(num_reads); + results[i].resize(num_reads); + for(int readIdx=0; readIdx scoreKernel(std::vector sequences, cudaFree(numRowsDev); cudaFree(eventStartsDev); cudaFree(eventStridesDev); - cudaFree(kmerRanksDev); - cudaFree(kmerRanksRCDev); cudaFree(eventOffsetsDev); cudaFree(poreModelLevelLogStdvDev); cudaFree(poreModelLevelStdvDev); @@ -569,14 +585,11 @@ std::vector scoreKernel(std::vector sequences, //Free host memory cudaFreeHost(eventMeans); + cudaFreeHost(preFlankingHost); + cudaFreeHost(postFlankingHost); + cudaFreeHost(returnedValues); - //Send all the scores back - std::vector r(num_reads); - for(int i=0; i GpuAligner::variantScoresThresholded(std::vector input_variants, @@ -596,31 +609,40 @@ std::vector GpuAligner::variantScoresThresholded(std::vector in } // Make methylated versions of each input sequence. Once for the base haplotype and once each for each variant - std::vector base_sequences = generate_methylated_alternatives(base_haplotype.get_sequence(), - methylation_types); - std::vector> variant_sequences; + + std::vector sequences; + + HMMInputSequence base_sequence = generate_methylated_alternatives(base_haplotype.get_sequence(), + methylation_types)[0]; //TODO: always 0? + + sequences.push_back(base_sequence); + + //std::vector> variant_sequences; for (auto v: variant_haplotypes){ - auto variant_sequence = generate_methylated_alternatives(v.get_sequence(), methylation_types); - variant_sequences.push_back(variant_sequence); + auto variant_sequence = generate_methylated_alternatives(v.get_sequence(), methylation_types)[0]; + sequences.push_back(variant_sequence); } - assert(base_sequences.size() == 1); + //assert(base_sequences.size() == 1); // return the sum of the score for the base sequences over all the event sequences - auto base_scores = scoreKernel(base_sequences, event_sequences, alignment_flags); + //auto base_scores = scoreKernel(base_sequences, event_sequences, alignment_flags); + + std::vector> scores = scoreKernel(sequences, event_sequences, alignment_flags); - std::vector v(variant_sequences.size()); - for (int i=0; i v(numVariants); // Thresholded score for each //(variant_sequences.size()); //TODO: Fix - temporary + uint32_t numScores = scores[0].size(); + for (int variantIndex=0; variantIndex Date: Thu, 5 Jul 2018 17:55:35 +0100 Subject: [PATCH 19/80] Kernel now fast but some numerical errors remain --- Makefile | 8 +- src/cuda_kernels/GpuAligner.cu | 309 +++++++++++++++---------------- src/cuda_kernels/GpuAligner.h | 44 ++++- src/nanopolish_call_variants.cpp | 9 +- 4 files changed, 192 insertions(+), 178 deletions(-) diff --git a/Makefile b/Makefile index 0c08f211..9494ceb5 100644 --- a/Makefile +++ b/Makefile @@ -9,13 +9,13 @@ SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/ali #Basic flags every build needs LIBS=-lz -CXXFLAGS ?= -O3# -g +CXXFLAGS ?= -O3 -g CXXFLAGS += -std=c++11 -fopenmp -fsigned-char CFLAGS ?= -std=c99 -O3 CXX ?= g++ CC ?= gcc NVCC = nvcc -NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -O3# -g +NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -O3 --default-stream per-thread -g -G CURTFLAGS ?= -L/usr/local/cuda-9.0/lib64 -lcudart # Change the value of HDF5, EIGEN, or HTS below to any value to disable compilation of bundled code @@ -69,8 +69,10 @@ EIGEN_INCLUDE=-I./eigen/ # Include the src subdirectories NP_INCLUDE=$(addprefix -I./, $(SUBDIRS)) +CUDA_INCLUDE=-I/usr/local/cuda-9.0/include + # Add include flags -CPPFLAGS += $(H5_INCLUDE) $(HTS_INCLUDE) $(FAST5_INCLUDE) $(NP_INCLUDE) $(EIGEN_INCLUDE) +CPPFLAGS += $(H5_INCLUDE) $(HTS_INCLUDE) $(FAST5_INCLUDE) $(NP_INCLUDE) $(EIGEN_INCLUDE) $(CUDA_INCLUDE) # Main programs to build PROGRAM=nanopolish diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index 2be67114..225effd9 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -4,7 +4,7 @@ #include #include "nanopolish_profile_hmm_r9.h" -#define MAX_STATES 1024 +#define MAX_STATES 128 __device__ float logsumexpf(float x, float y){ if(x == -INFINITY && y == -INFINITY){ @@ -65,7 +65,7 @@ __global__ void getScores(float * eventData, float * returnValues) { // Initialise the prev probability row, which is the row of the DP table - int n_kmers = blockDim.x; // Question: How does this deal with the case where the block is bigger than the sequence, such as if one variant is a deletion? + int n_kmers = blockDim.x; int n_states = n_kmers * PSR9_NUM_STATES + 2 * PSR9_NUM_STATES; // 3 blocks per kmer and then 3 each for start and end state. //initialise the return value @@ -98,7 +98,6 @@ __global__ void getScores(float * eventData, if (rc == true) { rank = kmer_ranks_rc[kmerIdx]; - //printf("Using an RC rank of %i\n", rank); }else{ rank = kmer_ranks[kmerIdx]; } @@ -231,7 +230,7 @@ __global__ void getScores(float * eventData, sum += lp_emission_b; float newBadEventScore = sum; - + __syncthreads(); // Write row out. prevProbabilities now becomes "current probabilities" for evaluating skips. prevProbabilities[curBlockOffset + PSR9_MATCH] = newMatchScore; prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] = newBadEventScore; @@ -295,24 +294,111 @@ __global__ void getScores(float * eventData, } +//Default constructor GpuAligner::GpuAligner() { - y = 20; - asize = y*sizeof(int); - for (int i=0; i> scoreKernel(std::vector sequences, - std::vector event_sequences, - uint32_t alignment_flags){ +//Destructor +GpuAligner::~GpuAligner() { + cudaFree(poreModelLevelMeanDev); + cudaFree(scaleDev); + cudaFree(shiftDev); + cudaFree(varDev); + cudaFree(logVarDev); + cudaFree(eventMeansDev); + cudaFree(eventsPerBaseDev); + cudaFree(numRowsDev); + cudaFree(eventStartsDev); + cudaFree(eventStridesDev); + cudaFree(eventOffsetsDev); + cudaFree(poreModelLevelLogStdvDev); + cudaFree(poreModelLevelStdvDev); + cudaFree(preFlankingDev); + cudaFree(postFlankingDev); - // Extract the pore model. - //Let's assume that every event sequence has the same pore model - //event_sequences[0].pore_model. + cudaFreeHost(eventMeans); + cudaFreeHost(preFlankingHost); + cudaFreeHost(postFlankingHost); - int num_reads = event_sequences.size(); - // These asserts are here during the development phase + int max_num_sequences = 8; // should be a private variable + // Free device and host memory + for (int i =0; i> GpuAligner::scoreKernel(std::vector sequences, + std::vector event_sequences, + uint32_t alignment_flags){ + // pre-running asserts assert(!sequences.empty()); assert(std::string(sequences[0].get_alphabet()->get_name()) == "nucleotide"); for (auto e: event_sequences) { @@ -321,22 +407,21 @@ std::vector> scoreKernel(std::vector seque assert( (e.rc && e.event_stride == -1) || (!e.rc && e.event_stride == 1)); } - size_t num_models = sequences.size(); - double num_model_penalty = log(num_models); - - assert(num_models != 1); //this is temporary - - //auto sequence = sequences[0]; // temporary. We are only going to score one sequence against a set of events for now. - - const uint32_t k = event_sequences[0].pore_model->k; //k is the kmerity + int num_reads = event_sequences.size(); - std::vector n_rows; //number of rows in the DP table (n_events + 1) - std::vector e_starts; //event starts - std::vector event_strides; + // Extract the pore model. + // Assume that every event sequence has the same pore model + // event_sequences[0].pore_model. + const uint32_t k = event_sequences[0].pore_model->k; //k is the length of a kmer + std::vector n_rows; //number of rows in the DP table (n_events + 1) for each read + std::vector e_starts; //event starts in the read for each read + std::vector event_strides; //event strides for each read std::vector> pre_flanks; std::vector> post_flanks; + std::vector eventsPerBase; + //Populate per-read vectors int numEventsTotal = 0; for(auto e: event_sequences){ uint32_t e_start = e.event_start_idx; @@ -352,53 +437,30 @@ std::vector> scoreKernel(std::vector seque else n_events = e_start - e_end + 1; - n_rows.push_back(n_events + 1); - numEventsTotal += n_events + 1; // TODO: is +1 necessary? + // TODO: is a +1 necessary here? + n_rows.push_back(n_events); + numEventsTotal += n_events; std::vector pre_flank = make_pre_flanking(e, e_start, n_events); std::vector post_flank = make_post_flanking(e, e_start, n_events); pre_flanks.push_back(pre_flank); post_flanks.push_back(post_flank); - } - - // Prepare raw data and send it over to the score calculator kernel - - // Buffer 1: Raw event data and associated starts and stops - // size_t numEventsTotal; - //1. Count the total number of events across all reads - //std::vector eventLengths; - std::vector eventsPerBase; - for (auto e: event_sequences){ - size_t numEvents = e.read->events->size(); float readEventsPerBase = e.read->events_per_base[e.strand]; - //eventLengths.push_back(numEvents); eventsPerBase.push_back(readEventsPerBase); } - - //Allocate a host buffer to store the event means, pre and post-flank data - float * eventMeans; - size_t eventMeansSize = numEventsTotal * sizeof(float); - cudaHostAlloc(&eventMeans, eventMeansSize , cudaHostAllocDefault); - - //Allocate a host buffer to store the event means, pre and post-flank data - float * preFlankingHost; - float * postFlankingHost; - cudaHostAlloc(&preFlankingHost, numEventsTotal * sizeof(float) , cudaHostAllocDefault); - cudaHostAlloc(&postFlankingHost, numEventsTotal * sizeof(float) , cudaHostAllocDefault); - + //Populate buffers for flanks and scaled means data std::vector eventOffsets; size_t offset = 0; - for(int j=0;jget_drift_scaled_level(event_idx, ev.strand); // send the data in drift scaled - //auto unscaled = ev.read->events[0][i].mean; //taking the first element. Not sure what the second one is.. + auto scaled = e.read->get_drift_scaled_level(event_idx, e.strand); // send the data in drift scaled eventMeans[offset + i] = scaled; preFlankingHost[offset + i] = pre_flanks[j][i]; //also copy over the pre-flanking data, since it has a 1-1 correspondence with events postFlankingHost[offset + i] = post_flanks[j][i]; //also copy over the pre-flanking data, since it has a 1-1 correspondence with events @@ -408,23 +470,22 @@ std::vector> scoreKernel(std::vector seque int num_states = event_sequences[0].pore_model->states.size(); + // Populate pore model buffers std::vector pore_model_level_log_stdv(num_states); std::vector pore_model_level_mean(num_states); std::vector pore_model_level_stdv(num_states); - - //TODO: Fix this. for(int st=0; ststates[st]; //TODO: Is this OK? + auto params = event_sequences[0].pore_model->states[st]; pore_model_level_log_stdv[st] = params.level_log_stdv; pore_model_level_mean[st] = params.level_mean; pore_model_level_stdv[st] = params.level_stdv; } + //Populating read-statistics buffers std::vector scale(num_reads); std::vector shift(num_reads); std::vector var(num_reads); std::vector log_var(num_reads); - for (int i=0;iscalings[read.strand].scale; @@ -433,79 +494,31 @@ std::vector> scoreKernel(std::vector seque log_var[i] = event_sequences[i].read->scalings[read.strand].log_var; } - float* scaleDev; - float* shiftDev; - float* varDev; - float* logVarDev; - - cudaMalloc( (void**)&scaleDev, scale.size() * sizeof(float)); - cudaMalloc( (void**)&shiftDev, shift.size() * sizeof(float)); - cudaMalloc( (void**)&varDev, var.size() * sizeof(float)); - cudaMalloc( (void**)&logVarDev, log_var.size() * sizeof(float)); - + // Copy to the device all buffers shared across kmer sequences. cudaMemcpyAsync( scaleDev, scale.data(), scale.size() * sizeof(float), cudaMemcpyHostToDevice ); cudaMemcpyAsync( shiftDev, shift.data(), shift.size() * sizeof(float), cudaMemcpyHostToDevice ); cudaMemcpyAsync( varDev, var.data(), var.size() * sizeof(float), cudaMemcpyHostToDevice ); cudaMemcpyAsync( logVarDev, log_var.data(), log_var.size() * sizeof(float), cudaMemcpyHostToDevice ); - - float* poreModelLevelLogStdvDev; - cudaMalloc( (void**)&poreModelLevelLogStdvDev, pore_model_level_log_stdv.size() * sizeof(float)); // for some reason this malloc is slow cudaMemcpyAsync( poreModelLevelLogStdvDev, pore_model_level_log_stdv.data(), pore_model_level_log_stdv.size() * sizeof(float), cudaMemcpyHostToDevice ); - - float* poreModelLevelMeanDev; - cudaMalloc( (void**)&poreModelLevelMeanDev, pore_model_level_mean.size() * sizeof(float)); cudaMemcpyAsync( poreModelLevelMeanDev, pore_model_level_mean.data(), pore_model_level_mean.size() * sizeof(float), cudaMemcpyHostToDevice ); - - float* poreModelLevelStdvDev; - cudaMalloc( (void**)&poreModelLevelStdvDev, pore_model_level_stdv.size() * sizeof(float)); cudaMemcpyAsync( poreModelLevelStdvDev, pore_model_level_stdv.data(), pore_model_level_stdv.size() * sizeof(float), cudaMemcpyHostToDevice ); - - - float* eventsPerBaseDev; - cudaMalloc( (void**)&eventsPerBaseDev, eventsPerBase.size() * sizeof(float)); cudaMemcpyAsync( eventsPerBaseDev, eventsPerBase.data(), eventsPerBase.size() * sizeof(float), cudaMemcpyHostToDevice ); - - float* eventMeansDev; - cudaMalloc( (void**)&eventMeansDev, eventMeansSize); - cudaMemcpyAsync( eventMeansDev, eventMeans, eventMeansSize, cudaMemcpyHostToDevice ); //malloc is taking 300us - - float* preFlankingDev; - cudaMalloc( (void**)&preFlankingDev, eventMeansSize); - cudaMemcpyAsync( preFlankingDev, preFlankingHost, eventMeansSize, cudaMemcpyHostToDevice ); //malloc is taking 300us - - float* postFlankingDev; - cudaMalloc( (void**)&postFlankingDev, eventMeansSize); - cudaMemcpyAsync( postFlankingDev, postFlankingHost, eventMeansSize, cudaMemcpyHostToDevice ); //malloc is taking 300us - - int* numRowsDev; - cudaMalloc( (void**)&numRowsDev, n_rows.size() * sizeof(int)); + cudaMemcpyAsync( eventMeansDev, eventMeans, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice ); + cudaMemcpyAsync( preFlankingDev, preFlankingHost, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice ); + cudaMemcpyAsync( postFlankingDev, postFlankingHost, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice ); cudaMemcpyAsync( numRowsDev, n_rows.data(), n_rows.size() * sizeof(int), cudaMemcpyHostToDevice ); - - int* eventStartsDev; - cudaMalloc( (void**)&eventStartsDev, e_starts.size() * sizeof(int)); cudaMemcpyAsync( eventStartsDev, e_starts.data(), e_starts.size() * sizeof(int), cudaMemcpyHostToDevice ); - - int* eventStridesDev; - cudaMalloc( (void**)&eventStridesDev, event_strides.size() * sizeof(int)); cudaMemcpyAsync( eventStridesDev, event_strides.data(), event_strides.size() * sizeof(int), cudaMemcpyHostToDevice ); - - int* eventOffsetsDev; - cudaMalloc( (void**)&eventOffsetsDev, eventOffsets.size() * sizeof(int)); cudaMemcpyAsync( eventOffsetsDev, eventOffsets.data(), eventOffsets.size() * sizeof(int), cudaMemcpyHostToDevice ); - float * returnValuesDev; - cudaMalloc((void **) &returnValuesDev, sizeof(float) * num_reads); //one score per read - - float* returnedValues; - cudaHostAlloc(&returnedValues, num_reads * sizeof(float) , cudaHostAllocDefault); + uint8_t MAX_NUM_KMERS = 100; - uint8_t num_streams = sequences.size(); - cudaStream_t streams[num_streams]; - //float *data[num_streams]; + for (int i =0; i> results(sequences.size()); - for (int i =0; i> scoreKernel(std::vector seque kmer_ranks_rc[ki] = sequence.get_kmer_rank(ki, k, true); } + assert(kmer_ranks.size() < MAX_NUM_KMERS); + cudaMemcpyAsync(kmerRanksDev, kmer_ranks.data(), kmer_ranks.size() * sizeof(int), + cudaMemcpyHostToDevice); + cudaMemcpyAsync(kmerRanksRCDev, kmer_ranks_rc.data(), kmer_ranks_rc.size() * sizeof(int), + cudaMemcpyHostToDevice); + int num_blocks = n_states / PSR9_NUM_STATES; uint32_t num_kmers = num_blocks - 2; // two terminal blocks. Not currently used but left here for now. dim3 dimBlock(num_blocks - 2); // One thread per state, not including Start and Terminal state. dim3 dimGrid(num_reads); // let's look at only the first read - int *kmerRanksDev; - int *kmerRanksRCDev; - cudaMalloc((void **) &kmerRanksDev, kmer_ranks.size() * sizeof(int)); - cudaMalloc((void **) &kmerRanksRCDev, kmer_ranks_rc.size() * sizeof(int)); - cudaMemcpyAsync(kmerRanksDev, kmer_ranks.data(), kmer_ranks.size() * sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpyAsync(kmerRanksRCDev, kmer_ranks_rc.data(), kmer_ranks_rc.size() * sizeof(int), - cudaMemcpyHostToDevice); - - getScores <<< dimGrid, dimBlock, 0>>> (eventMeansDev, + getScores <<< dimGrid, dimBlock, 0, streams[i]>>> (eventMeansDev, eventsPerBaseDev, numRowsDev, eventStartsDev, @@ -550,45 +561,19 @@ std::vector> scoreKernel(std::vector seque preFlankingDev, postFlankingDev, returnValuesDev); + } - cudaDeviceSynchronize(); - cudaMemcpyAsync(returnedValues, returnValuesDev, num_reads *sizeof(float), cudaMemcpyDeviceToHost); - cudaDeviceSynchronize(); - - cudaFree(kmerRanksDev); - cudaFree(kmerRanksRCDev); + cudaDeviceSynchronize(); - //Send all the scores back - //std::vector r(num_reads); - results[i].resize(num_reads); - for(int readIdx=0; readIdx> results(sequences.size()); + for (int i =0; i +#include +#include #ifndef GPU_ALIGNER_H #define GPU_ALIGNER_H1 @@ -44,18 +46,42 @@ class GpuAligner { public: - int n[20]; - int y; - int asize; - GpuAligner(); - int calculateSum(); - void setY(int); + ~GpuAligner(); std::vector variantScoresThresholded(std::vector tmp_variants, Haplotype haplotype, std::vector event_sequences, - uint32_t alignment_flags, int screen_score_threshold, std::vector methylation_types);// { - //return std::vector(); - //} + uint32_t alignment_flags, int screen_score_threshold, std::vector methylation_types); + + std::vector> scoreKernel(std::vector sequences, + std::vector event_sequences, + uint32_t alignment_flags); +private: + float* poreModelLevelMeanDev; + float* scaleDev; + float* shiftDev; + float* varDev; + float* logVarDev; + float * eventMeans; + float * preFlankingHost; + float * postFlankingHost; + int* eventOffsetsDev; + int* eventStridesDev; + int* eventStartsDev; + int* numRowsDev; + float* postFlankingDev; + float* preFlankingDev; + float* eventMeansDev; + float* eventsPerBaseDev; + float* poreModelLevelStdvDev; + float* poreModelLevelLogStdvDev; + // Allocate arrays for storing results, kmerRanksDev and kmerRanksRCDev + + std::vector kmerRanksDevPointers; + std::vector kmerRanksRCDevPointers; + std::vector returnValuesDevResultsPointers; + std::vector returnValuesHostResultsPointers; + + cudaStream_t streams[8]; // TODO 8 should not be hardcoded here }; #endif // GPU_ALIGNER_H \ No newline at end of file diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp index ee3e6d05..93f16d9b 100644 --- a/src/nanopolish_call_variants.cpp +++ b/src/nanopolish_call_variants.cpp @@ -130,7 +130,7 @@ namespace opt static int min_flanking_sequence = 30; static int max_haplotypes = 1000; static int max_rounds = 50; - static int screen_score_threshold = 100; + static int screen_score_threshold = 1000; static int screen_flanking_sequence = 10; static int debug_alignments = 0; static std::vector methylation_types; @@ -294,6 +294,8 @@ std::vector generate_candidate_single_base_edits(const AlignmentDB& ali auto scoring = std::chrono::high_resolution_clock::now() - std::chrono::high_resolution_clock::now(); auto gpu_exec = std::chrono::high_resolution_clock::now() - std::chrono::high_resolution_clock::now(); + GpuAligner aligner; + for(size_t i = region_start; i < region_end; ++i) { int calling_start = i - opt::screen_flanking_sequence; @@ -347,12 +349,11 @@ std::vector generate_candidate_single_base_edits(const AlignmentDB& ali calling_start, alignments.get_reference_substring(contig, calling_start, calling_end)); - GpuAligner aligner; auto t0_gpu = std::chrono::high_resolution_clock::now(); // get the scaled levels. std::vector scores = aligner.variantScoresThresholded(tmp_variants, test_haplotype, event_sequences, - alignment_flags, 100000,//opt::screen_score_threshold, + alignment_flags, opt::screen_score_threshold, opt::methylation_types); auto tf_gpu = std::chrono::high_resolution_clock::now(); gpu_exec += tf_gpu - t0_gpu; @@ -363,7 +364,7 @@ std::vector generate_candidate_single_base_edits(const AlignmentDB& ali test_haplotype, event_sequences, alignment_flags, - 100000,//opt::screen_score_threshold, + opt::screen_score_threshold, opt::methylation_types); auto t1 = std::chrono::high_resolution_clock::now(); scoring += t1-t0; From 348fcf01717c78492e443b51c5e33e9b17210604 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Fri, 6 Jul 2018 15:43:35 +0100 Subject: [PATCH 20/80] Fixed bug which was causing incorrect forward strand results --- Makefile | 4 +- src/cuda_kernels/GpuAligner.cu | 96 ++++++++++++++++++++------- src/hmm/nanopolish_profile_hmm_r9.inl | 11 ++- 3 files changed, 83 insertions(+), 28 deletions(-) diff --git a/Makefile b/Makefile index 9494ceb5..aaf8cbc2 100644 --- a/Makefile +++ b/Makefile @@ -9,13 +9,13 @@ SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/ali #Basic flags every build needs LIBS=-lz -CXXFLAGS ?= -O3 -g +CXXFLAGS ?= -O0 -g CXXFLAGS += -std=c++11 -fopenmp -fsigned-char CFLAGS ?= -std=c99 -O3 CXX ?= g++ CC ?= gcc NVCC = nvcc -NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -O3 --default-stream per-thread -g -G +NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -O0 --default-stream per-thread -g -G CURTFLAGS ?= -L/usr/local/cuda-9.0/lib64 -lcudart # Change the value of HDF5, EIGEN, or HTS below to any value to disable compilation of bundled code diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index 225effd9..8f9577b0 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -64,21 +64,37 @@ __global__ void getScores(float * eventData, float * postFlankingDev, float * returnValues) { + bool debug = false; + if(threadIdx.x==0 && blockIdx.x==0){ + debug=true; + } // Initialise the prev probability row, which is the row of the DP table + int n_kmers = blockDim.x; int n_states = n_kmers * PSR9_NUM_STATES + 2 * PSR9_NUM_STATES; // 3 blocks per kmer and then 3 each for start and end state. - //initialise the return value + //initialise the return value// Better to do this in a register returnValues[blockIdx.x] = -INFINITY; + __syncthreads(); __shared__ float prevProbabilities[MAX_STATES]; - // Initialise the previous probabilities + // Initialise the previous probabilities - this may not be quite correct as the intialization is different to the C++ version but I don't think it matter for (int i = 0; i < n_states - PSR9_NUM_STATES; i++) { prevProbabilities[i] = -INFINITY; } for (int i = n_states - PSR9_NUM_STATES; i < n_states; i++) { - prevProbabilities[i] = 0; // Is this correct? + prevProbabilities[i] = 0.0f; // Is this correct? + } + + if(debug==true){ + printf("Number of kmers is: %i\n", n_kmers); + printf("n_states is: %i\n", n_states); + printf("***\n"); + printf("Prev probabilities row has been intialised to: \n"); + for (int i = 0; i < n_states; i++) { + printf("Element %i = %f\n", i, prevProbabilities[i]); + } } //Step 1: calculate transitions. For now we are going to use external params. @@ -87,11 +103,12 @@ __global__ void getScores(float * eventData, int numRows = numRowsPerRead[readIdx]; // Number of rows in this DP table. int e_start = eventStarts[readIdx]; // Event start for read int e_stride = eventStrides[readIdx]; + int e_offset = eventOffsets[readIdx]; // Within the event means etc, the offset needed for this block to get a specific event + bool rc = false; if (e_stride == -1){ rc = true; } - int e_offset = eventOffsets[readIdx]; // Within the event means etc, the offset needed for this block to get a specific event int kmerIdx = threadIdx.x; uint32_t rank; @@ -154,17 +171,31 @@ __global__ void getScores(float * eventData, float var = varDev[readIdx]; float logVar = logVarDev[readIdx]; - for(int row=1; row> GpuAligner::scoreKernel(std::vectork; //k is the length of a kmer - std::vector n_rows; //number of rows in the DP table (n_events + 1) for each read + std::vector n_rows; //number of rows in the DP table (n_events) for each read std::vector e_starts; //event starts in the read for each read std::vector event_strides; //event strides for each read std::vector> pre_flanks; @@ -468,17 +513,17 @@ std::vector> GpuAligner::scoreKernel(std::vectorstates.size(); - // Populate pore model buffers + // Assume that every event sequence has the same pore model + int num_states = event_sequences[0].pore_model->states.size(); std::vector pore_model_level_log_stdv(num_states); std::vector pore_model_level_mean(num_states); std::vector pore_model_level_stdv(num_states); for(int st=0; ststates[st]; - pore_model_level_log_stdv[st] = params.level_log_stdv; - pore_model_level_mean[st] = params.level_mean; + pore_model_level_log_stdv[st] = params.level_log_stdv; //TODO: I am seeing level log stdv and level stdv return the same value. need to investigate this. pore_model_level_stdv[st] = params.level_stdv; + pore_model_level_mean[st] = params.level_mean; } //Populating read-statistics buffers @@ -511,12 +556,13 @@ std::vector> GpuAligner::scoreKernel(std::vector lp_emission_m %f\n", lp_emission_m); + float lp_emission_b = BAD_EVENT_PENALTY; HMMUpdateScores scores; @@ -376,6 +380,11 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, // printf("bt.lp_km %f\n", bt.lp_km); //} + if(row==1 && block == 1) { + printf("CPU> lp_emission_m %f\n", lp_emission_m); + printf("Rank is %i\n", rank); + } + output.update_cell(row, curr_block_offset + PSR9_MATCH, scores, lp_emission_m); // state PSR9_BAD_EVENT From 0719a9b61730ead964824f3d4810302ae70ae502 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Fri, 6 Jul 2018 16:05:07 +0100 Subject: [PATCH 21/80] tidyup --- src/common/nanopolish_variant.cpp | 1 - src/cuda_kernels/GpuAligner.cu | 55 +-------------------------- src/hmm/nanopolish_profile_hmm_r9.inl | 39 ------------------- 3 files changed, 2 insertions(+), 93 deletions(-) diff --git a/src/common/nanopolish_variant.cpp b/src/common/nanopolish_variant.cpp index 357c7fae..b73a6b2b 100644 --- a/src/common/nanopolish_variant.cpp +++ b/src/common/nanopolish_variant.cpp @@ -686,7 +686,6 @@ Variant score_variant_thresholded(const Variant& input_variant, if(fabs(total_score) < score_threshold) { // Calculate scores using the base nucleotide model - //printf("Working with input %i\n", j); double base_score = profile_hmm_score_set(base_sequences, input[j], alignment_flags); double variant_score = profile_hmm_score_set(variant_sequences, input[j], alignment_flags); diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index 8f9577b0..82581f04 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -87,16 +87,6 @@ __global__ void getScores(float * eventData, prevProbabilities[i] = 0.0f; // Is this correct? } - if(debug==true){ - printf("Number of kmers is: %i\n", n_kmers); - printf("n_states is: %i\n", n_states); - printf("***\n"); - printf("Prev probabilities row has been intialised to: \n"); - for (int i = 0; i < n_states; i++) { - printf("Element %i = %f\n", i, prevProbabilities[i]); - } - } - //Step 1: calculate transitions. For now we are going to use external params. int readIdx = blockIdx.x; float read_events_per_base = readEventsPerBase[readIdx]; @@ -171,15 +161,6 @@ __global__ void getScores(float * eventData, float var = varDev[readIdx]; float logVar = logVarDev[readIdx]; - if (debug==true){ - printf("Number of rows is : %i\n", numRows); - printf("Event data offset is : %i\n", e_offset); - printf("Event start is %i\n", e_start); - printf("Stride: %i\n", e_stride); - printf("RC: %d\n", rc); - printf("First Kmer (should be 6 something and *not* 295) %i\n", kmer_ranks[0]); - } - for(int row=1; row> GpuAligner::scoreKernel(std::vectorCPU e_start: %i\n", e_start); // Calculate number of blocks // A block of the HMM is a set of states for one kmer uint32_t num_blocks = output.get_num_columns() / PSR9_NUM_STATES; // num_columns is the number of HMM STATES @@ -304,7 +303,6 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, std::vector kmer_ranks(num_kmers); for(size_t ki = 0; ki < num_kmers; ++ki) { int kr = sequence.get_kmer_rank(ki, k, data.rc); // can * -1 here to see if 3rd is correct - //printf(">CPU Kmer rank: %i\n", kr); kmer_ranks[ki] = kr; } @@ -326,10 +324,6 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, // Fill in matrix for(uint32_t row = 1; row < output.get_num_rows(); row++) { - for (int col = 0; col< output.get_num_columns();col++){ - printf("Row = %i, col = %i, val = %f\n", row - 1, col, output.get(row -1,col)); - } - // Skip the first block which is the start state, it was initialized above // Similarily skip the last block, which is calculated in the terminate() function for(uint32_t block = 1; block < num_blocks - 1; block++) { @@ -369,22 +363,6 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, (event_idx == e_start || (flags & HAF_ALLOW_PRE_CLIP))) ? lp_sm + pre_flank[row - 1] : -INFINITY; - //if (row == 2) { - // printf("Working with matches in row 2\n"); - // printf("HMT_FROM_SOFT IS %f\n", scores.x[HMT_FROM_SOFT]); - // printf("Strand is %i\n", data.strand); - // printf("bt.lp_mm_self %f\n", bt.lp_mm_self); - // printf("bt.lp_mm_next %f\n", bt.lp_mm_next); - // printf("bt.lp_bm_self %f\n", bt.lp_bm_self); - // printf("bt.lp_bm_next %f\n", bt.lp_bm_next); - // printf("bt.lp_km %f\n", bt.lp_km); - //} - - if(row==1 && block == 1) { - printf("CPU> lp_emission_m %f\n", lp_emission_m); - printf("Rank is %i\n", rank); - } - output.update_cell(row, curr_block_offset + PSR9_MATCH, scores, lp_emission_m); // state PSR9_BAD_EVENT @@ -394,11 +372,8 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, scores.x[HMT_FROM_PREV_B] = -INFINITY; scores.x[HMT_FROM_PREV_K] = -INFINITY; scores.x[HMT_FROM_SOFT] = -INFINITY; - //printf("before: %f:\n", output.get(row, curr_block_offset + PSR9_BAD_EVENT)); output.update_cell(row, curr_block_offset + PSR9_BAD_EVENT, scores, lp_emission_b); - //printf("after: %f:\n", output.get(row, curr_block_offset + PSR9_BAD_EVENT)); - // in cu this is where the shared memory sync on prev states would go. // state PSR9_KMER_SKIP scores.x[HMT_FROM_SAME_M] = -INFINITY; scores.x[HMT_FROM_PREV_M] = bt.lp_mk + output.get(row, prev_block_offset + PSR9_MATCH); @@ -413,7 +388,6 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, // last kmer/event match. if(kmer_idx == last_kmer_idx && ( (flags & HAF_ALLOW_POST_CLIP) || row == last_event_row_idx)) { - //printf(">CPU Post-clip transition on row %i\n", row); float lp1 = lp_ms + output.get(row, curr_block_offset + PSR9_MATCH) + post_flank[row - 1]; float lp2 = lp_ms + output.get(row, curr_block_offset + PSR9_BAD_EVENT) + post_flank[row - 1]; float lp3 = lp_ms + output.get(row, curr_block_offset + PSR9_KMER_SKIP) + post_flank[row - 1]; @@ -421,12 +395,6 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, output.update_end(lp1, row, curr_block_offset + PSR9_MATCH); output.update_end(lp2, row, curr_block_offset + PSR9_BAD_EVENT); output.update_end(lp3, row, curr_block_offset + PSR9_KMER_SKIP); - - //printf(">LP1 %f\n", lp1); - //printf(">LP2 %f\n", lp2); - //printf(">LP3 %f\n", lp3); - //printf(">end %f\n", output.get_end()); - } @@ -464,13 +432,6 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, } } - for(uint32_t row = 1; row < output.get_num_rows(); row++) { - //for (int col=0; col Value for row %i and col %i is %f\n", row, col, output.get(row, col)); - // } - } - - return output.get_end(); } From 712e0685e1cc33472f223f6a120fd8c6f7b7dd01 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Fri, 6 Jul 2018 17:38:51 +0100 Subject: [PATCH 22/80] some performance improvments --- Makefile | 4 +-- src/cuda_kernels/GpuAligner.cu | 46 ++++++++++++++++++-------------- src/cuda_kernels/GpuAligner.h | 2 ++ src/nanopolish_call_variants.cpp | 3 --- 4 files changed, 30 insertions(+), 25 deletions(-) diff --git a/Makefile b/Makefile index aaf8cbc2..060645e9 100644 --- a/Makefile +++ b/Makefile @@ -9,13 +9,13 @@ SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/ali #Basic flags every build needs LIBS=-lz -CXXFLAGS ?= -O0 -g +CXXFLAGS ?= -O3 # -g CXXFLAGS += -std=c++11 -fopenmp -fsigned-char CFLAGS ?= -std=c99 -O3 CXX ?= g++ CC ?= gcc NVCC = nvcc -NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -O0 --default-stream per-thread -g -G +NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -O3 -use_fast_math --default-stream per-thread #-g -G CURTFLAGS ?= -L/usr/local/cuda-9.0/lib64 -lcudart # Change the value of HDF5, EIGEN, or HTS below to any value to disable compilation of bundled code diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index 82581f04..0dcabfd0 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -299,6 +299,8 @@ GpuAligner::GpuAligner() int maxEventsPerBase = 100; int totalEvents = maxEventsPerBase * max_num_reads; + poreModelInitialized = false; + cudaMalloc( (void**)&poreModelLevelMeanDev, numModelElements * sizeof(float)); cudaMalloc( (void**)&poreModelLevelLogStdvDev, numModelElements * sizeof(float)); cudaMalloc( (void**)&poreModelLevelStdvDev, numModelElements * sizeof(float)); @@ -493,9 +495,6 @@ std::vector> GpuAligner::scoreKernel(std::vector> GpuAligner::scoreKernel(std::vector> GpuAligner::scoreKernel(std::vector> GpuAligner::scoreKernel(std::vector> results(sequences.size()); - for (int i =0; i GpuAligner::variantScoresThresholded(std::vector in sequences.push_back(base_sequence); - //std::vector> variant_sequences; - for (auto v: variant_haplotypes){ auto variant_sequence = generate_methylated_alternatives(v.get_sequence(), methylation_types)[0]; sequences.push_back(variant_sequence); } - //assert(base_sequences.size() == 1); - - // return the sum of the score for the base sequences over all the event sequences - //auto base_scores = scoreKernel(base_sequences, event_sequences, alignment_flags); - std::vector> scores = scoreKernel(sequences, event_sequences, alignment_flags); - std::vector v(numVariants); // Thresholded score for each //(variant_sequences.size()); //TODO: Fix - temporary + std::vector v(numVariants); uint32_t numScores = scores[0].size(); - for (int variantIndex=0; variantIndex kmerRanksDevPointers; diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp index 93f16d9b..ad24b5be 100644 --- a/src/nanopolish_call_variants.cpp +++ b/src/nanopolish_call_variants.cpp @@ -280,7 +280,6 @@ std::vector generate_candidate_single_base_edits(const AlignmentDB& ali int region_end, uint32_t alignment_flags) { - std::cout << "CHECKPOINT 13" << std::endl; auto start = std::chrono::high_resolution_clock::now(); std::vector out_variants; @@ -375,8 +374,6 @@ std::vector generate_candidate_single_base_edits(const AlignmentDB& ali } } - std::cout << "CHECKPOINT 14 - Region end - start ength= " << region_end - region_start << std::endl; - auto end = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast( end - start ).count(); From d6be1c617c6a2034120fd3bd35efaf2ae79f4abb Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Mon, 9 Jul 2018 17:18:14 +0100 Subject: [PATCH 23/80] Fix error and tidy up --- Makefile | 4 +- src/common/nanopolish_variant.cpp | 2 +- src/cuda_kernels/GpuAligner.cu | 36 +++++++-------- src/cuda_kernels/GpuAligner.h | 2 +- src/main/nanopolish.cpp | 2 - src/nanopolish_call_variants.cpp | 73 ++++++++++++------------------- 6 files changed, 51 insertions(+), 68 deletions(-) diff --git a/Makefile b/Makefile index 060645e9..5adcd6eb 100644 --- a/Makefile +++ b/Makefile @@ -9,13 +9,13 @@ SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/ali #Basic flags every build needs LIBS=-lz -CXXFLAGS ?= -O3 # -g +CXXFLAGS ?= -O3 CXXFLAGS += -std=c++11 -fopenmp -fsigned-char CFLAGS ?= -std=c99 -O3 CXX ?= g++ CC ?= gcc NVCC = nvcc -NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -O3 -use_fast_math --default-stream per-thread #-g -G +NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -O3 -use_fast_math --default-stream per-thread CURTFLAGS ?= -L/usr/local/cuda-9.0/lib64 -lcudart # Change the value of HDF5, EIGEN, or HTS below to any value to disable compilation of bundled code diff --git a/src/common/nanopolish_variant.cpp b/src/common/nanopolish_variant.cpp index b73a6b2b..725a62ab 100644 --- a/src/common/nanopolish_variant.cpp +++ b/src/common/nanopolish_variant.cpp @@ -664,7 +664,7 @@ std::vector multi_call(VariantGroup& variant_group, // Variant score_variant_thresholded(const Variant& input_variant, Haplotype base_haplotype, - const std::vector& input, // raw reads (I think) + const std::vector& input, const uint32_t alignment_flags, const uint32_t score_threshold, const std::vector& methylation_types) diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index 0dcabfd0..7489ec7c 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -29,9 +29,6 @@ __device__ float lp_match_r9(int rank, // STEP 1: GET DRIFT-SCALED LEVEL: float level = mean; - // TODO: Apply scaling to these 3 model values as is done in the CPP implementation - //these can just be pulled from the model - float gaussian_mean = scale * poreModelLevelMean[rank] + shift; float gaussian_stdv = poreModelLevelStdv[rank] * var; float gaussian_log_level_stdv = poreModelLevelLogStdv[rank] + logVar; @@ -399,6 +396,7 @@ std::vector> GpuAligner::scoreKernel(std::vectorget_name()) == "nucleotide"); for (auto e: event_sequences) { assert(std::string(e.pore_model->pmalphabet->get_name()) == "nucleotide"); @@ -585,7 +583,7 @@ std::vector> GpuAligner::scoreKernel(std::vector GpuAligner::variantScoresThresholded(std::vector input_variants, +std::vector GpuAligner::variantScoresThresholded(std::vector input_variants, Haplotype base_haplotype, std::vector event_sequences, uint32_t alignment_flags, @@ -606,29 +604,31 @@ std::vector GpuAligner::variantScoresThresholded(std::vector in std::vector sequences; HMMInputSequence base_sequence = generate_methylated_alternatives(base_haplotype.get_sequence(), - methylation_types)[0]; //TODO: always 0? + methylation_types)[0]; //TODO: fix for non-zero sequences.push_back(base_sequence); for (auto v: variant_haplotypes){ - auto variant_sequence = generate_methylated_alternatives(v.get_sequence(), methylation_types)[0]; + auto variant_sequence = generate_methylated_alternatives(v.get_sequence(), methylation_types)[0]; //TODO: fix for non-zero sequences.push_back(variant_sequence); } - std::vector> scores = scoreKernel(sequences, event_sequences, alignment_flags); - - std::vector v(numVariants); - - uint32_t numScores = scores[0].size(); - for (int variantIndex=0; variantIndex v = input_variants; + + if (!event_sequences.empty()) { + std::vector> scores = scoreKernel(sequences, event_sequences, alignment_flags); + uint32_t numScores = scores[0].size(); + for (int variantIndex = 0; variantIndex < numVariants; variantIndex++) { // index 0 is the base scores + double totalScore = 0.0; + for (int k = 0; k < numScores; k++) { + if (fabs(totalScore) < screen_score_threshold) { + double baseScore = scores[0][k]; + totalScore += (scores[variantIndex + 1][k] - baseScore); + } } + v[variantIndex].quality = totalScore; + v[variantIndex].info = ""; } - v[variantIndex] = totalScore; } return v; diff --git a/src/cuda_kernels/GpuAligner.h b/src/cuda_kernels/GpuAligner.h index 1f804fab..25df67a4 100644 --- a/src/cuda_kernels/GpuAligner.h +++ b/src/cuda_kernels/GpuAligner.h @@ -49,7 +49,7 @@ class GpuAligner GpuAligner(); ~GpuAligner(); - std::vector + std::vector variantScoresThresholded(std::vector tmp_variants, Haplotype haplotype, std::vector event_sequences, uint32_t alignment_flags, int screen_score_threshold, std::vector methylation_types); diff --git a/src/main/nanopolish.cpp b/src/main/nanopolish.cpp index cc6fcab7..d25df269 100644 --- a/src/main/nanopolish.cpp +++ b/src/main/nanopolish.cpp @@ -64,7 +64,6 @@ int main(int argc, char** argv) { // Turn off HDF's exception printing, which is generally unhelpful for users H5Eset_auto(0, NULL, NULL); - std::cout << "CHECKPOINT 1\n"; int ret = 0; if(argc <= 1) { @@ -75,7 +74,6 @@ int main(int argc, char** argv) std::string command(argv[1]); auto iter = programs.find(command); if (iter != programs.end()) { - std::cout << "CHECKPOINT 2: " << iter->first <second(argc - 1, argv + 1); } else diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp index ad24b5be..1d9f147e 100644 --- a/src/nanopolish_call_variants.cpp +++ b/src/nanopolish_call_variants.cpp @@ -134,6 +134,8 @@ namespace opt static int screen_flanking_sequence = 10; static int debug_alignments = 0; static std::vector methylation_types; + static int gpu = 0; + } static const char* shortopts = "r:b:g:t:w:o:e:m:c:d:a:x:q:p:v"; @@ -145,6 +147,7 @@ enum { OPT_HELP = 1, OPT_SNPS_ONLY, OPT_CALC_ALL_SUPPORT, OPT_CONSENSUS, + OPT_GPU, OPT_FIX_HOMOPOLYMERS, OPT_GENOTYPE, OPT_MODELS_FOFN, @@ -181,6 +184,7 @@ static const struct option longopts[] = { { "p-bad", required_argument, NULL, OPT_P_BAD }, { "p-bad-self", required_argument, NULL, OPT_P_BAD_SELF }, { "consensus", required_argument, NULL, OPT_CONSENSUS }, + { "gpu", required_argument, NULL, OPT_GPU }, { "faster", no_argument, NULL, OPT_FASTER }, { "fix-homopolymers", no_argument, NULL, OPT_FIX_HOMOPOLYMERS }, { "calculate-all-support", no_argument, NULL, OPT_CALC_ALL_SUPPORT }, @@ -349,27 +353,30 @@ std::vector generate_candidate_single_base_edits(const AlignmentDB& ali alignments.get_reference_substring(contig, calling_start, calling_end)); auto t0_gpu = std::chrono::high_resolution_clock::now(); - // get the scaled levels. - - std::vector scores = aligner.variantScoresThresholded(tmp_variants, test_haplotype, event_sequences, - alignment_flags, opt::screen_score_threshold, - opt::methylation_types); - auto tf_gpu = std::chrono::high_resolution_clock::now(); - gpu_exec += tf_gpu - t0_gpu; - - for(const Variant& v : tmp_variants) { - auto t0 = std::chrono::high_resolution_clock::now(); - Variant scored_variant = score_variant_thresholded(v, - test_haplotype, - event_sequences, - alignment_flags, - opt::screen_score_threshold, - opt::methylation_types); - auto t1 = std::chrono::high_resolution_clock::now(); - scoring += t1-t0; - scored_variant.info = ""; - if(scored_variant.quality > 0) { - out_variants.push_back(scored_variant); + + if (opt::gpu){ + std::vector scoredVariants = aligner.variantScoresThresholded(tmp_variants, test_haplotype, event_sequences, + alignment_flags, opt::screen_score_threshold, + opt::methylation_types); + for (auto variant: scoredVariants){ + if (variant.quality > 0) { + out_variants.push_back(variant); + } + } + } else { + for (const Variant &v : tmp_variants) { + auto t0 = std::chrono::high_resolution_clock::now(); + Variant scored_variant = score_variant_thresholded(v, + test_haplotype, + event_sequences, + alignment_flags, + opt::screen_score_threshold, + opt::methylation_types); + auto t1 = std::chrono::high_resolution_clock::now(); + scored_variant.info = ""; + if (scored_variant.quality > 0) { + out_variants.push_back(scored_variant); + } } } } @@ -382,12 +389,6 @@ std::vector generate_candidate_single_base_edits(const AlignmentDB& ali auto gpu_screening = std::chrono::duration_cast(gpu_exec).count(); - std::cout << "FUNCTION TOOK " << duration << "ms" << std::endl; - std::cout << "SCREENING (CPU) COMPONENT TOOK " << screening << "ms" << std::endl; - std::cout << "SCREENING (GPU) COMPONENT TOOK " << gpu_screening << "ms" << std::endl; - - - return out_variants; } @@ -938,7 +939,6 @@ Haplotype call_variants_for_region(const std::string& contig, int region_start, alignments.get_region_start(), alignments.get_reference()); */ - std::cout<<"CHECKPOINT 8 - Data loaded"< candidate_variants; if(opt::candidates_file.empty()) { @@ -947,16 +947,12 @@ Haplotype call_variants_for_region(const std::string& contig, int region_start, candidate_variants = read_variants_for_region(opt::candidates_file, contig, region_start, region_end); } - std::cout<<"CHECKPOINT 9 - Candidate variants generated"< single_base_edits = generate_candidate_single_base_edits(alignments, region_start, region_end, alignment_flags); - std::cout<<"CHECKPOINT 11 - Single base edits generated"< dedup_set(candidate_variants.begin(), candidate_variants.end()); @@ -965,17 +961,12 @@ Haplotype call_variants_for_region(const std::string& contig, int region_start, std::sort(candidate_variants.begin(), candidate_variants.end(), sortByPosition); } - std::cout<<"CHECKPOINT 10 - Additional candidate variants generated"<> opt::num_threads; break; case 'v': opt::verbose++; break; case OPT_CONSENSUS: arg >> opt::consensus_output; opt::consensus_mode = 1; break; + case OPT_GPU: opt::gpu = 1; break; case OPT_FIX_HOMOPOLYMERS: opt::fix_homopolymers = 1; break; case OPT_EFFORT: arg >> opt::screen_score_threshold; break; case OPT_FASTER: opt::screen_score_threshold = 25; break; @@ -1168,7 +1160,6 @@ int call_variants_main(int argc, char** argv) int end_base; int contig_length = -1; - std::cout << "Checkpoint 3" << std::endl; // If a window has been specified, only call variants/polish in that range if(!opt::window.empty()) { // Parse the window string @@ -1204,8 +1195,6 @@ int call_variants_main(int argc, char** argv) out_fp = stdout; } - std::cout << "Checkpoint 4" << std::endl; - // Build the VCF header std::vector tag_fields; @@ -1240,14 +1229,10 @@ int call_variants_main(int argc, char** argv) Variant::make_vcf_tag_string("FORMAT", "GT", 1, "String", "Genotype")); - std::cout << "Checkpoint 5" << std::endl; - Variant::write_vcf_header(out_fp, tag_fields); Haplotype haplotype = call_variants_for_region(contig, start_base, end_base, out_fp); - std::cout << "Checkpoint 6" << std::endl; - if(out_fp != stdout) { fclose(out_fp); } From ca3af6e4b91c4bbeb14b043ce21625d7089db8a9 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Tue, 10 Jul 2018 13:26:42 +0100 Subject: [PATCH 24/80] tidy up --- src/cuda_kernels/GpuAligner.cu | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index 7489ec7c..bedff299 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -25,20 +25,18 @@ __device__ float lp_match_r9(int rank, float logVar, bool debug = false){ - float log_inv_sqrt_2pi = log(0.3989422804014327); // no need to calculate this every time. better solutions available.. + float log_inv_sqrt_2pi = log(0.3989422804014327); - // STEP 1: GET DRIFT-SCALED LEVEL: float level = mean; float gaussian_mean = scale * poreModelLevelMean[rank] + shift; float gaussian_stdv = poreModelLevelStdv[rank] * var; float gaussian_log_level_stdv = poreModelLevelLogStdv[rank] + logVar; - // Step 3: calculate log-normal PDF - float a = (level - gaussian_mean) / gaussian_stdv; // g is the gaussian parameters + float a = (level - gaussian_mean) / gaussian_stdv; - float emission = log_inv_sqrt_2pi - gaussian_log_level_stdv + (-0.5f * a * a); // log_inv_sqrt_2pi is defined in a comment above + float emission = log_inv_sqrt_2pi - gaussian_log_level_stdv + (-0.5f * a * a); - return emission; // log_inv_sqrt_2pi is defined in a comment above + return emission; } @@ -288,13 +286,10 @@ __global__ void getScores(float * eventData, } -//Default constructor GpuAligner::GpuAligner() { int numModelElements = 4096; int max_num_reads = 300; - int maxEventsPerBase = 100; - int totalEvents = maxEventsPerBase * max_num_reads; poreModelInitialized = false; @@ -332,9 +327,6 @@ GpuAligner::GpuAligner() returnValuesDevResultsPointers.resize(max_num_sequences); returnValuesHostResultsPointers.resize(max_num_sequences); - uint8_t num_streams = max_num_sequences; - - for (int i =0; i> GpuAligner::scoreKernel(std::vector> GpuAligner::scoreKernel(std::vector GpuAligner::variantScoresThresholded(std::vector i } return v; -} +} \ No newline at end of file From 27fe62735a67c7e95619ccac2013b3231567f3f6 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Tue, 10 Jul 2018 14:02:14 +0100 Subject: [PATCH 25/80] small performance improvments --- src/cuda_kernels/GpuAligner.cu | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index bedff299..8a2714f7 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -64,13 +64,11 @@ __global__ void getScores(float * eventData, debug=true; } // Initialise the prev probability row, which is the row of the DP table - int n_kmers = blockDim.x; int n_states = n_kmers * PSR9_NUM_STATES + 2 * PSR9_NUM_STATES; // 3 blocks per kmer and then 3 each for start and end state. - //initialise the return value// Better to do this in a register - returnValues[blockIdx.x] = -INFINITY; - __syncthreads(); + __shared__ float returnValue; + returnValue = -INFINITY; __shared__ float prevProbabilities[MAX_STATES]; @@ -222,7 +220,7 @@ __global__ void getScores(float * eventData, sum += lp_emission_b; float newBadEventScore = sum; - __syncthreads(); + // Write row out. prevProbabilities now becomes "current probabilities" for evaluating skips. prevProbabilities[curBlockOffset + PSR9_MATCH] = newMatchScore; prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] = newBadEventScore; @@ -241,7 +239,6 @@ __global__ void getScores(float * eventData, sum = logsumexpf(sum, HMT_FROM_PREV_B); sum = logsumexpf(sum, HMT_FROM_PREV_K); sum = logsumexpf(sum, HMT_FROM_SOFT); - sum += 0.0; //No emission. redundant. float newSkipScore = sum; @@ -249,20 +246,21 @@ __global__ void getScores(float * eventData, __syncthreads(); //Now need to do the skip-skip transition, which is serial so for now letting one thread execute it. + if (threadIdx.x == 0){ - for (int blkidx=2; blkidx <= blockDim.x; blkidx++){ + int firstBlockIdx = 2; + float prevSkipScore; prevSkipScore = prevProbabilities[(firstBlockIdx - 1) * PSR9_NUM_STATES + PSR9_KMER_SKIP]; + for (int blkidx = firstBlockIdx; blkidx <= blockDim.x; blkidx++){ auto skipIdx = blkidx * PSR9_NUM_STATES + PSR9_KMER_SKIP; - float prevSkipScore = prevProbabilities[skipIdx - PSR9_NUM_STATES]; - float curSkipScore = prevProbabilities[skipIdx]; + float curSkipScore = prevProbabilities[skipIdx + PSR9_KMER_SKIP]; HMT_FROM_PREV_K = lp_kk + prevSkipScore; newSkipScore = logsumexpf(curSkipScore, HMT_FROM_PREV_K); prevProbabilities[skipIdx] = newSkipScore; + prevSkipScore = newSkipScore; __syncthreads(); } } - __syncthreads(); - int lastKmerIdx = n_kmers -1; int lastRowIdx = numRows -1; float end; @@ -272,17 +270,16 @@ __global__ void getScores(float * eventData, float lp2 = lp_ms + prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] + postFlank; float lp3 = lp_ms + prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] + postFlank; - end = returnValues[blockIdx.x]; + end = returnValue; end = logsumexpf(end, lp1); end = logsumexpf(end, lp2); end = logsumexpf(end, lp3); - returnValues[blockIdx.x] = end; + returnValue = end; } - // Now do the end state - __syncthreads(); - } - __syncthreads(); + } + returnValues[blockIdx.x] = returnValue; + __syncthreads(); } From 0e7fdcb68eca7dd2316125abc93dc0913af3e750 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Tue, 10 Jul 2018 14:49:07 +0100 Subject: [PATCH 26/80] tidyup --- src/cuda_kernels/GpuAligner.cu | 74 +++++++++++++++------------------- 1 file changed, 32 insertions(+), 42 deletions(-) diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index 8a2714f7..c261381f 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -22,8 +22,7 @@ __device__ float lp_match_r9(int rank, float scale, float shift, float var, - float logVar, - bool debug = false){ + float logVar){ float log_inv_sqrt_2pi = log(0.3989422804014327); @@ -33,9 +32,7 @@ __device__ float lp_match_r9(int rank, float gaussian_log_level_stdv = poreModelLevelLogStdv[rank] + logVar; float a = (level - gaussian_mean) / gaussian_stdv; - float emission = log_inv_sqrt_2pi - gaussian_log_level_stdv + (-0.5f * a * a); - return emission; } @@ -59,10 +56,6 @@ __global__ void getScores(float * eventData, float * postFlankingDev, float * returnValues) { - bool debug = false; - if(threadIdx.x==0 && blockIdx.x==0){ - debug=true; - } // Initialise the prev probability row, which is the row of the DP table int n_kmers = blockDim.x; int n_states = n_kmers * PSR9_NUM_STATES + 2 * PSR9_NUM_STATES; // 3 blocks per kmer and then 3 each for start and end state. @@ -169,8 +162,7 @@ __global__ void getScores(float * eventData, scale, shift, var, - logVar, - debug); + logVar); float lp_emission_b = BAD_EVENT_PENALTY; @@ -189,7 +181,7 @@ __global__ void getScores(float * eventData, // with a penalty; float HMT_FROM_SOFT = (kmerIdx == 0 && (event_idx == e_start || - (HAF_ALLOW_PRE_CLIP))) ? lp_sm + preFlank : -INFINITY; // TODO: Add flag for HAF ALLOW_PRE_CLIP + (HAF_ALLOW_PRE_CLIP))) ? lp_sm + preFlank : -INFINITY; // calculate the score float sum = HMT_FROM_SAME_M; @@ -205,19 +197,19 @@ __global__ void getScores(float * eventData, // Calculate the bad event scores // state PSR9_BAD_EVENT HMT_FROM_SAME_M = lp_mb + prevProbabilities[curBlockOffset + PSR9_MATCH]; - HMT_FROM_PREV_M = -INFINITY; // not allowed + HMT_FROM_PREV_M = -INFINITY; HMT_FROM_SAME_B = lp_bb + prevProbabilities[curBlockOffset + PSR9_BAD_EVENT]; HMT_FROM_PREV_B = -INFINITY; HMT_FROM_PREV_K = -INFINITY; HMT_FROM_SOFT = -INFINITY; sum = HMT_FROM_SAME_M; - sum = logsumexpf(sum, HMT_FROM_PREV_M); sum = logsumexpf(sum, HMT_FROM_SAME_B); - sum = logsumexpf(sum, HMT_FROM_PREV_B); - sum = logsumexpf(sum, HMT_FROM_PREV_K); - sum = logsumexpf(sum, HMT_FROM_SOFT); sum += lp_emission_b; + //sum = logsumexpf(sum, HMT_FROM_PREV_B); + //sum = logsumexpf(sum, HMT_FROM_PREV_K); + //sum = logsumexpf(sum, HMT_FROM_SOFT); + //sum = logsumexpf(sum, HMT_FROM_PREV_M); float newBadEventScore = sum; @@ -233,12 +225,12 @@ __global__ void getScores(float * eventData, HMT_FROM_PREV_B = lp_bk + prevProbabilities[prevBlockOffset + PSR9_BAD_EVENT]; HMT_FROM_SOFT = -INFINITY; - sum = HMT_FROM_SAME_M; - sum = logsumexpf(sum, HMT_FROM_PREV_M); - sum = logsumexpf(sum, HMT_FROM_SAME_B); + sum = HMT_FROM_PREV_M; sum = logsumexpf(sum, HMT_FROM_PREV_B); sum = logsumexpf(sum, HMT_FROM_PREV_K); - sum = logsumexpf(sum, HMT_FROM_SOFT); + //sum = logsumexpf(sum, HMT_FROM_SAME_M); + //sum = logsumexpf(sum, HMT_FROM_SAME_B); + //sum = logsumexpf(sum, HMT_FROM_SOFT); float newSkipScore = sum; @@ -246,7 +238,6 @@ __global__ void getScores(float * eventData, __syncthreads(); //Now need to do the skip-skip transition, which is serial so for now letting one thread execute it. - if (threadIdx.x == 0){ int firstBlockIdx = 2; float prevSkipScore; prevSkipScore = prevProbabilities[(firstBlockIdx - 1) * PSR9_NUM_STATES + PSR9_KMER_SKIP]; @@ -257,7 +248,6 @@ __global__ void getScores(float * eventData, newSkipScore = logsumexpf(curSkipScore, HMT_FROM_PREV_K); prevProbabilities[skipIdx] = newSkipScore; prevSkipScore = newSkipScore; - __syncthreads(); } } @@ -290,28 +280,28 @@ GpuAligner::GpuAligner() poreModelInitialized = false; - cudaMalloc( (void**)&poreModelLevelMeanDev, numModelElements * sizeof(float)); - cudaMalloc( (void**)&poreModelLevelLogStdvDev, numModelElements * sizeof(float)); - cudaMalloc( (void**)&poreModelLevelStdvDev, numModelElements * sizeof(float)); + cudaMalloc((void**)&poreModelLevelMeanDev, numModelElements * sizeof(float)); + cudaMalloc((void**)&poreModelLevelLogStdvDev, numModelElements * sizeof(float)); + cudaMalloc((void**)&poreModelLevelStdvDev, numModelElements * sizeof(float)); - cudaMalloc( (void**)&scaleDev, max_num_reads * sizeof(float)); - cudaMalloc( (void**)&shiftDev, max_num_reads * sizeof(float)); - cudaMalloc( (void**)&varDev, max_num_reads * sizeof(float)); - cudaMalloc( (void**)&logVarDev, max_num_reads * sizeof(float)); + cudaMalloc((void**)&scaleDev, max_num_reads * sizeof(float)); + cudaMalloc((void**)&shiftDev, max_num_reads * sizeof(float)); + cudaMalloc((void**)&varDev, max_num_reads * sizeof(float)); + cudaMalloc((void**)&logVarDev, max_num_reads * sizeof(float)); cudaMalloc( (void**)&eventsPerBaseDev, max_num_reads * sizeof(float)); int max_n_rows = 100; int maxBuffer = 50000 * sizeof(float); //TODO: allocate more smartly - cudaMalloc( (void**)&numRowsDev, max_n_rows * sizeof(int)); - cudaMalloc( (void**)&eventStartsDev, maxBuffer); - cudaMalloc( (void**)&eventStridesDev, maxBuffer); - cudaMalloc( (void**)&eventOffsetsDev, maxBuffer); + cudaMalloc((void**)&numRowsDev, max_n_rows * sizeof(int)); + cudaMalloc((void**)&eventStartsDev, maxBuffer); + cudaMalloc((void**)&eventStridesDev, maxBuffer); + cudaMalloc((void**)&eventOffsetsDev, maxBuffer); - cudaMalloc( (void**)&eventMeansDev, maxBuffer); - cudaMalloc( (void**)&preFlankingDev, maxBuffer); - cudaMalloc( (void**)&postFlankingDev, maxBuffer); + cudaMalloc((void**)&eventMeansDev, maxBuffer); + cudaMalloc((void**)&preFlankingDev, maxBuffer); + cudaMalloc((void**)&postFlankingDev, maxBuffer); //Allocate a host buffer to store the event means, pre and post-flank data cudaHostAlloc(&eventMeans, maxBuffer , cudaHostAllocDefault); @@ -325,16 +315,16 @@ GpuAligner::GpuAligner() returnValuesHostResultsPointers.resize(max_num_sequences); for (int i =0; i Date: Tue, 10 Jul 2018 15:01:47 +0100 Subject: [PATCH 27/80] Update README.md --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index 1821d495..ba7039a6 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,12 @@ # Nanopolish +## GPU acceleration branch - experimental/Work in progress + +This branch adds CUDA-enabled GPU acceleration to the nanopolish consensus improvement algorithm. To try this feature run with the `--gpu` flag e.g: + +../nanopolish variants --consensus polished_gpu.fa -w "tig00000001:200000-230000" -r reads.fasta -b reads.sorted.bam -g draft.fa --threads=8 --gpu=1 + + [![Build Status](https://travis-ci.org/jts/nanopolish.svg?branch=master)](https://travis-ci.org/jts/nanopolish) Software package for signal-level analysis of Oxford Nanopore sequencing data. Nanopolish can calculate an improved consensus sequence for a draft genome assembly, detect base modifications, call SNPs and indels with respect to a reference genome and more (see Nanopolish modules, below). From 677c94b2d98159bec20bd53509246288cbf15f23 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Tue, 10 Jul 2018 15:02:00 +0100 Subject: [PATCH 28/80] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ba7039a6..aebd9835 100644 --- a/README.md +++ b/README.md @@ -3,9 +3,9 @@ ## GPU acceleration branch - experimental/Work in progress This branch adds CUDA-enabled GPU acceleration to the nanopolish consensus improvement algorithm. To try this feature run with the `--gpu` flag e.g: - +``` ../nanopolish variants --consensus polished_gpu.fa -w "tig00000001:200000-230000" -r reads.fasta -b reads.sorted.bam -g draft.fa --threads=8 --gpu=1 - +``` [![Build Status](https://travis-ci.org/jts/nanopolish.svg?branch=master)](https://travis-ci.org/jts/nanopolish) From 213b8ebf89898bda16eb7c2a537e57a01085182a Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Tue, 10 Jul 2018 15:02:29 +0100 Subject: [PATCH 29/80] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index aebd9835..9609577b 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ This branch adds CUDA-enabled GPU acceleration to the nanopolish consensus improvement algorithm. To try this feature run with the `--gpu` flag e.g: ``` -../nanopolish variants --consensus polished_gpu.fa -w "tig00000001:200000-230000" -r reads.fasta -b reads.sorted.bam -g draft.fa --threads=8 --gpu=1 +nanopolish variants --consensus polished_gpu.fa -w "tig00000001:200000-230000" -r reads.fasta -b reads.sorted.bam -g draft.fa --threads=8 --gpu=1 ``` [![Build Status](https://travis-ci.org/jts/nanopolish.svg?branch=master)](https://travis-ci.org/jts/nanopolish) From 33d3b56419b3c15812b96b570dfa72caf41ba55a Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Tue, 10 Jul 2018 15:09:37 +0100 Subject: [PATCH 30/80] tidup --- src/cuda_kernels/GpuAligner.cu | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index c261381f..e33ea674 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -132,8 +132,6 @@ __global__ void getScores(float * eventData, // Start filling out the "DP table" // Each thread is going to work on an individual P-HMM Block - // WRONG - need to use threadIdx & think carefully. we have one thread per block/kmer. each block has 3 states tho. - //int kmerIdx = blockIdx.x; int curBlockIdx = kmerIdx + 1; // Accounts for fact that we are not working with start block. int prevBlockIdx = curBlockIdx -1; int prevBlockOffset = PSR9_NUM_STATES * prevBlockIdx; From dbd79064e498f69eaa480e0e96c4cfb617c86561 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Tue, 10 Jul 2018 15:45:38 +0100 Subject: [PATCH 31/80] typo fix --- src/cuda_kernels/GpuAligner.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cuda_kernels/GpuAligner.h b/src/cuda_kernels/GpuAligner.h index 25df67a4..1a82e492 100644 --- a/src/cuda_kernels/GpuAligner.h +++ b/src/cuda_kernels/GpuAligner.h @@ -41,7 +41,7 @@ #include #ifndef GPU_ALIGNER_H -#define GPU_ALIGNER_H1 +#define GPU_ALIGNER_H class GpuAligner { From 29bf0603ba62279e91f1f4dd5345dcc1e20eaa5e Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Wed, 11 Jul 2018 09:39:49 +0100 Subject: [PATCH 32/80] Storing kmer ranks in one buffer --- src/cuda_kernels/GpuAligner.cu | 63 ++++++++++++++++++++++------------ src/cuda_kernels/GpuAligner.h | 4 ++- 2 files changed, 45 insertions(+), 22 deletions(-) diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index e33ea674..6504499f 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -307,11 +307,17 @@ GpuAligner::GpuAligner() cudaHostAlloc(&postFlankingHost, maxBuffer, cudaHostAllocDefault); int max_num_sequences = 8; + int max_sequence_length = 50; kmerRanksDevPointers.resize(max_num_sequences); kmerRanksRCDevPointers.resize(max_num_sequences); returnValuesDevResultsPointers.resize(max_num_sequences); returnValuesHostResultsPointers.resize(max_num_sequences); + // Populate host buffer with kmer ranks + int numKmers = max_sequence_length * max_num_sequences; + cudaHostAlloc(&kmerRanks, numKmers * 2 * sizeof(int), cudaHostAllocDefault); + cudaMalloc((void**)&kmerRanksDev, numKmers * 2 * sizeof(int)); + for (int i =0; i> GpuAligner::scoreKernel(std::vector kmer_ranks(n_kmers); - std::vector kmer_ranks_rc(n_kmers); - - for(size_t ki = 0; ki < n_kmers; ++ki) { - kmer_ranks[ki] = sequence.get_kmer_rank(ki, k, false); - kmer_ranks_rc[ki] = sequence.get_kmer_rank(ki, k, true); - } - - assert(kmer_ranks.size() < MAX_NUM_KMERS); - cudaMemcpyAsync(kmerRanksDev, kmer_ranks.data(), kmer_ranks.size() * sizeof(int), - cudaMemcpyHostToDevice, streams[i]); - cudaMemcpyAsync(kmerRanksRCDev, kmer_ranks_rc.data(), kmer_ranks_rc.size() * sizeof(int), - cudaMemcpyHostToDevice, streams[i]); - int num_blocks = n_states / PSR9_NUM_STATES; dim3 dimBlock(num_blocks - 2); // One thread per state, not including Start and Terminal state. @@ -523,8 +543,8 @@ std::vector> GpuAligner::scoreKernel(std::vector> GpuAligner::scoreKernel(std::vector> results(sequences.size()); for (size_t i =0; i event_sequences, uint32_t alignment_flags); private: - float* poreModelLevelMeanDev; float* scaleDev; float* shiftDev; float* varDev; @@ -75,6 +74,9 @@ class GpuAligner float* eventsPerBaseDev; float* poreModelLevelStdvDev; float* poreModelLevelLogStdvDev; + float* poreModelLevelMeanDev; + int * kmerRanks; + int * kmerRanksDev; bool poreModelInitialized; // Allocate arrays for storing results, kmerRanksDev and kmerRanksRCDev From 39fec2bfe329011d41483b4af29f579cdd038ae2 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Wed, 11 Jul 2018 11:12:53 +0100 Subject: [PATCH 33/80] fixed a makefile error --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 5adcd6eb..5d96b37b 100644 --- a/Makefile +++ b/Makefile @@ -124,7 +124,7 @@ depend: .depend .depend: $(CPP_SRC) $(C_SRC) $(CU_SRC) $(EXE_SRC) $(H5_LIB) $(EIGEN_CHECK) rm -f ./.depend - $(CXX) $(CXXFLAGS) $(CPPFLAGS) $(NVCCFLAGS) $(NVCC) -MM $(CPP_SRC) $(C_SRC) $(CU_SRC) > ./.depend; + $(CXX) $(CXXFLAGS) $(CPPFLAGS) -MM $(CPP_SRC) $(C_SRC) > ./.depend; include .depend From c6414ccd8ac7d78a29558ed4b58868f81db258ab Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Thu, 12 Jul 2018 11:38:31 +0100 Subject: [PATCH 34/80] Some simple CUDA API error reporting --- src/cuda_kernels/GpuAligner.cu | 103 ++++++++++++++++----------------- 1 file changed, 50 insertions(+), 53 deletions(-) diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index 6504499f..b9bb9636 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -6,6 +6,10 @@ #define MAX_STATES 128 +#define EXPAND_TO_STRING(X) #X +#define TO_STRING(X) EXPAND_TO_STRING(X) +#define CU_CHECK_ERR(X) if (X != cudaSuccess){printf("CUDA error: %s at line %s\n", cudaGetErrorString(X), TO_STRING(__LINE__));} + __device__ float logsumexpf(float x, float y){ if(x == -INFINITY && y == -INFINITY){ return -INFINITY; @@ -278,33 +282,28 @@ GpuAligner::GpuAligner() poreModelInitialized = false; - cudaMalloc((void**)&poreModelLevelMeanDev, numModelElements * sizeof(float)); - cudaMalloc((void**)&poreModelLevelLogStdvDev, numModelElements * sizeof(float)); - cudaMalloc((void**)&poreModelLevelStdvDev, numModelElements * sizeof(float)); - - cudaMalloc((void**)&scaleDev, max_num_reads * sizeof(float)); - cudaMalloc((void**)&shiftDev, max_num_reads * sizeof(float)); - cudaMalloc((void**)&varDev, max_num_reads * sizeof(float)); - cudaMalloc((void**)&logVarDev, max_num_reads * sizeof(float)); - - cudaMalloc( (void**)&eventsPerBaseDev, max_num_reads * sizeof(float)); + CU_CHECK_ERR(cudaMalloc((void**)&poreModelLevelMeanDev, numModelElements * sizeof(float))); + CU_CHECK_ERR(cudaMalloc((void**)&poreModelLevelLogStdvDev, numModelElements * sizeof(float))); + CU_CHECK_ERR(cudaMalloc((void**)&poreModelLevelStdvDev, numModelElements * sizeof(float))); + CU_CHECK_ERR(cudaMalloc((void**)&scaleDev, max_num_reads * sizeof(float))); + CU_CHECK_ERR(cudaMalloc((void**)&shiftDev, max_num_reads * sizeof(float))); + CU_CHECK_ERR(cudaMalloc((void**)&varDev, max_num_reads * sizeof(float))); + CU_CHECK_ERR(cudaMalloc((void**)&logVarDev, max_num_reads * sizeof(float))); + CU_CHECK_ERR(cudaMalloc( (void**)&eventsPerBaseDev, max_num_reads * sizeof(float))); int max_n_rows = 100; int maxBuffer = 50000 * sizeof(float); //TODO: allocate more smartly - cudaMalloc((void**)&numRowsDev, max_n_rows * sizeof(int)); - cudaMalloc((void**)&eventStartsDev, maxBuffer); - cudaMalloc((void**)&eventStridesDev, maxBuffer); - cudaMalloc((void**)&eventOffsetsDev, maxBuffer); - - cudaMalloc((void**)&eventMeansDev, maxBuffer); - cudaMalloc((void**)&preFlankingDev, maxBuffer); - cudaMalloc((void**)&postFlankingDev, maxBuffer); - - //Allocate a host buffer to store the event means, pre and post-flank data - cudaHostAlloc(&eventMeans, maxBuffer , cudaHostAllocDefault); - cudaHostAlloc(&preFlankingHost, maxBuffer, cudaHostAllocDefault); - cudaHostAlloc(&postFlankingHost, maxBuffer, cudaHostAllocDefault); + CU_CHECK_ERR(cudaMalloc((void**)&numRowsDev, max_n_rows * sizeof(int))); + CU_CHECK_ERR(cudaMalloc((void**)&eventStartsDev, maxBuffer)); + CU_CHECK_ERR(cudaMalloc((void**)&eventStridesDev, maxBuffer)); + CU_CHECK_ERR(cudaMalloc((void**)&eventOffsetsDev, maxBuffer)); + CU_CHECK_ERR(cudaMalloc((void**)&eventMeansDev, maxBuffer)); + CU_CHECK_ERR(cudaMalloc((void**)&preFlankingDev, maxBuffer)); + CU_CHECK_ERR(cudaMalloc((void**)&postFlankingDev, maxBuffer)); + CU_CHECK_ERR(cudaHostAlloc(&eventMeans, maxBuffer , cudaHostAllocDefault)); + CU_CHECK_ERR(cudaHostAlloc(&preFlankingHost, maxBuffer, cudaHostAllocDefault)); + CU_CHECK_ERR(cudaHostAlloc(&postFlankingHost, maxBuffer, cudaHostAllocDefault)); int max_num_sequences = 8; int max_sequence_length = 50; @@ -324,11 +323,10 @@ GpuAligner::GpuAligner() float * returnValuesDev; float * returnedValues; - cudaMalloc((void**)&returnValuesDev, sizeof(float) * max_num_reads); //one score per read - cudaHostAlloc(&returnedValues, max_num_reads * sizeof(float) , cudaHostAllocDefault); - - cudaMalloc((void**)&kmerRanksDev, max_n_rows * sizeof(int)); - cudaMalloc((void**)&kmerRanksRCDev, max_n_rows * sizeof(int)); + CU_CHECK_ERR(cudaMalloc((void**)&returnValuesDev, sizeof(float) * max_num_reads)); //one score per read + CU_CHECK_ERR(cudaHostAlloc(&returnedValues, max_num_reads * sizeof(float) , cudaHostAllocDefault)); + CU_CHECK_ERR(cudaMalloc((void**)&kmerRanksDev, max_n_rows * sizeof(int))); + CU_CHECK_ERR(cudaMalloc((void**)&kmerRanksRCDev, max_n_rows * sizeof(int))); kmerRanksDevPointers[i] = kmerRanksDev; kmerRanksRCDevPointers[i] = kmerRanksRCDev; @@ -342,34 +340,33 @@ GpuAligner::GpuAligner() //Destructor GpuAligner::~GpuAligner() { - cudaFree(poreModelLevelMeanDev); - cudaFree(scaleDev); - cudaFree(shiftDev); - cudaFree(varDev); - cudaFree(logVarDev); - cudaFree(eventMeansDev); - cudaFree(eventsPerBaseDev); - cudaFree(numRowsDev); - cudaFree(eventStartsDev); - cudaFree(eventStridesDev); - cudaFree(eventOffsetsDev); - cudaFree(poreModelLevelLogStdvDev); - cudaFree(poreModelLevelStdvDev); - cudaFree(preFlankingDev); - cudaFree(postFlankingDev); - - cudaFreeHost(eventMeans); - cudaFreeHost(preFlankingHost); - cudaFreeHost(postFlankingHost); + CU_CHECK_ERR(cudaFree(poreModelLevelMeanDev)); + CU_CHECK_ERR(cudaFree(scaleDev)); + CU_CHECK_ERR(cudaFree(shiftDev)); + CU_CHECK_ERR(cudaFree(varDev)); + CU_CHECK_ERR(cudaFree(logVarDev)); + CU_CHECK_ERR(cudaFree(eventMeansDev)); + CU_CHECK_ERR(cudaFree(eventsPerBaseDev)); + CU_CHECK_ERR(cudaFree(numRowsDev)); + CU_CHECK_ERR(cudaFree(eventStartsDev)); + CU_CHECK_ERR(cudaFree(eventStridesDev)); + CU_CHECK_ERR(cudaFree(eventOffsetsDev)); + CU_CHECK_ERR(cudaFree(poreModelLevelLogStdvDev)); + CU_CHECK_ERR(cudaFree(poreModelLevelStdvDev)); + CU_CHECK_ERR(cudaFree(preFlankingDev)); + CU_CHECK_ERR(cudaFree(postFlankingDev)); + CU_CHECK_ERR(cudaFree(kmerRanksDev)); + CU_CHECK_ERR(cudaFreeHost(eventMeans)); + CU_CHECK_ERR(cudaFreeHost(preFlankingHost)); + CU_CHECK_ERR(cudaFreeHost(postFlankingHost)); + CU_CHECK_ERR(cudaFreeHost(kmerRanks)); int max_num_sequences = 8; // should be a private variable // Free device and host memory for (int i =0; i GpuAligner::variantScoresThresholded(std::vector i } return v; -} \ No newline at end of file +} From 188de17f689236dbd198b866c2c970263500a404 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Thu, 12 Jul 2018 14:21:46 +0100 Subject: [PATCH 35/80] One buffer for pore model --- src/cuda_kernels/GpuAligner.cu | 66 ++++++++++++++-------------------- src/cuda_kernels/GpuAligner.h | 5 ++- 2 files changed, 31 insertions(+), 40 deletions(-) diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index b9bb9636..e12ab8a8 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -20,9 +20,7 @@ __device__ float logsumexpf(float x, float y){ __device__ float lp_match_r9(int rank, float mean, - float * poreModelLevelLogStdv, - float * poreModelLevelStdv, - float * poreModelLevelMean, + float * poreModelDev, float scale, float shift, float var, @@ -31,9 +29,9 @@ __device__ float lp_match_r9(int rank, float log_inv_sqrt_2pi = log(0.3989422804014327); float level = mean; - float gaussian_mean = scale * poreModelLevelMean[rank] + shift; - float gaussian_stdv = poreModelLevelStdv[rank] * var; - float gaussian_log_level_stdv = poreModelLevelLogStdv[rank] + logVar; + float gaussian_mean = scale * poreModelDev[rank * 3] + shift; + float gaussian_stdv = poreModelDev[rank * 3 + 1] * var; + float gaussian_log_level_stdv = poreModelDev[rank * 3 + 2] + logVar; float a = (level - gaussian_mean) / gaussian_stdv; float emission = log_inv_sqrt_2pi - gaussian_log_level_stdv + (-0.5f * a * a); @@ -49,9 +47,7 @@ __global__ void getScores(float * eventData, int * kmer_ranks, int * kmer_ranks_rc, int * eventOffsets, // Offset to use for getting an event IDX for a specific read (read obtained by block IDX) - float * poreModelLevelLogStdv, - float * poreModelLevelStdv, - float * poreModelLevelMean, + float * poreModelDev, float * scaleDev, float * shiftDev, float * varDev, @@ -156,11 +152,9 @@ __global__ void getScores(float * eventData, float preFlank = preFlankingDev[e_offset + row - 1]; float postFlank = postFlankingDev[e_offset + row - 1]; - float lp_emission_m = lp_match_r9(rank, + float lp_emission_m = lp_match_r9(rank, event_mean, - poreModelLevelLogStdv, - poreModelLevelStdv, - poreModelLevelMean, + poreModelDev, scale, shift, var, @@ -282,15 +276,15 @@ GpuAligner::GpuAligner() poreModelInitialized = false; - CU_CHECK_ERR(cudaMalloc((void**)&poreModelLevelMeanDev, numModelElements * sizeof(float))); - CU_CHECK_ERR(cudaMalloc((void**)&poreModelLevelLogStdvDev, numModelElements * sizeof(float))); - CU_CHECK_ERR(cudaMalloc((void**)&poreModelLevelStdvDev, numModelElements * sizeof(float))); CU_CHECK_ERR(cudaMalloc((void**)&scaleDev, max_num_reads * sizeof(float))); CU_CHECK_ERR(cudaMalloc((void**)&shiftDev, max_num_reads * sizeof(float))); CU_CHECK_ERR(cudaMalloc((void**)&varDev, max_num_reads * sizeof(float))); CU_CHECK_ERR(cudaMalloc((void**)&logVarDev, max_num_reads * sizeof(float))); CU_CHECK_ERR(cudaMalloc( (void**)&eventsPerBaseDev, max_num_reads * sizeof(float))); + // Allocate Device memory for pore model + CU_CHECK_ERR(cudaMalloc((void**)&poreModelDev, numModelElements * 3 * sizeof(float))); + int max_n_rows = 100; int maxBuffer = 50000 * sizeof(float); //TODO: allocate more smartly @@ -305,6 +299,8 @@ GpuAligner::GpuAligner() CU_CHECK_ERR(cudaHostAlloc(&preFlankingHost, maxBuffer, cudaHostAllocDefault)); CU_CHECK_ERR(cudaHostAlloc(&postFlankingHost, maxBuffer, cudaHostAllocDefault)); + // Allocate host memory for model + CU_CHECK_ERR(cudaHostAlloc(&poreModelHost, numModelElements * sizeof(float) * 3, cudaHostAllocDefault)); int max_num_sequences = 8; int max_sequence_length = 50; kmerRanksDevPointers.resize(max_num_sequences); @@ -340,7 +336,6 @@ GpuAligner::GpuAligner() //Destructor GpuAligner::~GpuAligner() { - CU_CHECK_ERR(cudaFree(poreModelLevelMeanDev)); CU_CHECK_ERR(cudaFree(scaleDev)); CU_CHECK_ERR(cudaFree(shiftDev)); CU_CHECK_ERR(cudaFree(varDev)); @@ -351,15 +346,16 @@ GpuAligner::~GpuAligner() { CU_CHECK_ERR(cudaFree(eventStartsDev)); CU_CHECK_ERR(cudaFree(eventStridesDev)); CU_CHECK_ERR(cudaFree(eventOffsetsDev)); - CU_CHECK_ERR(cudaFree(poreModelLevelLogStdvDev)); - CU_CHECK_ERR(cudaFree(poreModelLevelStdvDev)); CU_CHECK_ERR(cudaFree(preFlankingDev)); CU_CHECK_ERR(cudaFree(postFlankingDev)); CU_CHECK_ERR(cudaFree(kmerRanksDev)); + CU_CHECK_ERR(cudaFree(poreModelDev)); + CU_CHECK_ERR(cudaFreeHost(eventMeans)); CU_CHECK_ERR(cudaFreeHost(preFlankingHost)); CU_CHECK_ERR(cudaFreeHost(postFlankingHost)); CU_CHECK_ERR(cudaFreeHost(kmerRanks)); + CU_CHECK_ERR(cudaFreeHost(poreModelHost)); int max_num_sequences = 8; // should be a private variable // Free device and host memory @@ -444,16 +440,6 @@ std::vector> GpuAligner::scoreKernel(std::vectorstates.size(); - std::vector pore_model_level_log_stdv(num_states); - std::vector pore_model_level_mean(num_states); - std::vector pore_model_level_stdv(num_states); - for(int st=0; ststates[st]; - pore_model_level_log_stdv[st] = params.level_log_stdv; //TODO: I am seeing level log stdv and level stdv return the same value. need to investigate this. - pore_model_level_stdv[st] = params.level_stdv; - pore_model_level_mean[st] = params.level_mean; - } - //Populating read-statistics buffers std::vector scale(num_reads); std::vector shift(num_reads); @@ -482,13 +468,17 @@ std::vector> GpuAligner::scoreKernel(std::vectorstates[st]; + poreModelHost[st * poreModelEntriesPerState] = params.level_mean; + poreModelHost[st * poreModelEntriesPerState + 1] = params.level_stdv; + poreModelHost[st * poreModelEntriesPerState + 2] = params.level_log_stdv; + } + // copy over the pore model + cudaMemcpyAsync(poreModelDev, poreModelHost, + poreModelEntriesPerState * 4096 * sizeof(float), cudaMemcpyHostToDevice); // TODO don't hardcode num kmers + poreModelInitialized = true; } //Let's populate a host buffer with all the sequences. @@ -543,9 +533,7 @@ std::vector> GpuAligner::scoreKernel(std::vector Date: Thu, 12 Jul 2018 14:59:27 +0100 Subject: [PATCH 36/80] One buffer for pore model --- src/cuda_kernels/GpuAligner.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index e12ab8a8..21cdc0f1 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -20,7 +20,7 @@ __device__ float logsumexpf(float x, float y){ __device__ float lp_match_r9(int rank, float mean, - float * poreModelDev, + const float * poreModelDev, float scale, float shift, float var, @@ -47,7 +47,7 @@ __global__ void getScores(float * eventData, int * kmer_ranks, int * kmer_ranks_rc, int * eventOffsets, // Offset to use for getting an event IDX for a specific read (read obtained by block IDX) - float * poreModelDev, + const float * poreModelDev, float * scaleDev, float * shiftDev, float * varDev, From 9b8f0297c4697e4be2557861b557788c07f14de9 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Thu, 12 Jul 2018 16:21:09 +0100 Subject: [PATCH 37/80] Keeping pore model in registers --- src/cuda_kernels/GpuAligner.cu | 83 ++++++++++++++++---------------- src/cuda_kernels/GpuAligner.h | 2 - src/nanopolish_call_variants.cpp | 20 +------- 3 files changed, 43 insertions(+), 62 deletions(-) diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index 21cdc0f1..830bb3e8 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -20,18 +20,20 @@ __device__ float logsumexpf(float x, float y){ __device__ float lp_match_r9(int rank, float mean, - const float * poreModelDev, + float pore_mean, + float pore_stdv, + float pore_log_level_stdv, float scale, float shift, float var, float logVar){ - float log_inv_sqrt_2pi = log(0.3989422804014327); + float log_inv_sqrt_2pi = logf(0.3989422804014327); float level = mean; - float gaussian_mean = scale * poreModelDev[rank * 3] + shift; - float gaussian_stdv = poreModelDev[rank * 3 + 1] * var; - float gaussian_log_level_stdv = poreModelDev[rank * 3 + 2] + logVar; + float gaussian_mean = scale * pore_mean + shift; + float gaussian_stdv = pore_stdv * var; + float gaussian_log_level_stdv = pore_log_level_stdv + logVar; float a = (level - gaussian_mean) / gaussian_stdv; float emission = log_inv_sqrt_2pi - gaussian_log_level_stdv + (-0.5f * a * a); @@ -39,21 +41,20 @@ __device__ float lp_match_r9(int rank, } -__global__ void getScores(float * eventData, - float * readEventsPerBase, - int * numRowsPerRead, - int * eventStarts, - int * eventStrides, - int * kmer_ranks, - int * kmer_ranks_rc, - int * eventOffsets, // Offset to use for getting an event IDX for a specific read (read obtained by block IDX) - const float * poreModelDev, - float * scaleDev, - float * shiftDev, - float * varDev, - float * logVarDev, - float * preFlankingDev, - float * postFlankingDev, +__global__ void getScores(float * const eventData, + float * const readEventsPerBase, + int * const numRowsPerRead, + int * const eventStarts, + int * const eventStrides, + int * const kmerRanks, + int * const eventOffsets, // Offset to use for getting an event IDX for a specific read (read obtained by block IDX) + float * const poreModelDev, + float * const scaleDev, + float * const shiftDev, + float * const varDev, + float * const logVarDev, + float * const preFlankingDev, + float * const postFlankingDev, float * returnValues) { // Initialise the prev probability row, which is the row of the DP table @@ -90,11 +91,16 @@ __global__ void getScores(float * eventData, uint32_t rank; if (rc == true) { - rank = kmer_ranks_rc[kmerIdx]; + rank = kmerRanks[kmerIdx + n_kmers]; }else{ - rank = kmer_ranks[kmerIdx]; + rank = kmerRanks[kmerIdx]; } + float pore_mean = poreModelDev[rank * 3]; + float pore_stdv = poreModelDev[rank * 3 + 1]; + float pore_log_level_stdv = poreModelDev[rank * 3 + 2]; + + float p_stay = 1 - (1 / read_events_per_base); float p_skip = 0.0025; float p_bad = 0.001; @@ -116,16 +122,16 @@ __global__ void getScores(float * eventData, float p_km = 1.0f - p_kk; // We assign some transition probabilities. I believe this is correct and they don't vary by location in the sequence - float lp_mk = log(p_mk); - float lp_mb = log(p_mb); - float lp_mm_self = log(p_mm_self); - float lp_mm_next = log(p_mm_next); - float lp_bb = log(p_bb); - float lp_bk = log(p_bk); - float lp_bm_next = log(p_bm_next); - float lp_bm_self = log(p_bm_self); - float lp_kk = log(p_kk); - float lp_km = log(p_km); + float lp_mk = logf(p_mk); + float lp_mb = logf(p_mb); + float lp_mm_self = logf(p_mm_self); + float lp_mm_next = logf(p_mm_next); + float lp_bb = logf(p_bb); + float lp_bk = logf(p_bk); + float lp_bm_next = logf(p_bm_next); + float lp_bm_self = logf(p_bm_self); + float lp_kk = logf(p_kk); + float lp_km = logf(p_km); float lp_sm, lp_ms; lp_sm = lp_ms = 0.0f; @@ -152,9 +158,11 @@ __global__ void getScores(float * eventData, float preFlank = preFlankingDev[e_offset + row - 1]; float postFlank = postFlankingDev[e_offset + row - 1]; - float lp_emission_m = lp_match_r9(rank, + float lp_emission_m = lp_match_r9(rank, event_mean, - poreModelDev, + pore_mean, + pore_stdv, + pore_log_level_stdv, scale, shift, var, @@ -304,7 +312,6 @@ GpuAligner::GpuAligner() int max_num_sequences = 8; int max_sequence_length = 50; kmerRanksDevPointers.resize(max_num_sequences); - kmerRanksRCDevPointers.resize(max_num_sequences); returnValuesDevResultsPointers.resize(max_num_sequences); returnValuesHostResultsPointers.resize(max_num_sequences); @@ -315,17 +322,14 @@ GpuAligner::GpuAligner() for (int i =0; i> GpuAligner::scoreKernel(std::vector> GpuAligner::scoreKernel(std::vector> GpuAligner::scoreKernel(std::vector kmerRanksDevPointers; - std::vector kmerRanksRCDevPointers; std::vector returnValuesDevResultsPointers; std::vector returnValuesHostResultsPointers; diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp index f89dc3ca..33d7e8c9 100644 --- a/src/nanopolish_call_variants.cpp +++ b/src/nanopolish_call_variants.cpp @@ -287,19 +287,13 @@ std::vector generate_candidate_single_base_edits(const AlignmentDB& ali int region_end, uint32_t alignment_flags) { - auto start = std::chrono::high_resolution_clock::now(); - + printf("In the outer loop, %i, %i\n",region_start, region_end); std::vector out_variants; - std::vector out_variants_gpu; std::string contig = alignments.get_region_contig(); // Add all positively-scoring single-base changes into the candidate set - - auto scoring = std::chrono::high_resolution_clock::now() - std::chrono::high_resolution_clock::now(); - auto gpu_exec = std::chrono::high_resolution_clock::now() - std::chrono::high_resolution_clock::now(); - GpuAligner aligner; for(size_t i = region_start; i < region_end; ++i) { @@ -355,8 +349,6 @@ std::vector generate_candidate_single_base_edits(const AlignmentDB& ali calling_start, alignments.get_reference_substring(contig, calling_start, calling_end)); - auto t0_gpu = std::chrono::high_resolution_clock::now(); - if (opt::gpu){ std::vector scoredVariants = aligner.variantScoresThresholded(tmp_variants, test_haplotype, event_sequences, alignment_flags, opt::screen_score_threshold, @@ -375,7 +367,6 @@ std::vector generate_candidate_single_base_edits(const AlignmentDB& ali alignment_flags, opt::screen_score_threshold, opt::methylation_types); - auto t1 = std::chrono::high_resolution_clock::now(); scored_variant.info = ""; if (scored_variant.quality > 0) { out_variants.push_back(scored_variant); @@ -383,15 +374,6 @@ std::vector generate_candidate_single_base_edits(const AlignmentDB& ali } } } - - auto end = std::chrono::high_resolution_clock::now(); - - auto duration = std::chrono::duration_cast( end - start ).count(); - - auto screening = std::chrono::duration_cast(scoring).count(); - - auto gpu_screening = std::chrono::duration_cast(gpu_exec).count(); - return out_variants; } From 20eca32d86cd440d6a5bc9d156cf41ddb7af360d Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Thu, 12 Jul 2018 16:24:29 +0100 Subject: [PATCH 38/80] Removed print statement --- src/nanopolish_call_variants.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp index 33d7e8c9..6a3cd816 100644 --- a/src/nanopolish_call_variants.cpp +++ b/src/nanopolish_call_variants.cpp @@ -287,7 +287,6 @@ std::vector generate_candidate_single_base_edits(const AlignmentDB& ali int region_end, uint32_t alignment_flags) { - printf("In the outer loop, %i, %i\n",region_start, region_end); std::vector out_variants; std::string contig = alignments.get_region_contig(); From ca1796f477097b2c93297023cca8bde1b7913cb2 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Fri, 13 Jul 2018 13:30:26 +0100 Subject: [PATCH 39/80] Async kernel invocations for improved occupancy --- src/cuda_kernels/GpuAligner.cu | 32 ++--- src/nanopolish_call_variants.cpp | 222 ++++++++++++++++++++----------- 2 files changed, 163 insertions(+), 91 deletions(-) diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index 830bb3e8..e31da5b8 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -458,18 +458,18 @@ std::vector> GpuAligner::scoreKernel(std::vector> GpuAligner::scoreKernel(std::vector> GpuAligner::scoreKernel(std::vector> GpuAligner::scoreKernel(std::vector #include - +#include +#include +#include // Macros #define max3(x,y,z) std::max(std::max(x,y), z) @@ -281,97 +283,165 @@ void annotate_with_all_support(std::vector& variants, } } -// Given the input region, calculate all single base edits to the current assembly -std::vector generate_candidate_single_base_edits(const AlignmentDB& alignments, - int region_start, - int region_end, - uint32_t alignment_flags) -{ - std::vector out_variants; - - std::string contig = alignments.get_region_contig(); - // Add all positively-scoring single-base changes into the candidate set +void singleLocusBaseEditCandidate(int i, + const AlignmentDB& alignments, + uint32_t alignment_flags, + std::vector &out_variants, + std::string contig, + GpuAligner &aligner, + std::mutex &outVariantsMutex +){ - GpuAligner aligner; + int calling_start = i - opt::screen_flanking_sequence; + int calling_end = i + 1 + opt::screen_flanking_sequence; - for(size_t i = region_start; i < region_end; ++i) { + if(!alignments.are_coordinates_valid(contig, calling_start, calling_end)) { + return; + } - int calling_start = i - opt::screen_flanking_sequence; - int calling_end = i + 1 + opt::screen_flanking_sequence; + std::vector tmp_variants; + for(size_t j = 0; j < 4; ++j) { + // Substitutions + Variant v; + v.ref_name = contig; + v.ref_position = i; + v.ref_seq = alignments.get_reference_substring(contig, i, i); + v.alt_seq = "ACGT"[j]; + + if(v.ref_seq != v.alt_seq) { + tmp_variants.push_back(v); + } - if(!alignments.are_coordinates_valid(contig, calling_start, calling_end)) { - continue; + // Insertions + v.alt_seq = v.ref_seq + "ACGT"[j]; + // ignore insertions of the type "A" -> "AA" as these are redundant + if(v.alt_seq[1] != v.ref_seq[0]) { + tmp_variants.push_back(v); } + } - std::vector tmp_variants; - for(size_t j = 0; j < 4; ++j) { - // Substitutions - Variant v; - v.ref_name = contig; - v.ref_position = i; - v.ref_seq = alignments.get_reference_substring(contig, i, i); - v.alt_seq = "ACGT"[j]; - - if(v.ref_seq != v.alt_seq) { - tmp_variants.push_back(v); - } + // deletion + Variant del; + del.ref_name = contig; + del.ref_position = i - 1; + del.ref_seq = alignments.get_reference_substring(contig, i - 1, i); + del.alt_seq = del.ref_seq[0]; - // Insertions - v.alt_seq = v.ref_seq + "ACGT"[j]; - // ignore insertions of the type "A" -> "AA" as these are redundant - if(v.alt_seq[1] != v.ref_seq[0]) { - tmp_variants.push_back(v); - } - } + // ignore deletions of the type "AA" -> "A" as these are redundant + if(del.alt_seq[0] != del.ref_seq[1]) { + tmp_variants.push_back(del); + } - // deletion - Variant del; - del.ref_name = contig; - del.ref_position = i - 1; - del.ref_seq = alignments.get_reference_substring(contig, i - 1, i); - del.alt_seq = del.ref_seq[0]; - - // ignore deletions of the type "AA" -> "A" as these are redundant - if(del.alt_seq[0] != del.ref_seq[1]) { - tmp_variants.push_back(del); + // Screen variants by score + // We do this internally here as it is much faster to get the event sequences + // for the entire window for all variants at this position once, rather than + // for each variant individually + std::vector event_sequences = + alignments.get_event_subsequences(contig, calling_start, calling_end); + + Haplotype test_haplotype(contig, + calling_start, + alignments.get_reference_substring(contig, calling_start, calling_end)); + + if (opt::gpu){ + std::vector scoredVariants = aligner.variantScoresThresholded(tmp_variants, test_haplotype, event_sequences, + alignment_flags, opt::screen_score_threshold, + opt::methylation_types); + for (auto variant: scoredVariants){ + if (variant.quality > 0) { + std::lock_guard lock(outVariantsMutex); + out_variants.push_back(variant); + } + } + } else { + for (const Variant &v : tmp_variants) { + auto t0 = std::chrono::high_resolution_clock::now(); + Variant scored_variant = score_variant_thresholded(v, + test_haplotype, + event_sequences, + alignment_flags, + opt::screen_score_threshold, + opt::methylation_types); + scored_variant.info = ""; + if (scored_variant.quality > 0) { + out_variants.push_back(scored_variant); + } } + } +} - // Screen variants by score - // We do this internally here as it is much faster to get the event sequences - // for the entire window for all variants at this position once, rather than - // for each variant individually - std::vector event_sequences = - alignments.get_event_subsequences(contig, calling_start, calling_end); +// Given the input region, calculate all single base edits to the current assembly +std::vector generate_candidate_single_base_edits(const AlignmentDB& alignments, + int region_start, + int region_end, + uint32_t alignment_flags){ + std::vector out_variants; + std::string contig = alignments.get_region_contig(); + std::mutex outVariantsMutex; - Haplotype test_haplotype(contig, - calling_start, - alignments.get_reference_substring(contig, calling_start, calling_end)); + // Add all positively-scoring single-base changes into the candidate set + if (opt::gpu){ + size_t num_workers = 8; + std::vector gpuAligners(num_workers); + + //std::vector workerThreads(num_workers); + std::vector> handles(num_workers); + int nextLocus = region_start; + + //Initialise workers + for (int workerIdx=0; workerIdx scoredVariants = aligner.variantScoresThresholded(tmp_variants, test_haplotype, event_sequences, - alignment_flags, opt::screen_score_threshold, - opt::methylation_types); - for (auto variant: scoredVariants){ - if (variant.quality > 0) { - out_variants.push_back(variant); - } - } - } else { - for (const Variant &v : tmp_variants) { - auto t0 = std::chrono::high_resolution_clock::now(); - Variant scored_variant = score_variant_thresholded(v, - test_haplotype, - event_sequences, - alignment_flags, - opt::screen_score_threshold, - opt::methylation_types); - scored_variant.info = ""; - if (scored_variant.quality > 0) { - out_variants.push_back(scored_variant); + //Round robin the workers until done + while(nextLocus < region_end){ + for (int i = 0; i Date: Mon, 16 Jul 2018 12:00:11 +0100 Subject: [PATCH 40/80] Adding restrict flag to nvcc --- Makefile | 2 +- src/cuda_kernels/GpuAligner.cu | 7 ------- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/Makefile b/Makefile index 5d96b37b..0d86b514 100644 --- a/Makefile +++ b/Makefile @@ -15,7 +15,7 @@ CFLAGS ?= -std=c99 -O3 CXX ?= g++ CC ?= gcc NVCC = nvcc -NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -O3 -use_fast_math --default-stream per-thread +NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -O3 -use_fast_math --default-stream per-thread -restrict CURTFLAGS ?= -L/usr/local/cuda-9.0/lib64 -lcudart # Change the value of HDF5, EIGEN, or HTS below to any value to disable compilation of bundled code diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index e31da5b8..f12b3a0e 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -210,10 +210,6 @@ __global__ void getScores(float * const eventData, sum = HMT_FROM_SAME_M; sum = logsumexpf(sum, HMT_FROM_SAME_B); sum += lp_emission_b; - //sum = logsumexpf(sum, HMT_FROM_PREV_B); - //sum = logsumexpf(sum, HMT_FROM_PREV_K); - //sum = logsumexpf(sum, HMT_FROM_SOFT); - //sum = logsumexpf(sum, HMT_FROM_PREV_M); float newBadEventScore = sum; @@ -232,9 +228,6 @@ __global__ void getScores(float * const eventData, sum = HMT_FROM_PREV_M; sum = logsumexpf(sum, HMT_FROM_PREV_B); sum = logsumexpf(sum, HMT_FROM_PREV_K); - //sum = logsumexpf(sum, HMT_FROM_SAME_M); - //sum = logsumexpf(sum, HMT_FROM_SAME_B); - //sum = logsumexpf(sum, HMT_FROM_SOFT); float newSkipScore = sum; From 8b020be7c72159ab475789af7ddf7a9660e044e9 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Mon, 16 Jul 2018 16:15:35 +0100 Subject: [PATCH 41/80] transferring means data and pre/post-flanks --- Makefile | 4 +- src/cuda_kernels/GpuAligner.cu | 87 ++++++++++++++++++++++++++++++++ src/cuda_kernels/GpuAligner.h | 9 ++++ src/nanopolish_call_variants.cpp | 2 +- 4 files changed, 99 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 0d86b514..bb339cce 100644 --- a/Makefile +++ b/Makefile @@ -10,12 +10,12 @@ SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/ali #Basic flags every build needs LIBS=-lz CXXFLAGS ?= -O3 -CXXFLAGS += -std=c++11 -fopenmp -fsigned-char +CXXFLAGS += -std=c++11 -fopenmp -fsigned-char -g CFLAGS ?= -std=c99 -O3 CXX ?= g++ CC ?= gcc NVCC = nvcc -NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -O3 -use_fast_math --default-stream per-thread -restrict +NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -O3 -use_fast_math --default-stream per-thread -restrict -g CURTFLAGS ?= -L/usr/local/cuda-9.0/lib64 -lcudart # Change the value of HDF5, EIGEN, or HTS below to any value to disable compilation of bundled code diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index f12b3a0e..550b3a9a 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -364,6 +364,75 @@ GpuAligner::~GpuAligner() { } +std::vector>> GpuAligner::scoreKernelMod(std::vector scoreSets, + uint32_t alignment_flags){ + std::vector>> result(scoreSets.size()); + + int numScores = 0; + int numScoreSets = scoreSets.size(); // the number of sequence/read sets to be scored + + std::vector> read_lengths(numScoreSets); + std::vector> e_starts(numScoreSets); + std::vector> event_strides(numScoreSets); + + //Each sequence-event combination is its own thread and requires the following information: + //1. Event offsets (raw data offset) + //2. Sequence offset + //3. Event length (How long it will need to run before computing the score) + //4. Other sequence/event specific data + + // STEP1. Unpack read data. + // STEP2. Unpack sequence data. + // STEP3. Prepare buffers for job (thread) - specific data e.g read lengths, sequence lengths, read and sequence indexes etc. This can also be done on the fly. + + size_t rawReadOffset = 0; + size_t numEventsTotal = 0; + std::vector eventOffsets; //offsets of all the raw reads + + for (int scoreSetIdx=0; scoreSetIdx < numScoreSets; scoreSetIdx++){ + auto &scoreSet = scoreSets[scoreSetIdx]; + + //First unpack per-read data from the scoreSet + for (int eventSequenceIdx=0; eventSequenceIdx < scoreSet.rawData.size();eventSequenceIdx++){ + auto e = scoreSet.rawData[eventSequenceIdx]; + int e_start = e.event_start_idx; + + e_starts[scoreSetIdx].push_back(e_start); + + int e_stride = e.event_stride; + event_strides[scoreSetIdx].push_back(e_stride); + + uint32_t e_end = e.event_stop_idx; + uint32_t n_events = 0; + if(e_end > e_start) + n_events = e_end - e_start + 1; + else + n_events = e_start - e_end + 1; + + read_lengths[scoreSetIdx].push_back(n_events); + numEventsTotal += n_events; + + eventOffsets.push_back(rawReadOffset); + + std::vector pre_flank = make_pre_flanking(e, e_start, n_events); + std::vector post_flank = make_post_flanking(e, e_start, n_events); + + for (int i=0;iget_drift_scaled_level(event_idx, e.strand); // send the data in drift scaled + eventMeans[rawReadOffset + i] = scaled; + + //populate the pre/post-flanking data, since it has a 1-1 correspondence with events + preFlankingHost[rawReadOffset + i] = pre_flank[i]; + postFlankingHost[rawReadOffset + i] = post_flank[i]; + } + rawReadOffset += n_events; + } + } + + return result; +} + std::vector> GpuAligner::scoreKernel(std::vector sequences, std::vector event_sequences, uint32_t alignment_flags){ @@ -560,6 +629,7 @@ std::vector> GpuAligner::scoreKernel(std::vector GpuAligner::variantScoresThresholded(std::vector input_variants, Haplotype base_haplotype, std::vector event_sequences, @@ -594,6 +664,23 @@ std::vector GpuAligner::variantScoresThresholded(std::vector i if (!event_sequences.empty()) { std::vector> scores = scoreKernel(sequences, event_sequences, alignment_flags); + + // Now try it with the new method + ScoreSet s = { + sequences, + event_sequences + }; + + std::vector scoreSets(1,s); + + std::vector>> scoresMod = scoreKernelMod(scoreSets, alignment_flags); + +// for (int i=0; i stateSequences; + std::vector rawData; +} ScoreSet; + class GpuAligner { public: @@ -56,6 +62,8 @@ class GpuAligner std::vector> scoreKernel(std::vector sequences, std::vector event_sequences, uint32_t alignment_flags); + std::vector>> scoreKernelMod(std::vector scoreSets, + uint32_t alignment_flags); private: float* scaleDev; float* shiftDev; @@ -89,4 +97,5 @@ class GpuAligner cudaStream_t streams[8]; // TODO 8 should not be hardcoded here }; + #endif // GPU_ALIGNER_H diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp index 62992bce..2cd5982e 100644 --- a/src/nanopolish_call_variants.cpp +++ b/src/nanopolish_call_variants.cpp @@ -382,7 +382,7 @@ std::vector generate_candidate_single_base_edits(const AlignmentDB& ali // Add all positively-scoring single-base changes into the candidate set if (opt::gpu){ - size_t num_workers = 8; + size_t num_workers = 1; std::vector gpuAligners(num_workers); //std::vector workerThreads(num_workers); From 05a8896ec5503ff34e8c15e3f1c6c7d6bae4f569 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Thu, 19 Jul 2018 15:45:13 +0100 Subject: [PATCH 42/80] WIP - modifications to kernel for performance improvments --- Makefile | 6 +- src/common/nanopolish_variant.cpp | 2 +- src/cuda_kernels/GpuAligner.cu | 606 ++++++++++++++++++++++---- src/cuda_kernels/GpuAligner.h | 28 +- src/hmm/nanopolish_profile_hmm_r7.inl | 10 +- src/nanopolish_call_variants.cpp | 164 +++---- 6 files changed, 645 insertions(+), 171 deletions(-) diff --git a/Makefile b/Makefile index bb339cce..bf0324da 100644 --- a/Makefile +++ b/Makefile @@ -9,13 +9,13 @@ SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/ali #Basic flags every build needs LIBS=-lz -CXXFLAGS ?= -O3 +CXXFLAGS ?= -O0 CXXFLAGS += -std=c++11 -fopenmp -fsigned-char -g -CFLAGS ?= -std=c99 -O3 +CFLAGS ?= -std=c99 -O0 CXX ?= g++ CC ?= gcc NVCC = nvcc -NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -O3 -use_fast_math --default-stream per-thread -restrict -g +NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0include -O0 -use_fast_math --default-stream per-thread -restrict -g -G CURTFLAGS ?= -L/usr/local/cuda-9.0/lib64 -lcudart # Change the value of HDF5, EIGEN, or HTS below to any value to disable compilation of bundled code diff --git a/src/common/nanopolish_variant.cpp b/src/common/nanopolish_variant.cpp index 725a62ab..b73a6b2b 100644 --- a/src/common/nanopolish_variant.cpp +++ b/src/common/nanopolish_variant.cpp @@ -664,7 +664,7 @@ std::vector multi_call(VariantGroup& variant_group, // Variant score_variant_thresholded(const Variant& input_variant, Haplotype base_haplotype, - const std::vector& input, + const std::vector& input, // raw reads (I think) const uint32_t alignment_flags, const uint32_t score_threshold, const std::vector& methylation_types) diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index 550b3a9a..7150b3dc 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -4,11 +4,11 @@ #include #include "nanopolish_profile_hmm_r9.h" -#define MAX_STATES 128 +#define MAX_STATES 512 #define EXPAND_TO_STRING(X) #X #define TO_STRING(X) EXPAND_TO_STRING(X) -#define CU_CHECK_ERR(X) if (X != cudaSuccess){printf("CUDA error: %s at line %s\n", cudaGetErrorString(X), TO_STRING(__LINE__));} +#define CU_CHECK_ERR(X) if (X != cudaSuccess){printf("CUDA error: %s at line %s\n", cudaGetErrorString(X), TO_STRING(__LINE__));throw std::runtime_error("CUDA ERRROR");} __device__ float logsumexpf(float x, float y){ if(x == -INFINITY && y == -INFINITY){ @@ -41,6 +41,241 @@ __device__ float lp_match_r9(int rank, } + + +__global__ void getScoresMod (float * poreModelDev, + int * readLengthsDev, + int * eventStartsDev, + int * eventStridesDev, + float * eventsPerBaseDev, + float * scaleDev, + float * shiftDev, + float * varDev, + float * logVarDev, + int * eventOffsetsDev, + float * eventMeansDev, + float * preFlankingDev, + float * postFlankingDev, + int * sequenceLengthsDev, + int * sequenceOffsetsDev, + int * kmerRanksDev, + int * seqIdxDev, + int * readIdxDev, + float * returnValuesDev){ + + // get buffer indices + int scoreIdx = threadIdx.x; + int readIdx = readIdxDev[scoreIdx]; + int seqIdx = seqIdxDev[scoreIdx]; + + // get read statistics + int numEvents = readLengthsDev[readIdx]; + int readOffset = eventOffsetsDev[readIdx]; + float read_events_per_base = eventsPerBaseDev[readIdx]; + int e_start = eventStartsDev[readIdx]; // Event start for read + int e_stride = eventStridesDev[readIdx]; + int e_offset = eventOffsetsDev[readIdx]; // Within the event means etc, the offset needed for this block to get a specific event + float scale = scaleDev[readIdx]; + float shift = shiftDev[readIdx]; + float var = varDev[readIdx]; + float logVar = logVarDev[readIdx]; + + // get sequence statistics + int numKmers = sequenceLengthsDev[seqIdx]; + + int lastRowIdx = numEvents -1; + int lastKmerIdx = numKmers - 1; + + float returnValue = -INFINITY; //Used to sum over the last column. + float prevProbabilities[MAX_STATES]; + + int numBlocks = numKmers + 2; + int numStates = numBlocks * PSR9_NUM_STATES; // 3 blocks per kmer and then 3 each for start and end state. + + // Initialise the prev probabilities vector + for (int i = 0; i < numStates - PSR9_NUM_STATES; i++) { + prevProbabilities[i] = -INFINITY; + } + for (int i = numStates - PSR9_NUM_STATES; i < numStates; i++) { + prevProbabilities[i] = 0.0f; + } + + bool rc = false; + if (e_stride == -1){ + rc = true; + } + + //int kmerIdx = threadIdx.x; + uint32_t rank; + + float p_stay = 1 - (1 / read_events_per_base); + float p_skip = 0.0025; + float p_bad = 0.001; + float p_bad_self = p_bad; + float p_skip_self = 0.3; + float p_mk = p_skip; // probability of not observing an event at all + float p_mb = p_bad; // probabilty of observing a bad event + float p_mm_self = p_stay; // probability of observing additional events from this k-mer + float p_mm_next = 1.0f - p_mm_self - p_mk - p_mb; // normal movement from state to state + // transitions from event split state in previous block + float p_bb = p_bad_self; + float p_bk, p_bm_next, p_bm_self; + p_bk = p_bm_next = p_bm_self = (1.0f - p_bb) / 3; + // transitions from kmer skip state in previous block + float p_kk = p_skip_self; + float p_km = 1.0f - p_kk; + // We assign some transition probabilities. I believe this is correct and they don't vary by location in the sequence + float lp_mk = logf(p_mk); + float lp_mb = logf(p_mb); + float lp_mm_self = logf(p_mm_self); + float lp_mm_next = logf(p_mm_next); + float lp_bb = logf(p_bb); + float lp_bk = logf(p_bk); + float lp_bm_next = logf(p_bm_next); + float lp_bm_self = logf(p_bm_self); + float lp_kk = logf(p_kk); + float lp_km = logf(p_km); + float lp_sm, lp_ms; + lp_sm = lp_ms = 0.0f; + + // the penalty is controlled by the transition probability + float BAD_EVENT_PENALTY = 0.0f; + + //Fill out the dynamic programming table + for(int row=1; row>> GpuAligner::scoreKernelMod(std::vector scoreSets, - uint32_t alignment_flags){ - std::vector>> result(scoreSets.size()); - - int numScores = 0; - int numScoreSets = scoreSets.size(); // the number of sequence/read sets to be scored + uint32_t alignment_flags){ - std::vector> read_lengths(numScoreSets); - std::vector> e_starts(numScoreSets); - std::vector> event_strides(numScoreSets); + int numEventsTotal = 0; // The number of events across all scoreSets + int numSequences = 0; // The number of sequences across all scoreSets + int kmerOffset = 0; + int numReads = 0; // The number of reads across all scoreSets + int numBases = 0; + int numScoreSets = scoreSets.size(); - //Each sequence-event combination is its own thread and requires the following information: - //1. Event offsets (raw data offset) - //2. Sequence offset - //3. Event length (How long it will need to run before computing the score) - //4. Other sequence/event specific data - - // STEP1. Unpack read data. - // STEP2. Unpack sequence data. - // STEP3. Prepare buffers for job (thread) - specific data e.g read lengths, sequence lengths, read and sequence indexes etc. This can also be done on the fly. - - size_t rawReadOffset = 0; - size_t numEventsTotal = 0; - std::vector eventOffsets; //offsets of all the raw reads + int rawReadOffset = 0; + int globalReadIdx = 0; + int globalSequenceIdx = 0; + int globalScoreIdx = 0; + //Loop over every scoreset, filling out buffers and counters for (int scoreSetIdx=0; scoreSetIdx < numScoreSets; scoreSetIdx++){ auto &scoreSet = scoreSets[scoreSetIdx]; - - //First unpack per-read data from the scoreSet + int firstReadIdxinScoreSet = globalReadIdx; + //Read data for (int eventSequenceIdx=0; eventSequenceIdx < scoreSet.rawData.size();eventSequenceIdx++){ auto e = scoreSet.rawData[eventSequenceIdx]; - int e_start = e.event_start_idx; + numReads++; + + //Read statistics - populate host buffers + scaleHost[globalReadIdx] = e.read->scalings[e.strand].scale; + shiftHost[globalReadIdx] = e.read->scalings[e.strand].shift; + varHost[globalReadIdx] = e.read->scalings[e.strand].var; + logVarHost[globalReadIdx] = e.read->scalings[e.strand].log_var; - e_starts[scoreSetIdx].push_back(e_start); + int e_start = e.event_start_idx; + eventStartsHost[globalReadIdx] = e_start; int e_stride = e.event_stride; - event_strides[scoreSetIdx].push_back(e_stride); + eventStridesHost[globalReadIdx] = e_stride; uint32_t e_end = e.event_stop_idx; - uint32_t n_events = 0; + uint32_t n_events; if(e_end > e_start) n_events = e_end - e_start + 1; else n_events = e_start - e_end + 1; - - read_lengths[scoreSetIdx].push_back(n_events); + readLengthsHost[globalReadIdx] = n_events; numEventsTotal += n_events; - eventOffsets.push_back(rawReadOffset); + eventOffsetsHost[globalReadIdx] = rawReadOffset; + + float readEventsPerBase = e.read->events_per_base[e.strand]; + eventsPerBaseHost[globalReadIdx] = readEventsPerBase; std::vector pre_flank = make_pre_flanking(e, e_start, n_events); std::vector post_flank = make_post_flanking(e, e_start, n_events); @@ -426,7 +700,170 @@ std::vector>> GpuAligner::scoreKernelMod(std::ve preFlankingHost[rawReadOffset + i] = pre_flank[i]; postFlankingHost[rawReadOffset + i] = post_flank[i]; } + rawReadOffset += n_events; + globalReadIdx++; + } + //Pore Model + const uint32_t k = scoreSets[0].rawData[0].pore_model->k; //k is the length of a kmer in the pore model + if (poreModelInitialized == false) { + int num_states = scoreSets[0].rawData[0].pore_model->states.size(); + int poreModelEntriesPerState = 3; + for(int st=0; ststates[st]; + poreModelHost[st * poreModelEntriesPerState] = params.level_mean; + poreModelHost[st * poreModelEntriesPerState + 1] = params.level_stdv; + poreModelHost[st * poreModelEntriesPerState + 2] = params.level_log_stdv; + } + // copy over the pore model + CU_CHECK_ERR(cudaMemcpyAsync(poreModelDev, poreModelHost, + poreModelEntriesPerState * 4096 * sizeof(float), cudaMemcpyHostToDevice, streams[0])); // TODO don't hardcode num kmers + poreModelInitialized = true; + } + // Sequences + // Sequences + auto & sequences = scoreSet.stateSequences; + numSequences += sequences.size(); + + for (int i = 0; i>> (poreModelDev, + readLengthsDev, + eventStartsDev, + eventStridesDev, + eventsPerBaseDev, + scaleDev, + shiftDev, + varDev, + logVarDev, + eventOffsetsDev, + eventMeansDev, + preFlankingDev, + postFlankingDev, + sequenceLengthsDev, + sequenceOffsetsDev, + kmerRanksDev, + seqIdxDev, + readIdxDev, + scoresDev); + cudaError_t err = cudaGetLastError(); + + if (err != cudaSuccess) + printf("Errors during kernel execution: %s\n", cudaGetErrorString(err)); + + cudaMemcpyAsync(returnValuesHost, scoresDev, + globalScoreIdx * sizeof(float), cudaMemcpyDeviceToHost, streams[0]); + cudaStreamSynchronize(streams[0]); + + //Unpack results + int k = 0; + std::vector>> result(scoreSets.size()); + + for(int scoreSetIdx=0; scoreSetIdx seqScores; + for (int readIdx=0; readIdx> GpuAligner::scoreKernel(std::vectorstates.size(); //Populating read-statistics buffers std::vector scale(num_reads); @@ -520,19 +955,21 @@ std::vector> GpuAligner::scoreKernel(std::vector> GpuAligner::scoreKernel(std::vector> GpuAligner::scoreKernel(std::vector> GpuAligner::scoreKernel(std::vector>> (eventMeansDev, eventsPerBaseDev, @@ -604,6 +1042,10 @@ std::vector> GpuAligner::scoreKernel(std::vector> GpuAligner::scoreKernel(std::vector GpuAligner::variantScoresThresholded(std::vector input_variants, - Haplotype base_haplotype, - std::vector event_sequences, - uint32_t alignment_flags, - int screen_score_threshold, - std::vector methylation_types) { + Haplotype base_haplotype, + std::vector event_sequences, + uint32_t alignment_flags, + int screen_score_threshold, + std::vector methylation_types) { int numVariants = input_variants.size(); std::vector out_variants = input_variants; @@ -675,12 +1117,6 @@ std::vector GpuAligner::variantScoresThresholded(std::vector i std::vector>> scoresMod = scoreKernelMod(scoreSets, alignment_flags); -// for (int i=0; i stateSequences; - std::vector rawData; + std::vector &stateSequences; + std::vector &rawData; } ScoreSet; class GpuAligner @@ -85,6 +85,30 @@ class GpuAligner float* poreModelLevelMeanDev; float* poreModelDev; float* poreModelHost; + int * sequenceOffsetsDev; + + // NEW - for MOD kernel + int * readLengthsHost; + int * eventStartsHost; + int * eventStridesHost; + float * eventsPerBaseHost; + float * scaleHost; + float * shiftHost; + float * varHost; + float * logVarHost; + int * sequenceLengthsHost; + int * eventOffsetsHost; + int * sequenceOffsetsHost; + int * readIdxHost; + int * seqIdxHost; + + int * readLengthsDev; + int * sequenceLengthsDev; + int * readIdxDev; + int * seqIdxDev; + + float * returnValuesHost; + float * scoresDev; int * kmerRanks; int * kmerRanksDev; diff --git a/src/hmm/nanopolish_profile_hmm_r7.inl b/src/hmm/nanopolish_profile_hmm_r7.inl index bf0edd28..3fe4b309 100644 --- a/src/hmm/nanopolish_profile_hmm_r7.inl +++ b/src/hmm/nanopolish_profile_hmm_r7.inl @@ -306,9 +306,13 @@ inline float profile_hmm_fill_generic_r7(const HMMInputSequence& _sequence, assert( data.pore_model->states.size() == sequence.get_num_kmer_ranks(k) ); std::vector kmer_ranks(num_kmers); - for(size_t ki = 0; ki < num_kmers; ++ki) - kmer_ranks[ki] = sequence.get_kmer_rank(ki, k, data.rc); - + for(size_t ki = 0; ki < num_kmers; ++ki) { + int rank = sequence.get_kmer_rank(ki, k, data.rc); + if(rank>4096){ + printf("Rank: %i", rank); + } + kmer_ranks[ki] = rank; + } size_t num_events = output.get_num_rows() - 1; std::vector pre_flank = make_pre_flanking_r7(data, parameters, e_start, num_events); diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp index 2cd5982e..1b01ff2c 100644 --- a/src/nanopolish_call_variants.cpp +++ b/src/nanopolish_call_variants.cpp @@ -292,16 +292,16 @@ void singleLocusBaseEditCandidate(int i, GpuAligner &aligner, std::mutex &outVariantsMutex ){ - +try { int calling_start = i - opt::screen_flanking_sequence; int calling_end = i + 1 + opt::screen_flanking_sequence; - if(!alignments.are_coordinates_valid(contig, calling_start, calling_end)) { + if (!alignments.are_coordinates_valid(contig, calling_start, calling_end)) { return; } std::vector tmp_variants; - for(size_t j = 0; j < 4; ++j) { + for (size_t j = 0; j < 4; ++j) { // Substitutions Variant v; v.ref_name = contig; @@ -309,14 +309,14 @@ void singleLocusBaseEditCandidate(int i, v.ref_seq = alignments.get_reference_substring(contig, i, i); v.alt_seq = "ACGT"[j]; - if(v.ref_seq != v.alt_seq) { + if (v.ref_seq != v.alt_seq) { tmp_variants.push_back(v); } // Insertions v.alt_seq = v.ref_seq + "ACGT"[j]; // ignore insertions of the type "A" -> "AA" as these are redundant - if(v.alt_seq[1] != v.ref_seq[0]) { + if (v.alt_seq[1] != v.ref_seq[0]) { tmp_variants.push_back(v); } } @@ -329,7 +329,7 @@ void singleLocusBaseEditCandidate(int i, del.alt_seq = del.ref_seq[0]; // ignore deletions of the type "AA" -> "A" as these are redundant - if(del.alt_seq[0] != del.ref_seq[1]) { + if (del.alt_seq[0] != del.ref_seq[1]) { tmp_variants.push_back(del); } @@ -344,16 +344,18 @@ void singleLocusBaseEditCandidate(int i, calling_start, alignments.get_reference_substring(contig, calling_start, calling_end)); - if (opt::gpu){ - std::vector scoredVariants = aligner.variantScoresThresholded(tmp_variants, test_haplotype, event_sequences, - alignment_flags, opt::screen_score_threshold, + if (opt::gpu) { + std::vector scoredVariants = aligner.variantScoresThresholded(tmp_variants, test_haplotype, + event_sequences, + alignment_flags, + opt::screen_score_threshold, opt::methylation_types); - for (auto variant: scoredVariants){ - if (variant.quality > 0) { - std::lock_guard lock(outVariantsMutex); - out_variants.push_back(variant); - } - } + for (auto variant: scoredVariants) { + if (variant.quality > 0) { + std::lock_guard lock(outVariantsMutex); + out_variants.push_back(variant); + } + } } else { for (const Variant &v : tmp_variants) { auto t0 = std::chrono::high_resolution_clock::now(); @@ -369,6 +371,9 @@ void singleLocusBaseEditCandidate(int i, } } } +}catch (std::exception &e){ + printf("Exception in thread! %s\n", e.what()); +} } // Given the input region, calculate all single base edits to the current assembly @@ -376,74 +381,79 @@ std::vector generate_candidate_single_base_edits(const AlignmentDB& ali int region_start, int region_end, uint32_t alignment_flags){ - std::vector out_variants; - std::string contig = alignments.get_region_contig(); - std::mutex outVariantsMutex; - - // Add all positively-scoring single-base changes into the candidate set - if (opt::gpu){ - size_t num_workers = 1; - std::vector gpuAligners(num_workers); - - //std::vector workerThreads(num_workers); - std::vector> handles(num_workers); - int nextLocus = region_start; - - //Initialise workers - for (int workerIdx=0; workerIdx out_variants; + std::string contig = alignments.get_region_contig(); + std::mutex outVariantsMutex; + + // Add all positively-scoring single-base changes into the candidate set + if (opt::gpu) { + size_t num_workers = 1; + std::vector gpuAligners(num_workers); + + //std::vector workerThreads(num_workers); + std::vector> handles(num_workers); + int nextLocus = region_start; + + //Initialise workers + for (int workerIdx = 0; workerIdx < num_workers; workerIdx++) { + auto aligner = std::ref(gpuAligners[workerIdx]); + if (nextLocus < region_end) { + handles[workerIdx] = std::async(std::launch::async, + singleLocusBaseEditCandidate, + nextLocus, + std::ref(alignments), + alignment_flags, + std::ref(out_variants), + std::ref(contig), + aligner, + std::ref(outVariantsMutex)); + nextLocus++; + } + } - //Round robin the workers until done - while(nextLocus < region_end){ - for (int i = 0; i Date: Fri, 20 Jul 2018 13:26:10 +0100 Subject: [PATCH 43/80] Both Kernels giving similar but not identical results --- Makefile | 2 +- src/cuda_kernels/GpuAligner.cu | 143 ++++++++++++++++++--------------- 2 files changed, 77 insertions(+), 68 deletions(-) diff --git a/Makefile b/Makefile index bf0324da..a213f8c8 100644 --- a/Makefile +++ b/Makefile @@ -15,7 +15,7 @@ CFLAGS ?= -std=c99 -O0 CXX ?= g++ CC ?= gcc NVCC = nvcc -NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0include -O0 -use_fast_math --default-stream per-thread -restrict -g -G +NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0include -O0 -use_fast_math --default-stream per-thread -restrict -g CURTFLAGS ?= -L/usr/local/cuda-9.0/lib64 -lcudart # Change the value of HDF5, EIGEN, or HTS below to any value to disable compilation of bundled code diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index 7150b3dc..030881c2 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -41,8 +41,6 @@ __device__ float lp_match_r9(int rank, } - - __global__ void getScoresMod (float * poreModelDev, int * readLengthsDev, int * eventStartsDev, @@ -63,6 +61,11 @@ __global__ void getScoresMod (float * poreModelDev, int * readIdxDev, float * returnValuesDev){ + bool debug = false; + if ((threadIdx.x == 0) && (blockIdx.x == 0)){ + debug = false; + } + // get buffer indices int scoreIdx = threadIdx.x; int readIdx = readIdxDev[scoreIdx]; @@ -82,6 +85,9 @@ __global__ void getScoresMod (float * poreModelDev, // get sequence statistics int numKmers = sequenceLengthsDev[seqIdx]; + int seqOffset = sequenceOffsetsDev[seqIdx]; + + printf("This is thread %i, seqIdx is %i, readIdx is %i, numKmers is %i, seqOffset is %i\n", threadIdx.x, seqIdx, readIdx, numKmers, seqOffset); int lastRowIdx = numEvents -1; int lastKmerIdx = numKmers - 1; @@ -92,6 +98,13 @@ __global__ void getScoresMod (float * poreModelDev, int numBlocks = numKmers + 2; int numStates = numBlocks * PSR9_NUM_STATES; // 3 blocks per kmer and then 3 each for start and end state. + if(debug){ + printf("Kernel 1 >>> Num Kmers is %i\n", numKmers); + printf("Kernel 1 >>> n_states %i\n", numStates); + printf("Kernel 1 >>> num events in read is %i\n", numEvents); + printf("Kernel 1 >>> event offset is %i\n", e_offset); + } + // Initialise the prev probabilities vector for (int i = 0; i < numStates - PSR9_NUM_STATES; i++) { prevProbabilities[i] = -INFINITY; @@ -105,9 +118,6 @@ __global__ void getScoresMod (float * poreModelDev, rc = true; } - //int kmerIdx = threadIdx.x; - uint32_t rank; - float p_stay = 1 - (1 / read_events_per_base); float p_skip = 0.0025; float p_bad = 0.001; @@ -151,21 +161,19 @@ __global__ void getScoresMod (float * poreModelDev, float lp_emission_b = BAD_EVENT_PENALTY; //TODO: Can this be taken out of the inner loop? + //Initialise temp registers + float prevMatch = prevProbabilities[PSR9_MATCH];; + float prevSkip = prevProbabilities[PSR9_KMER_SKIP]; + float prevBad = prevProbabilities[PSR9_BAD_EVENT]; + for (int blkIdx = 1; blkIdx>> Num Kmers is %i\n", n_kmers); + printf("Kernel 0 >>> n_states %i\n", n_states); + printf("Kernel 0 >>> num events in read is %i\n", numRows); + printf("Kernel 0 >>> event offset is is %i\n", e_offset); + } + bool rc = false; if (e_stride == -1){ rc = true; } int kmerIdx = threadIdx.x; - uint32_t rank; - if (rc == true) { - rank = kmerRanks[kmerIdx + n_kmers]; - }else{ - rank = kmerRanks[kmerIdx]; - } + uint32_t rank = kmerRanks[kmerIdx + (n_kmers * rc)]; float pore_mean = poreModelDev[rank * 3]; float pore_stdv = poreModelDev[rank * 3 + 1]; @@ -645,7 +654,6 @@ std::vector>> GpuAligner::scoreKernelMod(std::ve int numSequences = 0; // The number of sequences across all scoreSets int kmerOffset = 0; int numReads = 0; // The number of reads across all scoreSets - int numBases = 0; int numScoreSets = scoreSets.size(); int rawReadOffset = 0; @@ -731,23 +739,24 @@ std::vector>> GpuAligner::scoreKernelMod(std::ve sequenceOffsetsHost[globalSequenceIdx] = kmerOffset; int sequenceLength = sequence.length(); - numBases += sequenceLength; - for(size_t ki = 0; ki < sequenceLength; ++ki) { + int numKmers = sequenceLength - k + 1; + + for(size_t ki = 0; ki < numKmers; ++ki) { int rank = sequence.get_kmer_rank(ki, k, false); kmerRanks[ki + kmerOffset] = rank; } //kmerRanksDevPointers[i] = kmerRanksDev + kmerOffset; - kmerOffset += sequenceLength; + kmerOffset += numKmers; - for(size_t ki = 0; ki < sequenceLength; ++ki) { + for(size_t ki = 0; ki < numKmers; ++ki) { int rank = sequence.get_kmer_rank(ki, k, true); kmerRanks[ki + kmerOffset] = rank; } - kmerOffset += sequenceLength; + kmerOffset += numKmers; - sequenceLengthsHost[globalSequenceIdx] = sequenceLength; + sequenceLengthsHost[globalSequenceIdx] = numKmers; // Loop over the raw reads, producing a cartesian product of the two @@ -816,7 +825,7 @@ std::vector>> GpuAligner::scoreKernelMod(std::ve // Launch Kernels - dim3 dimBlock(1); //TODO change back to globalScoreIDx this is only for debugging + dim3 dimBlock(globalScoreIdx); // TODO: divide work into smaller blocks dim3 dimGrid(1); //printf("Launching get scores mod kernel\n"); @@ -987,14 +996,14 @@ std::vector> GpuAligner::scoreKernel(std::vector> GpuAligner::scoreKernel(std::vector GpuAligner::variantScoresThresholded(std::vector i if (!event_sequences.empty()) { std::vector> scores = scoreKernel(sequences, event_sequences, alignment_flags); - + //std::vector> scores; // Now try it with the new method ScoreSet s = { sequences, From 409fb3a0663ae87dea4cb4fb0a1f6657fc135206 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Fri, 20 Jul 2018 21:17:02 +0100 Subject: [PATCH 44/80] Split work into smaller threadBlocks --- Makefile | 8 +- src/cuda_kernels/GpuAligner.cu | 450 ++++++++++++++++--------------- src/nanopolish_call_variants.cpp | 2 +- 3 files changed, 239 insertions(+), 221 deletions(-) diff --git a/Makefile b/Makefile index a213f8c8..5f22a05b 100644 --- a/Makefile +++ b/Makefile @@ -9,13 +9,13 @@ SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/ali #Basic flags every build needs LIBS=-lz -CXXFLAGS ?= -O0 -CXXFLAGS += -std=c++11 -fopenmp -fsigned-char -g -CFLAGS ?= -std=c99 -O0 +CXXFLAGS ?= -O3 +CXXFLAGS += -std=c++11 -fopenmp -fsigned-char #-g +CFLAGS ?= -std=c99 -O3 CXX ?= g++ CC ?= gcc NVCC = nvcc -NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0include -O0 -use_fast_math --default-stream per-thread -restrict -g +NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0include -O3 -use_fast_math --default-stream per-thread -restrict #-g CURTFLAGS ?= -L/usr/local/cuda-9.0/lib64 -lcudart # Change the value of HDF5, EIGEN, or HTS below to any value to disable compilation of bundled code diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index 030881c2..af84e60c 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -4,7 +4,7 @@ #include #include "nanopolish_profile_hmm_r9.h" -#define MAX_STATES 512 +#define MAX_STATES 256 #define EXPAND_TO_STRING(X) #X #define TO_STRING(X) EXPAND_TO_STRING(X) @@ -59,6 +59,7 @@ __global__ void getScoresMod (float * poreModelDev, int * kmerRanksDev, int * seqIdxDev, int * readIdxDev, + int numScores, float * returnValuesDev){ bool debug = false; @@ -67,213 +68,218 @@ __global__ void getScoresMod (float * poreModelDev, } // get buffer indices - int scoreIdx = threadIdx.x; - int readIdx = readIdxDev[scoreIdx]; - int seqIdx = seqIdxDev[scoreIdx]; - - // get read statistics - int numEvents = readLengthsDev[readIdx]; - int readOffset = eventOffsetsDev[readIdx]; - float read_events_per_base = eventsPerBaseDev[readIdx]; - int e_start = eventStartsDev[readIdx]; // Event start for read - int e_stride = eventStridesDev[readIdx]; - int e_offset = eventOffsetsDev[readIdx]; // Within the event means etc, the offset needed for this block to get a specific event - float scale = scaleDev[readIdx]; - float shift = shiftDev[readIdx]; - float var = varDev[readIdx]; - float logVar = logVarDev[readIdx]; - - // get sequence statistics - int numKmers = sequenceLengthsDev[seqIdx]; - int seqOffset = sequenceOffsetsDev[seqIdx]; - - printf("This is thread %i, seqIdx is %i, readIdx is %i, numKmers is %i, seqOffset is %i\n", threadIdx.x, seqIdx, readIdx, numKmers, seqOffset); - - int lastRowIdx = numEvents -1; - int lastKmerIdx = numKmers - 1; - - float returnValue = -INFINITY; //Used to sum over the last column. - float prevProbabilities[MAX_STATES]; - - int numBlocks = numKmers + 2; - int numStates = numBlocks * PSR9_NUM_STATES; // 3 blocks per kmer and then 3 each for start and end state. - - if(debug){ - printf("Kernel 1 >>> Num Kmers is %i\n", numKmers); - printf("Kernel 1 >>> n_states %i\n", numStates); - printf("Kernel 1 >>> num events in read is %i\n", numEvents); - printf("Kernel 1 >>> event offset is %i\n", e_offset); - } - - // Initialise the prev probabilities vector - for (int i = 0; i < numStates - PSR9_NUM_STATES; i++) { - prevProbabilities[i] = -INFINITY; - } - for (int i = numStates - PSR9_NUM_STATES; i < numStates; i++) { - prevProbabilities[i] = 0.0f; - } - - bool rc = false; - if (e_stride == -1){ - rc = true; - } - - float p_stay = 1 - (1 / read_events_per_base); - float p_skip = 0.0025; - float p_bad = 0.001; - float p_bad_self = p_bad; - float p_skip_self = 0.3; - float p_mk = p_skip; // probability of not observing an event at all - float p_mb = p_bad; // probabilty of observing a bad event - float p_mm_self = p_stay; // probability of observing additional events from this k-mer - float p_mm_next = 1.0f - p_mm_self - p_mk - p_mb; // normal movement from state to state - // transitions from event split state in previous block - float p_bb = p_bad_self; - float p_bk, p_bm_next, p_bm_self; - p_bk = p_bm_next = p_bm_self = (1.0f - p_bb) / 3; - // transitions from kmer skip state in previous block - float p_kk = p_skip_self; - float p_km = 1.0f - p_kk; - // We assign some transition probabilities. I believe this is correct and they don't vary by location in the sequence - float lp_mk = logf(p_mk); - float lp_mb = logf(p_mb); - float lp_mm_self = logf(p_mm_self); - float lp_mm_next = logf(p_mm_next); - float lp_bb = logf(p_bb); - float lp_bk = logf(p_bk); - float lp_bm_next = logf(p_bm_next); - float lp_bm_self = logf(p_bm_self); - float lp_kk = logf(p_kk); - float lp_km = logf(p_km); - float lp_sm, lp_ms; - lp_sm = lp_ms = 0.0f; + int scoreIdx = blockIdx.x * blockDim.x + threadIdx.x; + + if (scoreIdx < numScores) { + + int readIdx = readIdxDev[scoreIdx]; + int seqIdx = seqIdxDev[scoreIdx]; + + // get read statistics + int numEvents = readLengthsDev[readIdx]; + int readOffset = eventOffsetsDev[readIdx]; + float read_events_per_base = eventsPerBaseDev[readIdx]; + int e_start = eventStartsDev[readIdx]; // Event start for read + int e_stride = eventStridesDev[readIdx]; + int e_offset = eventOffsetsDev[readIdx]; // Within the event means etc, the offset needed for this block to get a specific event + float scale = scaleDev[readIdx]; + float shift = shiftDev[readIdx]; + float var = varDev[readIdx]; + float logVar = logVarDev[readIdx]; + + // get sequence statistics + int numKmers = sequenceLengthsDev[seqIdx]; + int seqOffset = sequenceOffsetsDev[seqIdx]; + + int lastRowIdx = numEvents - 1; + int lastKmerIdx = numKmers - 1; + + float returnValue = -INFINITY; //Used to sum over the last column. + float prevProbabilities[MAX_STATES]; + + int numBlocks = numKmers + 2; + int numStates = numBlocks * PSR9_NUM_STATES; // 3 blocks per kmer and then 3 each for start and end state. + + if (debug) { + printf("Kernel 1 >>> Num Kmers is %i\n", numKmers); + printf("Kernel 1 >>> n_states %i\n", numStates); + printf("Kernel 1 >>> num events in read is %i\n", numEvents); + printf("Kernel 1 >>> event offset is %i\n", e_offset); + } - // the penalty is controlled by the transition probability - float BAD_EVENT_PENALTY = 0.0f; + // Initialise the prev probabilities vector + for (int i = 0; i < numStates - PSR9_NUM_STATES; i++) { + prevProbabilities[i] = -INFINITY; + } + for (int i = numStates - PSR9_NUM_STATES; i < numStates; i++) { + prevProbabilities[i] = 0.0f; + } - //Fill out the dynamic programming table - for(int row=1; row>> GpuAligner::scoreKernelMod(std::ve globalSequenceIdx++; } - } // All data is now in host buffers - perform memcpys @@ -825,11 +830,13 @@ std::vector>> GpuAligner::scoreKernelMod(std::ve // Launch Kernels - dim3 dimBlock(globalScoreIdx); // TODO: divide work into smaller blocks - dim3 dimGrid(1); + int blockSize = 32; + int numBlocks = (globalScoreIdx + blockSize - 1 ) / blockSize; + dim3 dimBlock(blockSize); + dim3 dimGrid(numBlocks); //printf("Launching get scores mod kernel\n"); - getScoresMod <<< dimGrid, dimBlock, MAX_STATES * sizeof(int), streams[0]>>> (poreModelDev, + getScoresMod <<< dimGrid, dimBlock, 0, streams[0]>>> (poreModelDev, readLengthsDev, eventStartsDev, eventStridesDev, @@ -847,6 +854,7 @@ std::vector>> GpuAligner::scoreKernelMod(std::ve kmerRanksDev, seqIdxDev, readIdxDev, + globalScoreIdx, scoresDev); cudaError_t err = cudaGetLastError(); @@ -866,12 +874,15 @@ std::vector>> GpuAligner::scoreKernelMod(std::ve int numSequences = scoreSet.stateSequences.size(); int numReads = scoreSet.rawData.size(); for (int seqIdx=0; seqIdx seqScores; + + std::vector seqScores(numReads); + for (int readIdx=0; readIdx GpuAligner::variantScoresThresholded(std::vector i std::vector v = input_variants; if (!event_sequences.empty()) { - std::vector> scores = scoreKernel(sequences, event_sequences, alignment_flags); - //std::vector> scores; + //std::vector> scores = scoreKernel(sequences, event_sequences, alignment_flags); + // Now try it with the new method ScoreSet s = { sequences, event_sequences }; - std::vector scoreSets(1,s); + std::vector scoreSets; + scoreSets.push_back(s); + //scoreSets.push_back(s); + //scoreSets.push_back(s); + //scoreSets.push_back(s); + //scoreSets.push_back(s); + //scoreSets.push_back(s); - std::vector>> scoresMod = scoreKernelMod(scoreSets, alignment_flags); + auto scoresMod = scoreKernelMod(scoreSets, alignment_flags); + std::vector> scores = scoresMod[0]; uint32_t numScores = scores[0].size(); for (int variantIndex = 0; variantIndex < numVariants; variantIndex++) { // index 0 is the base scores @@ -1138,7 +1156,7 @@ std::vector GpuAligner::variantScoresThresholded(std::vector i v[variantIndex].quality = totalScore; v[variantIndex].info = ""; } - } + } return v; } diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp index 1b01ff2c..509c98e8 100644 --- a/src/nanopolish_call_variants.cpp +++ b/src/nanopolish_call_variants.cpp @@ -388,7 +388,7 @@ std::vector generate_candidate_single_base_edits(const AlignmentDB& ali // Add all positively-scoring single-base changes into the candidate set if (opt::gpu) { - size_t num_workers = 1; + size_t num_workers = 8; std::vector gpuAligners(num_workers); //std::vector workerThreads(num_workers); From 8136fa6888ab297d188e99980f358e817af8a663 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Sat, 21 Jul 2018 01:17:22 +0100 Subject: [PATCH 45/80] New Kernel working in multi-base mode. Code needs big refactor and testing --- src/cuda_kernels/GpuAligner.cu | 91 +++++++++------- src/cuda_kernels/GpuAligner.h | 10 +- src/nanopolish_call_variants.cpp | 179 ++++++++++++++++++------------- 3 files changed, 163 insertions(+), 117 deletions(-) diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index af84e60c..985c6a95 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -653,7 +653,7 @@ GpuAligner::~GpuAligner() { } -std::vector>> GpuAligner::scoreKernelMod(std::vector scoreSets, +std::vector>> GpuAligner::scoreKernelMod(std::vector &scoreSets, uint32_t alignment_flags){ int numEventsTotal = 0; // The number of events across all scoreSets @@ -669,7 +669,7 @@ std::vector>> GpuAligner::scoreKernelMod(std::ve //Loop over every scoreset, filling out buffers and counters for (int scoreSetIdx=0; scoreSetIdx < numScoreSets; scoreSetIdx++){ - auto &scoreSet = scoreSets[scoreSetIdx]; + auto scoreSet = scoreSets[scoreSetIdx]; int firstReadIdxinScoreSet = globalReadIdx; //Read data for (int eventSequenceIdx=0; eventSequenceIdx < scoreSet.rawData.size();eventSequenceIdx++){ @@ -1092,12 +1092,22 @@ std::vector> GpuAligner::scoreKernel(std::vector GpuAligner::variantScoresThresholded(std::vector input_variants, - Haplotype base_haplotype, - std::vector event_sequences, +std::vector GpuAligner::variantScoresThresholded(std::vector> input_variants_vector, + std::vector base_haplotypes, + std::vector> event_sequences_vector, uint32_t alignment_flags, int screen_score_threshold, std::vector methylation_types) { + int numScoreSets = base_haplotypes.size(); + std::vector scoreSets; + scoreSets.resize(numScoreSets); + + for(int scoreSetIdx=0; scoreSetIdx out_variants = input_variants; @@ -1122,41 +1132,42 @@ std::vector GpuAligner::variantScoresThresholded(std::vector i sequences.push_back(variant_sequence); } - std::vector v = input_variants; - - if (!event_sequences.empty()) { - //std::vector> scores = scoreKernel(sequences, event_sequences, alignment_flags); - - // Now try it with the new method - ScoreSet s = { - sequences, - event_sequences - }; - - std::vector scoreSets; - scoreSets.push_back(s); - //scoreSets.push_back(s); - //scoreSets.push_back(s); - //scoreSets.push_back(s); - //scoreSets.push_back(s); - //scoreSets.push_back(s); - - auto scoresMod = scoreKernelMod(scoreSets, alignment_flags); - std::vector> scores = scoresMod[0]; - - uint32_t numScores = scores[0].size(); - for (int variantIndex = 0; variantIndex < numVariants; variantIndex++) { // index 0 is the base scores - double totalScore = 0.0; - for (int k = 0; k < numScores; k++) { - if (fabs(totalScore) < screen_score_threshold) { - double baseScore = scores[0][k]; - totalScore += (scores[variantIndex + 1][k] - baseScore); - } - } - v[variantIndex].quality = totalScore; - v[variantIndex].info = ""; - } + ScoreSet s = { + sequences, + event_sequences + }; + scoreSets[scoreSetIdx] = s; + + } + + std::vector v; + if (!event_sequences_vector.empty()) { + //std::vector> scores = scoreKernel(sequences, event_sequences, alignment_flags); + + auto scoresMod = scoreKernelMod(scoreSets, alignment_flags); + + // results are now ready, need to unpack them + for (int scoreSetIdx=0; scoreSetIdx> scores = scoresMod[scoreSetIdx]; // scores for this candidate, including all variants and base(zeroth) + int numVariants = scores.size() - 1; // subtract one for the base + int numScores = scores[0].size(); + + for (int variantIndex = 0; variantIndex < numVariants; variantIndex++) { // index 0 is the base scores + double totalScore = 0.0; + for (int k = 0; k < numScores; k++) { + if (fabs(totalScore) < screen_score_threshold) { + double baseScore = scores[0][k]; + totalScore += (scores[variantIndex + 1][k] - baseScore); + } + } + // get the old variant: + auto unScoredVariant = input_variants_vector[scoreSetIdx][variantIndex]; + unScoredVariant.quality = totalScore; + unScoredVariant.info = ""; + v.push_back(unScoredVariant); + } } - return v; + } + return v; } diff --git a/src/cuda_kernels/GpuAligner.h b/src/cuda_kernels/GpuAligner.h index e586bfdc..d72ce9c2 100644 --- a/src/cuda_kernels/GpuAligner.h +++ b/src/cuda_kernels/GpuAligner.h @@ -45,8 +45,8 @@ //Data to be scored typedef struct { - std::vector &stateSequences; - std::vector &rawData; + std::vector stateSequences; + std::vector rawData; } ScoreSet; class GpuAligner @@ -56,13 +56,15 @@ class GpuAligner ~GpuAligner(); std::vector - variantScoresThresholded(std::vector tmp_variants, Haplotype haplotype, std::vector event_sequences, + variantScoresThresholded(std::vector>, + std::vector, + std::vector>, uint32_t alignment_flags, int screen_score_threshold, std::vector methylation_types); std::vector> scoreKernel(std::vector sequences, std::vector event_sequences, uint32_t alignment_flags); - std::vector>> scoreKernelMod(std::vector scoreSets, + std::vector>> scoreKernelMod(std::vector &scoreSets, uint32_t alignment_flags); private: float* scaleDev; diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp index 509c98e8..1a7b2e96 100644 --- a/src/nanopolish_call_variants.cpp +++ b/src/nanopolish_call_variants.cpp @@ -284,24 +284,27 @@ void annotate_with_all_support(std::vector& variants, } -void singleLocusBaseEditCandidate(int i, - const AlignmentDB& alignments, - uint32_t alignment_flags, - std::vector &out_variants, - std::string contig, - GpuAligner &aligner, - std::mutex &outVariantsMutex -){ -try { - int calling_start = i - opt::screen_flanking_sequence; - int calling_end = i + 1 + opt::screen_flanking_sequence; - - if (!alignments.are_coordinates_valid(contig, calling_start, calling_end)) { +void locusRangeBaseEditCandidate(int start, int end, + const AlignmentDB& alignments, + uint32_t alignment_flags, + std::vector &out_variants, + std::string contig, + GpuAligner &aligner, + std::mutex &outVariantsMutex){ + try { + std::vector> tmp_variants_vector; + std::vector haplotypes; + std::vector> event_sequences_vector; + for(int i = start; i<=end; i++){ + int calling_start = i - opt::screen_flanking_sequence; + int calling_end = i + 1 + opt::screen_flanking_sequence; + + if (!alignments.are_coordinates_valid(contig, calling_start, calling_end)) { return; - } + } - std::vector tmp_variants; - for (size_t j = 0; j < 4; ++j) { + std::vector tmp_variants; + for (size_t j = 0; j < 4; ++j) { // Substitutions Variant v; v.ref_name = contig; @@ -310,43 +313,51 @@ try { v.alt_seq = "ACGT"[j]; if (v.ref_seq != v.alt_seq) { - tmp_variants.push_back(v); + tmp_variants.push_back(v); } // Insertions v.alt_seq = v.ref_seq + "ACGT"[j]; // ignore insertions of the type "A" -> "AA" as these are redundant if (v.alt_seq[1] != v.ref_seq[0]) { - tmp_variants.push_back(v); + tmp_variants.push_back(v); } - } + } - // deletion - Variant del; - del.ref_name = contig; - del.ref_position = i - 1; - del.ref_seq = alignments.get_reference_substring(contig, i - 1, i); - del.alt_seq = del.ref_seq[0]; + // deletion + Variant del; + del.ref_name = contig; + del.ref_position = i - 1; + del.ref_seq = alignments.get_reference_substring(contig, i - 1, i); + del.alt_seq = del.ref_seq[0]; - // ignore deletions of the type "AA" -> "A" as these are redundant - if (del.alt_seq[0] != del.ref_seq[1]) { + // ignore deletions of the type "AA" -> "A" as these are redundant + if (del.alt_seq[0] != del.ref_seq[1]) { tmp_variants.push_back(del); + } + + // Screen variants by score + // We do this internally here as it is much faster to get the event sequences + // for the entire window for all variants at this position once, rather than + // for each variant individually + std::vector event_sequences = + alignments.get_event_subsequences(contig, calling_start, calling_end); + + Haplotype test_haplotype(contig, + calling_start, + alignments.get_reference_substring(contig, + calling_start, + calling_end)); + + haplotypes.push_back(test_haplotype); + event_sequences_vector.push_back(event_sequences); + tmp_variants_vector.push_back(tmp_variants); } - - // Screen variants by score - // We do this internally here as it is much faster to get the event sequences - // for the entire window for all variants at this position once, rather than - // for each variant individually - std::vector event_sequences = - alignments.get_event_subsequences(contig, calling_start, calling_end); - - Haplotype test_haplotype(contig, - calling_start, - alignments.get_reference_substring(contig, calling_start, calling_end)); - + if (opt::gpu) { - std::vector scoredVariants = aligner.variantScoresThresholded(tmp_variants, test_haplotype, - event_sequences, + std::vector scoredVariants = aligner.variantScoresThresholded(tmp_variants_vector, + haplotypes, + event_sequences_vector, alignment_flags, opt::screen_score_threshold, opt::methylation_types); @@ -357,19 +368,19 @@ try { } } } else { - for (const Variant &v : tmp_variants) { - auto t0 = std::chrono::high_resolution_clock::now(); - Variant scored_variant = score_variant_thresholded(v, - test_haplotype, - event_sequences, - alignment_flags, - opt::screen_score_threshold, - opt::methylation_types); - scored_variant.info = ""; - if (scored_variant.quality > 0) { - out_variants.push_back(scored_variant); - } - } + //for (const Variant &v : tmp_variants) { + // auto t0 = std::chrono::high_resolution_clock::now(); + // Variant scored_variant = score_variant_thresholded(v, + // test_haplotype, + // event_sequences, + // alignment_flags, + // opt::screen_score_threshold, + // opt::methylation_types); + // scored_variant.info = ""; + // if (scored_variant.quality > 0) { + // out_variants.push_back(scored_variant); + // } + //} } }catch (std::exception &e){ printf("Exception in thread! %s\n", e.what()); @@ -393,42 +404,64 @@ std::vector generate_candidate_single_base_edits(const AlignmentDB& ali //std::vector workerThreads(num_workers); std::vector> handles(num_workers); - int nextLocus = region_start; + int lociPerWorker = 12; + int nextLocusBegin = region_start; + int nextLocusEnd = region_start; + + //printf("Initialising workers\n"); //Initialise workers for (int workerIdx = 0; workerIdx < num_workers; workerIdx++) { auto aligner = std::ref(gpuAligners[workerIdx]); - if (nextLocus < region_end) { + if (nextLocusEnd < region_end) { //TODO: Check this is correct. May be leaving some off at the end. May want to put icrements at start and redo this whole block. handles[workerIdx] = std::async(std::launch::async, - singleLocusBaseEditCandidate, - nextLocus, + locusRangeBaseEditCandidate, + nextLocusBegin, + nextLocusEnd, std::ref(alignments), alignment_flags, std::ref(out_variants), std::ref(contig), aligner, std::ref(outVariantsMutex)); - nextLocus++; + if ((nextLocusEnd + lociPerWorker) < region_end){ + nextLocusBegin = nextLocusEnd + 1; + nextLocusEnd = nextLocusBegin + lociPerWorker - 1; + }else{ + nextLocusBegin = nextLocusEnd + 1; + nextLocusEnd = region_end; + } } } + //printf("Workers initialised\n"); //Round robin the workers until done - while (nextLocus < region_end) { + while (nextLocusEnd < region_end) { for (int i = 0; i < num_workers; i++) { auto status = handles[i].wait_for(std::chrono::microseconds(100)); - if (status == std::future_status::ready && (nextLocus < region_end)) { + //printf("Got status\n"); + if (status == std::future_status::ready && (nextLocusEnd < region_end)) { + //printf("Entering the event loop, locus start is %i and end is %i\n", nextLocusBegin, nextLocusEnd); auto aligner = std::ref(gpuAligners[i]); + //printf("Sending work to a worker\n"); handles[i].get(); handles[i] = std::async(std::launch::async, - singleLocusBaseEditCandidate, - nextLocus, + locusRangeBaseEditCandidate, + nextLocusBegin, + nextLocusEnd, std::ref(alignments), alignment_flags, std::ref(out_variants), std::ref(contig), aligner, std::ref(outVariantsMutex)); - nextLocus++; + if ((nextLocusEnd + lociPerWorker) < region_end){ + nextLocusBegin = nextLocusEnd + 1; + nextLocusEnd = nextLocusBegin + lociPerWorker - 1; + }else{ + nextLocusBegin = nextLocusEnd + 1; + nextLocusEnd = region_end; + } } } } @@ -439,15 +472,15 @@ std::vector generate_candidate_single_base_edits(const AlignmentDB& ali } } else { GpuAligner aligner; //TODO: temporary - refactor to get rid of this - for (size_t i = region_start; i < region_end; ++i) { - singleLocusBaseEditCandidate(i, - std::ref(alignments), - alignment_flags, - std::ref(out_variants), - std::ref(contig), - std::ref(aligner), - std::ref(outVariantsMutex)); - } + //for (size_t i = region_start; i < region_end; ++i) { + // singleLocusBaseEditCandidate(i, + // std::ref(alignments), + // alignment_flags, + // std::ref(out_variants), + // std::ref(contig), + // std::ref(aligner), + // std::ref(outVariantsMutex)); + // } return out_variants; } From 9d4323907d6a8634147485fa9e46bff197537ac4 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Mon, 23 Jul 2018 13:17:25 +0100 Subject: [PATCH 46/80] Increased buffer sizes --- src/cuda_kernels/GpuAligner.cu | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index 985c6a95..8767a40f 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -8,7 +8,7 @@ #define EXPAND_TO_STRING(X) #X #define TO_STRING(X) EXPAND_TO_STRING(X) -#define CU_CHECK_ERR(X) if (X != cudaSuccess){printf("CUDA error: %s at line %s\n", cudaGetErrorString(X), TO_STRING(__LINE__));throw std::runtime_error("CUDA ERRROR");} +#define CU_CHECK_ERR(X) if (X != cudaSuccess){printf("CUDA error: <<%s>> at line %s\n", cudaGetErrorString(X), TO_STRING(__LINE__));throw std::runtime_error("CUDA ERROR");} __device__ float logsumexpf(float x, float y){ if(x == -INFINITY && y == -INFINITY){ @@ -523,10 +523,10 @@ __global__ void getScores(float * const eventData, GpuAligner::GpuAligner() { int numModelElements = 4096; - int max_num_reads = 1000; + int max_num_reads = 5000; int readsSizeBuffer = max_num_reads * sizeof(int); int max_n_rows = 100; - int maxBuffer = 100000 * sizeof(float); //TODO: allocate more smartly + int maxBuffer = 500000 * sizeof(float); //TODO: allocate more smartly int max_num_sequences = 8; int max_sequence_length = 50; @@ -550,13 +550,11 @@ GpuAligner::GpuAligner() CU_CHECK_ERR(cudaMalloc( (void**)&readLengthsDev, readsSizeBuffer)); CU_CHECK_ERR(cudaHostAlloc(&readLengthsHost, readsSizeBuffer, cudaHostAllocDefault)); - // Allocate Device memory for pore model CU_CHECK_ERR(cudaMalloc((void**)&poreModelDev, numModelElements * 3 * sizeof(float))); CU_CHECK_ERR(cudaHostAlloc(&poreModelHost, numModelElements * sizeof(float) * 3, cudaHostAllocDefault)); - - CU_CHECK_ERR(cudaMalloc((void**)&numRowsDev, max_n_rows * sizeof(int))); + CU_CHECK_ERR(cudaMalloc((void**)&numRowsDev, readsSizeBuffer * sizeof(int))); CU_CHECK_ERR(cudaMalloc((void**)&eventStartsDev, readsSizeBuffer)); CU_CHECK_ERR(cudaHostAlloc(&eventStartsHost, readsSizeBuffer, cudaHostAllocDefault)); @@ -782,9 +780,6 @@ std::vector>> GpuAligner::scoreKernelMod(std::ve CU_CHECK_ERR(cudaMemcpyAsync(eventStartsDev, eventStartsHost, numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0])); - CU_CHECK_ERR(cudaMemcpyAsync(eventStridesDev, eventStridesHost, - numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0])); - CU_CHECK_ERR(cudaMemcpyAsync(eventsPerBaseDev, eventsPerBaseHost, numReads * sizeof(float), cudaMemcpyHostToDevice, streams[0])); @@ -794,6 +789,9 @@ std::vector>> GpuAligner::scoreKernelMod(std::ve CU_CHECK_ERR(cudaMemcpyAsync(shiftDev, shiftHost, numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0])); + CU_CHECK_ERR(cudaMemcpyAsync(eventStridesDev, eventStridesHost, + numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0])); + CU_CHECK_ERR(cudaMemcpyAsync(varDev, varHost, numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0])); From d7f2e3184489e67856118034e4aa4499cb58d1e2 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Mon, 23 Jul 2018 16:12:35 +0100 Subject: [PATCH 47/80] Fixed issue with bases at end not being corrected --- src/nanopolish_call_variants.cpp | 61 ++++++++++++++++---------------- 1 file changed, 31 insertions(+), 30 deletions(-) diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp index 1a7b2e96..95304c71 100644 --- a/src/nanopolish_call_variants.cpp +++ b/src/nanopolish_call_variants.cpp @@ -405,68 +405,69 @@ std::vector generate_candidate_single_base_edits(const AlignmentDB& ali //std::vector workerThreads(num_workers); std::vector> handles(num_workers); - int lociPerWorker = 12; + int lociPerWorker = 12; int nextLocusBegin = region_start; - int nextLocusEnd = region_start; + int nextLocusEnd = nextLocusBegin + lociPerWorker; + bool finished = false; - //printf("Initialising workers\n"); - //Initialise workers for (int workerIdx = 0; workerIdx < num_workers; workerIdx++) { auto aligner = std::ref(gpuAligners[workerIdx]); - if (nextLocusEnd < region_end) { //TODO: Check this is correct. May be leaving some off at the end. May want to put icrements at start and redo this whole block. + if (!finished) { + if (nextLocusEnd == region_end) { + finished = true; + } handles[workerIdx] = std::async(std::launch::async, locusRangeBaseEditCandidate, nextLocusBegin, - nextLocusEnd, + nextLocusEnd, std::ref(alignments), alignment_flags, std::ref(out_variants), std::ref(contig), aligner, std::ref(outVariantsMutex)); - if ((nextLocusEnd + lociPerWorker) < region_end){ - nextLocusBegin = nextLocusEnd + 1; - nextLocusEnd = nextLocusBegin + lociPerWorker - 1; - }else{ - nextLocusBegin = nextLocusEnd + 1; - nextLocusEnd = region_end; - } + if ((nextLocusEnd + lociPerWorker) < region_end){ + nextLocusBegin = nextLocusEnd + 1; + nextLocusEnd = nextLocusBegin + lociPerWorker - 1; + }else{ + nextLocusBegin = nextLocusEnd + 1; + nextLocusEnd = region_end; + } } } - //printf("Workers initialised\n"); //Round robin the workers until done - while (nextLocusEnd < region_end) { + while (!finished) { for (int i = 0; i < num_workers; i++) { auto status = handles[i].wait_for(std::chrono::microseconds(100)); - //printf("Got status\n"); - if (status == std::future_status::ready && (nextLocusEnd < region_end)) { - //printf("Entering the event loop, locus start is %i and end is %i\n", nextLocusBegin, nextLocusEnd); + if (status == std::future_status::ready && (!finished)) { + if (nextLocusEnd == region_end){ + finished = true; + } auto aligner = std::ref(gpuAligners[i]); - //printf("Sending work to a worker\n"); handles[i].get(); handles[i] = std::async(std::launch::async, - locusRangeBaseEditCandidate, - nextLocusBegin, - nextLocusEnd, + locusRangeBaseEditCandidate, + nextLocusBegin, + nextLocusEnd, std::ref(alignments), alignment_flags, std::ref(out_variants), std::ref(contig), aligner, std::ref(outVariantsMutex)); - if ((nextLocusEnd + lociPerWorker) < region_end){ - nextLocusBegin = nextLocusEnd + 1; - nextLocusEnd = nextLocusBegin + lociPerWorker - 1; - }else{ - nextLocusBegin = nextLocusEnd + 1; - nextLocusEnd = region_end; - } + if ((nextLocusEnd + lociPerWorker) < region_end){ + nextLocusBegin = nextLocusEnd + 1; + nextLocusEnd = nextLocusBegin + lociPerWorker - 1; + }else{ + nextLocusBegin = nextLocusEnd + 1; + nextLocusEnd = region_end; + } } } } - //Synchronize the remaining ones + //Block until all workers are complete for (int workerIdx = 0; workerIdx < num_workers; workerIdx++) { handles[workerIdx].wait(); } From 18effc5d29f9799d860914cc00eba61acee034e0 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Mon, 23 Jul 2018 16:42:32 +0100 Subject: [PATCH 48/80] 16 workers - better on V100 for now --- src/nanopolish_call_variants.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp index 95304c71..8f6d2caa 100644 --- a/src/nanopolish_call_variants.cpp +++ b/src/nanopolish_call_variants.cpp @@ -399,7 +399,7 @@ std::vector generate_candidate_single_base_edits(const AlignmentDB& ali // Add all positively-scoring single-base changes into the candidate set if (opt::gpu) { - size_t num_workers = 8; + size_t num_workers = 16; std::vector gpuAligners(num_workers); //std::vector workerThreads(num_workers); From 5b09cbc5ee4e824ce84227a80e86e0a1d8b5a04b Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Tue, 24 Jul 2018 16:13:46 +0100 Subject: [PATCH 49/80] Refactor of nanopolish_call_variants.cpp --- src/nanopolish_call_variants.cpp | 419 +++++++++++++++++-------------- 1 file changed, 232 insertions(+), 187 deletions(-) diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp index 8f6d2caa..fbf033c6 100644 --- a/src/nanopolish_call_variants.cpp +++ b/src/nanopolish_call_variants.cpp @@ -283,211 +283,248 @@ void annotate_with_all_support(std::vector& variants, } } +void prepareForBaseEditCandidates(int start, + int end, + const AlignmentDB& alignments, + std::string contig, + std::vector> &tmp_variants_vector, + std::vector &haplotypes, + std::vector> &event_sequences_vector +){ + for(int i = start; i<=end; i++){ + int calling_start = i - opt::screen_flanking_sequence; + int calling_end = i + 1 + opt::screen_flanking_sequence; + + if (!alignments.are_coordinates_valid(contig, calling_start, calling_end)) { + return; + } + + std::vector tmp_variants; + for (size_t j = 0; j < 4; ++j) { + // Substitutions + Variant v; + v.ref_name = contig; + v.ref_position = i; + v.ref_seq = alignments.get_reference_substring(contig, i, i); + v.alt_seq = "ACGT"[j]; + + if (v.ref_seq != v.alt_seq) { + tmp_variants.push_back(v); + } + + // Insertions + v.alt_seq = v.ref_seq + "ACGT"[j]; + // ignore insertions of the type "A" -> "AA" as these are redundant + if (v.alt_seq[1] != v.ref_seq[0]) { + tmp_variants.push_back(v); + } + } + + // deletion + Variant del; + del.ref_name = contig; + del.ref_position = i - 1; + del.ref_seq = alignments.get_reference_substring(contig, i - 1, i); + del.alt_seq = del.ref_seq[0]; + + // ignore deletions of the type "AA" -> "A" as these are redundant + if (del.alt_seq[0] != del.ref_seq[1]) { + tmp_variants.push_back(del); + } + + // Screen variants by score + // We do this internally here as it is much faster to get the event sequences + // for the entire window for all variants at this position once, rather than + // for each variant individually + std::vector event_sequences = alignments.get_event_subsequences(contig, calling_start, calling_end); + + Haplotype test_haplotype(contig, + calling_start, + alignments.get_reference_substring(contig, + calling_start, + calling_end)); + + haplotypes.push_back(test_haplotype); + event_sequences_vector.push_back(event_sequences); + tmp_variants_vector.push_back(tmp_variants); + } +} -void locusRangeBaseEditCandidate(int start, int end, - const AlignmentDB& alignments, - uint32_t alignment_flags, - std::vector &out_variants, - std::string contig, - GpuAligner &aligner, - std::mutex &outVariantsMutex){ - try { + +void locusRangeBaseEditCandidateGPU(int start, + int end, + const AlignmentDB& alignments, + uint32_t alignment_flags, + std::vector &out_variants, + std::string contig, + GpuAligner &aligner, + std::mutex &outVariantsMutex) { std::vector> tmp_variants_vector; std::vector haplotypes; std::vector> event_sequences_vector; - for(int i = start; i<=end; i++){ - int calling_start = i - opt::screen_flanking_sequence; - int calling_end = i + 1 + opt::screen_flanking_sequence; - - if (!alignments.are_coordinates_valid(contig, calling_start, calling_end)) { - return; - } - - std::vector tmp_variants; - for (size_t j = 0; j < 4; ++j) { - // Substitutions - Variant v; - v.ref_name = contig; - v.ref_position = i; - v.ref_seq = alignments.get_reference_substring(contig, i, i); - v.alt_seq = "ACGT"[j]; - - if (v.ref_seq != v.alt_seq) { - tmp_variants.push_back(v); - } - // Insertions - v.alt_seq = v.ref_seq + "ACGT"[j]; - // ignore insertions of the type "A" -> "AA" as these are redundant - if (v.alt_seq[1] != v.ref_seq[0]) { - tmp_variants.push_back(v); + prepareForBaseEditCandidates(start, + end, + alignments, + contig, + tmp_variants_vector, + haplotypes, + event_sequences_vector); + + std::vector scoredVariants = aligner.variantScoresThresholded(tmp_variants_vector, + haplotypes, + event_sequences_vector, + alignment_flags, + opt::screen_score_threshold, + opt::methylation_types); + for (auto variant: scoredVariants) { + if (variant.quality > 0) { + std::lock_guard lock(outVariantsMutex); + out_variants.push_back(variant); } - } - - // deletion - Variant del; - del.ref_name = contig; - del.ref_position = i - 1; - del.ref_seq = alignments.get_reference_substring(contig, i - 1, i); - del.alt_seq = del.ref_seq[0]; - - // ignore deletions of the type "AA" -> "A" as these are redundant - if (del.alt_seq[0] != del.ref_seq[1]) { - tmp_variants.push_back(del); - } - - // Screen variants by score - // We do this internally here as it is much faster to get the event sequences - // for the entire window for all variants at this position once, rather than - // for each variant individually - std::vector event_sequences = - alignments.get_event_subsequences(contig, calling_start, calling_end); - - Haplotype test_haplotype(contig, - calling_start, - alignments.get_reference_substring(contig, - calling_start, - calling_end)); - - haplotypes.push_back(test_haplotype); - event_sequences_vector.push_back(event_sequences); - tmp_variants_vector.push_back(tmp_variants); } - - if (opt::gpu) { - std::vector scoredVariants = aligner.variantScoresThresholded(tmp_variants_vector, - haplotypes, - event_sequences_vector, - alignment_flags, - opt::screen_score_threshold, - opt::methylation_types); - for (auto variant: scoredVariants) { - if (variant.quality > 0) { - std::lock_guard lock(outVariantsMutex); - out_variants.push_back(variant); + +} + +void locusRangeBaseEditCandidate(int start, + int end, + const AlignmentDB& alignments, + uint32_t alignment_flags, + std::vector &out_variants, + std::string contig) { + std::vector> tmp_variants_vector; + std::vector haplotypes; + std::vector> event_sequences_vector; + + prepareForBaseEditCandidates(start, + end, + alignments, + contig, + tmp_variants_vector, + haplotypes, + event_sequences_vector); + + int numHaplotypes = haplotypes.size(); + for (int haplotypeIDX = 0; haplotypeIDX < numHaplotypes; haplotypeIDX++) { + auto variants = tmp_variants_vector[haplotypeIDX]; + auto test_haplotype = haplotypes[haplotypeIDX]; + auto event_sequences = event_sequences_vector[haplotypeIDX]; + for (const Variant &v : variants) { + auto t0 = std::chrono::high_resolution_clock::now(); + Variant scored_variant = score_variant_thresholded(v, + test_haplotype, + event_sequences, + alignment_flags, + opt::screen_score_threshold, + opt::methylation_types); + scored_variant.info = ""; + if (scored_variant.quality > 0) { + out_variants.push_back(scored_variant); } } - } else { - //for (const Variant &v : tmp_variants) { - // auto t0 = std::chrono::high_resolution_clock::now(); - // Variant scored_variant = score_variant_thresholded(v, - // test_haplotype, - // event_sequences, - // alignment_flags, - // opt::screen_score_threshold, - // opt::methylation_types); - // scored_variant.info = ""; - // if (scored_variant.quality > 0) { - // out_variants.push_back(scored_variant); - // } - //} } -}catch (std::exception &e){ - printf("Exception in thread! %s\n", e.what()); -} } -// Given the input region, calculate all single base edits to the current assembly -std::vector generate_candidate_single_base_edits(const AlignmentDB& alignments, - int region_start, - int region_end, - uint32_t alignment_flags){ - try { - std::vector out_variants; - std::string contig = alignments.get_region_contig(); - std::mutex outVariantsMutex; - - // Add all positively-scoring single-base changes into the candidate set - if (opt::gpu) { - size_t num_workers = 16; - std::vector gpuAligners(num_workers); - - //std::vector workerThreads(num_workers); - std::vector> handles(num_workers); - - int lociPerWorker = 12; - int nextLocusBegin = region_start; - int nextLocusEnd = nextLocusBegin + lociPerWorker; - bool finished = false; - - for (int workerIdx = 0; workerIdx < num_workers; workerIdx++) { - auto aligner = std::ref(gpuAligners[workerIdx]); - if (!finished) { - if (nextLocusEnd == region_end) { - finished = true; - } - handles[workerIdx] = std::async(std::launch::async, - locusRangeBaseEditCandidate, - nextLocusBegin, - nextLocusEnd, - std::ref(alignments), - alignment_flags, - std::ref(out_variants), - std::ref(contig), - aligner, - std::ref(outVariantsMutex)); - if ((nextLocusEnd + lociPerWorker) < region_end){ - nextLocusBegin = nextLocusEnd + 1; - nextLocusEnd = nextLocusBegin + lociPerWorker - 1; - }else{ - nextLocusBegin = nextLocusEnd + 1; - nextLocusEnd = region_end; - } - } - } +std::vector generate_candidate_single_base_edits_gpu(const AlignmentDB& alignments, + int region_start, + int region_end, + uint32_t alignment_flags){ - //Round robin the workers until done - while (!finished) { - for (int i = 0; i < num_workers; i++) { - auto status = handles[i].wait_for(std::chrono::microseconds(100)); - if (status == std::future_status::ready && (!finished)) { - if (nextLocusEnd == region_end){ - finished = true; - } - auto aligner = std::ref(gpuAligners[i]); - handles[i].get(); - handles[i] = std::async(std::launch::async, - locusRangeBaseEditCandidate, - nextLocusBegin, - nextLocusEnd, - std::ref(alignments), - alignment_flags, - std::ref(out_variants), - std::ref(contig), - aligner, - std::ref(outVariantsMutex)); - if ((nextLocusEnd + lociPerWorker) < region_end){ - nextLocusBegin = nextLocusEnd + 1; - nextLocusEnd = nextLocusBegin + lociPerWorker - 1; - }else{ - nextLocusBegin = nextLocusEnd + 1; - nextLocusEnd = region_end; - } - } - } + std::mutex outVariantsMutex; + std::vector out_variants; + std::string contig = alignments.get_region_contig(); + + // Add all positively-scoring single-base changes into the candidate set + size_t num_workers = 16; + std::vector gpuAligners(num_workers); + + //std::vector workerThreads(num_workers); + std::vector> handles(num_workers); + + int lociPerWorker = 12; + int nextLocusBegin = region_start; + int nextLocusEnd = nextLocusBegin + lociPerWorker; + bool finished = false; + + for (int workerIdx = 0; workerIdx < num_workers; workerIdx++) { + auto aligner = std::ref(gpuAligners[workerIdx]); + if (!finished) { + if (nextLocusEnd == region_end) { + finished = true; + } + handles[workerIdx] = std::async(std::launch::async, + locusRangeBaseEditCandidateGPU, + nextLocusBegin, + nextLocusEnd, + std::ref(alignments), + alignment_flags, + std::ref(out_variants), + std::ref(contig), + aligner, + std::ref(outVariantsMutex)); + if ((nextLocusEnd + lociPerWorker) < region_end){ + nextLocusBegin = nextLocusEnd + 1; + nextLocusEnd = nextLocusBegin + lociPerWorker - 1; + }else{ + nextLocusBegin = nextLocusEnd + 1; + nextLocusEnd = region_end; } + } + } - //Block until all workers are complete - for (int workerIdx = 0; workerIdx < num_workers; workerIdx++) { - handles[workerIdx].wait(); + //Round robin the workers until done + while (!finished) { + for (int i = 0; i < num_workers; i++) { + auto status = handles[i].wait_for(std::chrono::microseconds(100)); + if (status == std::future_status::ready && (!finished)) { + if (nextLocusEnd == region_end){ + finished = true; + } + auto aligner = std::ref(gpuAligners[i]); + handles[i].get(); + handles[i] = std::async(std::launch::async, + locusRangeBaseEditCandidateGPU, + nextLocusBegin, + nextLocusEnd, + std::ref(alignments), + alignment_flags, + std::ref(out_variants), + std::ref(contig), + aligner, + std::ref(outVariantsMutex)); + if ((nextLocusEnd + lociPerWorker) < region_end){ + nextLocusBegin = nextLocusEnd + 1; + nextLocusEnd = nextLocusBegin + lociPerWorker - 1; + }else{ + nextLocusBegin = nextLocusEnd + 1; + nextLocusEnd = region_end; + } } - } else { - GpuAligner aligner; //TODO: temporary - refactor to get rid of this - //for (size_t i = region_start; i < region_end; ++i) { - // singleLocusBaseEditCandidate(i, - // std::ref(alignments), - // alignment_flags, - // std::ref(out_variants), - // std::ref(contig), - // std::ref(aligner), - // std::ref(outVariantsMutex)); - // } - return out_variants; } - catch(std::exception &e){ - printf("Excpetion in calling thread: %s\n", e.what()); + + //Block until all workers are complete + for (int workerIdx = 0; workerIdx < num_workers; workerIdx++) { + handles[workerIdx].wait(); } + return out_variants; +} + +// Given the input region, calculate all single base edits to the current assembly +std::vector generate_candidate_single_base_edits(const AlignmentDB& alignments, + int region_start, + int region_end, + uint32_t alignment_flags){ + std::vector out_variants; + std::string contig = alignments.get_region_contig(); + locusRangeBaseEditCandidate(region_start, + region_end, + alignments, + alignment_flags, + out_variants, + std::ref(contig)); + + return out_variants; } // Given the input set of variants, calculate the variants that have a positive score @@ -1048,7 +1085,15 @@ Haplotype call_variants_for_region(const std::string& contig, int region_start, if(opt::consensus_mode) { // generate single-base edits that have a positive haplotype score - std::vector single_base_edits = generate_candidate_single_base_edits(alignments, region_start, region_end, alignment_flags); + + std::vector single_base_edits; + if(opt::gpu) { + single_base_edits = generate_candidate_single_base_edits_gpu(alignments, region_start, region_end, + alignment_flags); + } else { + single_base_edits = generate_candidate_single_base_edits(alignments, region_start, region_end, + alignment_flags); + } // insert these into the candidate set candidate_variants.insert(candidate_variants.end(), single_base_edits.begin(), single_base_edits.end()); From 68fb38eb38fadc196ba57f19db1879d37e7228b1 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Tue, 24 Jul 2018 16:58:11 +0100 Subject: [PATCH 50/80] Fewer and bigger streams --- src/cuda_kernels/GpuAligner.cu | 22 ++++++++++++---------- src/cuda_kernels/GpuAligner.h | 4 ++++ src/nanopolish_call_variants.cpp | 13 ++++++------- 3 files changed, 22 insertions(+), 17 deletions(-) diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index 8767a40f..5f2f287c 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -522,13 +522,15 @@ __global__ void getScores(float * const eventData, GpuAligner::GpuAligner() { - int numModelElements = 4096; - int max_num_reads = 5000; - int readsSizeBuffer = max_num_reads * sizeof(int); - int max_n_rows = 100; - int maxBuffer = 500000 * sizeof(float); //TODO: allocate more smartly + size_t numModelElements = 4096; + size_t max_reads_per_worker = LOCI_PER_WORKER * MAX_COVERAGE; + int readsSizeBuffer = max_reads_per_worker * sizeof(int); + int maxBuffer = max_reads_per_worker * MAX_SEQUENCE_LENGTH * sizeof(int); //4MB buffer + + //OLD int max_num_sequences = 8; int max_sequence_length = 50; + int max_n_rows = 100; poreModelInitialized = false; @@ -544,12 +546,12 @@ GpuAligner::GpuAligner() CU_CHECK_ERR(cudaMalloc((void**)&logVarDev, readsSizeBuffer)); CU_CHECK_ERR(cudaHostAlloc(&logVarHost, readsSizeBuffer, cudaHostAllocDefault)); - CU_CHECK_ERR(cudaMalloc( (void**)&eventsPerBaseDev, maxBuffer)); - CU_CHECK_ERR(cudaHostAlloc(&eventsPerBaseHost, maxBuffer, cudaHostAllocDefault)); - CU_CHECK_ERR(cudaMalloc( (void**)&readLengthsDev, readsSizeBuffer)); CU_CHECK_ERR(cudaHostAlloc(&readLengthsHost, readsSizeBuffer, cudaHostAllocDefault)); + CU_CHECK_ERR(cudaMalloc( (void**)&eventsPerBaseDev, maxBuffer)); + CU_CHECK_ERR(cudaHostAlloc(&eventsPerBaseHost, maxBuffer, cudaHostAllocDefault)); + // Allocate Device memory for pore model CU_CHECK_ERR(cudaMalloc((void**)&poreModelDev, numModelElements * 3 * sizeof(float))); CU_CHECK_ERR(cudaHostAlloc(&poreModelHost, numModelElements * sizeof(float) * 3, cudaHostAllocDefault)); @@ -605,8 +607,8 @@ GpuAligner::GpuAligner() float * returnValuesDev; float * returnedValues; - CU_CHECK_ERR(cudaMalloc((void**)&returnValuesDev, sizeof(float) * max_num_reads)); //one score per read - CU_CHECK_ERR(cudaHostAlloc(&returnedValues, max_num_reads * sizeof(float) , cudaHostAllocDefault)); + CU_CHECK_ERR(cudaMalloc((void**)&returnValuesDev, sizeof(float) * 50)); //one score per read + CU_CHECK_ERR(cudaHostAlloc(&returnedValues, 59 * sizeof(float) , cudaHostAllocDefault)); CU_CHECK_ERR(cudaMalloc((void**)&kmerRanksDev, max_n_rows * sizeof(int))); kmerRanksDevPointers[i] = kmerRanksDev; diff --git a/src/cuda_kernels/GpuAligner.h b/src/cuda_kernels/GpuAligner.h index d72ce9c2..4169ecd3 100644 --- a/src/cuda_kernels/GpuAligner.h +++ b/src/cuda_kernels/GpuAligner.h @@ -43,6 +43,10 @@ #ifndef GPU_ALIGNER_H #define GPU_ALIGNER_H +#define LOCI_PER_WORKER 64 +#define MAX_COVERAGE 500 +#define MAX_SEQUENCE_LENGTH 100 + //Data to be scored typedef struct { std::vector stateSequences; diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp index fbf033c6..4bffbfb4 100644 --- a/src/nanopolish_call_variants.cpp +++ b/src/nanopolish_call_variants.cpp @@ -435,15 +435,14 @@ std::vector generate_candidate_single_base_edits_gpu(const AlignmentDB& std::string contig = alignments.get_region_contig(); // Add all positively-scoring single-base changes into the candidate set - size_t num_workers = 16; + size_t num_workers = 4; std::vector gpuAligners(num_workers); //std::vector workerThreads(num_workers); std::vector> handles(num_workers); - int lociPerWorker = 12; int nextLocusBegin = region_start; - int nextLocusEnd = nextLocusBegin + lociPerWorker; + int nextLocusEnd = nextLocusBegin + LOCI_PER_WORKER; bool finished = false; for (int workerIdx = 0; workerIdx < num_workers; workerIdx++) { @@ -462,9 +461,9 @@ std::vector generate_candidate_single_base_edits_gpu(const AlignmentDB& std::ref(contig), aligner, std::ref(outVariantsMutex)); - if ((nextLocusEnd + lociPerWorker) < region_end){ + if ((nextLocusEnd + LOCI_PER_WORKER) < region_end){ nextLocusBegin = nextLocusEnd + 1; - nextLocusEnd = nextLocusBegin + lociPerWorker - 1; + nextLocusEnd = nextLocusBegin + LOCI_PER_WORKER - 1; }else{ nextLocusBegin = nextLocusEnd + 1; nextLocusEnd = region_end; @@ -492,9 +491,9 @@ std::vector generate_candidate_single_base_edits_gpu(const AlignmentDB& std::ref(contig), aligner, std::ref(outVariantsMutex)); - if ((nextLocusEnd + lociPerWorker) < region_end){ + if ((nextLocusEnd + LOCI_PER_WORKER) < region_end){ nextLocusBegin = nextLocusEnd + 1; - nextLocusEnd = nextLocusBegin + lociPerWorker - 1; + nextLocusEnd = nextLocusBegin + LOCI_PER_WORKER - 1; }else{ nextLocusBegin = nextLocusEnd + 1; nextLocusEnd = region_end; From bb69f2e259e618b2fc5c8a102d63bcffcf97e86d Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Wed, 25 Jul 2018 10:19:10 +0100 Subject: [PATCH 51/80] fixing a memory leak --- src/cuda_kernels/GpuAligner.cu | 502 ++----------------------------- src/nanopolish_call_variants.cpp | 6 +- 2 files changed, 29 insertions(+), 479 deletions(-) diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index 5f2f287c..078ef203 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -282,243 +282,6 @@ __global__ void getScoresMod (float * poreModelDev, } } -__global__ void getScores(float * const eventData, - float * const readEventsPerBase, - int * const numRowsPerRead, - int * const eventStarts, - int * const eventStrides, - int * const kmerRanks, - int * const eventOffsets, // Offset to use for getting an event IDX for a specific read (read obtained by block IDX) - float * const poreModelDev, - float * const scaleDev, - float * const shiftDev, - float * const varDev, - float * const logVarDev, - float * const preFlankingDev, - float * const postFlankingDev, - float * returnValues) { - - bool debug = false; - if (threadIdx.x == 0 && blockIdx.x == 0){ - debug = false; - } - - // Initialise the prev probability row, which is the row of the DP table - int n_kmers = blockDim.x; - int n_states = n_kmers * PSR9_NUM_STATES + 2 * PSR9_NUM_STATES; // 3 blocks per kmer and then 3 each for start and end state. - - - - __shared__ float returnValue; - returnValue = -INFINITY; - - __shared__ float prevProbabilities[MAX_STATES]; - - // Initialise the previous probabilities - this may not be quite correct as the intialization is different to the C++ version but I don't think it matter - for (int i = 0; i < n_states - PSR9_NUM_STATES; i++) { - prevProbabilities[i] = -INFINITY; - } - for (int i = n_states - PSR9_NUM_STATES; i < n_states; i++) { - prevProbabilities[i] = 0.0f; // Is this correct? - } - - //Step 1: calculate transitions. For now we are going to use external params. - int readIdx = blockIdx.x; - float read_events_per_base = readEventsPerBase[readIdx]; - int numRows = numRowsPerRead[readIdx]; // Number of rows in this DP table. - int e_start = eventStarts[readIdx]; // Event start for read - int e_stride = eventStrides[readIdx]; - int e_offset = eventOffsets[readIdx]; // Within the event means etc, the offset needed for this block to get a specific event - - if(debug){ - printf("Kernel 0 >>> Num Kmers is %i\n", n_kmers); - printf("Kernel 0 >>> n_states %i\n", n_states); - printf("Kernel 0 >>> num events in read is %i\n", numRows); - printf("Kernel 0 >>> event offset is is %i\n", e_offset); - } - - bool rc = false; - if (e_stride == -1){ - rc = true; - } - - int kmerIdx = threadIdx.x; - - uint32_t rank = kmerRanks[kmerIdx + (n_kmers * rc)]; - - float pore_mean = poreModelDev[rank * 3]; - float pore_stdv = poreModelDev[rank * 3 + 1]; - float pore_log_level_stdv = poreModelDev[rank * 3 + 2]; - - - float p_stay = 1 - (1 / read_events_per_base); - float p_skip = 0.0025; - float p_bad = 0.001; - float p_bad_self = p_bad; - float p_skip_self = 0.3; - - float p_mk = p_skip; // probability of not observing an event at all - float p_mb = p_bad; // probabilty of observing a bad event - float p_mm_self = p_stay; // probability of observing additional events from this k-mer - float p_mm_next = 1.0f - p_mm_self - p_mk - p_mb; // normal movement from state to state - - // transitions from event split state in previous block - float p_bb = p_bad_self; - float p_bk, p_bm_next, p_bm_self; - p_bk = p_bm_next = p_bm_self = (1.0f - p_bb) / 3; - - // transitions from kmer skip state in previous block - float p_kk = p_skip_self; - float p_km = 1.0f - p_kk; - - // We assign some transition probabilities. I believe this is correct and they don't vary by location in the sequence - float lp_mk = logf(p_mk); - float lp_mb = logf(p_mb); - float lp_mm_self = logf(p_mm_self); - float lp_mm_next = logf(p_mm_next); - float lp_bb = logf(p_bb); - float lp_bk = logf(p_bk); - float lp_bm_next = logf(p_bm_next); - float lp_bm_self = logf(p_bm_self); - float lp_kk = logf(p_kk); - float lp_km = logf(p_km); - - float lp_sm, lp_ms; - lp_sm = lp_ms = 0.0f; - - // Start filling out the "DP table" - // Each thread is going to work on an individual P-HMM Block - int curBlockIdx = kmerIdx + 1; // Accounts for fact that we are not working with start block. - int prevBlockIdx = curBlockIdx -1; - int prevBlockOffset = PSR9_NUM_STATES * prevBlockIdx; - int curBlockOffset = PSR9_NUM_STATES * curBlockIdx; - - // the penalty is controlled by the transition probability - float BAD_EVENT_PENALTY = 0.0f; - - float scale = scaleDev[readIdx]; - float shift = shiftDev[readIdx]; - float var = varDev[readIdx]; - float logVar = logVarDev[readIdx]; - - for(int row=1; row>> GpuAligner::scoreKernelMod(std::vector &scoreSets, @@ -808,25 +561,25 @@ std::vector>> GpuAligner::scoreKernelMod(std::ve numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0])); // Reads + Flanks - CU_CHECK_ERR(cudaMemcpyAsync( eventMeansDev, eventMeans, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice, streams[0] )); + CU_CHECK_ERR(cudaMemcpyAsync(eventMeansDev, eventMeans, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice, streams[0] )); - CU_CHECK_ERR(cudaMemcpyAsync( preFlankingDev, preFlankingHost, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice, streams[0] )); + CU_CHECK_ERR(cudaMemcpyAsync(preFlankingDev, preFlankingHost, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice, streams[0] )); - CU_CHECK_ERR(cudaMemcpyAsync( postFlankingDev, postFlankingHost, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice, streams[0] )); + CU_CHECK_ERR(cudaMemcpyAsync(postFlankingDev, postFlankingHost, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice, streams[0] )); // Sequence statistics - CU_CHECK_ERR(cudaMemcpyAsync( sequenceLengthsDev, sequenceLengthsHost, numSequences * sizeof(int), cudaMemcpyHostToDevice, streams[0] )); + CU_CHECK_ERR(cudaMemcpyAsync(sequenceLengthsDev, sequenceLengthsHost, numSequences * sizeof(int), cudaMemcpyHostToDevice, streams[0] )); // Sequence offsets - CU_CHECK_ERR(cudaMemcpyAsync( sequenceOffsetsDev, sequenceOffsetsHost, numSequences * sizeof(int), cudaMemcpyHostToDevice, streams[0] )); + CU_CHECK_ERR(cudaMemcpyAsync(sequenceOffsetsDev, sequenceOffsetsHost, numSequences * sizeof(int), cudaMemcpyHostToDevice, streams[0] )); // Sequences - CU_CHECK_ERR(cudaMemcpyAsync( kmerRanksDev, kmerRanks, kmerOffset * sizeof(int), cudaMemcpyHostToDevice, streams[0] )); + CU_CHECK_ERR(cudaMemcpyAsync(kmerRanksDev, kmerRanks, kmerOffset * sizeof(int), cudaMemcpyHostToDevice, streams[0] )); // Job details - CU_CHECK_ERR(cudaMemcpyAsync( seqIdxDev, seqIdxHost, globalScoreIdx * sizeof(int), cudaMemcpyHostToDevice, streams[0] )); - CU_CHECK_ERR(cudaMemcpyAsync( readIdxDev, readIdxHost, globalScoreIdx * sizeof(int), cudaMemcpyHostToDevice, streams[0] )); + CU_CHECK_ERR(cudaMemcpyAsync(seqIdxDev, seqIdxHost, globalScoreIdx * sizeof(int), cudaMemcpyHostToDevice, streams[0] )); + CU_CHECK_ERR(cudaMemcpyAsync(readIdxDev, readIdxHost, globalScoreIdx * sizeof(int), cudaMemcpyHostToDevice, streams[0] )); // Launch Kernels @@ -890,207 +643,6 @@ std::vector>> GpuAligner::scoreKernelMod(std::ve return result; } -std::vector> GpuAligner::scoreKernel(std::vector sequences, - std::vector event_sequences, - uint32_t alignment_flags){ - // pre-running asserts - assert(!sequences.empty()); - assert(!event_sequences.empty()); - assert(std::string(sequences[0].get_alphabet()->get_name()) == "nucleotide"); - for (auto e: event_sequences) { - assert(std::string(e.pore_model->pmalphabet->get_name()) == "nucleotide"); - assert(e.read->pore_type == PT_R9); - assert( (e.rc && e.event_stride == -1) || (!e.rc && e.event_stride == 1)); - } - - int num_reads = event_sequences.size(); - - const uint32_t k = event_sequences[0].pore_model->k; //k is the length of a kmer - - std::vector n_rows; //number of rows in the DP table (n_events) for each read - std::vector e_starts; //event starts in the read for each read - std::vector event_strides; //event strides for each read - std::vector> pre_flanks; - std::vector> post_flanks; - std::vector eventsPerBase; - - //Populate per-read vectors - int numEventsTotal = 0; - for(auto e: event_sequences){ - uint32_t e_start = e.event_start_idx; - e_starts.push_back(e_start); - - uint32_t e_stride = e.event_stride; - event_strides.push_back(e_stride); - - uint32_t e_end = e.event_stop_idx; - uint32_t n_events = 0; - if(e_end > e_start) - n_events = e_end - e_start + 1; - else - n_events = e_start - e_end + 1; - - n_rows.push_back(n_events); - numEventsTotal += n_events; - - std::vector pre_flank = make_pre_flanking(e, e_start, n_events); - std::vector post_flank = make_post_flanking(e, e_start, n_events); - - pre_flanks.push_back(pre_flank); - post_flanks.push_back(post_flank); - - float readEventsPerBase = e.read->events_per_base[e.strand]; - eventsPerBase.push_back(readEventsPerBase); - } - - //Populate buffers for flanks and scaled means data - std::vector eventOffsets; - size_t offset = 0; - for(int j=0; jget_drift_scaled_level(event_idx, e.strand); // send the data in drift scaled - eventMeans[offset + i] = scaled; - preFlankingHost[offset + i] = pre_flanks[j][i]; //also copy over the pre-flanking data, since it has a 1-1 correspondence with events - postFlankingHost[offset + i] = post_flanks[j][i]; //also copy over the pre-flanking data, since it has a 1-1 correspondence with events - } - offset += num_events; - } - - int num_states = event_sequences[0].pore_model->states.size(); - //Populating read-statistics buffers - std::vector scale(num_reads); - std::vector shift(num_reads); - std::vector var(num_reads); - std::vector log_var(num_reads); - for (int i=0;iscalings[read.strand].scale; - shift[i] = event_sequences[i].read->scalings[read.strand].shift; - var[i] = event_sequences[i].read->scalings[read.strand].var; - log_var[i] = event_sequences[i].read->scalings[read.strand].log_var; - } - - // Copy to the device all buffers shared across kmer sequences. - CU_CHECK_ERR(cudaMemcpyAsync( scaleDev, scale.data(), scale.size() * sizeof(float), cudaMemcpyHostToDevice, streams[0] )); - CU_CHECK_ERR(cudaMemcpyAsync( shiftDev, shift.data(), shift.size() * sizeof(float), cudaMemcpyHostToDevice, streams[0])); - CU_CHECK_ERR(cudaMemcpyAsync( varDev, var.data(), var.size() * sizeof(float), cudaMemcpyHostToDevice, streams[0])); - CU_CHECK_ERR(cudaMemcpyAsync( logVarDev, log_var.data(), log_var.size() * sizeof(float), cudaMemcpyHostToDevice, streams[0] )); - CU_CHECK_ERR(cudaMemcpyAsync( eventsPerBaseDev, eventsPerBase.data(), eventsPerBase.size() * sizeof(float), cudaMemcpyHostToDevice, streams[0])); - CU_CHECK_ERR(cudaMemcpyAsync( eventMeansDev, eventMeans, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice, streams[0] )); - CU_CHECK_ERR(cudaMemcpyAsync( preFlankingDev, preFlankingHost, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice, streams[0] )); - CU_CHECK_ERR(cudaMemcpyAsync( postFlankingDev, postFlankingHost, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice, streams[0] )); - CU_CHECK_ERR(cudaMemcpyAsync( numRowsDev, n_rows.data(), n_rows.size() * sizeof(int), cudaMemcpyHostToDevice, streams[0] )); - CU_CHECK_ERR(cudaMemcpyAsync( eventStartsDev, e_starts.data(), e_starts.size() * sizeof(int), cudaMemcpyHostToDevice, streams[0] )); - CU_CHECK_ERR(cudaMemcpyAsync( eventStridesDev, event_strides.data(), event_strides.size() * sizeof(int), cudaMemcpyHostToDevice, streams[0] )); - CU_CHECK_ERR(cudaMemcpyAsync( eventOffsetsDev, eventOffsets.data(), eventOffsets.size() * sizeof(int), cudaMemcpyHostToDevice, streams[0] )); - - // Populate pore model buffers - // Assume that every event sequence has the same pore model - if (poreModelInitialized == false) { - int poreModelEntriesPerState = 3; - for(int st=0; ststates[st]; - poreModelHost[st * poreModelEntriesPerState] = params.level_mean; - poreModelHost[st * poreModelEntriesPerState + 1] = params.level_stdv; - poreModelHost[st * poreModelEntriesPerState + 2] = params.level_log_stdv; - } - // copy over the pore model - CU_CHECK_ERR(cudaMemcpyAsync(poreModelDev, poreModelHost, - poreModelEntriesPerState * 4096 * sizeof(float), cudaMemcpyHostToDevice, streams[0])); // TODO don't hardcode num kmers - poreModelInitialized = true; - } - - //Let's populate a host buffer with all the sequences. - size_t numKmers = 0; - for (auto sequence: sequences) { - numKmers += (sequence.length() - k + 1); - } - - size_t kmerOffset = 0; - for (int i = 0; i>> (eventMeansDev, - eventsPerBaseDev, - numRowsDev, - eventStartsDev, - eventStridesDev, - kmerRanksDevPtr, - eventOffsetsDev, - poreModelDev, - scaleDev, - shiftDev, - varDev, - logVarDev, - preFlankingDev, - postFlankingDev, - returnValuesDev); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - printf("Errors during kernel execution: %s\n", cudaGetErrorString(err)); - - } - for (int i = 0; i<8;i++) { - cudaMemcpyAsync(returnValuesHostResultsPointers[i], returnValuesDevResultsPointers[i], - num_reads * sizeof(float), cudaMemcpyDeviceToHost, streams[i]); - } - std::vector> results(sequences.size()); - for (size_t i =0; i GpuAligner::variantScoresThresholded(std::vector> input_variants_vector, std::vector base_haplotypes, diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp index 4bffbfb4..47cc2272 100644 --- a/src/nanopolish_call_variants.cpp +++ b/src/nanopolish_call_variants.cpp @@ -38,7 +38,6 @@ #include "profiler.h" #include "progress.h" #include "stdaln.h" -#include #include #include #include @@ -410,7 +409,6 @@ void locusRangeBaseEditCandidate(int start, auto test_haplotype = haplotypes[haplotypeIDX]; auto event_sequences = event_sequences_vector[haplotypeIDX]; for (const Variant &v : variants) { - auto t0 = std::chrono::high_resolution_clock::now(); Variant scored_variant = score_variant_thresholded(v, test_haplotype, event_sequences, @@ -435,7 +433,7 @@ std::vector generate_candidate_single_base_edits_gpu(const AlignmentDB& std::string contig = alignments.get_region_contig(); // Add all positively-scoring single-base changes into the candidate set - size_t num_workers = 4; + size_t num_workers = 8; std::vector gpuAligners(num_workers); //std::vector workerThreads(num_workers); @@ -474,7 +472,7 @@ std::vector generate_candidate_single_base_edits_gpu(const AlignmentDB& //Round robin the workers until done while (!finished) { for (int i = 0; i < num_workers; i++) { - auto status = handles[i].wait_for(std::chrono::microseconds(100)); + auto status = handles[i].wait_for(std::chrono::microseconds(0)); if (status == std::future_status::ready && (!finished)) { if (nextLocusEnd == region_end){ finished = true; From 2b14a68e21d29de2d53fbab77ede27761ea63f29 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Wed, 25 Jul 2018 16:15:33 +0100 Subject: [PATCH 52/80] 40x coverage --- src/cuda_kernels/GpuAligner.cu | 2 ++ src/cuda_kernels/GpuAligner.h | 5 +---- src/nanopolish_call_variants.cpp | 13 +++++++++---- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index 078ef203..aedc4592 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -660,6 +660,8 @@ std::vector GpuAligner::variantScoresThresholded(std::vector out_variants = input_variants; diff --git a/src/cuda_kernels/GpuAligner.h b/src/cuda_kernels/GpuAligner.h index 4169ecd3..d7b8a826 100644 --- a/src/cuda_kernels/GpuAligner.h +++ b/src/cuda_kernels/GpuAligner.h @@ -1,6 +1,3 @@ -// -// Created by mike on 05/06/18. -// #include #include "nanopolish_variant.h" #include @@ -43,7 +40,7 @@ #ifndef GPU_ALIGNER_H #define GPU_ALIGNER_H -#define LOCI_PER_WORKER 64 +#define LOCI_PER_WORKER 32 #define MAX_COVERAGE 500 #define MAX_SEQUENCE_LENGTH 100 diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp index 47cc2272..5c67d82d 100644 --- a/src/nanopolish_call_variants.cpp +++ b/src/nanopolish_call_variants.cpp @@ -132,7 +132,8 @@ namespace opt static int min_flanking_sequence = 30; static int max_haplotypes = 1000; static int max_rounds = 50; - static int screen_score_threshold = 1000; + static int screen_score_threshold = 100; + static int max_coverage_gpu = 40; static int screen_flanking_sequence = 10; static int debug_alignments = 0; static std::vector methylation_types; @@ -433,7 +434,7 @@ std::vector generate_candidate_single_base_edits_gpu(const AlignmentDB& std::string contig = alignments.get_region_contig(); // Add all positively-scoring single-base changes into the candidate set - size_t num_workers = 8; + size_t num_workers = opt::num_threads; std::vector gpuAligners(num_workers); //std::vector workerThreads(num_workers); @@ -1085,10 +1086,14 @@ Haplotype call_variants_for_region(const std::string& contig, int region_start, std::vector single_base_edits; if(opt::gpu) { - single_base_edits = generate_candidate_single_base_edits_gpu(alignments, region_start, region_end, + single_base_edits = generate_candidate_single_base_edits_gpu(alignments, + region_start, + region_end, alignment_flags); } else { - single_base_edits = generate_candidate_single_base_edits(alignments, region_start, region_end, + single_base_edits = generate_candidate_single_base_edits(alignments, + region_start, + region_end, alignment_flags); } // insert these into the candidate set From f4d53cc3e7dea1ee7faf2d17343de0c360a1c456 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Thu, 26 Jul 2018 10:10:29 +0100 Subject: [PATCH 53/80] added max coverage --- src/cuda_kernels/GpuAligner.cu | 10 ++++++---- src/cuda_kernels/GpuAligner.h | 3 ++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index aedc4592..c283bc93 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -286,13 +286,13 @@ __global__ void getScoresMod (float * poreModelDev, GpuAligner::GpuAligner() { size_t numModelElements = 4096; - size_t max_reads_per_worker = LOCI_PER_WORKER * MAX_COVERAGE; + size_t max_reads_per_worker = LOCI_PER_WORKER * MAX_COVERAGE * MAX_NUM_VARIANTS_PER_LOCUS; int readsSizeBuffer = max_reads_per_worker * sizeof(int); - int maxBuffer = max_reads_per_worker * MAX_SEQUENCE_LENGTH * sizeof(int); //4MB buffer + int maxBuffer = max_reads_per_worker * MAX_SEQUENCE_LENGTH * sizeof(int); //OLD int max_num_sequences = 1; //TODO can get rid of this - int max_sequence_length = 50; + int max_sequence_length = 100; int max_n_rows = 100; poreModelInitialized = false; @@ -660,7 +660,9 @@ std::vector GpuAligner::variantScoresThresholded(std::vector MAX_COVERAGE) { + event_sequences.resize(MAX_COVERAGE); + } int numVariants = input_variants.size(); diff --git a/src/cuda_kernels/GpuAligner.h b/src/cuda_kernels/GpuAligner.h index d7b8a826..6e94fc62 100644 --- a/src/cuda_kernels/GpuAligner.h +++ b/src/cuda_kernels/GpuAligner.h @@ -41,8 +41,9 @@ #define GPU_ALIGNER_H #define LOCI_PER_WORKER 32 -#define MAX_COVERAGE 500 +#define MAX_COVERAGE 400 #define MAX_SEQUENCE_LENGTH 100 +#define MAX_NUM_VARIANTS_PER_LOCUS 10 //Data to be scored typedef struct { From cf5be6aff9c1e1b80c7864342e359829a978bc90 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Thu, 26 Jul 2018 15:30:10 +0100 Subject: [PATCH 54/80] Finding good max coverage to use --- src/cuda_kernels/GpuAligner.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cuda_kernels/GpuAligner.h b/src/cuda_kernels/GpuAligner.h index 6e94fc62..94989125 100644 --- a/src/cuda_kernels/GpuAligner.h +++ b/src/cuda_kernels/GpuAligner.h @@ -41,8 +41,8 @@ #define GPU_ALIGNER_H #define LOCI_PER_WORKER 32 -#define MAX_COVERAGE 400 -#define MAX_SEQUENCE_LENGTH 100 +#define MAX_COVERAGE 300 +#define MAX_SEQUENCE_LENGTH 50 #define MAX_NUM_VARIANTS_PER_LOCUS 10 //Data to be scored From 6e22a85080690bbef4eca1066994763887281982 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Thu, 26 Jul 2018 16:52:55 +0100 Subject: [PATCH 55/80] Performance tuning for V100 --- src/cuda_kernels/GpuAligner.h | 1 + src/nanopolish_call_variants.cpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/cuda_kernels/GpuAligner.h b/src/cuda_kernels/GpuAligner.h index 94989125..731f2ed9 100644 --- a/src/cuda_kernels/GpuAligner.h +++ b/src/cuda_kernels/GpuAligner.h @@ -44,6 +44,7 @@ #define MAX_COVERAGE 300 #define MAX_SEQUENCE_LENGTH 50 #define MAX_NUM_VARIANTS_PER_LOCUS 10 +#define MAX_NUM_WORKERS 16 //Data to be scored typedef struct { diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp index 5c67d82d..3dff1b49 100644 --- a/src/nanopolish_call_variants.cpp +++ b/src/nanopolish_call_variants.cpp @@ -434,7 +434,7 @@ std::vector generate_candidate_single_base_edits_gpu(const AlignmentDB& std::string contig = alignments.get_region_contig(); // Add all positively-scoring single-base changes into the candidate set - size_t num_workers = opt::num_threads; + size_t num_workers = (opt::num_threads < MAX_NUM_WORKERS) ? opt::num_threads : MAX_NUM_WORKERS; std::vector gpuAligners(num_workers); //std::vector workerThreads(num_workers); From e2a35252b92e9e1e8fb8e58b7146e2874130e1a0 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Fri, 27 Jul 2018 11:28:32 +0100 Subject: [PATCH 56/80] set sleep to 100us --- src/nanopolish_call_variants.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp index 3dff1b49..5d614867 100644 --- a/src/nanopolish_call_variants.cpp +++ b/src/nanopolish_call_variants.cpp @@ -473,7 +473,7 @@ std::vector generate_candidate_single_base_edits_gpu(const AlignmentDB& //Round robin the workers until done while (!finished) { for (int i = 0; i < num_workers; i++) { - auto status = handles[i].wait_for(std::chrono::microseconds(0)); + auto status = handles[i].wait_for(std::chrono::microseconds(100)); if (status == std::future_status::ready && (!finished)) { if (nextLocusEnd == region_end){ finished = true; From 1adf4b8ad25c79ea9433af7bd1bbe2fc5337503d Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Thu, 2 Aug 2018 15:33:05 +0100 Subject: [PATCH 57/80] Merged upstream master --- README.md | 11 ++++-- src/common/nanopolish_common.h | 2 +- src/common/nanopolish_variant.cpp | 11 +++++- src/common/nanopolish_variant.h | 17 ++++++-- src/main/nanopolish.cpp | 4 +- src/nanopolish_call_variants.cpp | 65 +++++++++++++++++++------------ 6 files changed, 74 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index 9609577b..74c9c23b 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,10 @@ nanopolish variants --consensus polished_gpu.fa -w "tig00000001:200000-230000" - Software package for signal-level analysis of Oxford Nanopore sequencing data. Nanopolish can calculate an improved consensus sequence for a draft genome assembly, detect base modifications, call SNPs and indels with respect to a reference genome and more (see Nanopolish modules, below). +## Release notes + +* 0.10.1: `nanopolish variants --consensus` now only outputs a VCF file instead of a fasta sequence. The VCF file describes the changes that need to be made to turn the draft sequence into the polished assembly. A new program, `nanopolish vcf2fasta`, is provided to generate the polished genome (this replaces `nanopolish_merge.py`, see usage instructions below). This change is to avoid issues when merging segments that end on repeat boundaries (reported by Michael Wykes and Chris Wright). + ## Dependencies A compiler that supports C++11 is needed to build nanopolish. Development of the code is performed using [gcc-4.8](https://gcc.gnu.org/gcc-4.8/). @@ -43,7 +47,7 @@ When major features have been added or bugs fixed, we will tag and release a new ``` git clone --recursive https://github.com/jts/nanopolish.git cd nanopolish -git checkout v0.7.1 +git checkout v0.9.2 make ``` @@ -52,7 +56,6 @@ make The main subprograms of nanopolish are: ``` -nanopolish extract: extract reads in FASTA or FASTQ format from a directory of FAST5 files nanopolish call-methylation: predict genomic bases that may be methylated nanopolish variants: detect SNPs and indels with respect to a reference genome nanopolish variants --consensus: calculate an improved consensus sequence for a draft genome assembly @@ -89,7 +92,7 @@ Now, we use nanopolish to compute the consensus sequence (the genome is polished ``` python nanopolish_makerange.py draft.fa | parallel --results nanopolish.results -P 8 \ - nanopolish variants --consensus polished.{1}.fa -w {1} -r reads.fa -b reads.sorted.bam -g draft.fa -t 4 --min-candidate-frequency 0.1 + nanopolish variants --consensus -o polished.{1}.vcf -w {1} -r reads.fa -b reads.sorted.bam -g draft.fa -t 4 --min-candidate-frequency 0.1 ``` This command will run the consensus algorithm on eight 50kbp segments of the genome at a time, using 4 threads each. Change the ```-P``` and ```--threads``` options as appropriate for the machines you have available. @@ -97,7 +100,7 @@ This command will run the consensus algorithm on eight 50kbp segments of the gen After all polishing jobs are complete, you can merge the individual 50kb segments together back into the final assembly: ``` -python nanopolish_merge.py polished.*.fa > polished_genome.fa +nanopolish vcf2fasta -g draft.fa polished.*.vcf > polished_genome.fa ``` ## Calling Methylation diff --git a/src/common/nanopolish_common.h b/src/common/nanopolish_common.h index 887a5e67..d287ac09 100644 --- a/src/common/nanopolish_common.h +++ b/src/common/nanopolish_common.h @@ -18,7 +18,7 @@ #include "logsum.h" #define PACKAGE_NAME "nanopolish" -#define PACKAGE_VERSION "0.9.2" +#define PACKAGE_VERSION "0.10.1" #define PACKAGE_BUGREPORT "https://github.com/jts/nanopolish/issues" // diff --git a/src/common/nanopolish_variant.cpp b/src/common/nanopolish_variant.cpp index b73a6b2b..902756f3 100644 --- a/src/common/nanopolish_variant.cpp +++ b/src/common/nanopolish_variant.cpp @@ -18,6 +18,13 @@ //#define DEBUG_HAPLOTYPE_SELECTION 1 +std::string Variant::make_vcf_header_key_value(const std::string& key, const std::string& value) +{ + std::stringstream ss; + ss << "##" << key << "=" << value; + return ss.str(); +} + std::string Variant::make_vcf_tag_string(const std::string& tag, const std::string& id, int count, @@ -31,11 +38,11 @@ std::string Variant::make_vcf_tag_string(const std::string& tag, } void Variant::write_vcf_header(FILE* fp, - const std::vector& tag_lines) + const std::vector& header_lines) { fprintf(fp, "##fileformat=VCFv4.2\n"); - for(const std::string& line : tag_lines) { + for(const std::string& line : header_lines) { fprintf(fp, "%s\n", line.c_str()); } fprintf(fp, "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample\n"); diff --git a/src/common/nanopolish_variant.h b/src/common/nanopolish_variant.h index f3f350ba..c9ef3b73 100644 --- a/src/common/nanopolish_variant.h +++ b/src/common/nanopolish_variant.h @@ -21,7 +21,9 @@ class AlignmentDB; struct Variant { static void write_vcf_header(FILE* fp, - const std::vector& tag_lines = std::vector()); + const std::vector& header_lines = std::vector()); + + static std::string make_vcf_header_key_value(const std::string& key, const std::string& value); static std::string make_vcf_tag_string(const std::string& tag, const std::string& id, @@ -43,8 +45,8 @@ struct Variant void write_vcf(FILE* fp) const { assert(fp != NULL); - const char* gt_def = genotype.empty() ? NULL : "GT"; - const char* gt_str = genotype.empty() ? NULL : genotype.c_str(); + const char* gt_def = "GT"; + const char* gt_str = genotype.empty() ? "." : genotype.c_str(); fprintf(fp, "%s\t%zu\t%s\t", ref_name.c_str(), ref_position + 1, "."); fprintf(fp, "%s\t%s\t%.1lf\t", ref_seq.c_str(), alt_seq.c_str(), quality); @@ -116,6 +118,15 @@ class VariantKeyComp } }; +class VariantKeyEqualityComp +{ + public: + inline bool operator()(const Variant& a, const Variant& b) + { + return a.key() == b.key(); + } +}; + // Read a collection of variants from a VCF file std::vector read_variants_from_file(const std::string& filename); std::vector read_variants_for_region(const std::string& filename, diff --git a/src/main/nanopolish.cpp b/src/main/nanopolish.cpp index d25df269..459a3e2e 100644 --- a/src/main/nanopolish.cpp +++ b/src/main/nanopolish.cpp @@ -18,6 +18,7 @@ #include "nanopolish_call_methylation.h" #include "nanopolish_scorereads.h" #include "nanopolish_phase_reads.h" +#include "nanopolish_vcf2fasta.h" #include "nanopolish_train_poremodel_from_basecalls.h" int print_usage(int argc, char **argv); @@ -34,7 +35,8 @@ static std::map< std::string, std::function > programs = { {"variants", call_variants_main}, {"methyltrain", methyltrain_main}, {"scorereads", scorereads_main} , - {"phase-reads", phase_reads_main} , + {"phase-reads", phase_reads_main} , + {"vcf2fasta", vcf2fasta_main} , {"call-methylation", call_methylation_main} }; diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp index 5d614867..1d9cf081 100644 --- a/src/nanopolish_call_variants.cpp +++ b/src/nanopolish_call_variants.cpp @@ -79,7 +79,7 @@ static const char *CONSENSUS_USAGE_MESSAGE = " --version display version\n" " --help display this help and exit\n" " --snps only call SNPs\n" -" --consensus=FILE run in consensus calling mode and write polished sequence to FILE\n" +" --consensus run in consensus calling mode\n" " --fix-homopolymers run the experimental homopolymer caller\n" " --faster minimize compute time while slightly reducing consensus accuracy\n" " -w, --window=STR find variants in window STR (format: :-)\n" @@ -188,7 +188,7 @@ static const struct option longopts[] = { { "p-skip-self", required_argument, NULL, OPT_P_SKIP_SELF }, { "p-bad", required_argument, NULL, OPT_P_BAD }, { "p-bad-self", required_argument, NULL, OPT_P_BAD_SELF }, - { "consensus", required_argument, NULL, OPT_CONSENSUS }, + { "consensus", no_argument, NULL, OPT_CONSENSUS }, { "gpu", required_argument, NULL, OPT_GPU }, { "faster", no_argument, NULL, OPT_FASTER }, { "fix-homopolymers", no_argument, NULL, OPT_FIX_HOMOPOLYMERS }, @@ -889,6 +889,11 @@ Haplotype fix_homopolymers(const Haplotype& input_haplotype, int size_diff = call - hp_length; std::string contig = fixed_haplotype.get_reference_name(); + Variant v; + v.ref_name = contig; + v.add_info("TotalReads", event_sequences.size()); + v.add_info("AlleleCount", 1); + if(size_diff > 0) { // add a 1bp insertion in this region // the variant might conflict with other variants in the region @@ -896,12 +901,12 @@ Haplotype fix_homopolymers(const Haplotype& input_haplotype, // NB: it is intended that if the call is a 2bp (or greater) insertion // we only insert 1bp (for now) for(size_t k = hap_hp_start; k <= hap_hp_end; ++k) { - Variant v; - v.ref_name = contig; v.ref_position = input_haplotype.get_reference_position_for_haplotype_base(k); + if(v.ref_position == std::string::npos) { continue; } + v.ref_seq = fixed_haplotype.substr_by_reference(v.ref_position, v.ref_position).get_sequence(); if(v.ref_seq.size() == 1 && v.ref_seq[0] == hp_base) { v.alt_seq = v.ref_seq + hp_base; @@ -916,10 +921,9 @@ Haplotype fix_homopolymers(const Haplotype& input_haplotype, } else if(size_diff < 0) { // add a 1bp deletion at this position for(size_t k = hap_hp_start; k <= hap_hp_end; ++k) { - Variant v; - v.ref_name = contig; v.ref_position = input_haplotype.get_reference_position_for_haplotype_base(k); v.quality = score; + if(v.ref_position == std::string::npos) { continue; } @@ -1035,14 +1039,12 @@ Haplotype call_haplotype_from_candidates(const AlignmentDB& alignments, // Apply them to the final haplotype for(size_t vi = 0; vi < called_variants.size(); vi++) { derived_haplotype.apply_variant(called_variants[vi]); - called_variants[vi].write_vcf(vcf_out); } } } return derived_haplotype; } - Haplotype call_variants_for_region(const std::string& contig, int region_start, int region_end, FILE* out_fp) { const int BUFFER = opt::min_flanking_sequence + 10; @@ -1167,13 +1169,6 @@ Haplotype call_variants_for_region(const std::string& contig, int region_start, called_haplotype = fix_homopolymers(called_haplotype, alignments); } - // write consensus result - FILE* consensus_fp = fopen(opt::consensus_output.c_str(), "w"); - fprintf(consensus_fp, ">%s:%d-%d\n%s\n", contig.c_str(), - alignments.get_region_start(), - alignments.get_region_end(), - called_haplotype.get_sequence().c_str()); - fclose(consensus_fp); } else { // // Calling strategy in reference-based variant calling mode @@ -1210,7 +1205,7 @@ void parse_call_variants_options(int argc, char** argv) case '?': die = true; break; case 't': arg >> opt::num_threads; break; case 'v': opt::verbose++; break; - case OPT_CONSENSUS: arg >> opt::consensus_output; opt::consensus_mode = 1; break; + case OPT_CONSENSUS: opt::consensus_mode = 1; break; case OPT_GPU: opt::gpu = 1; break; case OPT_FIX_HOMOPOLYMERS: opt::fix_homopolymers = 1; break; case OPT_EFFORT: arg >> opt::screen_score_threshold; break; @@ -1342,43 +1337,63 @@ int call_variants_main(int argc, char** argv) } // Build the VCF header - std::vector tag_fields; + std::vector header_fields; + + std::stringstream polish_window; + polish_window << contig << ":" << start_base << "-" << end_base; + header_fields.push_back(Variant::make_vcf_header_key_value("nanopolish_window", polish_window.str())); // - tag_fields.push_back( + header_fields.push_back( Variant::make_vcf_tag_string("INFO", "TotalReads", 1, "Integer", "The number of event-space reads used to call the variant")); - tag_fields.push_back( + header_fields.push_back( Variant::make_vcf_tag_string("INFO", "SupportFraction", 1, "Float", "The fraction of event-space reads that support the variant")); - tag_fields.push_back( + header_fields.push_back( Variant::make_vcf_tag_string("INFO", "BaseCalledReadsWithVariant", 1, "Integer", "The number of base-space reads that support the variant")); - tag_fields.push_back( + header_fields.push_back( Variant::make_vcf_tag_string("INFO", "BaseCalledFraction", 1, "Float", "The fraction of base-space reads that support the variant")); - tag_fields.push_back( + header_fields.push_back( Variant::make_vcf_tag_string("INFO", "AlleleCount", 1, "Integer", "The inferred number of copies of the allele")); if(opt::calculate_all_support) { - tag_fields.push_back( + header_fields.push_back( Variant::make_vcf_tag_string("INFO", "SupportFractionByBase", 4, "Integer", "The fraction of reads supporting A,C,G,T at this position")); } - tag_fields.push_back( + header_fields.push_back( Variant::make_vcf_tag_string("FORMAT", "GT", 1, "String", "Genotype")); - Variant::write_vcf_header(out_fp, tag_fields); + Variant::write_vcf_header(out_fp, header_fields); Haplotype haplotype = call_variants_for_region(contig, start_base, end_base, out_fp); + // write the consensus result as a fasta file if requested + if(!opt::consensus_output.empty()) { + FILE* consensus_fp = fopen(opt::consensus_output.c_str(), "w"); + fprintf(consensus_fp, ">%s:%d-%d\n%s\n", contig.c_str(), + start_base, + end_base, + haplotype.get_sequence().c_str()); + fclose(consensus_fp); + } + + // write the variants + for(const auto& v : haplotype.get_variants()) { + v.write_vcf(out_fp); + } + + // if(out_fp != stdout) { fclose(out_fp); } From 2856bbb15a790c1d5b862810cd18da869ca4bbb8 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Wed, 15 Aug 2018 15:07:48 +0100 Subject: [PATCH 58/80] Adding files for VCF handling which for some reason are absent --- src/nanopolish_vcf2fasta.cpp | 274 +++++++++++++++++++++++++++++++++++ src/nanopolish_vcf2fasta.h | 14 ++ 2 files changed, 288 insertions(+) create mode 100644 src/nanopolish_vcf2fasta.cpp create mode 100644 src/nanopolish_vcf2fasta.h diff --git a/src/nanopolish_vcf2fasta.cpp b/src/nanopolish_vcf2fasta.cpp new file mode 100644 index 00000000..93187985 --- /dev/null +++ b/src/nanopolish_vcf2fasta.cpp @@ -0,0 +1,274 @@ +//--------------------------------------------------------- +// Copyright 2018 Ontario Institute for Cancer Research +// Written by Jared Simpson (jared.simpson@oicr.on.ca) +//--------------------------------------------------------- +// +// nanopolish_vcf2fasta - write a new genome sequence +// by introducing variants from a set of vcf files +// +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "htslib/faidx.h" +#include "nanopolish_common.h" +#include "nanopolish_variant.h" +#include "nanopolish_eventalign.h" +#include "nanopolish_haplotype.h" + +// +// Getopt +// +#define SUBPROGRAM "vcf2fasta" + +static const char *VCF2FASTA_VERSION_MESSAGE = +SUBPROGRAM " Version " PACKAGE_VERSION "\n" +"Written by Jared Simpson.\n" +"\n" +"Copyright 2018 Ontario Institute for Cancer Research\n"; + +static const char *VCF2FASTA_USAGE_MESSAGE = +"Usage: " PACKAGE_NAME " " SUBPROGRAM " -g draft.fa segment1.vcf segment2.vcf ...\n" +"Write a new genome sequence by introducing variants from the input files\n" +"\n" +" -v, --verbose display verbose output\n" +" --version display version\n" +" --help display this help and exit\n" +" -g, --genome=FILE the input genome is in FILE\n" +"\nReport bugs to " PACKAGE_BUGREPORT "\n\n"; + +namespace opt +{ + static unsigned int verbose; + static std::vector input_vcf_files; + static std::string genome_file; +} + +static const char* shortopts = "g:v"; + +enum { OPT_HELP = 1, OPT_VERSION }; + +static const struct option longopts[] = { + { "verbose", no_argument, NULL, 'v' }, + { "help", no_argument, NULL, OPT_HELP }, + { "version", no_argument, NULL, OPT_VERSION }, + { "genome", required_argument, NULL, 'g' }, + { NULL, 0, NULL, 0 } +}; + +void parse_vcf2fasta_options(int argc, char** argv) +{ + bool die = false; + for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) { + std::istringstream arg(optarg != NULL ? optarg : ""); + switch (c) { + case '?': die = true; break; + case 'v': opt::verbose++; break; + case 'g': arg >> opt::genome_file; break; + case OPT_HELP: + std::cout << VCF2FASTA_USAGE_MESSAGE; + exit(EXIT_SUCCESS); + case OPT_VERSION: + std::cout << VCF2FASTA_VERSION_MESSAGE; + exit(EXIT_SUCCESS); + } + } + + if(opt::genome_file.empty()) { + std::cerr << SUBPROGRAM ": -g/--genome file is required\n"; + die = true; + } + + if (argc - optind < 1) { + std::cerr << SUBPROGRAM ": not enough arguments\n"; + die = true; + } + + if (die) + { + std::cout << "\n" << VCF2FASTA_USAGE_MESSAGE; + exit(EXIT_FAILURE); + } + + for(; optind < argc; ++optind) { + opt::input_vcf_files.push_back(argv[optind]); + } +} + +int vcf2fasta_main(int argc, char** argv) +{ + parse_vcf2fasta_options(argc, argv); + + // Read genome file + faidx_t *fai = fai_load(opt::genome_file.c_str()); + + // Read VCF files and gather variants for each contig and the polishing window coordinates + std::map> variants_by_contig; + std::map>> windows_by_contig; + + for(const auto& filename : opt::input_vcf_files) { + + std::string window_str; + std::vector out; + std::ifstream infile(filename); + std::string line; + while(getline(infile, line)) { + + // parse header + if(line[0] == '#') { + + // check for window coordinates + if(line.find("nanopolish_window") != std::string::npos) { + std::vector fields = split(line, '='); + assert(fields.size() == 2); + window_str = fields[1]; + } + } else { + Variant v(line); + variants_by_contig[v.ref_name].push_back(v); + } + } + + if(window_str.empty()) { + fprintf(stderr, "error: could not detect polishing window from input file %s\n", filename.c_str()); + exit(EXIT_FAILURE); + } + + std::string window_contig; + int window_start, window_end; + parse_region_string(window_str, window_contig, window_start, window_end); + windows_by_contig[window_contig].push_back(std::make_pair(window_start, window_end)); + } + + size_t n_contigs = faidx_nseq(fai); + + for(size_t contig_idx = 0; contig_idx < n_contigs; ++contig_idx) { + std::string contig = faidx_iseq(fai, contig_idx); + int contig_length = faidx_seq_len(fai, contig.c_str()); + + // Confirm that all windows on this contig have been polished + bool window_check_ok = true; + auto& windows = windows_by_contig[contig]; + + std::sort(windows.begin(), windows.end()); + if(windows[0].first != 0) { + fprintf(stderr, "error: first %d bases are not covered by a polished window for contig %s.\n", windows[0].first, contig.c_str()); + window_check_ok = false; + } + + for(size_t window_idx = 1; window_idx < windows.size(); ++window_idx) { + int prev_start = windows[window_idx - 1].first; + int prev_end = windows[window_idx - 1].second; + int curr_start = windows[window_idx].first; + int curr_end = windows[window_idx].second; + if(curr_start > prev_end) { + fprintf(stderr, "error: adjacent polishing windows do not overlap (%d-%d and %d-%d)\n", prev_start, prev_end, curr_start, curr_end); + window_check_ok = false; + } + } + + int end_gap = contig_length - windows.back().second; + if(end_gap > 500) { + fprintf(stderr, "error: last %d bases are not covered by a polished window for contig %s.\n", end_gap, contig.c_str()); + window_check_ok = false; + } + + if(!window_check_ok) { + fprintf(stderr, "error: one or more polishing windows are missing. Please check that all nanopolish variants --consensus jobs ran to completion\n"); + exit(EXIT_FAILURE); + } + + int length; + char* seq = fai_fetch(fai, contig.c_str(), &length); + if(length < 0) { + fprintf(stderr, "error: could not fetch contig %s\n", contig.c_str()); + exit(EXIT_FAILURE); + } + + auto& variants = variants_by_contig[contig]; + std::sort(variants.begin(), variants.end(), sortByPosition); + + // remove duplicate variants + VariantKeyEqualityComp vkec; + auto last = std::unique(variants.begin(), variants.end(), vkec); + variants.erase(last, variants.end()); + + assert(variants.size() < (1 << 30)); + uint32_t deleted_tag = 1 << 30; + uint32_t variant_tag = 1 << 31; + + // make a vector holding either a literal character or an index to the variant that needs to be applied + std::vector consensus_record(length); + for(size_t i = 0; i < length; ++i) { + consensus_record[i] = seq[i]; + } + + size_t num_skipped = 0; + size_t num_subs = 0; + size_t num_insertions = 0; + size_t num_deletions = 0; + + // update the consensus record according to the variants for this contig + size_t applied_variants = 0; + for(size_t variant_idx = 0; variant_idx < variants.size(); ++variant_idx) { + const Variant& v = variants[variant_idx]; + + // check if the variant record matches the reference sequence + bool matches_ref = true; + for(size_t i = 0; i < v.ref_seq.length(); ++i) { + matches_ref = matches_ref && v.ref_seq[i] == consensus_record[v.ref_position + i]; + } + + if(!matches_ref) { + num_skipped += 1; + continue; + } + + // mark the first base of the reference sequence as a variant and set the index + consensus_record[v.ref_position] = variant_tag | variant_idx; + + // mark the subsequent bases of the reference as deleted + for(size_t i = 1; i < v.ref_seq.length(); ++i) { + consensus_record[v.ref_position + i] = deleted_tag; + } + + num_subs += v.ref_seq.length() == v.alt_seq.length(); + num_insertions += v.ref_seq.length() < v.alt_seq.length(); + num_deletions += v.ref_seq.length() > v.alt_seq.length(); + } + + // write out the consensus record + std::string out; + out.reserve(length); + for(size_t i = 0; i < length; ++i) { + uint32_t r = consensus_record[i]; + if(r & variant_tag) { + out.append(variants[r & ~variant_tag].alt_seq); + } else if(r & ~deleted_tag) { + out.append(1, r); + } else { + assert(r & deleted_tag); + } + } + + fprintf(stderr, "[vcf2fasta] rewrote contig %s with %zu subs, %zu ins, %zu dels (%zu skipped)\n", contig.c_str(), num_subs, num_insertions, num_deletions, num_skipped); + fprintf(stdout, ">%s\n%s\n", contig.c_str(), out.c_str()); + + free(seq); + seq = NULL; + } + + return 0; +} diff --git a/src/nanopolish_vcf2fasta.h b/src/nanopolish_vcf2fasta.h new file mode 100644 index 00000000..729cebe8 --- /dev/null +++ b/src/nanopolish_vcf2fasta.h @@ -0,0 +1,14 @@ +//--------------------------------------------------------- +// Copyright 2018 Ontario Institute for Cancer Research +// Written by Jared Simpson (jared.simpson@oicr.on.ca) +//--------------------------------------------------------- +// +// nanopolish_vcf2fasta - write a new genome sequence +// by introducing variants from a set of vcf files +// +#ifndef NANOPOLISH_VCF2FASTA_H +#define NANOPOLISH_VCF2FASTA_H + +int vcf2fasta_main(int argc, char** argv); + +#endif From ca5f7a2967333fc65be976229bd093b23ecd99fe Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Thu, 20 Sep 2018 10:50:28 +0100 Subject: [PATCH 59/80] tidying makefile --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 5f22a05b..30897c9b 100644 --- a/Makefile +++ b/Makefile @@ -10,12 +10,12 @@ SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/ali #Basic flags every build needs LIBS=-lz CXXFLAGS ?= -O3 -CXXFLAGS += -std=c++11 -fopenmp -fsigned-char #-g +CXXFLAGS += -std=c++11 -fopenmp -fsigned-char CFLAGS ?= -std=c99 -O3 CXX ?= g++ CC ?= gcc NVCC = nvcc -NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0include -O3 -use_fast_math --default-stream per-thread -restrict #-g +NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0include -O3 -use_fast_math --default-stream per-thread -restrict CURTFLAGS ?= -L/usr/local/cuda-9.0/lib64 -lcudart # Change the value of HDF5, EIGEN, or HTS below to any value to disable compilation of bundled code From 5ecf0668635aa16fb7cafd1b39969b9178ac11d0 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Thu, 20 Sep 2018 11:06:58 +0100 Subject: [PATCH 60/80] tidying --- src/cuda_kernels/GpuAligner.cu | 11 +++-------- src/nanopolish_call_variants.cpp | 3 ++- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index c283bc93..31d50890 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -487,8 +487,6 @@ std::vector>> GpuAligner::scoreKernelMod(std::ve poreModelEntriesPerState * 4096 * sizeof(float), cudaMemcpyHostToDevice, streams[0])); // TODO don't hardcode num kmers poreModelInitialized = true; } - // Sequences - // Sequences auto & sequences = scoreSet.stateSequences; numSequences += sequences.size(); @@ -505,7 +503,7 @@ std::vector>> GpuAligner::scoreKernelMod(std::ve int rank = sequence.get_kmer_rank(ki, k, false); kmerRanks[ki + kmerOffset] = rank; } - //kmerRanksDevPointers[i] = kmerRanksDev + kmerOffset; + kmerOffset += numKmers; for(size_t ki = 0; ki < numKmers; ++ki) { @@ -517,8 +515,7 @@ std::vector>> GpuAligner::scoreKernelMod(std::ve sequenceLengthsHost[globalSequenceIdx] = numKmers; - // Loop over the raw reads, producing a cartesian product of the two - + // Loop over the raw reads, producing a cartesian product of reads and sequences auto numReadsInScoreSet = scoreSet.rawData.size(); for (int r=0; r>> GpuAligner::scoreKernelMod(std::ve if (err != cudaSuccess) printf("Errors during kernel execution: %s\n", cudaGetErrorString(err)); - cudaMemcpyAsync(returnValuesHost, scoresDev, - globalScoreIdx * sizeof(float), cudaMemcpyDeviceToHost, streams[0]); + cudaMemcpyAsync(returnValuesHost, scoresDev, globalScoreIdx * sizeof(float), cudaMemcpyDeviceToHost, streams[0]); cudaStreamSynchronize(streams[0]); //Unpack results @@ -699,7 +695,6 @@ std::vector GpuAligner::variantScoresThresholded(std::vector v; if (!event_sequences_vector.empty()) { - //std::vector> scores = scoreKernel(sequences, event_sequences, alignment_flags); auto scoresMod = scoreKernelMod(scoreSets, alignment_flags); diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp index 1d9cf081..806cde2c 100644 --- a/src/nanopolish_call_variants.cpp +++ b/src/nanopolish_call_variants.cpp @@ -444,6 +444,7 @@ std::vector generate_candidate_single_base_edits_gpu(const AlignmentDB& int nextLocusEnd = nextLocusBegin + LOCI_PER_WORKER; bool finished = false; + //Initialise the workers for (int workerIdx = 0; workerIdx < num_workers; workerIdx++) { auto aligner = std::ref(gpuAligners[workerIdx]); if (!finished) { @@ -470,7 +471,7 @@ std::vector generate_candidate_single_base_edits_gpu(const AlignmentDB& } } - //Round robin the workers until done + //Round robin - assigning work to the workers until out of candidates while (!finished) { for (int i = 0; i < num_workers; i++) { auto status = handles[i].wait_for(std::chrono::microseconds(100)); From 075cee3b83cb76568b81632840f391b0c31920d6 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Thu, 20 Sep 2018 14:53:36 +0100 Subject: [PATCH 61/80] GPU acceleration of nanopolish consensus --- README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 74c9c23b..ec1bd330 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,5 @@ # Nanopolish -## GPU acceleration branch - experimental/Work in progress - -This branch adds CUDA-enabled GPU acceleration to the nanopolish consensus improvement algorithm. To try this feature run with the `--gpu` flag e.g: -``` -nanopolish variants --consensus polished_gpu.fa -w "tig00000001:200000-230000" -r reads.fasta -b reads.sorted.bam -g draft.fa --threads=8 --gpu=1 -``` - [![Build Status](https://travis-ci.org/jts/nanopolish.svg?branch=master)](https://travis-ci.org/jts/nanopolish) Software package for signal-level analysis of Oxford Nanopore sequencing data. Nanopolish can calculate an improved consensus sequence for a draft genome assembly, detect base modifications, call SNPs and indels with respect to a reference genome and more (see Nanopolish modules, below). @@ -119,6 +112,13 @@ Then you can run nanopolish from the image: docker run -v /path/to/local/data/data/:/data/ -it :image_id ./nanopolish eventalign -r /data/reads.fa -b /data/alignments.sorted.bam -g /data/ref.fa ``` +## GPU acceleration + +The nanopolish consensus improvement algorithm can be performed faster using CUDA-enabled GPU acceleration. This is an experimental feature, to try this feature run with the `--gpu` flag e.g: +``` +nanopolish variants --consensus polished_gpu.fa -w "tig00000001:200000-230000" -r reads.fasta -b reads.sorted.bam -g draft.fa --threads=8 --gpu=1 +``` + ## Credits and Thanks The fast table-driven logsum implementation was provided by Sean Eddy as public domain code. This code was originally part of [hmmer3](http://hmmer.janelia.org/). Nanopolish also includes code from Oxford Nanopore's [scrappie](https://github.com/nanoporetech/scrappie) basecaller. This code is licensed under the MPL. From 979475093f9f17fed62ee51f4b6a9d5729b547d9 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Thu, 20 Sep 2018 14:55:34 +0100 Subject: [PATCH 62/80] removed spurious comment --- src/common/nanopolish_variant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/nanopolish_variant.cpp b/src/common/nanopolish_variant.cpp index 902756f3..d9fa2498 100644 --- a/src/common/nanopolish_variant.cpp +++ b/src/common/nanopolish_variant.cpp @@ -671,7 +671,7 @@ std::vector multi_call(VariantGroup& variant_group, // Variant score_variant_thresholded(const Variant& input_variant, Haplotype base_haplotype, - const std::vector& input, // raw reads (I think) + const std::vector& input, const uint32_t alignment_flags, const uint32_t score_threshold, const std::vector& methylation_types) From 3fc628eb7fa858981967f0a30cd07080edba2d8e Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Thu, 20 Sep 2018 15:04:39 +0100 Subject: [PATCH 63/80] setting indentation to 4 to match rest of nanopolish --- src/cuda_kernels/GpuAligner.cu | 826 ++++++++++++++++----------------- 1 file changed, 413 insertions(+), 413 deletions(-) diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index 31d50890..7e8ece87 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -12,7 +12,7 @@ __device__ float logsumexpf(float x, float y){ if(x == -INFINITY && y == -INFINITY){ - return -INFINITY; + return -INFINITY; } float result = fmax(x, y) + log1pf(expf(-fabsf(y - x))); return result; @@ -64,7 +64,7 @@ __global__ void getScoresMod (float * poreModelDev, bool debug = false; if ((threadIdx.x == 0) && (blockIdx.x == 0)){ - debug = false; + debug = false; } // get buffer indices @@ -72,213 +72,213 @@ __global__ void getScoresMod (float * poreModelDev, if (scoreIdx < numScores) { - int readIdx = readIdxDev[scoreIdx]; - int seqIdx = seqIdxDev[scoreIdx]; - - // get read statistics - int numEvents = readLengthsDev[readIdx]; - int readOffset = eventOffsetsDev[readIdx]; - float read_events_per_base = eventsPerBaseDev[readIdx]; - int e_start = eventStartsDev[readIdx]; // Event start for read - int e_stride = eventStridesDev[readIdx]; - int e_offset = eventOffsetsDev[readIdx]; // Within the event means etc, the offset needed for this block to get a specific event - float scale = scaleDev[readIdx]; - float shift = shiftDev[readIdx]; - float var = varDev[readIdx]; - float logVar = logVarDev[readIdx]; - - // get sequence statistics - int numKmers = sequenceLengthsDev[seqIdx]; - int seqOffset = sequenceOffsetsDev[seqIdx]; - - int lastRowIdx = numEvents - 1; - int lastKmerIdx = numKmers - 1; - - float returnValue = -INFINITY; //Used to sum over the last column. - float prevProbabilities[MAX_STATES]; - - int numBlocks = numKmers + 2; - int numStates = numBlocks * PSR9_NUM_STATES; // 3 blocks per kmer and then 3 each for start and end state. - - if (debug) { - printf("Kernel 1 >>> Num Kmers is %i\n", numKmers); - printf("Kernel 1 >>> n_states %i\n", numStates); - printf("Kernel 1 >>> num events in read is %i\n", numEvents); - printf("Kernel 1 >>> event offset is %i\n", e_offset); - } - - // Initialise the prev probabilities vector - for (int i = 0; i < numStates - PSR9_NUM_STATES; i++) { - prevProbabilities[i] = -INFINITY; - } - for (int i = numStates - PSR9_NUM_STATES; i < numStates; i++) { - prevProbabilities[i] = 0.0f; - } - - bool rc = false; - if (e_stride == -1) { - rc = true; - } - - float p_stay = 1 - (1 / read_events_per_base); - float p_skip = 0.0025; - float p_bad = 0.001; - float p_bad_self = p_bad; - float p_skip_self = 0.3; - float p_mk = p_skip; // probability of not observing an event at all - float p_mb = p_bad; // probabilty of observing a bad event - float p_mm_self = p_stay; // probability of observing additional events from this k-mer - float p_mm_next = 1.0f - p_mm_self - p_mk - p_mb; // normal movement from state to state - // transitions from event split state in previous block - float p_bb = p_bad_self; - float p_bk, p_bm_next, p_bm_self; - p_bk = p_bm_next = p_bm_self = (1.0f - p_bb) / 3; - // transitions from kmer skip state in previous block - float p_kk = p_skip_self; - float p_km = 1.0f - p_kk; - // We assign some transition probabilities. I believe this is correct and they don't vary by location in the sequence - float lp_mk = logf(p_mk); - float lp_mb = logf(p_mb); - float lp_mm_self = logf(p_mm_self); - float lp_mm_next = logf(p_mm_next); - float lp_bb = logf(p_bb); - float lp_bk = logf(p_bk); - float lp_bm_next = logf(p_bm_next); - float lp_bm_self = logf(p_bm_self); - float lp_kk = logf(p_kk); - float lp_km = logf(p_km); - float lp_sm, lp_ms; - lp_sm = lp_ms = 0.0f; - - // the penalty is controlled by the transition probability - float BAD_EVENT_PENALTY = 0.0f; - - //Fill out the dynamic programming table - for (int row = 1; row < numEvents + 1; row++) {//TODO: check that numRows is correct value. - //row-specific values - int event_idx = e_start + (row - 1) * e_stride; - float eventMean = eventMeansDev[e_offset + row - 1]; - float preFlank = preFlankingDev[e_offset + row - 1]; - float postFlank = postFlankingDev[e_offset + row - 1]; - - float lp_emission_b = BAD_EVENT_PENALTY; //TODO: Can this be taken out of the inner loop? - - //Initialise temp registers - float prevMatch = prevProbabilities[PSR9_MATCH];; - float prevSkip = prevProbabilities[PSR9_KMER_SKIP]; - float prevBad = prevProbabilities[PSR9_BAD_EVENT]; - - for (int blkIdx = 1; blkIdx < numBlocks - 1; blkIdx++) { - int curBlockIdx = blkIdx; - int prevBlockIdx = curBlockIdx - 1; - int prevBlockOffset = PSR9_NUM_STATES * prevBlockIdx; - int curBlockOffset = PSR9_NUM_STATES * curBlockIdx; - - int kmerIdx = blkIdx - 1; // because there is a start block with no associated kmer - uint32_t rank = kmerRanksDev[seqOffset + kmerIdx + (numKmers * - rc)]; // TODO understand why this is segfaulting sometimes, why does kmerIdx sometimes exceed 4096 - - float pore_mean = poreModelDev[rank * 3]; - float pore_stdv = poreModelDev[rank * 3 + 1]; - float pore_log_level_stdv = poreModelDev[rank * 3 + 2]; - - float lp_emission_m = lp_match_r9(rank, - eventMean, - pore_mean, - pore_stdv, - pore_log_level_stdv, - scale, - shift, - var, - logVar); - - // Get all the scores for a match - float curMatch = prevProbabilities[curBlockOffset + PSR9_MATCH]; - float curBad = prevProbabilities[curBlockOffset + PSR9_BAD_EVENT]; - float curSkip = prevProbabilities[curBlockOffset + PSR9_KMER_SKIP]; - - float HMT_FROM_SAME_M = lp_mm_self + curMatch; - float HMT_FROM_PREV_M = lp_mm_next + prevMatch; - float HMT_FROM_SAME_B = lp_bm_self + curBad; - float HMT_FROM_PREV_B = lp_bm_next + prevBad; - float HMT_FROM_PREV_K = lp_km + prevSkip; - - // m_s is the probability of going from the start state - // to this kmer. The start state is (currently) only - // allowed to go to the first kmer. If ALLOW_PRE_CLIP - // is defined, we allow all events before this one to be skipped, - // with a penalty; - float HMT_FROM_SOFT = (kmerIdx == 0 && - (event_idx == e_start || - (HAF_ALLOW_PRE_CLIP))) ? lp_sm + preFlank : -INFINITY; - - // calculate the score - float sum = HMT_FROM_SAME_M; - sum = logsumexpf(sum, HMT_FROM_SOFT); - sum = logsumexpf(sum, HMT_FROM_PREV_M); - sum = logsumexpf(sum, HMT_FROM_SAME_B); - sum = logsumexpf(sum, HMT_FROM_PREV_B); - sum = logsumexpf(sum, HMT_FROM_PREV_K); - sum += lp_emission_m; - - float newMatchScore = sum; - - // Calculate the bad event scores - // state PSR9_BAD_EVENT - HMT_FROM_SAME_M = lp_mb + curMatch; - HMT_FROM_PREV_M = -INFINITY; - HMT_FROM_SAME_B = lp_bb + prevBad; - HMT_FROM_PREV_B = -INFINITY; - HMT_FROM_PREV_K = -INFINITY; - HMT_FROM_SOFT = -INFINITY; - - sum = HMT_FROM_SAME_M; - sum = logsumexpf(sum, HMT_FROM_SAME_B); - sum += lp_emission_b; - - float newBadEventScore = sum; - - // Write row out. prevProbabilities now becomes "current probabilities" for evaluating skips. - prevProbabilities[curBlockOffset + PSR9_MATCH] = newMatchScore; - prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] = newBadEventScore; - - //Update tmp vars - prevMatch = curMatch; - prevSkip = curSkip; - prevBad = prevBad; - - //Now do the non-skip-skip transition. This relies on the updated vector values. - // state PSR9_KMER_SKIP - HMT_FROM_PREV_M = lp_mk + prevProbabilities[prevBlockOffset + PSR9_MATCH]; - HMT_FROM_PREV_B = lp_bk + prevProbabilities[prevBlockOffset + PSR9_BAD_EVENT]; - HMT_FROM_PREV_K = lp_kk + prevProbabilities[prevBlockOffset + PSR9_KMER_SKIP]; - - sum = HMT_FROM_PREV_M; - sum = logsumexpf(sum, HMT_FROM_PREV_B); - sum = logsumexpf(sum, - HMT_FROM_PREV_K); //TODO - this is in the 'normal' kernel instead of HMT_FROM_PREV_M - is it wrong? - sum = logsumexpf(sum, - HMT_FROM_PREV_M); //TODO - assume this should probably be in there, but not in current - - float newSkipScore = sum; - - prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] = newSkipScore; - - //post-clip transition - if (kmerIdx == lastKmerIdx && ((HAF_ALLOW_POST_CLIP) || row == lastRowIdx)) { - float lp1 = lp_ms + prevProbabilities[curBlockOffset + PSR9_MATCH] + postFlank; - float lp2 = lp_ms + prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] + postFlank; - float lp3 = lp_ms + prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] + postFlank; - - float end = returnValue; - end = logsumexpf(end, lp1); - end = logsumexpf(end, lp2); - end = logsumexpf(end, lp3); - returnValue = end; - } - } - } - returnValuesDev[scoreIdx] = returnValue; + int readIdx = readIdxDev[scoreIdx]; + int seqIdx = seqIdxDev[scoreIdx]; + + // get read statistics + int numEvents = readLengthsDev[readIdx]; + int readOffset = eventOffsetsDev[readIdx]; + float read_events_per_base = eventsPerBaseDev[readIdx]; + int e_start = eventStartsDev[readIdx]; // Event start for read + int e_stride = eventStridesDev[readIdx]; + int e_offset = eventOffsetsDev[readIdx]; // Within the event means etc, the offset needed for this block to get a specific event + float scale = scaleDev[readIdx]; + float shift = shiftDev[readIdx]; + float var = varDev[readIdx]; + float logVar = logVarDev[readIdx]; + + // get sequence statistics + int numKmers = sequenceLengthsDev[seqIdx]; + int seqOffset = sequenceOffsetsDev[seqIdx]; + + int lastRowIdx = numEvents - 1; + int lastKmerIdx = numKmers - 1; + + float returnValue = -INFINITY; //Used to sum over the last column. + float prevProbabilities[MAX_STATES]; + + int numBlocks = numKmers + 2; + int numStates = numBlocks * PSR9_NUM_STATES; // 3 blocks per kmer and then 3 each for start and end state. + + if (debug) { + printf("Kernel 1 >>> Num Kmers is %i\n", numKmers); + printf("Kernel 1 >>> n_states %i\n", numStates); + printf("Kernel 1 >>> num events in read is %i\n", numEvents); + printf("Kernel 1 >>> event offset is %i\n", e_offset); + } + + // Initialise the prev probabilities vector + for (int i = 0; i < numStates - PSR9_NUM_STATES; i++) { + prevProbabilities[i] = -INFINITY; + } + for (int i = numStates - PSR9_NUM_STATES; i < numStates; i++) { + prevProbabilities[i] = 0.0f; + } + + bool rc = false; + if (e_stride == -1) { + rc = true; + } + + float p_stay = 1 - (1 / read_events_per_base); + float p_skip = 0.0025; + float p_bad = 0.001; + float p_bad_self = p_bad; + float p_skip_self = 0.3; + float p_mk = p_skip; // probability of not observing an event at all + float p_mb = p_bad; // probabilty of observing a bad event + float p_mm_self = p_stay; // probability of observing additional events from this k-mer + float p_mm_next = 1.0f - p_mm_self - p_mk - p_mb; // normal movement from state to state + // transitions from event split state in previous block + float p_bb = p_bad_self; + float p_bk, p_bm_next, p_bm_self; + p_bk = p_bm_next = p_bm_self = (1.0f - p_bb) / 3; + // transitions from kmer skip state in previous block + float p_kk = p_skip_self; + float p_km = 1.0f - p_kk; + // We assign some transition probabilities. I believe this is correct and they don't vary by location in the sequence + float lp_mk = logf(p_mk); + float lp_mb = logf(p_mb); + float lp_mm_self = logf(p_mm_self); + float lp_mm_next = logf(p_mm_next); + float lp_bb = logf(p_bb); + float lp_bk = logf(p_bk); + float lp_bm_next = logf(p_bm_next); + float lp_bm_self = logf(p_bm_self); + float lp_kk = logf(p_kk); + float lp_km = logf(p_km); + float lp_sm, lp_ms; + lp_sm = lp_ms = 0.0f; + + // the penalty is controlled by the transition probability + float BAD_EVENT_PENALTY = 0.0f; + + //Fill out the dynamic programming table + for (int row = 1; row < numEvents + 1; row++) {//TODO: check that numRows is correct value. + //row-specific values + int event_idx = e_start + (row - 1) * e_stride; + float eventMean = eventMeansDev[e_offset + row - 1]; + float preFlank = preFlankingDev[e_offset + row - 1]; + float postFlank = postFlankingDev[e_offset + row - 1]; + + float lp_emission_b = BAD_EVENT_PENALTY; //TODO: Can this be taken out of the inner loop? + + //Initialise temp registers + float prevMatch = prevProbabilities[PSR9_MATCH];; + float prevSkip = prevProbabilities[PSR9_KMER_SKIP]; + float prevBad = prevProbabilities[PSR9_BAD_EVENT]; + + for (int blkIdx = 1; blkIdx < numBlocks - 1; blkIdx++) { + int curBlockIdx = blkIdx; + int prevBlockIdx = curBlockIdx - 1; + int prevBlockOffset = PSR9_NUM_STATES * prevBlockIdx; + int curBlockOffset = PSR9_NUM_STATES * curBlockIdx; + + int kmerIdx = blkIdx - 1; // because there is a start block with no associated kmer + uint32_t rank = kmerRanksDev[seqOffset + kmerIdx + (numKmers * + rc)]; // TODO understand why this is segfaulting sometimes, why does kmerIdx sometimes exceed 4096 + + float pore_mean = poreModelDev[rank * 3]; + float pore_stdv = poreModelDev[rank * 3 + 1]; + float pore_log_level_stdv = poreModelDev[rank * 3 + 2]; + + float lp_emission_m = lp_match_r9(rank, + eventMean, + pore_mean, + pore_stdv, + pore_log_level_stdv, + scale, + shift, + var, + logVar); + + // Get all the scores for a match + float curMatch = prevProbabilities[curBlockOffset + PSR9_MATCH]; + float curBad = prevProbabilities[curBlockOffset + PSR9_BAD_EVENT]; + float curSkip = prevProbabilities[curBlockOffset + PSR9_KMER_SKIP]; + + float HMT_FROM_SAME_M = lp_mm_self + curMatch; + float HMT_FROM_PREV_M = lp_mm_next + prevMatch; + float HMT_FROM_SAME_B = lp_bm_self + curBad; + float HMT_FROM_PREV_B = lp_bm_next + prevBad; + float HMT_FROM_PREV_K = lp_km + prevSkip; + + // m_s is the probability of going from the start state + // to this kmer. The start state is (currently) only + // allowed to go to the first kmer. If ALLOW_PRE_CLIP + // is defined, we allow all events before this one to be skipped, + // with a penalty; + float HMT_FROM_SOFT = (kmerIdx == 0 && + (event_idx == e_start || + (HAF_ALLOW_PRE_CLIP))) ? lp_sm + preFlank : -INFINITY; + + // calculate the score + float sum = HMT_FROM_SAME_M; + sum = logsumexpf(sum, HMT_FROM_SOFT); + sum = logsumexpf(sum, HMT_FROM_PREV_M); + sum = logsumexpf(sum, HMT_FROM_SAME_B); + sum = logsumexpf(sum, HMT_FROM_PREV_B); + sum = logsumexpf(sum, HMT_FROM_PREV_K); + sum += lp_emission_m; + + float newMatchScore = sum; + + // Calculate the bad event scores + // state PSR9_BAD_EVENT + HMT_FROM_SAME_M = lp_mb + curMatch; + HMT_FROM_PREV_M = -INFINITY; + HMT_FROM_SAME_B = lp_bb + prevBad; + HMT_FROM_PREV_B = -INFINITY; + HMT_FROM_PREV_K = -INFINITY; + HMT_FROM_SOFT = -INFINITY; + + sum = HMT_FROM_SAME_M; + sum = logsumexpf(sum, HMT_FROM_SAME_B); + sum += lp_emission_b; + + float newBadEventScore = sum; + + // Write row out. prevProbabilities now becomes "current probabilities" for evaluating skips. + prevProbabilities[curBlockOffset + PSR9_MATCH] = newMatchScore; + prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] = newBadEventScore; + + //Update tmp vars + prevMatch = curMatch; + prevSkip = curSkip; + prevBad = prevBad; + + //Now do the non-skip-skip transition. This relies on the updated vector values. + // state PSR9_KMER_SKIP + HMT_FROM_PREV_M = lp_mk + prevProbabilities[prevBlockOffset + PSR9_MATCH]; + HMT_FROM_PREV_B = lp_bk + prevProbabilities[prevBlockOffset + PSR9_BAD_EVENT]; + HMT_FROM_PREV_K = lp_kk + prevProbabilities[prevBlockOffset + PSR9_KMER_SKIP]; + + sum = HMT_FROM_PREV_M; + sum = logsumexpf(sum, HMT_FROM_PREV_B); + sum = logsumexpf(sum, + HMT_FROM_PREV_K); //TODO - this is in the 'normal' kernel instead of HMT_FROM_PREV_M - is it wrong? + sum = logsumexpf(sum, + HMT_FROM_PREV_M); //TODO - assume this should probably be in there, but not in current + + float newSkipScore = sum; + + prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] = newSkipScore; + + //post-clip transition + if (kmerIdx == lastKmerIdx && ((HAF_ALLOW_POST_CLIP) || row == lastRowIdx)) { + float lp1 = lp_ms + prevProbabilities[curBlockOffset + PSR9_MATCH] + postFlank; + float lp2 = lp_ms + prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] + postFlank; + float lp3 = lp_ms + prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] + postFlank; + + float end = returnValue; + end = logsumexpf(end, lp1); + end = logsumexpf(end, lp2); + end = logsumexpf(end, lp3); + returnValue = end; + } + } + } + returnValuesDev[scoreIdx] = returnValue; } } @@ -362,7 +362,7 @@ GpuAligner::GpuAligner() returnValuesDevResultsPointers.resize(max_num_sequences); for (int i =0; i>> GpuAligner::scoreKernelMod(std::ve //Loop over every scoreset, filling out buffers and counters for (int scoreSetIdx=0; scoreSetIdx < numScoreSets; scoreSetIdx++){ - auto scoreSet = scoreSets[scoreSetIdx]; - int firstReadIdxinScoreSet = globalReadIdx; - //Read data - for (int eventSequenceIdx=0; eventSequenceIdx < scoreSet.rawData.size();eventSequenceIdx++){ - auto e = scoreSet.rawData[eventSequenceIdx]; - numReads++; - - //Read statistics - populate host buffers - scaleHost[globalReadIdx] = e.read->scalings[e.strand].scale; - shiftHost[globalReadIdx] = e.read->scalings[e.strand].shift; - varHost[globalReadIdx] = e.read->scalings[e.strand].var; - logVarHost[globalReadIdx] = e.read->scalings[e.strand].log_var; - - int e_start = e.event_start_idx; - eventStartsHost[globalReadIdx] = e_start; - - int e_stride = e.event_stride; - eventStridesHost[globalReadIdx] = e_stride; - - uint32_t e_end = e.event_stop_idx; - uint32_t n_events; - if(e_end > e_start) - n_events = e_end - e_start + 1; - else - n_events = e_start - e_end + 1; - readLengthsHost[globalReadIdx] = n_events; - numEventsTotal += n_events; - - eventOffsetsHost[globalReadIdx] = rawReadOffset; - - float readEventsPerBase = e.read->events_per_base[e.strand]; - eventsPerBaseHost[globalReadIdx] = readEventsPerBase; - - std::vector pre_flank = make_pre_flanking(e, e_start, n_events); - std::vector post_flank = make_post_flanking(e, e_start, n_events); - - for (int i=0;iget_drift_scaled_level(event_idx, e.strand); // send the data in drift scaled - eventMeans[rawReadOffset + i] = scaled; - - //populate the pre/post-flanking data, since it has a 1-1 correspondence with events - preFlankingHost[rawReadOffset + i] = pre_flank[i]; - postFlankingHost[rawReadOffset + i] = post_flank[i]; - } - - rawReadOffset += n_events; - globalReadIdx++; - } - //Pore Model - const uint32_t k = scoreSets[0].rawData[0].pore_model->k; //k is the length of a kmer in the pore model - if (poreModelInitialized == false) { - int num_states = scoreSets[0].rawData[0].pore_model->states.size(); - int poreModelEntriesPerState = 3; - for(int st=0; ststates[st]; - poreModelHost[st * poreModelEntriesPerState] = params.level_mean; - poreModelHost[st * poreModelEntriesPerState + 1] = params.level_stdv; - poreModelHost[st * poreModelEntriesPerState + 2] = params.level_log_stdv; - } - // copy over the pore model - CU_CHECK_ERR(cudaMemcpyAsync(poreModelDev, poreModelHost, - poreModelEntriesPerState * 4096 * sizeof(float), cudaMemcpyHostToDevice, streams[0])); // TODO don't hardcode num kmers - poreModelInitialized = true; - } - auto & sequences = scoreSet.stateSequences; - numSequences += sequences.size(); - - for (int i = 0; iscalings[e.strand].scale; + shiftHost[globalReadIdx] = e.read->scalings[e.strand].shift; + varHost[globalReadIdx] = e.read->scalings[e.strand].var; + logVarHost[globalReadIdx] = e.read->scalings[e.strand].log_var; + + int e_start = e.event_start_idx; + eventStartsHost[globalReadIdx] = e_start; + + int e_stride = e.event_stride; + eventStridesHost[globalReadIdx] = e_stride; + + uint32_t e_end = e.event_stop_idx; + uint32_t n_events; + if(e_end > e_start) + n_events = e_end - e_start + 1; + else + n_events = e_start - e_end + 1; + readLengthsHost[globalReadIdx] = n_events; + numEventsTotal += n_events; + + eventOffsetsHost[globalReadIdx] = rawReadOffset; + + float readEventsPerBase = e.read->events_per_base[e.strand]; + eventsPerBaseHost[globalReadIdx] = readEventsPerBase; + + std::vector pre_flank = make_pre_flanking(e, e_start, n_events); + std::vector post_flank = make_post_flanking(e, e_start, n_events); + + for (int i=0;iget_drift_scaled_level(event_idx, e.strand); // send the data in drift scaled + eventMeans[rawReadOffset + i] = scaled; + + //populate the pre/post-flanking data, since it has a 1-1 correspondence with events + preFlankingHost[rawReadOffset + i] = pre_flank[i]; + postFlankingHost[rawReadOffset + i] = post_flank[i]; + } + + rawReadOffset += n_events; + globalReadIdx++; + } + //Pore Model + const uint32_t k = scoreSets[0].rawData[0].pore_model->k; //k is the length of a kmer in the pore model + if (poreModelInitialized == false) { + int num_states = scoreSets[0].rawData[0].pore_model->states.size(); + int poreModelEntriesPerState = 3; + for(int st=0; ststates[st]; + poreModelHost[st * poreModelEntriesPerState] = params.level_mean; + poreModelHost[st * poreModelEntriesPerState + 1] = params.level_stdv; + poreModelHost[st * poreModelEntriesPerState + 2] = params.level_log_stdv; + } + // copy over the pore model + CU_CHECK_ERR(cudaMemcpyAsync(poreModelDev, poreModelHost, + poreModelEntriesPerState * 4096 * sizeof(float), cudaMemcpyHostToDevice, streams[0])); // TODO don't hardcode num kmers + poreModelInitialized = true; + } + auto & sequences = scoreSet.stateSequences; + numSequences += sequences.size(); + + for (int i = 0; i>> GpuAligner::scoreKernelMod(std::ve //printf("Launching get scores mod kernel\n"); getScoresMod <<< dimGrid, dimBlock, 0, streams[0]>>> (poreModelDev, - readLengthsDev, - eventStartsDev, - eventStridesDev, - eventsPerBaseDev, - scaleDev, - shiftDev, - varDev, - logVarDev, - eventOffsetsDev, - eventMeansDev, - preFlankingDev, - postFlankingDev, - sequenceLengthsDev, - sequenceOffsetsDev, - kmerRanksDev, - seqIdxDev, - readIdxDev, - globalScoreIdx, - scoresDev); + readLengthsDev, + eventStartsDev, + eventStridesDev, + eventsPerBaseDev, + scaleDev, + shiftDev, + varDev, + logVarDev, + eventOffsetsDev, + eventMeansDev, + preFlankingDev, + postFlankingDev, + sequenceLengthsDev, + sequenceOffsetsDev, + kmerRanksDev, + seqIdxDev, + readIdxDev, + globalScoreIdx, + scoresDev); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) - printf("Errors during kernel execution: %s\n", cudaGetErrorString(err)); + printf("Errors during kernel execution: %s\n", cudaGetErrorString(err)); cudaMemcpyAsync(returnValuesHost, scoresDev, globalScoreIdx * sizeof(float), cudaMemcpyDeviceToHost, streams[0]); cudaStreamSynchronize(streams[0]); @@ -619,21 +619,21 @@ std::vector>> GpuAligner::scoreKernelMod(std::ve std::vector>> result(scoreSets.size()); for(int scoreSetIdx=0; scoreSetIdx seqScores(numReads); + std::vector seqScores(numReads); - for (int readIdx=0; readIdx GpuAligner::variantScoresThresholded(std::vector methylation_types) { - int numScoreSets = base_haplotypes.size(); - std::vector scoreSets; - scoreSets.resize(numScoreSets); - - for(int scoreSetIdx=0; scoreSetIdx scoreSets; + scoreSets.resize(numScoreSets); - if (event_sequences.size() > MAX_COVERAGE) { - event_sequences.resize(MAX_COVERAGE); - } - - int numVariants = input_variants.size(); + for(int scoreSetIdx=0; scoreSetIdx out_variants = input_variants; - std::vector variant_haplotypes(numVariants, base_haplotype); + auto input_variants = input_variants_vector[scoreSetIdx]; + auto base_haplotype = base_haplotypes[scoreSetIdx]; + auto event_sequences = event_sequences_vector[scoreSetIdx]; - //loop over the vector, applying the variants to the haplotypes - for (int i = 0; i MAX_COVERAGE) { + event_sequences.resize(MAX_COVERAGE); + } - // Make methylated versions of each input sequence. Once for the base haplotype and once each for each variant + int numVariants = input_variants.size(); - std::vector sequences; + std::vector out_variants = input_variants; + std::vector variant_haplotypes(numVariants, base_haplotype); - HMMInputSequence base_sequence = generate_methylated_alternatives(base_haplotype.get_sequence(), - methylation_types)[0]; //TODO: fix for non-zero + //loop over the vector, applying the variants to the haplotypes + for (int i = 0; i sequences; - ScoreSet s = { - sequences, - event_sequences - }; + HMMInputSequence base_sequence = generate_methylated_alternatives(base_haplotype.get_sequence(), + methylation_types)[0]; //TODO: fix for non-zero - scoreSets[scoreSetIdx] = s; + sequences.push_back(base_sequence); - } + for (auto v: variant_haplotypes){ + auto variant_sequence = generate_methylated_alternatives(v.get_sequence(), methylation_types)[0]; //TODO: fix for non-zero + sequences.push_back(variant_sequence); + } - std::vector v; - if (!event_sequences_vector.empty()) { + ScoreSet s = { + sequences, + event_sequences + }; - auto scoresMod = scoreKernelMod(scoreSets, alignment_flags); + scoreSets[scoreSetIdx] = s; - // results are now ready, need to unpack them - for (int scoreSetIdx=0; scoreSetIdx> scores = scoresMod[scoreSetIdx]; // scores for this candidate, including all variants and base(zeroth) - int numVariants = scores.size() - 1; // subtract one for the base - int numScores = scores[0].size(); + } - for (int variantIndex = 0; variantIndex < numVariants; variantIndex++) { // index 0 is the base scores - double totalScore = 0.0; - for (int k = 0; k < numScores; k++) { - if (fabs(totalScore) < screen_score_threshold) { - double baseScore = scores[0][k]; - totalScore += (scores[variantIndex + 1][k] - baseScore); - } + std::vector v; + if (!event_sequences_vector.empty()) { + + auto scoresMod = scoreKernelMod(scoreSets, alignment_flags); + + // results are now ready, need to unpack them + for (int scoreSetIdx=0; scoreSetIdx> scores = scoresMod[scoreSetIdx]; // scores for this candidate, including all variants and base(zeroth) + int numVariants = scores.size() - 1; // subtract one for the base + int numScores = scores[0].size(); + + for (int variantIndex = 0; variantIndex < numVariants; variantIndex++) { // index 0 is the base scores + double totalScore = 0.0; + for (int k = 0; k < numScores; k++) { + if (fabs(totalScore) < screen_score_threshold) { + double baseScore = scores[0][k]; + totalScore += (scores[variantIndex + 1][k] - baseScore); + } + } + // get the old variant: + auto unScoredVariant = input_variants_vector[scoreSetIdx][variantIndex]; + unScoredVariant.quality = totalScore; + unScoredVariant.info = ""; + v.push_back(unScoredVariant); + } } - // get the old variant: - auto unScoredVariant = input_variants_vector[scoreSetIdx][variantIndex]; - unScoredVariant.quality = totalScore; - unScoredVariant.info = ""; - v.push_back(unScoredVariant); - } } - } - return v; + return v; } From b2fb309b16dbe5147eb623e6b77b0e3554bd9d71 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Thu, 20 Sep 2018 15:06:43 +0100 Subject: [PATCH 64/80] removed some outdated comments --- src/cuda_kernels/GpuAligner.cu | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu index 7e8ece87..36b6378a 100644 --- a/src/cuda_kernels/GpuAligner.cu +++ b/src/cuda_kernels/GpuAligner.cu @@ -154,14 +154,14 @@ __global__ void getScoresMod (float * poreModelDev, float BAD_EVENT_PENALTY = 0.0f; //Fill out the dynamic programming table - for (int row = 1; row < numEvents + 1; row++) {//TODO: check that numRows is correct value. + for (int row = 1; row < numEvents + 1; row++) { //row-specific values int event_idx = e_start + (row - 1) * e_stride; float eventMean = eventMeansDev[e_offset + row - 1]; float preFlank = preFlankingDev[e_offset + row - 1]; float postFlank = postFlankingDev[e_offset + row - 1]; - float lp_emission_b = BAD_EVENT_PENALTY; //TODO: Can this be taken out of the inner loop? + float lp_emission_b = BAD_EVENT_PENALTY; //Initialise temp registers float prevMatch = prevProbabilities[PSR9_MATCH];; @@ -176,7 +176,7 @@ __global__ void getScoresMod (float * poreModelDev, int kmerIdx = blkIdx - 1; // because there is a start block with no associated kmer uint32_t rank = kmerRanksDev[seqOffset + kmerIdx + (numKmers * - rc)]; // TODO understand why this is segfaulting sometimes, why does kmerIdx sometimes exceed 4096 + rc)]; float pore_mean = poreModelDev[rank * 3]; float pore_stdv = poreModelDev[rank * 3 + 1]; @@ -255,10 +255,8 @@ __global__ void getScoresMod (float * poreModelDev, sum = HMT_FROM_PREV_M; sum = logsumexpf(sum, HMT_FROM_PREV_B); - sum = logsumexpf(sum, - HMT_FROM_PREV_K); //TODO - this is in the 'normal' kernel instead of HMT_FROM_PREV_M - is it wrong? - sum = logsumexpf(sum, - HMT_FROM_PREV_M); //TODO - assume this should probably be in there, but not in current + sum = logsumexpf(sum, HMT_FROM_PREV_K); + sum = logsumexpf(sum, HMT_FROM_PREV_M); float newSkipScore = sum; @@ -291,7 +289,7 @@ GpuAligner::GpuAligner() int maxBuffer = max_reads_per_worker * MAX_SEQUENCE_LENGTH * sizeof(int); //OLD - int max_num_sequences = 1; //TODO can get rid of this + int max_num_sequences = 1; int max_sequence_length = 100; int max_n_rows = 100; @@ -400,7 +398,7 @@ GpuAligner::~GpuAligner() { CU_CHECK_ERR(cudaFreeHost(seqIdxHost)); CU_CHECK_ERR(cudaFreeHost(readIdxHost)); - int max_num_sequences = 1; //TODO can get rid of this + int max_num_sequences = 1; for (int i =0; i>> GpuAligner::scoreKernelMod(std::ve } // copy over the pore model CU_CHECK_ERR(cudaMemcpyAsync(poreModelDev, poreModelHost, - poreModelEntriesPerState * 4096 * sizeof(float), cudaMemcpyHostToDevice, streams[0])); // TODO don't hardcode num kmers + poreModelEntriesPerState * 4096 * sizeof(float), cudaMemcpyHostToDevice, streams[0])); poreModelInitialized = true; } auto & sequences = scoreSet.stateSequences; @@ -675,12 +673,12 @@ std::vector GpuAligner::variantScoresThresholded(std::vector sequences; HMMInputSequence base_sequence = generate_methylated_alternatives(base_haplotype.get_sequence(), - methylation_types)[0]; //TODO: fix for non-zero + methylation_types)[0]; sequences.push_back(base_sequence); for (auto v: variant_haplotypes){ - auto variant_sequence = generate_methylated_alternatives(v.get_sequence(), methylation_types)[0]; //TODO: fix for non-zero + auto variant_sequence = generate_methylated_alternatives(v.get_sequence(), methylation_types)[0]; sequences.push_back(variant_sequence); } From 186ac5dcfbd38e902597105d18a6ebbb7400aa5f Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Thu, 20 Sep 2018 15:09:46 +0100 Subject: [PATCH 65/80] removed old debug code --- src/hmm/nanopolish_emissions.h | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/src/hmm/nanopolish_emissions.h b/src/hmm/nanopolish_emissions.h index 5f99a410..3dca4746 100644 --- a/src/hmm/nanopolish_emissions.h +++ b/src/hmm/nanopolish_emissions.h @@ -63,19 +63,8 @@ inline float log_probability_match_r9(const SquiggleRead& read, { // event level mean, scaled with the drift value float level = read.get_drift_scaled_level(event_idx, strand); - //if (debug == true){ - // printf("Level being used to calculate emission: %f\n", level); - //} + GaussianParameters gp = read.get_scaled_gaussian_from_pore_model_state(pore_model, strand, kmer_rank); -// if (debug == true) { -// printf(">Event IDX is: %i\n", event_idx); -// printf(">CPU Strand is: %i\n", strand); -// printf(">CPU kmer_rank is: %i\n", kmer_rank); -// printf(">CPU level is: %f\n", level); -// printf(">CPU gaussian mean: %f\n", gp.mean); -// printf(">CPU gaussian stdv: %f\n", gp.stdv); -// printf(">CPU gaussian log_level_stdv: %f\n", gp.log_stdv); -// } float lp = log_normal_pdf(level, gp); return lp; } From e823003f9b8a7dcacdf1e7b979e6a8397067b5de Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Thu, 20 Sep 2018 15:19:19 +0100 Subject: [PATCH 66/80] removed deprecated code --- src/hmm/nanopolish_profile_hmm.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/hmm/nanopolish_profile_hmm.cpp b/src/hmm/nanopolish_profile_hmm.cpp index 0d9f5167..6d5d0f37 100644 --- a/src/hmm/nanopolish_profile_hmm.cpp +++ b/src/hmm/nanopolish_profile_hmm.cpp @@ -31,7 +31,6 @@ float profile_hmm_score(const HMMInputSequence& sequence, const HMMInputData& da float profile_hmm_score_set(const std::vector& sequences, const HMMInputData& data, const uint32_t flags) { - //printf("In profile_hmm_score set function...\n"); assert(!sequences.empty()); assert(std::string(sequences[0].get_alphabet()->get_name()) == "nucleotide"); assert(std::string(data.pore_model->pmalphabet->get_name()) == "nucleotide"); From 551cd230a037ec888abf569ba3ecfe22f51e5779 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Thu, 20 Sep 2018 15:28:19 +0100 Subject: [PATCH 67/80] removed old debug code --- src/hmm/nanopolish_profile_hmm_r7.inl | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/hmm/nanopolish_profile_hmm_r7.inl b/src/hmm/nanopolish_profile_hmm_r7.inl index 3fe4b309..7f9083e9 100644 --- a/src/hmm/nanopolish_profile_hmm_r7.inl +++ b/src/hmm/nanopolish_profile_hmm_r7.inl @@ -308,9 +308,6 @@ inline float profile_hmm_fill_generic_r7(const HMMInputSequence& _sequence, std::vector kmer_ranks(num_kmers); for(size_t ki = 0; ki < num_kmers; ++ki) { int rank = sequence.get_kmer_rank(ki, k, data.rc); - if(rank>4096){ - printf("Rank: %i", rank); - } kmer_ranks[ki] = rank; } size_t num_events = output.get_num_rows() - 1; From 5d67b61e464a10c04ce006f0863204bd35a9b553 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Thu, 20 Sep 2018 15:30:09 +0100 Subject: [PATCH 68/80] revert typo --- src/hmm/nanopolish_profile_hmm_r9.inl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/hmm/nanopolish_profile_hmm_r9.inl b/src/hmm/nanopolish_profile_hmm_r9.inl index 0d90b5c3..c09b4321 100644 --- a/src/hmm/nanopolish_profile_hmm_r9.inl +++ b/src/hmm/nanopolish_profile_hmm_r9.inl @@ -216,7 +216,6 @@ inline std::vector make_pre_flanking(const HMMInputData& data, pre_flank[i] = log(TRANS_CLIP_SELF) + log_probability_background(*data.read, event_idx, data.strand) + // emit from background pre_flank[i - 1]; // this accounts for the transition from the start & to the silent pre - } return pre_flank; From 27f4d5c574bb50031f18208729557734a9e9c6ee Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Thu, 20 Sep 2018 15:32:50 +0100 Subject: [PATCH 69/80] Made indentation consistent --- src/main/nanopolish.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/main/nanopolish.cpp b/src/main/nanopolish.cpp index 459a3e2e..d6bcf8d4 100644 --- a/src/main/nanopolish.cpp +++ b/src/main/nanopolish.cpp @@ -54,10 +54,10 @@ int print_usage(int, char **) int print_version(int, char **) { static const char *VERSION_MESSAGE = - "nanopolish version " PACKAGE_VERSION "\n" - "Written by Jared Simpson.\n" - "\n" - "Copyright 2015-2017 Ontario Institute for Cancer Research\n"; + "nanopolish version " PACKAGE_VERSION "\n" + "Written by Jared Simpson.\n" + "\n" + "Copyright 2015-2017 Ontario Institute for Cancer Research\n"; std::cout << VERSION_MESSAGE << std::endl; return 0; } @@ -78,7 +78,7 @@ int main(int argc, char** argv) if (iter != programs.end()) { ret = iter->second(argc - 1, argv + 1); } - else + else ret = print_usage( argc - 1, argv + 1); } @@ -92,7 +92,7 @@ int main(int argc, char** argv) extern int g_bad_fast5_file; if(g_total_reads > 0) { fprintf(stderr, "[post-run summaryz] total reads: %d, unparseable: %d, qc fail: %d, could not calibrate: %d, no alignment: %d, bad fast5: %d\n", - g_total_reads, g_unparseable_reads, g_qc_fail_reads, g_failed_calibration_reads, g_failed_alignment_reads, g_bad_fast5_file); + g_total_reads, g_unparseable_reads, g_qc_fail_reads, g_failed_calibration_reads, g_failed_alignment_reads, g_bad_fast5_file); } return ret; } From 585302a9d7e3220c1079876bc6d750f237d01610 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Thu, 20 Sep 2018 15:39:41 +0100 Subject: [PATCH 70/80] fixed indentation --- src/nanopolish_call_variants.cpp | 221 +++++++++++++++---------------- 1 file changed, 110 insertions(+), 111 deletions(-) diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp index 806cde2c..a5de13d9 100644 --- a/src/nanopolish_call_variants.cpp +++ b/src/nanopolish_call_variants.cpp @@ -66,43 +66,43 @@ float g_p_skip, g_p_skip_self, g_p_bad, g_p_bad_self; #define SUBPROGRAM "variants" static const char *CONSENSUS_VERSION_MESSAGE = -SUBPROGRAM " Version " PACKAGE_VERSION "\n" -"Written by Jared Simpson.\n" -"\n" -"Copyright 2015 Ontario Institute for Cancer Research\n"; + SUBPROGRAM " Version " PACKAGE_VERSION "\n" + "Written by Jared Simpson.\n" + "\n" + "Copyright 2015 Ontario Institute for Cancer Research\n"; static const char *CONSENSUS_USAGE_MESSAGE = -"Usage: " PACKAGE_NAME " " SUBPROGRAM " [OPTIONS] --reads reads.fa --bam alignments.bam --genome genome.fa\n" -"Find SNPs using a signal-level HMM\n" -"\n" -" -v, --verbose display verbose output\n" -" --version display version\n" -" --help display this help and exit\n" -" --snps only call SNPs\n" -" --consensus run in consensus calling mode\n" -" --fix-homopolymers run the experimental homopolymer caller\n" -" --faster minimize compute time while slightly reducing consensus accuracy\n" -" -w, --window=STR find variants in window STR (format: :-)\n" -" -r, --reads=FILE the ONT reads are in fasta FILE\n" -" -b, --bam=FILE the reads aligned to the reference genome are in bam FILE\n" -" -e, --event-bam=FILE the events aligned to the reference genome are in bam FILE\n" -" -g, --genome=FILE the reference genome is in FILE\n" -" -p, --ploidy=NUM the ploidy level of the sequenced genome\n" -" -q --methylation-aware=STR turn on methylation aware polishing and test motifs given in STR (example: -q dcm,dam)\n" -" --genotype=FILE call genotypes for the variants in the vcf FILE\n" -" -o, --outfile=FILE write result to FILE [default: stdout]\n" -" -t, --threads=NUM use NUM threads (default: 1)\n" -" -m, --min-candidate-frequency=F extract candidate variants from the aligned reads when the variant frequency is at least F (default 0.2)\n" -" -d, --min-candidate-depth=D extract candidate variants from the aligned reads when the depth is at least D (default: 20)\n" -" -x, --max-haplotypes=N consider at most N haplotype combinations (default: 1000)\n" -" --min-flanking-sequence=N distance from alignment end to calculate variants (default: 30)\n" -" --max-rounds=N perform N rounds of consensus sequence improvement (default: 50)\n" -" -c, --candidates=VCF read variant candidates from VCF, rather than discovering them from aligned reads\n" -" -a, --alternative-basecalls-bam=FILE if an alternative basecaller was used that does not output event annotations\n" -" then use basecalled sequences from FILE. The signal-level events will still be taken from the -b bam.\n" -" --calculate-all-support when making a call, also calculate the support of the 3 other possible bases\n" -" --models-fofn=FILE read alternative k-mer models from FILE\n" -"\nReport bugs to " PACKAGE_BUGREPORT "\n\n"; + "Usage: " PACKAGE_NAME " " SUBPROGRAM " [OPTIONS] --reads reads.fa --bam alignments.bam --genome genome.fa\n" + "Find SNPs using a signal-level HMM\n" + "\n" + " -v, --verbose display verbose output\n" + " --version display version\n" + " --help display this help and exit\n" + " --snps only call SNPs\n" + " --consensus run in consensus calling mode\n" + " --fix-homopolymers run the experimental homopolymer caller\n" + " --faster minimize compute time while slightly reducing consensus accuracy\n" + " -w, --window=STR find variants in window STR (format: :-)\n" + " -r, --reads=FILE the ONT reads are in fasta FILE\n" + " -b, --bam=FILE the reads aligned to the reference genome are in bam FILE\n" + " -e, --event-bam=FILE the events aligned to the reference genome are in bam FILE\n" + " -g, --genome=FILE the reference genome is in FILE\n" + " -p, --ploidy=NUM the ploidy level of the sequenced genome\n" + " -q --methylation-aware=STR turn on methylation aware polishing and test motifs given in STR (example: -q dcm,dam)\n" + " --genotype=FILE call genotypes for the variants in the vcf FILE\n" + " -o, --outfile=FILE write result to FILE [default: stdout]\n" + " -t, --threads=NUM use NUM threads (default: 1)\n" + " -m, --min-candidate-frequency=F extract candidate variants from the aligned reads when the variant frequency is at least F (default 0.2)\n" + " -d, --min-candidate-depth=D extract candidate variants from the aligned reads when the depth is at least D (default: 20)\n" + " -x, --max-haplotypes=N consider at most N haplotype combinations (default: 1000)\n" + " --min-flanking-sequence=N distance from alignment end to calculate variants (default: 30)\n" + " --max-rounds=N perform N rounds of consensus sequence improvement (default: 50)\n" + " -c, --candidates=VCF read variant candidates from VCF, rather than discovering them from aligned reads\n" + " -a, --alternative-basecalls-bam=FILE if an alternative basecaller was used that does not output event annotations\n" + " then use basecalled sequences from FILE. The signal-level events will still be taken from the -b bam.\n" + " --calculate-all-support when making a call, also calculate the support of the 3 other possible bases\n" + " --models-fofn=FILE read alternative k-mer models from FILE\n" + "\nReport bugs to " PACKAGE_BUGREPORT "\n\n"; namespace opt { @@ -289,8 +289,7 @@ void prepareForBaseEditCandidates(int start, std::string contig, std::vector> &tmp_variants_vector, std::vector &haplotypes, - std::vector> &event_sequences_vector -){ + std::vector> &event_sequences_vector){ for(int i = start; i<=end; i++){ int calling_start = i - opt::screen_flanking_sequence; int calling_end = i + 1 + opt::screen_flanking_sequence; @@ -388,7 +387,7 @@ void locusRangeBaseEditCandidateGPU(int start, void locusRangeBaseEditCandidate(int start, int end, - const AlignmentDB& alignments, + const AlignmentDB& alignments, uint32_t alignment_flags, std::vector &out_variants, std::string contig) { @@ -843,7 +842,7 @@ Haplotype fix_homopolymers(const Haplotype& input_haplotype, duration_likelihoods[var_sequence_length] += log_gamma; } if(opt::verbose > 3) { - fprintf(stderr, "SUM_VAR\t%zu\t%zu\t%d\t%d\t%lu\t%.5lf\t%.2lf\n", ref_hp_start, hp_length, var_sequence_length, call_window, variant_offset_end - variant_offset_start, sum_duration, log_gamma); + fprintf(stderr, "SUM_VAR\t%zu\t%zu\t%d\t%d\t%lu\t%.5lf\t%.2lf\n", ref_hp_start, hp_length, var_sequence_length, call_window, variant_offset_end - variant_offset_start, sum_duration, log_gamma); } } } @@ -960,7 +959,7 @@ Haplotype call_haplotype_from_candidates(const AlignmentDB& alignments, size_t end_variant_idx = curr_variant_idx + 1; while(end_variant_idx < candidate_variants.size()) { int distance = candidate_variants[end_variant_idx].ref_position - - candidate_variants[end_variant_idx - 1].ref_position; + candidate_variants[end_variant_idx - 1].ref_position; if(distance > opt::min_distance_between_variants) break; end_variant_idx++; @@ -969,8 +968,8 @@ Haplotype call_haplotype_from_candidates(const AlignmentDB& alignments, size_t num_variants = end_variant_idx - curr_variant_idx; int calling_start = candidate_variants[curr_variant_idx].ref_position - opt::min_flanking_sequence; int calling_end = candidate_variants[end_variant_idx - 1].ref_position + - candidate_variants[end_variant_idx - 1].ref_seq.length() + - opt::min_flanking_sequence; + candidate_variants[end_variant_idx - 1].ref_seq.length() + + opt::min_flanking_sequence; int calling_size = calling_end - calling_start; @@ -1014,7 +1013,7 @@ Haplotype call_haplotype_from_candidates(const AlignmentDB& alignments, } } else { fprintf(stderr, "Warning: %zu variants in span, region not called [%d %d]\n", num_variants, calling_start, calling_end); - } + } // advance to start of next region curr_variant_idx = end_variant_idx; @@ -1070,11 +1069,11 @@ Haplotype call_variants_for_region(const std::string& contig, int region_start, fprintf(stderr, "input region: %s\n", alignments.get_reference_substring(contig, region_start - BUFFER, region_end + BUFFER).c_str()); } -/* - Haplotype called_haplotype(alignments.get_region_contig(), - alignments.get_region_start(), - alignments.get_reference()); -*/ + /* + Haplotype called_haplotype(alignments.get_region_contig(), + alignments.get_region_start(), + alignments.get_reference()); + */ // Step 1. Discover putative variants across the whole region std::vector candidate_variants; if(opt::candidates_file.empty()) { @@ -1155,10 +1154,10 @@ Haplotype call_variants_for_region(const std::string& contig, int region_start, last_round_variant_keys = this_round_variant_keys; if(variant_set_changed) { candidate_variants = expand_variants(alignments, - called_variants, - region_start, - region_end, - alignment_flags); + called_variants, + region_start, + region_end, + alignment_flags); } else { break; @@ -1190,44 +1189,44 @@ void parse_call_variants_options(int argc, char** argv) for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) { std::istringstream arg(optarg != NULL ? optarg : ""); switch (c) { - case 'r': arg >> opt::reads_file; break; - case 'g': arg >> opt::genome_file; break; - case 'b': arg >> opt::bam_file; break; - case 'e': arg >> opt::event_bam_file; break; - case 'w': arg >> opt::window; break; - case 'o': arg >> opt::output_file; break; - case 'm': arg >> opt::min_candidate_frequency; break; - case 'd': arg >> opt::min_candidate_depth; break; - case 'x': arg >> opt::max_haplotypes; break; - case 'c': arg >> opt::candidates_file; break; - case 'p': arg >> opt::ploidy; break; - case 'q': arg >> methylation_motifs_str; break; - case 'a': arg >> opt::alternative_basecalls_bam; break; - case '?': die = true; break; - case 't': arg >> opt::num_threads; break; - case 'v': opt::verbose++; break; - case OPT_CONSENSUS: opt::consensus_mode = 1; break; - case OPT_GPU: opt::gpu = 1; break; - case OPT_FIX_HOMOPOLYMERS: opt::fix_homopolymers = 1; break; - case OPT_EFFORT: arg >> opt::screen_score_threshold; break; - case OPT_FASTER: opt::screen_score_threshold = 25; break; - case OPT_MAX_ROUNDS: arg >> opt::max_rounds; break; - case OPT_GENOTYPE: opt::genotype_only = 1; arg >> opt::candidates_file; break; - case OPT_MODELS_FOFN: arg >> opt::models_fofn; break; - case OPT_CALC_ALL_SUPPORT: opt::calculate_all_support = 1; break; - case OPT_SNPS_ONLY: opt::snps_only = 1; break; - case OPT_PROGRESS: opt::show_progress = 1; break; - case OPT_P_SKIP: arg >> g_p_skip; break; - case OPT_P_SKIP_SELF: arg >> g_p_skip_self; break; - case OPT_P_BAD: arg >> g_p_bad; break; - case OPT_P_BAD_SELF: arg >> g_p_bad_self; break; - case OPT_MIN_FLANKING_SEQUENCE: arg >> opt::min_flanking_sequence; break; - case OPT_HELP: - std::cout << CONSENSUS_USAGE_MESSAGE; - exit(EXIT_SUCCESS); - case OPT_VERSION: - std::cout << CONSENSUS_VERSION_MESSAGE; - exit(EXIT_SUCCESS); + case 'r': arg >> opt::reads_file; break; + case 'g': arg >> opt::genome_file; break; + case 'b': arg >> opt::bam_file; break; + case 'e': arg >> opt::event_bam_file; break; + case 'w': arg >> opt::window; break; + case 'o': arg >> opt::output_file; break; + case 'm': arg >> opt::min_candidate_frequency; break; + case 'd': arg >> opt::min_candidate_depth; break; + case 'x': arg >> opt::max_haplotypes; break; + case 'c': arg >> opt::candidates_file; break; + case 'p': arg >> opt::ploidy; break; + case 'q': arg >> methylation_motifs_str; break; + case 'a': arg >> opt::alternative_basecalls_bam; break; + case '?': die = true; break; + case 't': arg >> opt::num_threads; break; + case 'v': opt::verbose++; break; + case OPT_CONSENSUS: opt::consensus_mode = 1; break; + case OPT_GPU: opt::gpu = 1; break; + case OPT_FIX_HOMOPOLYMERS: opt::fix_homopolymers = 1; break; + case OPT_EFFORT: arg >> opt::screen_score_threshold; break; + case OPT_FASTER: opt::screen_score_threshold = 25; break; + case OPT_MAX_ROUNDS: arg >> opt::max_rounds; break; + case OPT_GENOTYPE: opt::genotype_only = 1; arg >> opt::candidates_file; break; + case OPT_MODELS_FOFN: arg >> opt::models_fofn; break; + case OPT_CALC_ALL_SUPPORT: opt::calculate_all_support = 1; break; + case OPT_SNPS_ONLY: opt::snps_only = 1; break; + case OPT_PROGRESS: opt::show_progress = 1; break; + case OPT_P_SKIP: arg >> g_p_skip; break; + case OPT_P_SKIP_SELF: arg >> g_p_skip_self; break; + case OPT_P_BAD: arg >> g_p_bad; break; + case OPT_P_BAD_SELF: arg >> g_p_bad_self; break; + case OPT_MIN_FLANKING_SEQUENCE: arg >> opt::min_flanking_sequence; break; + case OPT_HELP: + std::cout << CONSENSUS_USAGE_MESSAGE; + exit(EXIT_SUCCESS); + case OPT_VERSION: + std::cout << CONSENSUS_VERSION_MESSAGE; + exit(EXIT_SUCCESS); } } @@ -1281,10 +1280,10 @@ void parse_call_variants_options(int argc, char** argv) } if (die) - { - std::cout << "\n" << CONSENSUS_USAGE_MESSAGE; - exit(EXIT_FAILURE); - } + { + std::cout << "\n" << CONSENSUS_USAGE_MESSAGE; + exit(EXIT_FAILURE); + } } void print_invalid_window_error(int start_base, int end_base) @@ -1346,34 +1345,34 @@ int call_variants_main(int argc, char** argv) // header_fields.push_back( - Variant::make_vcf_tag_string("INFO", "TotalReads", 1, "Integer", - "The number of event-space reads used to call the variant")); + Variant::make_vcf_tag_string("INFO", "TotalReads", 1, "Integer", + "The number of event-space reads used to call the variant")); header_fields.push_back( - Variant::make_vcf_tag_string("INFO", "SupportFraction", 1, "Float", - "The fraction of event-space reads that support the variant")); + Variant::make_vcf_tag_string("INFO", "SupportFraction", 1, "Float", + "The fraction of event-space reads that support the variant")); header_fields.push_back( - Variant::make_vcf_tag_string("INFO", "BaseCalledReadsWithVariant", 1, "Integer", - "The number of base-space reads that support the variant")); + Variant::make_vcf_tag_string("INFO", "BaseCalledReadsWithVariant", 1, "Integer", + "The number of base-space reads that support the variant")); header_fields.push_back( - Variant::make_vcf_tag_string("INFO", "BaseCalledFraction", 1, "Float", - "The fraction of base-space reads that support the variant")); + Variant::make_vcf_tag_string("INFO", "BaseCalledFraction", 1, "Float", + "The fraction of base-space reads that support the variant")); header_fields.push_back( - Variant::make_vcf_tag_string("INFO", "AlleleCount", 1, "Integer", - "The inferred number of copies of the allele")); + Variant::make_vcf_tag_string("INFO", "AlleleCount", 1, "Integer", + "The inferred number of copies of the allele")); if(opt::calculate_all_support) { header_fields.push_back( - Variant::make_vcf_tag_string("INFO", "SupportFractionByBase", 4, "Integer", - "The fraction of reads supporting A,C,G,T at this position")); + Variant::make_vcf_tag_string("INFO", "SupportFractionByBase", 4, "Integer", + "The fraction of reads supporting A,C,G,T at this position")); } header_fields.push_back( - Variant::make_vcf_tag_string("FORMAT", "GT", 1, "String", - "Genotype")); + Variant::make_vcf_tag_string("FORMAT", "GT", 1, "String", + "Genotype")); Variant::write_vcf_header(out_fp, header_fields); @@ -1383,9 +1382,9 @@ int call_variants_main(int argc, char** argv) if(!opt::consensus_output.empty()) { FILE* consensus_fp = fopen(opt::consensus_output.c_str(), "w"); fprintf(consensus_fp, ">%s:%d-%d\n%s\n", contig.c_str(), - start_base, - end_base, - haplotype.get_sequence().c_str()); + start_base, + end_base, + haplotype.get_sequence().c_str()); fclose(consensus_fp); } From f3bf3e1f27e986e4ee3c884ab182e81ca7e52d28 Mon Sep 17 00:00:00 2001 From: Hasindu Gamaarachchi Date: Wed, 25 Sep 2019 17:38:18 +1000 Subject: [PATCH 71/80] changes to the makefile to get it compiled --- Makefile | 9 ++++----- test/.gitignore | 2 ++ 2 files changed, 6 insertions(+), 5 deletions(-) create mode 100644 test/.gitignore diff --git a/Makefile b/Makefile index dad29e07..0ccb65b5 100644 --- a/Makefile +++ b/Makefile @@ -15,8 +15,8 @@ CFLAGS ?= -std=c99 -O3 CXX ?= g++ CC ?= gcc NVCC = nvcc -NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0include -O3 -use_fast_math --default-stream per-thread -restrict -CURTFLAGS ?= -L/usr/local/cuda-9.0/lib64 -lcudart +NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda/include -O3 -use_fast_math --default-stream per-thread -restrict +CURTFLAGS ?= -L/usr/local/cuda/lib64 -lcudart # Change the value of HDF5, EIGEN, or HTS below to any value to disable compilation of bundled code HDF5 ?= install @@ -69,7 +69,7 @@ EIGEN_INCLUDE = -I./eigen/ # Include the src subdirectories NP_INCLUDE = $(addprefix -I./, $(SUBDIRS)) -CUDA_INCLUDE=-I/usr/local/cuda-9.0/include +CUDA_INCLUDE=-I/usr/local/cuda/include # Add include flags CPPFLAGS += $(H5_INCLUDE) $(HTS_INCLUDE) $(FAST5_INCLUDE) $(NP_INCLUDE) $(EIGEN_INCLUDE) $(CUDA_INCLUDE) @@ -158,5 +158,4 @@ test: $(TEST_PROGRAM) .PHONY: clean clean: - rm -f $(PROGRAM) $(TEST_PROGRAM) $(CPP_OBJ) $(CU_OBJ) $(C_OBJ) src/main/nanopolish.o src/test/nanopolish_test.o - src/main/nanopolish.o src/test/nanopolish_test.o \ No newline at end of file + rm -f $(PROGRAM) $(TEST_PROGRAM) $(CPP_OBJ) $(CU_OBJ) $(C_OBJ) src/main/nanopolish.o src/test/nanopolish_test.o src/main/nanopolish.o src/test/nanopolish_test.o diff --git a/test/.gitignore b/test/.gitignore new file mode 100644 index 00000000..479a396b --- /dev/null +++ b/test/.gitignore @@ -0,0 +1,2 @@ +ecoli_2kb_region + From e484b291ece556922e2d367a440fe5f01b933880 Mon Sep 17 00:00:00 2001 From: Hasindu Gamaarachchi Date: Fri, 27 Sep 2019 19:20:57 +1000 Subject: [PATCH 72/80] cleaned up the make file and added cuda support as an option with minimal changes to the original source --- .travis.yml | 2 +- Makefile | 64 ++++++++++++++++++++------------ src/nanopolish_call_variants.cpp | 20 +++++++--- 3 files changed, 56 insertions(+), 30 deletions(-) diff --git a/.travis.yml b/.travis.yml index ca383521..7b15b855 100644 --- a/.travis.yml +++ b/.travis.yml @@ -43,4 +43,4 @@ script: # to display the log without downloading the raw log on Travis log page. # Travis finishs with error when exceeding the limit of 4 MB of log length. - export H5_CFLAGS="-w" - - make nanopolish && make test + - make && make test diff --git a/Makefile b/Makefile index 0ccb65b5..f45fda70 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # # Sub directories containing source code, except for the main programs -SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/alignment src/pore_model src/cuda_kernels +SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/alignment src/pore_model # # Set libraries, paths, flags and options @@ -11,12 +11,9 @@ SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/ali LIBS = -lz CXXFLAGS ?= -g -O3 CXXFLAGS += -std=c++11 -fopenmp -fsigned-char -CFLAGS ?= -std=c99 -O3 +CFLAGS ?= -O3 -std=c99 CXX ?= g++ CC ?= gcc -NVCC = nvcc -NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda/include -O3 -use_fast_math --default-stream per-thread -restrict -CURTFLAGS ?= -L/usr/local/cuda/lib64 -lcudart # Change the value of HDF5, EIGEN, or HTS below to any value to disable compilation of bundled code HDF5 ?= install @@ -69,17 +66,15 @@ EIGEN_INCLUDE = -I./eigen/ # Include the src subdirectories NP_INCLUDE = $(addprefix -I./, $(SUBDIRS)) -CUDA_INCLUDE=-I/usr/local/cuda/include - # Add include flags -CPPFLAGS += $(H5_INCLUDE) $(HTS_INCLUDE) $(FAST5_INCLUDE) $(NP_INCLUDE) $(EIGEN_INCLUDE) $(CUDA_INCLUDE) +CPPFLAGS += $(H5_INCLUDE) $(HTS_INCLUDE) $(FAST5_INCLUDE) $(NP_INCLUDE) $(EIGEN_INCLUDE) # Main programs to build PROGRAM = nanopolish TEST_PROGRAM = nanopolish_test .PHONY: all -all: $(PROGRAM) $(TEST_PROGRAM) +all: depend $(PROGRAM) # # Build libhts @@ -113,27 +108,50 @@ eigen/INSTALL: # Find the source files by searching subdirectories CPP_SRC := $(foreach dir, $(SUBDIRS), $(wildcard $(dir)/*.cpp)) -CU_SRC := $(foreach dir, $(SUBDIRS), $(wildcard $(dir)/*.cu)) C_SRC := $(foreach dir, $(SUBDIRS), $(wildcard $(dir)/*.c)) EXE_SRC = src/main/nanopolish.cpp src/test/nanopolish_test.cpp # Automatically generated object names -CPP_OBJ=$(CPP_SRC:.cpp=.o) -C_OBJ=$(C_SRC:.c=.o) -CU_OBJ=$(CU_SRC:.cu=.o) +CPP_OBJ = $(CPP_SRC:.cpp=.o) +C_OBJ = $(C_SRC:.c=.o) + +ifdef cuda + + NVCC = nvcc + NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda/include -O3 -use_fast_math --default-stream per-thread -restrict + CURTFLAGS ?= -L/usr/local/cuda/lib64 -lcudart + + CUDA_INCLUDE?=-I/usr/local/cuda/include + CPPFLAGS+=$(CUDA_INCLUDE) + CPPFLAGS+=-DHAVE_CUDA=1 + + # Sub directories containing CUDA source code + SUBDIRS+=src/cuda_kernels + # Find the source files by searching subdirectories + CU_SRC := $(foreach dir, $(SUBDIRS), $(wildcard $(dir)/*.cu)) + # Automatically generated object names + CU_OBJ=$(CU_SRC:.cu=.o) + CPP_OBJ+=$(CU_OBJ) + LDFLAGS+=$(CURTFLAGS) .SUFFIXES: .cu +# Compile objects +.cu.o: + $(NVCC) -o $@ -c $(NVCCFLAGS) $(CPPFLAGS) $< + +endif + + + # Generate dependencies .PHONY: depend depend: .depend -.depend: $(CPP_SRC) $(C_SRC) $(CU_SRC) $(EXE_SRC) $(H5_LIB) $(EIGEN_CHECK) +.depend: $(CPP_SRC) $(C_SRC) $(EXE_SRC) $(H5_LIB) $(EIGEN_CHECK) rm -f ./.depend $(CXX) $(CXXFLAGS) $(CPPFLAGS) -MM $(CPP_SRC) $(C_SRC) > ./.depend; -include .depend - # Compile objects .cpp.o: $(CXX) -o $@ -c $(CXXFLAGS) $(CPPFLAGS) -fPIC $< @@ -141,16 +159,13 @@ include .depend .c.o: $(CC) -o $@ -c $(CFLAGS) $(CPPFLAGS) $(H5_INCLUDE) -fPIC $< -.cu.o: - $(NVCC) -o $@ -c $(NVCCFLAGS) $(CPPFLAGS) $< - # Link main executable -$(PROGRAM): src/main/nanopolish.o $(CU_OBJ) $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(EIGEN_CHECK) - $(CXX) -o $@ $(CXXFLAGS) $(CPPFLAGS) -fPIC $< $(CPP_OBJ) $(CU_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(LIBS) $(LDFLAGS) $(CURTFLAGS) +$(PROGRAM): src/main/nanopolish.o $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(EIGEN_CHECK) + $(CXX) -o $@ $(CXXFLAGS) $(CPPFLAGS) -fPIC $< $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(LIBS) $(LDFLAGS) # Link test executable -$(TEST_PROGRAM): src/test/nanopolish_test.o $(CPP_OBJ) $(CU_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) - $(CXX) -o $@ $(CXXFLAGS) $(CPPFLAGS) -fPIC $< $(CPP_OBJ) $(CU_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(LIBS) $(LDFLAGS) $(CURTFLAGS) +$(TEST_PROGRAM): src/test/nanopolish_test.o $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) + $(CXX) -o $@ $(CXXFLAGS) $(CPPFLAGS) -fPIC $< $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(LIBS) $(LDFLAGS) .PHONY: test test: $(TEST_PROGRAM) @@ -158,4 +173,5 @@ test: $(TEST_PROGRAM) .PHONY: clean clean: - rm -f $(PROGRAM) $(TEST_PROGRAM) $(CPP_OBJ) $(CU_OBJ) $(C_OBJ) src/main/nanopolish.o src/test/nanopolish_test.o src/main/nanopolish.o src/test/nanopolish_test.o + rm -f $(PROGRAM) $(TEST_PROGRAM) $(CPP_OBJ) $(C_OBJ) \ + src/main/nanopolish.o src/test/nanopolish_test.o diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp index 28f86574..90f991b3 100644 --- a/src/nanopolish_call_variants.cpp +++ b/src/nanopolish_call_variants.cpp @@ -38,10 +38,12 @@ #include "profiler.h" #include "progress.h" #include "stdaln.h" -#include -#include -#include -#include +#ifdef HAVE_CUDA + #include + #include + #include + #include +#endif // Macros #define max3(x,y,z) std::max(std::max(x,y), z) @@ -349,7 +351,7 @@ void prepareForBaseEditCandidates(int start, } } - +#ifdef HAVE_CUDA void locusRangeBaseEditCandidateGPU(int start, int end, const AlignmentDB& alignments, @@ -384,6 +386,7 @@ void locusRangeBaseEditCandidateGPU(int start, } } +#endif void locusRangeBaseEditCandidate(int start, int end, @@ -423,6 +426,7 @@ void locusRangeBaseEditCandidate(int start, } } +#ifdef HAVE_CUDA std::vector generate_candidate_single_base_edits_gpu(const AlignmentDB& alignments, int region_start, int region_end, @@ -507,6 +511,7 @@ std::vector generate_candidate_single_base_edits_gpu(const AlignmentDB& } return out_variants; } +#endif // Given the input region, calculate all single base edits to the current assembly std::vector generate_candidate_single_base_edits(const AlignmentDB& alignments, @@ -1087,10 +1092,15 @@ Haplotype call_variants_for_region(const std::string& contig, int region_start, std::vector single_base_edits; if(opt::gpu) { + #ifdef HAVE_CUDA single_base_edits = generate_candidate_single_base_edits_gpu(alignments, region_start, region_end, alignment_flags); + #else + fprintf(stderr,"Not compiled for CUDA\n"); + exit(1); + #endif } else { single_base_edits = generate_candidate_single_base_edits(alignments, region_start, From f19f9b8d9371870f9a3553e365ab90fa76683fe0 Mon Sep 17 00:00:00 2001 From: Hasindu Gamaarachchi Date: Fri, 27 Sep 2019 19:53:24 +1000 Subject: [PATCH 73/80] cleaned up to be consistent with the original code --- src/hmm/nanopolish_emissions.h | 4 +--- src/hmm/nanopolish_profile_hmm_r7.inl | 7 +++---- src/hmm/nanopolish_profile_hmm_r9.cpp | 2 +- src/hmm/nanopolish_profile_hmm_r9.inl | 28 +++++++++++---------------- src/main/nanopolish.cpp | 19 +++++++++--------- 5 files changed, 25 insertions(+), 35 deletions(-) diff --git a/src/hmm/nanopolish_emissions.h b/src/hmm/nanopolish_emissions.h index 3dca4746..f9e85142 100644 --- a/src/hmm/nanopolish_emissions.h +++ b/src/hmm/nanopolish_emissions.h @@ -58,12 +58,10 @@ inline float log_probability_match_r9(const SquiggleRead& read, const PoreModel& pore_model, uint32_t kmer_rank, uint32_t event_idx, - uint8_t strand, - bool debug = false) + uint8_t strand) { // event level mean, scaled with the drift value float level = read.get_drift_scaled_level(event_idx, strand); - GaussianParameters gp = read.get_scaled_gaussian_from_pore_model_state(pore_model, strand, kmer_rank); float lp = log_normal_pdf(level, gp); return lp; diff --git a/src/hmm/nanopolish_profile_hmm_r7.inl b/src/hmm/nanopolish_profile_hmm_r7.inl index 7f9083e9..bf0edd28 100644 --- a/src/hmm/nanopolish_profile_hmm_r7.inl +++ b/src/hmm/nanopolish_profile_hmm_r7.inl @@ -306,10 +306,9 @@ inline float profile_hmm_fill_generic_r7(const HMMInputSequence& _sequence, assert( data.pore_model->states.size() == sequence.get_num_kmer_ranks(k) ); std::vector kmer_ranks(num_kmers); - for(size_t ki = 0; ki < num_kmers; ++ki) { - int rank = sequence.get_kmer_rank(ki, k, data.rc); - kmer_ranks[ki] = rank; - } + for(size_t ki = 0; ki < num_kmers; ++ki) + kmer_ranks[ki] = sequence.get_kmer_rank(ki, k, data.rc); + size_t num_events = output.get_num_rows() - 1; std::vector pre_flank = make_pre_flanking_r7(data, parameters, e_start, num_events); diff --git a/src/hmm/nanopolish_profile_hmm_r9.cpp b/src/hmm/nanopolish_profile_hmm_r9.cpp index 1f365ebe..773394a7 100644 --- a/src/hmm/nanopolish_profile_hmm_r9.cpp +++ b/src/hmm/nanopolish_profile_hmm_r9.cpp @@ -46,7 +46,7 @@ float profile_hmm_score_r9(const HMMInputSequence& sequence, const HMMInputData& FloatMatrix fm; allocate_matrix(fm, n_rows, n_states); - profile_hmm_forward_initialize_r9(fm); // what does this do? + profile_hmm_forward_initialize_r9(fm); ProfileHMMForwardOutputR9 output(&fm); diff --git a/src/hmm/nanopolish_profile_hmm_r9.inl b/src/hmm/nanopolish_profile_hmm_r9.inl index c09b4321..71d52aba 100644 --- a/src/hmm/nanopolish_profile_hmm_r9.inl +++ b/src/hmm/nanopolish_profile_hmm_r9.inl @@ -216,6 +216,7 @@ inline std::vector make_pre_flanking(const HMMInputData& data, pre_flank[i] = log(TRANS_CLIP_SELF) + log_probability_background(*data.read, event_idx, data.strand) + // emit from background pre_flank[i - 1]; // this accounts for the transition from the start & to the silent pre + } return pre_flank; @@ -260,7 +261,7 @@ inline std::vector make_post_flanking(const HMMInputData& data, template inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, const HMMInputData& _data, - const uint32_t, //e_start apparently not used by this function + const uint32_t, uint32_t flags, ProfileHMMOutput& output) { @@ -281,10 +282,10 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, #endif uint32_t e_start = data.event_start_idx; - + // Calculate number of blocks // A block of the HMM is a set of states for one kmer - uint32_t num_blocks = output.get_num_columns() / PSR9_NUM_STATES; // num_columns is the number of HMM STATES + uint32_t num_blocks = output.get_num_columns() / PSR9_NUM_STATES; uint32_t last_event_row_idx = output.get_num_rows() - 1; // Precompute the transition probabilites for each kmer block @@ -300,10 +301,8 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, assert( data.pore_model->states.size() == sequence.get_num_kmer_ranks(k) ); std::vector kmer_ranks(num_kmers); - for(size_t ki = 0; ki < num_kmers; ++ki) { - int kr = sequence.get_kmer_rank(ki, k, data.rc); // can * -1 here to see if 3rd is correct - kmer_ranks[ki] = kr; - } + for(size_t ki = 0; ki < num_kmers; ++ki) + kmer_ranks[ki] = sequence.get_kmer_rank(ki, k, data.rc); size_t num_events = output.get_num_rows() - 1; @@ -338,8 +337,7 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, // Emission probabilities uint32_t event_idx = e_start + (row - 1) * data.event_stride; uint32_t rank = kmer_ranks[kmer_idx]; - float lp_emission_m = log_probability_match_r9(*data.read, *data.pore_model, rank, event_idx, data.strand, true); - + float lp_emission_m = log_probability_match_r9(*data.read, *data.pore_model, rank, event_idx, data.strand); float lp_emission_b = BAD_EVENT_PENALTY; HMMUpdateScores scores; @@ -351,8 +349,6 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, scores.x[HMT_FROM_PREV_B] = bt.lp_bm_next + output.get(row - 1, prev_block_offset + PSR9_BAD_EVENT); scores.x[HMT_FROM_PREV_K] = bt.lp_km + output.get(row - 1, prev_block_offset + PSR9_KMER_SKIP); - scores.x[HMT_FROM_PREV_B] = bt.lp_bm_next + output.get(row - 1, prev_block_offset + PSR9_BAD_EVENT); - // m_s is the probability of going from the start state // to this kmer. The start state is (currently) only // allowed to go to the first kmer. If ALLOW_PRE_CLIP @@ -361,10 +357,10 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, scores.x[HMT_FROM_SOFT] = (kmer_idx == 0 && (event_idx == e_start || (flags & HAF_ALLOW_PRE_CLIP))) ? lp_sm + pre_flank[row - 1] : -INFINITY; - + output.update_cell(row, curr_block_offset + PSR9_MATCH, scores, lp_emission_m); - // state PSR9_BAD_EVENT + // state PSR9_BAD_EVENT scores.x[HMT_FROM_SAME_M] = bt.lp_mb + output.get(row - 1, curr_block_offset + PSR9_MATCH); scores.x[HMT_FROM_PREV_M] = -INFINITY; // not allowed scores.x[HMT_FROM_SAME_B] = bt.lp_bb + output.get(row - 1, curr_block_offset + PSR9_BAD_EVENT); @@ -385,7 +381,6 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, // If POST_CLIP is enabled we allow the last kmer to transition directly // to the end after any event. Otherwise we only allow it from the // last kmer/event match. - if(kmer_idx == last_kmer_idx && ( (flags & HAF_ALLOW_POST_CLIP) || row == last_event_row_idx)) { float lp1 = lp_ms + output.get(row, curr_block_offset + PSR9_MATCH) + post_flank[row - 1]; float lp2 = lp_ms + output.get(row, curr_block_offset + PSR9_BAD_EVENT) + post_flank[row - 1]; @@ -396,7 +391,6 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, output.update_end(lp3, row, curr_block_offset + PSR9_KMER_SKIP); } - #ifdef DEBUG_LOCAL_ALIGNMENT printf("[%d %d] start: %.2lf pre: %.2lf fm: %.2lf\n", event_idx, kmer_idx, m_s + lp_emission_m, pre_flank[row - 1], output.get(row, curr_block_offset + PSR9_MATCH)); printf("[%d %d] end: %.2lf post: %.2lf\n", event_idx, kmer_idx, lp_end, post_flank[row - 1]); @@ -430,7 +424,7 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence, #endif } } - - return output.get_end(); + + return output.get_end(); } diff --git a/src/main/nanopolish.cpp b/src/main/nanopolish.cpp index 417790ca..3962d79b 100644 --- a/src/main/nanopolish.cpp +++ b/src/main/nanopolish.cpp @@ -56,10 +56,10 @@ int print_usage(int, char **) int print_version(int, char **) { static const char *VERSION_MESSAGE = - "nanopolish version " PACKAGE_VERSION "\n" - "Written by Jared Simpson.\n" - "\n" - "Copyright 2015-2017 Ontario Institute for Cancer Research\n"; + "nanopolish version " PACKAGE_VERSION "\n" + "Written by Jared Simpson.\n" + "\n" + "Copyright 2015-2017 Ontario Institute for Cancer Research\n"; std::cout << VERSION_MESSAGE << std::endl; return 0; } @@ -77,10 +77,9 @@ int main(int argc, char** argv) } else { std::string command(argv[1]); auto iter = programs.find(command); - if (iter != programs.end()) { - ret = iter->second(argc - 1, argv + 1); - } - else + if (iter != programs.end()) + ret = iter->second( argc - 1, argv + 1); + else ret = print_usage( argc - 1, argv + 1); } @@ -93,8 +92,8 @@ int main(int argc, char** argv) extern int g_failed_alignment_reads; extern int g_bad_fast5_file; if(g_total_reads > 0) { - fprintf(stderr, "[post-run summaryz] total reads: %d, unparseable: %d, qc fail: %d, could not calibrate: %d, no alignment: %d, bad fast5: %d\n", - g_total_reads, g_unparseable_reads, g_qc_fail_reads, g_failed_calibration_reads, g_failed_alignment_reads, g_bad_fast5_file); + fprintf(stderr, "[post-run summary] total reads: %d, unparseable: %d, qc fail: %d, could not calibrate: %d, no alignment: %d, bad fast5: %d\n", + g_total_reads, g_unparseable_reads, g_qc_fail_reads, g_failed_calibration_reads, g_failed_alignment_reads, g_bad_fast5_file); } return ret; } From 56585975849a46d25bc54adc93c9b8058342f74d Mon Sep 17 00:00:00 2001 From: Hasindu Gamaarachchi Date: Sat, 28 Sep 2019 15:23:40 +1000 Subject: [PATCH 74/80] restructured to minimise changes to the original source code --- Makefile | 27 +- cuda.mk | 27 ++ src/cuda_kernels/gpu_call_variants.inl | 191 ++++++++++ src/nanopolish_call_variants.cpp | 475 ++++++++----------------- 4 files changed, 370 insertions(+), 350 deletions(-) create mode 100644 cuda.mk create mode 100644 src/cuda_kernels/gpu_call_variants.inl diff --git a/Makefile b/Makefile index f45fda70..5d3bbbe0 100644 --- a/Makefile +++ b/Makefile @@ -116,34 +116,9 @@ CPP_OBJ = $(CPP_SRC:.cpp=.o) C_OBJ = $(C_SRC:.c=.o) ifdef cuda - - NVCC = nvcc - NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda/include -O3 -use_fast_math --default-stream per-thread -restrict - CURTFLAGS ?= -L/usr/local/cuda/lib64 -lcudart - - CUDA_INCLUDE?=-I/usr/local/cuda/include - CPPFLAGS+=$(CUDA_INCLUDE) - CPPFLAGS+=-DHAVE_CUDA=1 - - # Sub directories containing CUDA source code - SUBDIRS+=src/cuda_kernels - # Find the source files by searching subdirectories - CU_SRC := $(foreach dir, $(SUBDIRS), $(wildcard $(dir)/*.cu)) - # Automatically generated object names - CU_OBJ=$(CU_SRC:.cu=.o) - CPP_OBJ+=$(CU_OBJ) - LDFLAGS+=$(CURTFLAGS) - -.SUFFIXES: .cu - -# Compile objects -.cu.o: - $(NVCC) -o $@ -c $(NVCCFLAGS) $(CPPFLAGS) $< - +include cuda.mk endif - - # Generate dependencies .PHONY: depend depend: .depend diff --git a/cuda.mk b/cuda.mk new file mode 100644 index 00000000..57c97bf4 --- /dev/null +++ b/cuda.mk @@ -0,0 +1,27 @@ +#Make file options for CUDA support + +NVCC ?= nvcc +CUDA_ROOT = /usr/local/cuda +CUDA_LIB ?= $(CUDA_ROOT)/lib64 +CUDA_INCLUDE ?= $(CUDA_ROOT)/include +CURTFLAGS = -L$(CUDA_LIB) -lcudart +NVCCFLAGS ?= -std=c++11 -I. -I$(CUDA_INCLUDE) -O3 -use_fast_math --default-stream per-thread -restrict + +CPPFLAGS += -I$(CUDA_INCLUDE) +CPPFLAGS += -DHAVE_CUDA=1 + +# Sub directories containing CUDA source code +SUBDIRS += src/cuda_kernels +# Find the source files by searching subdirectories +CU_SRC := $(foreach dir, $(SUBDIRS), $(wildcard $(dir)/*.cu)) +# Automatically generated object names +CU_OBJ = $(CU_SRC:.cu=.o) +CPP_OBJ += $(CU_OBJ) +LDFLAGS += $(CURTFLAGS) + +.SUFFIXES: .cu + +# Compile objects +.cu.o: + $(NVCC) -o $@ -c $(NVCCFLAGS) $(CPPFLAGS) $< + diff --git a/src/cuda_kernels/gpu_call_variants.inl b/src/cuda_kernels/gpu_call_variants.inl new file mode 100644 index 00000000..c5036dcf --- /dev/null +++ b/src/cuda_kernels/gpu_call_variants.inl @@ -0,0 +1,191 @@ +#include +#include +#include +#include + +void prepareForBaseEditCandidates(int start, + int end, + const AlignmentDB& alignments, + std::string contig, + std::vector> &tmp_variants_vector, + std::vector &haplotypes, + std::vector> &event_sequences_vector){ + for(int i = start; i<=end; i++){ + int calling_start = i - opt::screen_flanking_sequence; + int calling_end = i + 1 + opt::screen_flanking_sequence; + + if (!alignments.are_coordinates_valid(contig, calling_start, calling_end)) { + return; + } + + std::vector tmp_variants; + for (size_t j = 0; j < 4; ++j) { + // Substitutions + Variant v; + v.ref_name = contig; + v.ref_position = i; + v.ref_seq = alignments.get_reference_substring(contig, i, i); + v.alt_seq = "ACGT"[j]; + + if (v.ref_seq != v.alt_seq) { + tmp_variants.push_back(v); + } + + // Insertions + v.alt_seq = v.ref_seq + "ACGT"[j]; + // ignore insertions of the type "A" -> "AA" as these are redundant + if (v.alt_seq[1] != v.ref_seq[0]) { + tmp_variants.push_back(v); + } + } + + // deletion + Variant del; + del.ref_name = contig; + del.ref_position = i - 1; + del.ref_seq = alignments.get_reference_substring(contig, i - 1, i); + del.alt_seq = del.ref_seq[0]; + + // ignore deletions of the type "AA" -> "A" as these are redundant + if (del.alt_seq[0] != del.ref_seq[1]) { + tmp_variants.push_back(del); + } + + // Screen variants by score + // We do this internally here as it is much faster to get the event sequences + // for the entire window for all variants at this position once, rather than + // for each variant individually + std::vector event_sequences = alignments.get_event_subsequences(contig, calling_start, calling_end); + + Haplotype test_haplotype(contig, + calling_start, + alignments.get_reference_substring(contig, + calling_start, + calling_end)); + + haplotypes.push_back(test_haplotype); + event_sequences_vector.push_back(event_sequences); + tmp_variants_vector.push_back(tmp_variants); + } +} + + +void locusRangeBaseEditCandidateGPU(int start, + int end, + const AlignmentDB& alignments, + uint32_t alignment_flags, + std::vector &out_variants, + std::string contig, + GpuAligner &aligner, + std::mutex &outVariantsMutex) { + std::vector> tmp_variants_vector; + std::vector haplotypes; + std::vector> event_sequences_vector; + + prepareForBaseEditCandidates(start, + end, + alignments, + contig, + tmp_variants_vector, + haplotypes, + event_sequences_vector); + + std::vector scoredVariants = aligner.variantScoresThresholded(tmp_variants_vector, + haplotypes, + event_sequences_vector, + alignment_flags, + opt::screen_score_threshold, + opt::methylation_types); + for (auto variant: scoredVariants) { + if (variant.quality > 0) { + std::lock_guard lock(outVariantsMutex); + out_variants.push_back(variant); + } + } + +} + +std::vector generate_candidate_single_base_edits_gpu(const AlignmentDB& alignments, + int region_start, + int region_end, + uint32_t alignment_flags){ + + std::mutex outVariantsMutex; + std::vector out_variants; + std::string contig = alignments.get_region_contig(); + + // Add all positively-scoring single-base changes into the candidate set + size_t num_workers = (opt::num_threads < MAX_NUM_WORKERS) ? opt::num_threads : MAX_NUM_WORKERS; + std::vector gpuAligners(num_workers); + + //std::vector workerThreads(num_workers); + std::vector> handles(num_workers); + + int nextLocusBegin = region_start; + int nextLocusEnd = nextLocusBegin + LOCI_PER_WORKER; + bool finished = false; + + //Initialise the workers + for (int workerIdx = 0; workerIdx < num_workers; workerIdx++) { + auto aligner = std::ref(gpuAligners[workerIdx]); + if (!finished) { + if (nextLocusEnd == region_end) { + finished = true; + } + handles[workerIdx] = std::async(std::launch::async, + locusRangeBaseEditCandidateGPU, + nextLocusBegin, + nextLocusEnd, + std::ref(alignments), + alignment_flags, + std::ref(out_variants), + std::ref(contig), + aligner, + std::ref(outVariantsMutex)); + if ((nextLocusEnd + LOCI_PER_WORKER) < region_end){ + nextLocusBegin = nextLocusEnd + 1; + nextLocusEnd = nextLocusBegin + LOCI_PER_WORKER - 1; + }else{ + nextLocusBegin = nextLocusEnd + 1; + nextLocusEnd = region_end; + } + } + } + + //Round robin - assigning work to the workers until out of candidates + while (!finished) { + for (int i = 0; i < num_workers; i++) { + auto status = handles[i].wait_for(std::chrono::microseconds(100)); + if (status == std::future_status::ready && (!finished)) { + if (nextLocusEnd == region_end){ + finished = true; + } + auto aligner = std::ref(gpuAligners[i]); + handles[i].get(); + handles[i] = std::async(std::launch::async, + locusRangeBaseEditCandidateGPU, + nextLocusBegin, + nextLocusEnd, + std::ref(alignments), + alignment_flags, + std::ref(out_variants), + std::ref(contig), + aligner, + std::ref(outVariantsMutex)); + if ((nextLocusEnd + LOCI_PER_WORKER) < region_end){ + nextLocusBegin = nextLocusEnd + 1; + nextLocusEnd = nextLocusBegin + LOCI_PER_WORKER - 1; + }else{ + nextLocusBegin = nextLocusEnd + 1; + nextLocusEnd = region_end; + } + } + } + } + + //Block until all workers are complete + for (int workerIdx = 0; workerIdx < num_workers; workerIdx++) { + handles[workerIdx].wait(); + } + return out_variants; +} diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp index 90f991b3..3aaf371a 100644 --- a/src/nanopolish_call_variants.cpp +++ b/src/nanopolish_call_variants.cpp @@ -38,12 +38,6 @@ #include "profiler.h" #include "progress.h" #include "stdaln.h" -#ifdef HAVE_CUDA - #include - #include - #include - #include -#endif // Macros #define max3(x,y,z) std::max(std::max(x,y), z) @@ -68,43 +62,43 @@ float g_p_skip, g_p_skip_self, g_p_bad, g_p_bad_self; #define SUBPROGRAM "variants" static const char *CONSENSUS_VERSION_MESSAGE = - SUBPROGRAM " Version " PACKAGE_VERSION "\n" - "Written by Jared Simpson.\n" - "\n" - "Copyright 2015 Ontario Institute for Cancer Research\n"; +SUBPROGRAM " Version " PACKAGE_VERSION "\n" +"Written by Jared Simpson.\n" +"\n" +"Copyright 2015 Ontario Institute for Cancer Research\n"; static const char *CONSENSUS_USAGE_MESSAGE = - "Usage: " PACKAGE_NAME " " SUBPROGRAM " [OPTIONS] --reads reads.fa --bam alignments.bam --genome genome.fa\n" - "Find SNPs using a signal-level HMM\n" - "\n" - " -v, --verbose display verbose output\n" - " --version display version\n" - " --help display this help and exit\n" - " --snps only call SNPs\n" - " --consensus run in consensus calling mode\n" - " --fix-homopolymers run the experimental homopolymer caller\n" - " --faster minimize compute time while slightly reducing consensus accuracy\n" - " -w, --window=STR find variants in window STR (format: :-)\n" - " -r, --reads=FILE the ONT reads are in fasta FILE\n" - " -b, --bam=FILE the reads aligned to the reference genome are in bam FILE\n" - " -e, --event-bam=FILE the events aligned to the reference genome are in bam FILE\n" - " -g, --genome=FILE the reference genome is in FILE\n" - " -p, --ploidy=NUM the ploidy level of the sequenced genome\n" - " -q --methylation-aware=STR turn on methylation aware polishing and test motifs given in STR (example: -q dcm,dam)\n" - " --genotype=FILE call genotypes for the variants in the vcf FILE\n" - " -o, --outfile=FILE write result to FILE [default: stdout]\n" - " -t, --threads=NUM use NUM threads (default: 1)\n" - " -m, --min-candidate-frequency=F extract candidate variants from the aligned reads when the variant frequency is at least F (default 0.2)\n" - " -d, --min-candidate-depth=D extract candidate variants from the aligned reads when the depth is at least D (default: 20)\n" - " -x, --max-haplotypes=N consider at most N haplotype combinations (default: 1000)\n" - " --min-flanking-sequence=N distance from alignment end to calculate variants (default: 30)\n" - " --max-rounds=N perform N rounds of consensus sequence improvement (default: 50)\n" - " -c, --candidates=VCF read variant candidates from VCF, rather than discovering them from aligned reads\n" - " -a, --alternative-basecalls-bam=FILE if an alternative basecaller was used that does not output event annotations\n" - " then use basecalled sequences from FILE. The signal-level events will still be taken from the -b bam.\n" - " --calculate-all-support when making a call, also calculate the support of the 3 other possible bases\n" - " --models-fofn=FILE read alternative k-mer models from FILE\n" - "\nReport bugs to " PACKAGE_BUGREPORT "\n\n"; +"Usage: " PACKAGE_NAME " " SUBPROGRAM " [OPTIONS] --reads reads.fa --bam alignments.bam --genome genome.fa\n" +"Find SNPs using a signal-level HMM\n" +"\n" +" -v, --verbose display verbose output\n" +" --version display version\n" +" --help display this help and exit\n" +" --snps only call SNPs\n" +" --consensus run in consensus calling mode\n" +" --fix-homopolymers run the experimental homopolymer caller\n" +" --faster minimize compute time while slightly reducing consensus accuracy\n" +" -w, --window=STR find variants in window STR (format: :-)\n" +" -r, --reads=FILE the ONT reads are in fasta FILE\n" +" -b, --bam=FILE the reads aligned to the reference genome are in bam FILE\n" +" -e, --event-bam=FILE the events aligned to the reference genome are in bam FILE\n" +" -g, --genome=FILE the reference genome is in FILE\n" +" -p, --ploidy=NUM the ploidy level of the sequenced genome\n" +" -q --methylation-aware=STR turn on methylation aware polishing and test motifs given in STR (example: -q dcm,dam)\n" +" --genotype=FILE call genotypes for the variants in the vcf FILE\n" +" -o, --outfile=FILE write result to FILE [default: stdout]\n" +" -t, --threads=NUM use NUM threads (default: 1)\n" +" -m, --min-candidate-frequency=F extract candidate variants from the aligned reads when the variant frequency is at least F (default 0.2)\n" +" -d, --min-candidate-depth=D extract candidate variants from the aligned reads when the depth is at least D (default: 20)\n" +" -x, --max-haplotypes=N consider at most N haplotype combinations (default: 1000)\n" +" --min-flanking-sequence=N distance from alignment end to calculate variants (default: 30)\n" +" --max-rounds=N perform N rounds of consensus sequence improvement (default: 50)\n" +" -c, --candidates=VCF read variant candidates from VCF, rather than discovering them from aligned reads\n" +" -a, --alternative-basecalls-bam=FILE if an alternative basecaller was used that does not output event annotations\n" +" then use basecalled sequences from FILE. The signal-level events will still be taken from the -b bam.\n" +" --calculate-all-support when making a call, also calculate the support of the 3 other possible bases\n" +" --models-fofn=FILE read alternative k-mer models from FILE\n" +"\nReport bugs to " PACKAGE_BUGREPORT "\n\n"; namespace opt { @@ -140,7 +134,6 @@ namespace opt static int debug_alignments = 0; static std::vector methylation_types; static int gpu = 0; - } static const char* shortopts = "r:b:g:t:w:o:e:m:c:d:a:x:q:p:v"; @@ -285,23 +278,28 @@ void annotate_with_all_support(std::vector& variants, } } -void prepareForBaseEditCandidates(int start, - int end, - const AlignmentDB& alignments, - std::string contig, - std::vector> &tmp_variants_vector, - std::vector &haplotypes, - std::vector> &event_sequences_vector){ - for(int i = start; i<=end; i++){ +// Given the input region, calculate all single base edits to the current assembly +std::vector generate_candidate_single_base_edits(const AlignmentDB& alignments, + int region_start, + int region_end, + uint32_t alignment_flags) +{ + std::vector out_variants; + + std::string contig = alignments.get_region_contig(); + + // Add all positively-scoring single-base changes into the candidate set + for(size_t i = region_start; i < region_end; ++i) { + int calling_start = i - opt::screen_flanking_sequence; int calling_end = i + 1 + opt::screen_flanking_sequence; - if (!alignments.are_coordinates_valid(contig, calling_start, calling_end)) { - return; + if(!alignments.are_coordinates_valid(contig, calling_start, calling_end)) { + continue; } std::vector tmp_variants; - for (size_t j = 0; j < 4; ++j) { + for(size_t j = 0; j < 4; ++j) { // Substitutions Variant v; v.ref_name = contig; @@ -309,14 +307,14 @@ void prepareForBaseEditCandidates(int start, v.ref_seq = alignments.get_reference_substring(contig, i, i); v.alt_seq = "ACGT"[j]; - if (v.ref_seq != v.alt_seq) { + if(v.ref_seq != v.alt_seq) { tmp_variants.push_back(v); } // Insertions v.alt_seq = v.ref_seq + "ACGT"[j]; // ignore insertions of the type "A" -> "AA" as these are redundant - if (v.alt_seq[1] != v.ref_seq[0]) { + if(v.alt_seq[1] != v.ref_seq[0]) { tmp_variants.push_back(v); } } @@ -329,7 +327,7 @@ void prepareForBaseEditCandidates(int start, del.alt_seq = del.ref_seq[0]; // ignore deletions of the type "AA" -> "A" as these are redundant - if (del.alt_seq[0] != del.ref_seq[1]) { + if(del.alt_seq[0] != del.ref_seq[1]) { tmp_variants.push_back(del); } @@ -337,199 +335,29 @@ void prepareForBaseEditCandidates(int start, // We do this internally here as it is much faster to get the event sequences // for the entire window for all variants at this position once, rather than // for each variant individually - std::vector event_sequences = alignments.get_event_subsequences(contig, calling_start, calling_end); + std::vector event_sequences = + alignments.get_event_subsequences(contig, calling_start, calling_end); Haplotype test_haplotype(contig, calling_start, - alignments.get_reference_substring(contig, - calling_start, - calling_end)); - - haplotypes.push_back(test_haplotype); - event_sequences_vector.push_back(event_sequences); - tmp_variants_vector.push_back(tmp_variants); - } -} - -#ifdef HAVE_CUDA -void locusRangeBaseEditCandidateGPU(int start, - int end, - const AlignmentDB& alignments, - uint32_t alignment_flags, - std::vector &out_variants, - std::string contig, - GpuAligner &aligner, - std::mutex &outVariantsMutex) { - std::vector> tmp_variants_vector; - std::vector haplotypes; - std::vector> event_sequences_vector; - - prepareForBaseEditCandidates(start, - end, - alignments, - contig, - tmp_variants_vector, - haplotypes, - event_sequences_vector); - - std::vector scoredVariants = aligner.variantScoresThresholded(tmp_variants_vector, - haplotypes, - event_sequences_vector, - alignment_flags, - opt::screen_score_threshold, - opt::methylation_types); - for (auto variant: scoredVariants) { - if (variant.quality > 0) { - std::lock_guard lock(outVariantsMutex); - out_variants.push_back(variant); - } - } - -} -#endif + alignments.get_reference_substring(contig, calling_start, calling_end)); -void locusRangeBaseEditCandidate(int start, - int end, - const AlignmentDB& alignments, - uint32_t alignment_flags, - std::vector &out_variants, - std::string contig) { - std::vector> tmp_variants_vector; - std::vector haplotypes; - std::vector> event_sequences_vector; - - prepareForBaseEditCandidates(start, - end, - alignments, - contig, - tmp_variants_vector, - haplotypes, - event_sequences_vector); - - int numHaplotypes = haplotypes.size(); - for (int haplotypeIDX = 0; haplotypeIDX < numHaplotypes; haplotypeIDX++) { - auto variants = tmp_variants_vector[haplotypeIDX]; - auto test_haplotype = haplotypes[haplotypeIDX]; - auto event_sequences = event_sequences_vector[haplotypeIDX]; - for (const Variant &v : variants) { - Variant scored_variant = score_variant_thresholded(v, - test_haplotype, - event_sequences, - alignment_flags, - opt::screen_score_threshold, - opt::methylation_types); + for(const Variant& v : tmp_variants) { + Variant scored_variant = score_variant_thresholded(v, test_haplotype, event_sequences, alignment_flags, opt::screen_score_threshold, opt::methylation_types); scored_variant.info = ""; - if (scored_variant.quality > 0) { + if(scored_variant.quality > 0) { out_variants.push_back(scored_variant); } } + } + return out_variants; } #ifdef HAVE_CUDA -std::vector generate_candidate_single_base_edits_gpu(const AlignmentDB& alignments, - int region_start, - int region_end, - uint32_t alignment_flags){ - - std::mutex outVariantsMutex; - std::vector out_variants; - std::string contig = alignments.get_region_contig(); - - // Add all positively-scoring single-base changes into the candidate set - size_t num_workers = (opt::num_threads < MAX_NUM_WORKERS) ? opt::num_threads : MAX_NUM_WORKERS; - std::vector gpuAligners(num_workers); - - //std::vector workerThreads(num_workers); - std::vector> handles(num_workers); - - int nextLocusBegin = region_start; - int nextLocusEnd = nextLocusBegin + LOCI_PER_WORKER; - bool finished = false; - - //Initialise the workers - for (int workerIdx = 0; workerIdx < num_workers; workerIdx++) { - auto aligner = std::ref(gpuAligners[workerIdx]); - if (!finished) { - if (nextLocusEnd == region_end) { - finished = true; - } - handles[workerIdx] = std::async(std::launch::async, - locusRangeBaseEditCandidateGPU, - nextLocusBegin, - nextLocusEnd, - std::ref(alignments), - alignment_flags, - std::ref(out_variants), - std::ref(contig), - aligner, - std::ref(outVariantsMutex)); - if ((nextLocusEnd + LOCI_PER_WORKER) < region_end){ - nextLocusBegin = nextLocusEnd + 1; - nextLocusEnd = nextLocusBegin + LOCI_PER_WORKER - 1; - }else{ - nextLocusBegin = nextLocusEnd + 1; - nextLocusEnd = region_end; - } - } - } - - //Round robin - assigning work to the workers until out of candidates - while (!finished) { - for (int i = 0; i < num_workers; i++) { - auto status = handles[i].wait_for(std::chrono::microseconds(100)); - if (status == std::future_status::ready && (!finished)) { - if (nextLocusEnd == region_end){ - finished = true; - } - auto aligner = std::ref(gpuAligners[i]); - handles[i].get(); - handles[i] = std::async(std::launch::async, - locusRangeBaseEditCandidateGPU, - nextLocusBegin, - nextLocusEnd, - std::ref(alignments), - alignment_flags, - std::ref(out_variants), - std::ref(contig), - aligner, - std::ref(outVariantsMutex)); - if ((nextLocusEnd + LOCI_PER_WORKER) < region_end){ - nextLocusBegin = nextLocusEnd + 1; - nextLocusEnd = nextLocusBegin + LOCI_PER_WORKER - 1; - }else{ - nextLocusBegin = nextLocusEnd + 1; - nextLocusEnd = region_end; - } - } - } - } - - //Block until all workers are complete - for (int workerIdx = 0; workerIdx < num_workers; workerIdx++) { - handles[workerIdx].wait(); - } - return out_variants; -} + #include #endif -// Given the input region, calculate all single base edits to the current assembly -std::vector generate_candidate_single_base_edits(const AlignmentDB& alignments, - int region_start, - int region_end, - uint32_t alignment_flags){ - std::vector out_variants; - std::string contig = alignments.get_region_contig(); - locusRangeBaseEditCandidate(region_start, - region_end, - alignments, - alignment_flags, - out_variants, - std::ref(contig)); - - return out_variants; -} - // Given the input set of variants, calculate the variants that have a positive score std::vector screen_variants_by_score(const AlignmentDB& alignments, const std::vector& candidate_variants, @@ -847,7 +675,7 @@ Haplotype fix_homopolymers(const Haplotype& input_haplotype, duration_likelihoods[var_sequence_length] += log_gamma; } if(opt::verbose > 3) { - fprintf(stderr, "SUM_VAR\t%zu\t%zu\t%d\t%d\t%lu\t%.5lf\t%.2lf\n", ref_hp_start, hp_length, var_sequence_length, call_window, variant_offset_end - variant_offset_start, sum_duration, log_gamma); + fprintf(stderr, "SUM_VAR\t%zu\t%zu\t%d\t%d\t%lu\t%.5lf\t%.2lf\n", ref_hp_start, hp_length, var_sequence_length, call_window, variant_offset_end - variant_offset_start, sum_duration, log_gamma); } } } @@ -963,7 +791,7 @@ Haplotype call_haplotype_from_candidates(const AlignmentDB& alignments, size_t end_variant_idx = curr_variant_idx + 1; while(end_variant_idx < candidate_variants.size()) { int distance = candidate_variants[end_variant_idx].ref_position - - candidate_variants[end_variant_idx - 1].ref_position; + candidate_variants[end_variant_idx - 1].ref_position; if(distance > opt::min_distance_between_variants) break; end_variant_idx++; @@ -972,8 +800,8 @@ Haplotype call_haplotype_from_candidates(const AlignmentDB& alignments, size_t num_variants = end_variant_idx - curr_variant_idx; int calling_start = candidate_variants[curr_variant_idx].ref_position - opt::min_flanking_sequence; int calling_end = candidate_variants[end_variant_idx - 1].ref_position + - candidate_variants[end_variant_idx - 1].ref_seq.length() + - opt::min_flanking_sequence; + candidate_variants[end_variant_idx - 1].ref_seq.length() + + opt::min_flanking_sequence; int calling_size = calling_end - calling_start; @@ -1017,7 +845,7 @@ Haplotype call_haplotype_from_candidates(const AlignmentDB& alignments, } } else { fprintf(stderr, "Warning: %zu variants in span, region not called [%d %d]\n", num_variants, calling_start, calling_end); - } + } // advance to start of next region curr_variant_idx = end_variant_idx; @@ -1073,11 +901,12 @@ Haplotype call_variants_for_region(const std::string& contig, int region_start, fprintf(stderr, "input region: %s\n", alignments.get_reference_substring(contig, region_start - BUFFER, region_end + BUFFER).c_str()); } - /* - Haplotype called_haplotype(alignments.get_region_contig(), - alignments.get_region_start(), - alignments.get_reference()); - */ +/* + Haplotype called_haplotype(alignments.get_region_contig(), + alignments.get_region_start(), + alignments.get_reference()); +*/ + // Step 1. Discover putative variants across the whole region std::vector candidate_variants; if(opt::candidates_file.empty()) { @@ -1089,24 +918,20 @@ Haplotype call_variants_for_region(const std::string& contig, int region_start, if(opt::consensus_mode) { // generate single-base edits that have a positive haplotype score - std::vector single_base_edits; - if(opt::gpu) { - #ifdef HAVE_CUDA - single_base_edits = generate_candidate_single_base_edits_gpu(alignments, - region_start, - region_end, - alignment_flags); - #else - fprintf(stderr,"Not compiled for CUDA\n"); - exit(1); - #endif - } else { - single_base_edits = generate_candidate_single_base_edits(alignments, - region_start, - region_end, - alignment_flags); + if(opt::gpu==0) { + single_base_edits= generate_candidate_single_base_edits(alignments, region_start, region_end, alignment_flags); + } + else{ + #ifdef HAVE_CUDA + single_base_edits= generate_candidate_single_base_edits_gpu(alignments, region_start, region_end, alignment_flags); + #else + fprintf(stderr,"--gpu option is only effective when compiled with CUDA support\n"); + fprintf(stderr,"Please compile nanopolish by 'make cuda=1'. You need to have CUDA toolkit setup for this."); + exit(EXIT_FAILURE); + #endif } + // insert these into the candidate set candidate_variants.insert(candidate_variants.end(), single_base_edits.begin(), single_base_edits.end()); @@ -1117,6 +942,8 @@ Haplotype call_variants_for_region(const std::string& contig, int region_start, std::sort(candidate_variants.begin(), candidate_variants.end(), sortByPosition); } + // Step 2. Call variants + Haplotype called_haplotype(alignments.get_region_contig(), alignments.get_region_start(), alignments.get_reference()); @@ -1162,10 +989,10 @@ Haplotype call_variants_for_region(const std::string& contig, int region_start, last_round_variant_keys = this_round_variant_keys; if(variant_set_changed) { candidate_variants = expand_variants(alignments, - called_variants, - region_start, - region_end, - alignment_flags); + called_variants, + region_start, + region_end, + alignment_flags); } else { break; @@ -1196,44 +1023,44 @@ void parse_call_variants_options(int argc, char** argv) for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) { std::istringstream arg(optarg != NULL ? optarg : ""); switch (c) { - case 'r': arg >> opt::reads_file; break; - case 'g': arg >> opt::genome_file; break; - case 'b': arg >> opt::bam_file; break; - case 'e': arg >> opt::event_bam_file; break; - case 'w': arg >> opt::window; break; - case 'o': arg >> opt::output_file; break; - case 'm': arg >> opt::min_candidate_frequency; break; - case 'd': arg >> opt::min_candidate_depth; break; - case 'x': arg >> opt::max_haplotypes; break; - case 'c': arg >> opt::candidates_file; break; - case 'p': arg >> opt::ploidy; break; - case 'q': arg >> methylation_motifs_str; break; - case 'a': arg >> opt::alternative_basecalls_bam; break; - case '?': die = true; break; - case 't': arg >> opt::num_threads; break; - case 'v': opt::verbose++; break; - case OPT_CONSENSUS: opt::consensus_mode = 1; break; - case OPT_GPU: opt::gpu = 1; break; - case OPT_FIX_HOMOPOLYMERS: opt::fix_homopolymers = 1; break; - case OPT_EFFORT: arg >> opt::screen_score_threshold; break; - case OPT_FASTER: opt::screen_score_threshold = 25; break; - case OPT_MAX_ROUNDS: arg >> opt::max_rounds; break; - case OPT_GENOTYPE: opt::genotype_only = 1; arg >> opt::candidates_file; break; - case OPT_MODELS_FOFN: arg >> opt::models_fofn; break; - case OPT_CALC_ALL_SUPPORT: opt::calculate_all_support = 1; break; - case OPT_SNPS_ONLY: opt::snps_only = 1; break; - case OPT_PROGRESS: opt::show_progress = 1; break; - case OPT_P_SKIP: arg >> g_p_skip; break; - case OPT_P_SKIP_SELF: arg >> g_p_skip_self; break; - case OPT_P_BAD: arg >> g_p_bad; break; - case OPT_P_BAD_SELF: arg >> g_p_bad_self; break; - case OPT_MIN_FLANKING_SEQUENCE: arg >> opt::min_flanking_sequence; break; - case OPT_HELP: - std::cout << CONSENSUS_USAGE_MESSAGE; - exit(EXIT_SUCCESS); - case OPT_VERSION: - std::cout << CONSENSUS_VERSION_MESSAGE; - exit(EXIT_SUCCESS); + case 'r': arg >> opt::reads_file; break; + case 'g': arg >> opt::genome_file; break; + case 'b': arg >> opt::bam_file; break; + case 'e': arg >> opt::event_bam_file; break; + case 'w': arg >> opt::window; break; + case 'o': arg >> opt::output_file; break; + case 'm': arg >> opt::min_candidate_frequency; break; + case 'd': arg >> opt::min_candidate_depth; break; + case 'x': arg >> opt::max_haplotypes; break; + case 'c': arg >> opt::candidates_file; break; + case 'p': arg >> opt::ploidy; break; + case 'q': arg >> methylation_motifs_str; break; + case 'a': arg >> opt::alternative_basecalls_bam; break; + case '?': die = true; break; + case 't': arg >> opt::num_threads; break; + case 'v': opt::verbose++; break; + case OPT_CONSENSUS: opt::consensus_mode = 1; break; + case OPT_GPU: opt::gpu = 1; break; + case OPT_FIX_HOMOPOLYMERS: opt::fix_homopolymers = 1; break; + case OPT_EFFORT: arg >> opt::screen_score_threshold; break; + case OPT_FASTER: opt::screen_score_threshold = 25; break; + case OPT_MAX_ROUNDS: arg >> opt::max_rounds; break; + case OPT_GENOTYPE: opt::genotype_only = 1; arg >> opt::candidates_file; break; + case OPT_MODELS_FOFN: arg >> opt::models_fofn; break; + case OPT_CALC_ALL_SUPPORT: opt::calculate_all_support = 1; break; + case OPT_SNPS_ONLY: opt::snps_only = 1; break; + case OPT_PROGRESS: opt::show_progress = 1; break; + case OPT_P_SKIP: arg >> g_p_skip; break; + case OPT_P_SKIP_SELF: arg >> g_p_skip_self; break; + case OPT_P_BAD: arg >> g_p_bad; break; + case OPT_P_BAD_SELF: arg >> g_p_bad_self; break; + case OPT_MIN_FLANKING_SEQUENCE: arg >> opt::min_flanking_sequence; break; + case OPT_HELP: + std::cout << CONSENSUS_USAGE_MESSAGE; + exit(EXIT_SUCCESS); + case OPT_VERSION: + std::cout << CONSENSUS_VERSION_MESSAGE; + exit(EXIT_SUCCESS); } } @@ -1287,10 +1114,10 @@ void parse_call_variants_options(int argc, char** argv) } if (die) - { - std::cout << "\n" << CONSENSUS_USAGE_MESSAGE; - exit(EXIT_FAILURE); - } + { + std::cout << "\n" << CONSENSUS_USAGE_MESSAGE; + exit(EXIT_FAILURE); + } } void print_invalid_window_error(int start_base, int end_base) @@ -1356,34 +1183,34 @@ int call_variants_main(int argc, char** argv) // header_fields.push_back( - Variant::make_vcf_tag_string("INFO", "TotalReads", 1, "Integer", - "The number of event-space reads used to call the variant")); + Variant::make_vcf_tag_string("INFO", "TotalReads", 1, "Integer", + "The number of event-space reads used to call the variant")); header_fields.push_back( - Variant::make_vcf_tag_string("INFO", "SupportFraction", 1, "Float", - "The fraction of event-space reads that support the variant")); + Variant::make_vcf_tag_string("INFO", "SupportFraction", 1, "Float", + "The fraction of event-space reads that support the variant")); header_fields.push_back( - Variant::make_vcf_tag_string("INFO", "BaseCalledReadsWithVariant", 1, "Integer", - "The number of base-space reads that support the variant")); + Variant::make_vcf_tag_string("INFO", "BaseCalledReadsWithVariant", 1, "Integer", + "The number of base-space reads that support the variant")); header_fields.push_back( - Variant::make_vcf_tag_string("INFO", "BaseCalledFraction", 1, "Float", - "The fraction of base-space reads that support the variant")); + Variant::make_vcf_tag_string("INFO", "BaseCalledFraction", 1, "Float", + "The fraction of base-space reads that support the variant")); header_fields.push_back( - Variant::make_vcf_tag_string("INFO", "AlleleCount", 1, "Integer", - "The inferred number of copies of the allele")); + Variant::make_vcf_tag_string("INFO", "AlleleCount", 1, "Integer", + "The inferred number of copies of the allele")); if(opt::calculate_all_support) { header_fields.push_back( - Variant::make_vcf_tag_string("INFO", "SupportFractionByBase", 4, "Integer", - "The fraction of reads supporting A,C,G,T at this position")); + Variant::make_vcf_tag_string("INFO", "SupportFractionByBase", 4, "Integer", + "The fraction of reads supporting A,C,G,T at this position")); } header_fields.push_back( - Variant::make_vcf_tag_string("FORMAT", "GT", 1, "String", - "Genotype")); + Variant::make_vcf_tag_string("FORMAT", "GT", 1, "String", + "Genotype")); Variant::write_vcf_header(out_fp, header_fields); @@ -1393,9 +1220,9 @@ int call_variants_main(int argc, char** argv) if(!opt::consensus_output.empty()) { FILE* consensus_fp = fopen(opt::consensus_output.c_str(), "w"); fprintf(consensus_fp, ">%s:%d-%d\n%s\n", contig.c_str(), - start_base, - end_base, - haplotype.get_sequence().c_str()); + start_base, + end_base, + haplotype.get_sequence().c_str()); fclose(consensus_fp); } From 8338b92df9d54de3b2457068df910ab47138a6b6 Mon Sep 17 00:00:00 2001 From: Hasindu Gamaarachchi Date: Sat, 28 Sep 2019 15:42:06 +1000 Subject: [PATCH 75/80] make the --gpu more clear --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ebf66275..fc36fc65 100644 --- a/README.md +++ b/README.md @@ -116,10 +116,11 @@ docker run -v /path/to/local/data/data/:/data/ -it :image_id ./nanopolish event ## GPU acceleration -The nanopolish consensus improvement algorithm can be performed faster using CUDA-enabled GPU acceleration. This is an experimental feature, to try this feature run with the `--gpu` flag e.g: +The nanopolish consensus improvement algorithm can be performed faster using CUDA-enabled GPU acceleration. This is an experimental feature, to try this feature run with the `--gpu=1` flag e.g: ``` nanopolish variants --consensus polished_gpu.fa -w "tig00000001:200000-230000" -r reads.fasta -b reads.sorted.bam -g draft.fa --threads=8 --gpu=1 ``` +Note that this feature requires nanopolish to be compiled with `make cuda=1`. You should have the [CUDA toolkit installed and configured](https://docs.nvidia.com/cuda/cuda-quick-start-guide/). If your CUDA installation is not in the default location, you can provide the path to make as `make cuda=1 NVCC=/path/to/nvidia_c_compiler CUDA_LIB=/path/to/cuda/lib CUDA_INCLUDE=/path/to/cuda/include`. ## Credits and Thanks From c05733bbfea00f0b3e375d19f5be637ca3a10bd6 Mon Sep 17 00:00:00 2001 From: Hasindu Gamaarachchi Date: Sat, 28 Sep 2019 16:09:03 +1000 Subject: [PATCH 76/80] set to cuda static runtime library --- cuda.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda.mk b/cuda.mk index 57c97bf4..59c91c4f 100644 --- a/cuda.mk +++ b/cuda.mk @@ -4,7 +4,7 @@ NVCC ?= nvcc CUDA_ROOT = /usr/local/cuda CUDA_LIB ?= $(CUDA_ROOT)/lib64 CUDA_INCLUDE ?= $(CUDA_ROOT)/include -CURTFLAGS = -L$(CUDA_LIB) -lcudart +CURTFLAGS = -L$(CUDA_LIB) -lcudart_static -lrt NVCCFLAGS ?= -std=c++11 -I. -I$(CUDA_INCLUDE) -O3 -use_fast_math --default-stream per-thread -restrict CPPFLAGS += -I$(CUDA_INCLUDE) From 8964db064785eb039b43f23657520680610ef01f Mon Sep 17 00:00:00 2001 From: Hasindu Gamaarachchi Date: Sat, 28 Sep 2019 18:30:52 +1000 Subject: [PATCH 77/80] removed .gitignore in test/ --- test/.gitignore | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 test/.gitignore diff --git a/test/.gitignore b/test/.gitignore deleted file mode 100644 index 479a396b..00000000 --- a/test/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -ecoli_2kb_region - From 896b8066e49d9004f0e62aed70d1f966b49bdb3a Mon Sep 17 00:00:00 2001 From: Hasindu Gamaarachchi Date: Thu, 3 Oct 2019 21:47:29 +1000 Subject: [PATCH 78/80] add cida object file to make file clean option --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index 1e56cca4..9d7a6974 100644 --- a/Makefile +++ b/Makefile @@ -155,4 +155,5 @@ test: $(TEST_PROGRAM) .PHONY: clean clean: rm -f $(PROGRAM) $(TEST_PROGRAM) $(CPP_OBJ) $(C_OBJ) \ + src/cuda_kernels/GpuAligner.o \ src/main/nanopolish.o src/test/nanopolish_test.o From 09df08dbe6e3aefa2332acf2988f72b6124b72c6 Mon Sep 17 00:00:00 2001 From: Hasindu Gamaarachchi Date: Fri, 13 Mar 2020 11:14:14 +1100 Subject: [PATCH 79/80] implementation of the methylation aware polishing option for the GPU --- Makefile | 2 +- cuda.mk | 2 +- src/cuda_kernels/GpuAligner.cu | 722 --------------- src/cuda_kernels/gpu_aligner.cu | 864 ++++++++++++++++++ .../{GpuAligner.h => gpu_aligner.h} | 29 +- src/cuda_kernels/gpu_call_variants.inl | 2 +- src/pore_model/nanopolish_pore_model_set.cpp | 10 + src/pore_model/nanopolish_pore_model_set.h | 10 +- 8 files changed, 909 insertions(+), 732 deletions(-) delete mode 100644 src/cuda_kernels/GpuAligner.cu create mode 100644 src/cuda_kernels/gpu_aligner.cu rename src/cuda_kernels/{GpuAligner.h => gpu_aligner.h} (76%) diff --git a/Makefile b/Makefile index bd5eb566..40d90313 100644 --- a/Makefile +++ b/Makefile @@ -176,5 +176,5 @@ test: $(TEST_PROGRAM) .PHONY: clean clean: rm -f $(PROGRAM) $(TEST_PROGRAM) $(CPP_OBJ) $(C_OBJ) \ - src/cuda_kernels/GpuAligner.o \ + src/cuda_kernels/gpu_aligner.o \ src/main/nanopolish.o src/test/nanopolish_test.o diff --git a/cuda.mk b/cuda.mk index 59c91c4f..50330d3e 100644 --- a/cuda.mk +++ b/cuda.mk @@ -5,7 +5,7 @@ CUDA_ROOT = /usr/local/cuda CUDA_LIB ?= $(CUDA_ROOT)/lib64 CUDA_INCLUDE ?= $(CUDA_ROOT)/include CURTFLAGS = -L$(CUDA_LIB) -lcudart_static -lrt -NVCCFLAGS ?= -std=c++11 -I. -I$(CUDA_INCLUDE) -O3 -use_fast_math --default-stream per-thread -restrict +NVCCFLAGS ?= -g -lineinfo -std=c++11 -I. -I$(CUDA_INCLUDE) -O3 -use_fast_math --default-stream per-thread -restrict CPPFLAGS += -I$(CUDA_INCLUDE) CPPFLAGS += -DHAVE_CUDA=1 diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu deleted file mode 100644 index 36b6378a..00000000 --- a/src/cuda_kernels/GpuAligner.cu +++ /dev/null @@ -1,722 +0,0 @@ -#include -#include -#include "GpuAligner.h" -#include -#include "nanopolish_profile_hmm_r9.h" - -#define MAX_STATES 256 - -#define EXPAND_TO_STRING(X) #X -#define TO_STRING(X) EXPAND_TO_STRING(X) -#define CU_CHECK_ERR(X) if (X != cudaSuccess){printf("CUDA error: <<%s>> at line %s\n", cudaGetErrorString(X), TO_STRING(__LINE__));throw std::runtime_error("CUDA ERROR");} - -__device__ float logsumexpf(float x, float y){ - if(x == -INFINITY && y == -INFINITY){ - return -INFINITY; - } - float result = fmax(x, y) + log1pf(expf(-fabsf(y - x))); - return result; -} - -__device__ float lp_match_r9(int rank, - float mean, - float pore_mean, - float pore_stdv, - float pore_log_level_stdv, - float scale, - float shift, - float var, - float logVar){ - - float log_inv_sqrt_2pi = logf(0.3989422804014327); - - float level = mean; - float gaussian_mean = scale * pore_mean + shift; - float gaussian_stdv = pore_stdv * var; - float gaussian_log_level_stdv = pore_log_level_stdv + logVar; - - float a = (level - gaussian_mean) / gaussian_stdv; - float emission = log_inv_sqrt_2pi - gaussian_log_level_stdv + (-0.5f * a * a); - return emission; - -} - -__global__ void getScoresMod (float * poreModelDev, - int * readLengthsDev, - int * eventStartsDev, - int * eventStridesDev, - float * eventsPerBaseDev, - float * scaleDev, - float * shiftDev, - float * varDev, - float * logVarDev, - int * eventOffsetsDev, - float * eventMeansDev, - float * preFlankingDev, - float * postFlankingDev, - int * sequenceLengthsDev, - int * sequenceOffsetsDev, - int * kmerRanksDev, - int * seqIdxDev, - int * readIdxDev, - int numScores, - float * returnValuesDev){ - - bool debug = false; - if ((threadIdx.x == 0) && (blockIdx.x == 0)){ - debug = false; - } - - // get buffer indices - int scoreIdx = blockIdx.x * blockDim.x + threadIdx.x; - - if (scoreIdx < numScores) { - - int readIdx = readIdxDev[scoreIdx]; - int seqIdx = seqIdxDev[scoreIdx]; - - // get read statistics - int numEvents = readLengthsDev[readIdx]; - int readOffset = eventOffsetsDev[readIdx]; - float read_events_per_base = eventsPerBaseDev[readIdx]; - int e_start = eventStartsDev[readIdx]; // Event start for read - int e_stride = eventStridesDev[readIdx]; - int e_offset = eventOffsetsDev[readIdx]; // Within the event means etc, the offset needed for this block to get a specific event - float scale = scaleDev[readIdx]; - float shift = shiftDev[readIdx]; - float var = varDev[readIdx]; - float logVar = logVarDev[readIdx]; - - // get sequence statistics - int numKmers = sequenceLengthsDev[seqIdx]; - int seqOffset = sequenceOffsetsDev[seqIdx]; - - int lastRowIdx = numEvents - 1; - int lastKmerIdx = numKmers - 1; - - float returnValue = -INFINITY; //Used to sum over the last column. - float prevProbabilities[MAX_STATES]; - - int numBlocks = numKmers + 2; - int numStates = numBlocks * PSR9_NUM_STATES; // 3 blocks per kmer and then 3 each for start and end state. - - if (debug) { - printf("Kernel 1 >>> Num Kmers is %i\n", numKmers); - printf("Kernel 1 >>> n_states %i\n", numStates); - printf("Kernel 1 >>> num events in read is %i\n", numEvents); - printf("Kernel 1 >>> event offset is %i\n", e_offset); - } - - // Initialise the prev probabilities vector - for (int i = 0; i < numStates - PSR9_NUM_STATES; i++) { - prevProbabilities[i] = -INFINITY; - } - for (int i = numStates - PSR9_NUM_STATES; i < numStates; i++) { - prevProbabilities[i] = 0.0f; - } - - bool rc = false; - if (e_stride == -1) { - rc = true; - } - - float p_stay = 1 - (1 / read_events_per_base); - float p_skip = 0.0025; - float p_bad = 0.001; - float p_bad_self = p_bad; - float p_skip_self = 0.3; - float p_mk = p_skip; // probability of not observing an event at all - float p_mb = p_bad; // probabilty of observing a bad event - float p_mm_self = p_stay; // probability of observing additional events from this k-mer - float p_mm_next = 1.0f - p_mm_self - p_mk - p_mb; // normal movement from state to state - // transitions from event split state in previous block - float p_bb = p_bad_self; - float p_bk, p_bm_next, p_bm_self; - p_bk = p_bm_next = p_bm_self = (1.0f - p_bb) / 3; - // transitions from kmer skip state in previous block - float p_kk = p_skip_self; - float p_km = 1.0f - p_kk; - // We assign some transition probabilities. I believe this is correct and they don't vary by location in the sequence - float lp_mk = logf(p_mk); - float lp_mb = logf(p_mb); - float lp_mm_self = logf(p_mm_self); - float lp_mm_next = logf(p_mm_next); - float lp_bb = logf(p_bb); - float lp_bk = logf(p_bk); - float lp_bm_next = logf(p_bm_next); - float lp_bm_self = logf(p_bm_self); - float lp_kk = logf(p_kk); - float lp_km = logf(p_km); - float lp_sm, lp_ms; - lp_sm = lp_ms = 0.0f; - - // the penalty is controlled by the transition probability - float BAD_EVENT_PENALTY = 0.0f; - - //Fill out the dynamic programming table - for (int row = 1; row < numEvents + 1; row++) { - //row-specific values - int event_idx = e_start + (row - 1) * e_stride; - float eventMean = eventMeansDev[e_offset + row - 1]; - float preFlank = preFlankingDev[e_offset + row - 1]; - float postFlank = postFlankingDev[e_offset + row - 1]; - - float lp_emission_b = BAD_EVENT_PENALTY; - - //Initialise temp registers - float prevMatch = prevProbabilities[PSR9_MATCH];; - float prevSkip = prevProbabilities[PSR9_KMER_SKIP]; - float prevBad = prevProbabilities[PSR9_BAD_EVENT]; - - for (int blkIdx = 1; blkIdx < numBlocks - 1; blkIdx++) { - int curBlockIdx = blkIdx; - int prevBlockIdx = curBlockIdx - 1; - int prevBlockOffset = PSR9_NUM_STATES * prevBlockIdx; - int curBlockOffset = PSR9_NUM_STATES * curBlockIdx; - - int kmerIdx = blkIdx - 1; // because there is a start block with no associated kmer - uint32_t rank = kmerRanksDev[seqOffset + kmerIdx + (numKmers * - rc)]; - - float pore_mean = poreModelDev[rank * 3]; - float pore_stdv = poreModelDev[rank * 3 + 1]; - float pore_log_level_stdv = poreModelDev[rank * 3 + 2]; - - float lp_emission_m = lp_match_r9(rank, - eventMean, - pore_mean, - pore_stdv, - pore_log_level_stdv, - scale, - shift, - var, - logVar); - - // Get all the scores for a match - float curMatch = prevProbabilities[curBlockOffset + PSR9_MATCH]; - float curBad = prevProbabilities[curBlockOffset + PSR9_BAD_EVENT]; - float curSkip = prevProbabilities[curBlockOffset + PSR9_KMER_SKIP]; - - float HMT_FROM_SAME_M = lp_mm_self + curMatch; - float HMT_FROM_PREV_M = lp_mm_next + prevMatch; - float HMT_FROM_SAME_B = lp_bm_self + curBad; - float HMT_FROM_PREV_B = lp_bm_next + prevBad; - float HMT_FROM_PREV_K = lp_km + prevSkip; - - // m_s is the probability of going from the start state - // to this kmer. The start state is (currently) only - // allowed to go to the first kmer. If ALLOW_PRE_CLIP - // is defined, we allow all events before this one to be skipped, - // with a penalty; - float HMT_FROM_SOFT = (kmerIdx == 0 && - (event_idx == e_start || - (HAF_ALLOW_PRE_CLIP))) ? lp_sm + preFlank : -INFINITY; - - // calculate the score - float sum = HMT_FROM_SAME_M; - sum = logsumexpf(sum, HMT_FROM_SOFT); - sum = logsumexpf(sum, HMT_FROM_PREV_M); - sum = logsumexpf(sum, HMT_FROM_SAME_B); - sum = logsumexpf(sum, HMT_FROM_PREV_B); - sum = logsumexpf(sum, HMT_FROM_PREV_K); - sum += lp_emission_m; - - float newMatchScore = sum; - - // Calculate the bad event scores - // state PSR9_BAD_EVENT - HMT_FROM_SAME_M = lp_mb + curMatch; - HMT_FROM_PREV_M = -INFINITY; - HMT_FROM_SAME_B = lp_bb + prevBad; - HMT_FROM_PREV_B = -INFINITY; - HMT_FROM_PREV_K = -INFINITY; - HMT_FROM_SOFT = -INFINITY; - - sum = HMT_FROM_SAME_M; - sum = logsumexpf(sum, HMT_FROM_SAME_B); - sum += lp_emission_b; - - float newBadEventScore = sum; - - // Write row out. prevProbabilities now becomes "current probabilities" for evaluating skips. - prevProbabilities[curBlockOffset + PSR9_MATCH] = newMatchScore; - prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] = newBadEventScore; - - //Update tmp vars - prevMatch = curMatch; - prevSkip = curSkip; - prevBad = prevBad; - - //Now do the non-skip-skip transition. This relies on the updated vector values. - // state PSR9_KMER_SKIP - HMT_FROM_PREV_M = lp_mk + prevProbabilities[prevBlockOffset + PSR9_MATCH]; - HMT_FROM_PREV_B = lp_bk + prevProbabilities[prevBlockOffset + PSR9_BAD_EVENT]; - HMT_FROM_PREV_K = lp_kk + prevProbabilities[prevBlockOffset + PSR9_KMER_SKIP]; - - sum = HMT_FROM_PREV_M; - sum = logsumexpf(sum, HMT_FROM_PREV_B); - sum = logsumexpf(sum, HMT_FROM_PREV_K); - sum = logsumexpf(sum, HMT_FROM_PREV_M); - - float newSkipScore = sum; - - prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] = newSkipScore; - - //post-clip transition - if (kmerIdx == lastKmerIdx && ((HAF_ALLOW_POST_CLIP) || row == lastRowIdx)) { - float lp1 = lp_ms + prevProbabilities[curBlockOffset + PSR9_MATCH] + postFlank; - float lp2 = lp_ms + prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] + postFlank; - float lp3 = lp_ms + prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] + postFlank; - - float end = returnValue; - end = logsumexpf(end, lp1); - end = logsumexpf(end, lp2); - end = logsumexpf(end, lp3); - returnValue = end; - } - } - } - returnValuesDev[scoreIdx] = returnValue; - } -} - - -GpuAligner::GpuAligner() -{ - size_t numModelElements = 4096; - size_t max_reads_per_worker = LOCI_PER_WORKER * MAX_COVERAGE * MAX_NUM_VARIANTS_PER_LOCUS; - int readsSizeBuffer = max_reads_per_worker * sizeof(int); - int maxBuffer = max_reads_per_worker * MAX_SEQUENCE_LENGTH * sizeof(int); - - //OLD - int max_num_sequences = 1; - int max_sequence_length = 100; - int max_n_rows = 100; - - poreModelInitialized = false; - - CU_CHECK_ERR(cudaMalloc((void**)&scaleDev, readsSizeBuffer)); - CU_CHECK_ERR(cudaHostAlloc(&scaleHost, readsSizeBuffer, cudaHostAllocDefault)); - - CU_CHECK_ERR(cudaMalloc((void**)&shiftDev, readsSizeBuffer)); - CU_CHECK_ERR(cudaHostAlloc(&shiftHost, readsSizeBuffer, cudaHostAllocDefault)); - - CU_CHECK_ERR(cudaMalloc((void**)&varDev, readsSizeBuffer)); - CU_CHECK_ERR(cudaHostAlloc(&varHost, readsSizeBuffer, cudaHostAllocDefault)); - - CU_CHECK_ERR(cudaMalloc((void**)&logVarDev, readsSizeBuffer)); - CU_CHECK_ERR(cudaHostAlloc(&logVarHost, readsSizeBuffer, cudaHostAllocDefault)); - - CU_CHECK_ERR(cudaMalloc((void**)&readLengthsDev, readsSizeBuffer)); - CU_CHECK_ERR(cudaHostAlloc(&readLengthsHost, readsSizeBuffer, cudaHostAllocDefault)); - - CU_CHECK_ERR(cudaMalloc((void**)&eventsPerBaseDev, maxBuffer)); - CU_CHECK_ERR(cudaHostAlloc(&eventsPerBaseHost, maxBuffer, cudaHostAllocDefault)); - - // Allocate Device memory for pore model - CU_CHECK_ERR(cudaMalloc((void**)&poreModelDev, numModelElements * 3 * sizeof(float))); - CU_CHECK_ERR(cudaHostAlloc(&poreModelHost, numModelElements * sizeof(float) * 3, cudaHostAllocDefault)); - - CU_CHECK_ERR(cudaMalloc((void**)&eventStartsDev, readsSizeBuffer)); - CU_CHECK_ERR(cudaHostAlloc(&eventStartsHost, readsSizeBuffer, cudaHostAllocDefault)); - - CU_CHECK_ERR(cudaMalloc((void**)&eventStridesDev, maxBuffer)); - CU_CHECK_ERR(cudaHostAlloc(&eventStridesHost, maxBuffer, cudaHostAllocDefault)); - - CU_CHECK_ERR(cudaMalloc((void**)&eventOffsetsDev, maxBuffer)); - CU_CHECK_ERR(cudaHostAlloc(&eventOffsetsHost, maxBuffer, cudaHostAllocDefault)); - - CU_CHECK_ERR(cudaMalloc((void**)&eventMeansDev, maxBuffer)); - CU_CHECK_ERR(cudaHostAlloc(&eventMeans, maxBuffer , cudaHostAllocDefault)); - - CU_CHECK_ERR(cudaMalloc((void**)&preFlankingDev, maxBuffer)); - CU_CHECK_ERR(cudaHostAlloc(&preFlankingHost, maxBuffer, cudaHostAllocDefault)); - - CU_CHECK_ERR(cudaMalloc((void**)&postFlankingDev, maxBuffer)); - CU_CHECK_ERR(cudaHostAlloc(&postFlankingHost, maxBuffer, cudaHostAllocDefault)); - - CU_CHECK_ERR(cudaMalloc((void**)&sequenceOffsetsDev, maxBuffer)); - CU_CHECK_ERR(cudaHostAlloc(&sequenceOffsetsHost, maxBuffer, cudaHostAllocDefault)); - - CU_CHECK_ERR(cudaMalloc((void**)&sequenceLengthsDev, maxBuffer)); - CU_CHECK_ERR(cudaHostAlloc(&sequenceLengthsHost, maxBuffer, cudaHostAllocDefault)); - - CU_CHECK_ERR(cudaMalloc((void**)&scoresDev, maxBuffer)); - CU_CHECK_ERR(cudaHostAlloc(&returnValuesHost, maxBuffer, cudaHostAllocDefault)); - - CU_CHECK_ERR(cudaMalloc((void**)&seqIdxDev, maxBuffer)); - CU_CHECK_ERR(cudaHostAlloc(&seqIdxHost, maxBuffer, cudaHostAllocDefault)); - - CU_CHECK_ERR(cudaMalloc((void**)&readIdxDev, maxBuffer)); - CU_CHECK_ERR(cudaHostAlloc(&readIdxHost, maxBuffer, cudaHostAllocDefault)); - - int numKmers = max_sequence_length * max_num_sequences; - CU_CHECK_ERR(cudaHostAlloc(&kmerRanks, maxBuffer , cudaHostAllocDefault)); - CU_CHECK_ERR(cudaMalloc((void**)&kmerRanksDev, maxBuffer )); - - // Allocate host memory for model - returnValuesHostResultsPointers.resize(max_num_sequences); - kmerRanksDevPointers.resize(max_num_sequences); - returnValuesDevResultsPointers.resize(max_num_sequences); - - for (int i =0; i>> GpuAligner::scoreKernelMod(std::vector &scoreSets, - uint32_t alignment_flags){ - - int numEventsTotal = 0; // The number of events across all scoreSets - int numSequences = 0; // The number of sequences across all scoreSets - int kmerOffset = 0; - int numReads = 0; // The number of reads across all scoreSets - int numScoreSets = scoreSets.size(); - - int rawReadOffset = 0; - int globalReadIdx = 0; - int globalSequenceIdx = 0; - int globalScoreIdx = 0; - - //Loop over every scoreset, filling out buffers and counters - for (int scoreSetIdx=0; scoreSetIdx < numScoreSets; scoreSetIdx++){ - auto scoreSet = scoreSets[scoreSetIdx]; - int firstReadIdxinScoreSet = globalReadIdx; - //Read data - for (int eventSequenceIdx=0; eventSequenceIdx < scoreSet.rawData.size();eventSequenceIdx++){ - auto e = scoreSet.rawData[eventSequenceIdx]; - numReads++; - - //Read statistics - populate host buffers - scaleHost[globalReadIdx] = e.read->scalings[e.strand].scale; - shiftHost[globalReadIdx] = e.read->scalings[e.strand].shift; - varHost[globalReadIdx] = e.read->scalings[e.strand].var; - logVarHost[globalReadIdx] = e.read->scalings[e.strand].log_var; - - int e_start = e.event_start_idx; - eventStartsHost[globalReadIdx] = e_start; - - int e_stride = e.event_stride; - eventStridesHost[globalReadIdx] = e_stride; - - uint32_t e_end = e.event_stop_idx; - uint32_t n_events; - if(e_end > e_start) - n_events = e_end - e_start + 1; - else - n_events = e_start - e_end + 1; - readLengthsHost[globalReadIdx] = n_events; - numEventsTotal += n_events; - - eventOffsetsHost[globalReadIdx] = rawReadOffset; - - float readEventsPerBase = e.read->events_per_base[e.strand]; - eventsPerBaseHost[globalReadIdx] = readEventsPerBase; - - std::vector pre_flank = make_pre_flanking(e, e_start, n_events); - std::vector post_flank = make_post_flanking(e, e_start, n_events); - - for (int i=0;iget_drift_scaled_level(event_idx, e.strand); // send the data in drift scaled - eventMeans[rawReadOffset + i] = scaled; - - //populate the pre/post-flanking data, since it has a 1-1 correspondence with events - preFlankingHost[rawReadOffset + i] = pre_flank[i]; - postFlankingHost[rawReadOffset + i] = post_flank[i]; - } - - rawReadOffset += n_events; - globalReadIdx++; - } - //Pore Model - const uint32_t k = scoreSets[0].rawData[0].pore_model->k; //k is the length of a kmer in the pore model - if (poreModelInitialized == false) { - int num_states = scoreSets[0].rawData[0].pore_model->states.size(); - int poreModelEntriesPerState = 3; - for(int st=0; ststates[st]; - poreModelHost[st * poreModelEntriesPerState] = params.level_mean; - poreModelHost[st * poreModelEntriesPerState + 1] = params.level_stdv; - poreModelHost[st * poreModelEntriesPerState + 2] = params.level_log_stdv; - } - // copy over the pore model - CU_CHECK_ERR(cudaMemcpyAsync(poreModelDev, poreModelHost, - poreModelEntriesPerState * 4096 * sizeof(float), cudaMemcpyHostToDevice, streams[0])); - poreModelInitialized = true; - } - auto & sequences = scoreSet.stateSequences; - numSequences += sequences.size(); - - for (int i = 0; i>> (poreModelDev, - readLengthsDev, - eventStartsDev, - eventStridesDev, - eventsPerBaseDev, - scaleDev, - shiftDev, - varDev, - logVarDev, - eventOffsetsDev, - eventMeansDev, - preFlankingDev, - postFlankingDev, - sequenceLengthsDev, - sequenceOffsetsDev, - kmerRanksDev, - seqIdxDev, - readIdxDev, - globalScoreIdx, - scoresDev); - cudaError_t err = cudaGetLastError(); - - if (err != cudaSuccess) - printf("Errors during kernel execution: %s\n", cudaGetErrorString(err)); - - cudaMemcpyAsync(returnValuesHost, scoresDev, globalScoreIdx * sizeof(float), cudaMemcpyDeviceToHost, streams[0]); - cudaStreamSynchronize(streams[0]); - - //Unpack results - int k = 0; - std::vector>> result(scoreSets.size()); - - for(int scoreSetIdx=0; scoreSetIdx seqScores(numReads); - - for (int readIdx=0; readIdx GpuAligner::variantScoresThresholded(std::vector> input_variants_vector, - std::vector base_haplotypes, - std::vector> event_sequences_vector, - uint32_t alignment_flags, - int screen_score_threshold, - std::vector methylation_types) { - int numScoreSets = base_haplotypes.size(); - std::vector scoreSets; - scoreSets.resize(numScoreSets); - - for(int scoreSetIdx=0; scoreSetIdx MAX_COVERAGE) { - event_sequences.resize(MAX_COVERAGE); - } - - int numVariants = input_variants.size(); - - std::vector out_variants = input_variants; - std::vector variant_haplotypes(numVariants, base_haplotype); - - //loop over the vector, applying the variants to the haplotypes - for (int i = 0; i sequences; - - HMMInputSequence base_sequence = generate_methylated_alternatives(base_haplotype.get_sequence(), - methylation_types)[0]; - - sequences.push_back(base_sequence); - - for (auto v: variant_haplotypes){ - auto variant_sequence = generate_methylated_alternatives(v.get_sequence(), methylation_types)[0]; - sequences.push_back(variant_sequence); - } - - ScoreSet s = { - sequences, - event_sequences - }; - - scoreSets[scoreSetIdx] = s; - - } - - std::vector v; - if (!event_sequences_vector.empty()) { - - auto scoresMod = scoreKernelMod(scoreSets, alignment_flags); - - // results are now ready, need to unpack them - for (int scoreSetIdx=0; scoreSetIdx> scores = scoresMod[scoreSetIdx]; // scores for this candidate, including all variants and base(zeroth) - int numVariants = scores.size() - 1; // subtract one for the base - int numScores = scores[0].size(); - - for (int variantIndex = 0; variantIndex < numVariants; variantIndex++) { // index 0 is the base scores - double totalScore = 0.0; - for (int k = 0; k < numScores; k++) { - if (fabs(totalScore) < screen_score_threshold) { - double baseScore = scores[0][k]; - totalScore += (scores[variantIndex + 1][k] - baseScore); - } - } - // get the old variant: - auto unScoredVariant = input_variants_vector[scoreSetIdx][variantIndex]; - unScoredVariant.quality = totalScore; - unScoredVariant.info = ""; - v.push_back(unScoredVariant); - } - } - } - return v; -} diff --git a/src/cuda_kernels/gpu_aligner.cu b/src/cuda_kernels/gpu_aligner.cu new file mode 100644 index 00000000..972086ed --- /dev/null +++ b/src/cuda_kernels/gpu_aligner.cu @@ -0,0 +1,864 @@ +#include +#include +#include "gpu_aligner.h" +#include +#include "nanopolish_profile_hmm_r9.h" + +int gpu_aligner_debug = 0; + +#define MAX_STATES 256 + +#define EXPAND_TO_STRING(X) #X +#define TO_STRING(X) EXPAND_TO_STRING(X) +#define CU_CHECK_ERR(X) if (X != cudaSuccess){printf("CUDA error: <<%s>> at line %s\n", cudaGetErrorString(X), TO_STRING(__LINE__));throw std::runtime_error("CUDA ERROR");} + +__device__ float logsumexpf(float x, float y){ + if(x == -INFINITY && y == -INFINITY){ + return -INFINITY; + } + float result = fmax(x, y) + log1pf(expf(-fabsf(y - x))); + return result; +} + +__device__ float lp_match_r9(int rank, + float mean, + float pore_mean, + float pore_stdv, + float pore_log_level_stdv, + float scale, + float shift, + float var, + float logVar){ + + float log_inv_sqrt_2pi = logf(0.3989422804014327); + + float level = mean; + float gaussian_mean = scale * pore_mean + shift; + float gaussian_stdv = pore_stdv * var; + float gaussian_log_level_stdv = pore_log_level_stdv + logVar; + + float a = (level - gaussian_mean) / gaussian_stdv; + float emission = log_inv_sqrt_2pi - gaussian_log_level_stdv + (-0.5f * a * a); + return emission; + +} + +__global__ void getScoresMod (float * poreModelDev, + int * readLengthsDev, + int * eventStartsDev, + int * eventStridesDev, + float * eventsPerBaseDev, + float * scaleDev, + float * shiftDev, + float * varDev, + float * logVarDev, + int * eventOffsetsDev, + float * eventMeansDev, + int * modelOffsetsDev, + float * preFlankingDev, + float * postFlankingDev, + int * sequenceLengthsDev, + int * sequenceOffsetsDev, + int * kmerRanksDev, + int * seqIdxDev, + int * readIdxDev, + int numScores, + float * returnValuesDev){ + + bool debug = false; + if ((threadIdx.x == 0) && (blockIdx.x == 0)){ + debug = false; + } + + // get buffer indices + int scoreIdx = blockIdx.x * blockDim.x + threadIdx.x; + + if (scoreIdx < numScores) { + + int readIdx = readIdxDev[scoreIdx]; + int seqIdx = seqIdxDev[scoreIdx]; + + // get read statistics + int numEvents = readLengthsDev[readIdx]; + float read_events_per_base = eventsPerBaseDev[readIdx]; + int e_start = eventStartsDev[readIdx]; // Event start for read + int e_stride = eventStridesDev[readIdx]; + int e_offset = eventOffsetsDev[readIdx]; // Within the event means etc, the offset needed for this block to get a specific event + int m_offset = modelOffsetsDev[readIdx]; + float scale = scaleDev[readIdx]; + float shift = shiftDev[readIdx]; + float var = varDev[readIdx]; + float logVar = logVarDev[readIdx]; + + // get sequence statistics + int numKmers = sequenceLengthsDev[seqIdx]; + int seqOffset = sequenceOffsetsDev[seqIdx]; + + int lastRowIdx = numEvents - 1; + int lastKmerIdx = numKmers - 1; + + float returnValue = -INFINITY; //Used to sum over the last column. + float prevProbabilities[MAX_STATES]; + + int numBlocks = numKmers + 2; + int numStates = numBlocks * PSR9_NUM_STATES; // 3 blocks per kmer and then 3 each for start and end state. + + if (debug) { + printf("Kernel 1 >>> Num Kmers is %i\n", numKmers); + printf("Kernel 1 >>> n_states %i\n", numStates); + printf("Kernel 1 >>> num events in read is %i\n", numEvents); + printf("Kernel 1 >>> event offset is %i\n", e_offset); + } + + // Initialise the prev probabilities vector + for (int i = 0; i < numStates - PSR9_NUM_STATES; i++) { + prevProbabilities[i] = -INFINITY; + } + for (int i = numStates - PSR9_NUM_STATES; i < numStates; i++) { + prevProbabilities[i] = 0.0f; + } + + bool rc = false; + if (e_stride == -1) { + rc = true; + } + + float p_stay = 1 - (1 / read_events_per_base); + float p_skip = 0.0025; + float p_bad = 0.001; + float p_bad_self = p_bad; + float p_skip_self = 0.3; + float p_mk = p_skip; // probability of not observing an event at all + float p_mb = p_bad; // probabilty of observing a bad event + float p_mm_self = p_stay; // probability of observing additional events from this k-mer + float p_mm_next = 1.0f - p_mm_self - p_mk - p_mb; // normal movement from state to state + // transitions from event split state in previous block + float p_bb = p_bad_self; + float p_bk, p_bm_next, p_bm_self; + p_bk = p_bm_next = p_bm_self = (1.0f - p_bb) / 3; + // transitions from kmer skip state in previous block + float p_kk = p_skip_self; + float p_km = 1.0f - p_kk; + // We assign some transition probabilities. I believe this is correct and they don't vary by location in the sequence + float lp_mk = logf(p_mk); + float lp_mb = logf(p_mb); + float lp_mm_self = logf(p_mm_self); + float lp_mm_next = logf(p_mm_next); + float lp_bb = logf(p_bb); + float lp_bk = logf(p_bk); + float lp_bm_next = logf(p_bm_next); + float lp_bm_self = logf(p_bm_self); + float lp_kk = logf(p_kk); + float lp_km = logf(p_km); + float lp_sm, lp_ms; + lp_sm = lp_ms = 0.0f; + + // the penalty is controlled by the transition probability + float BAD_EVENT_PENALTY = 0.0f; + + //Fill out the dynamic programming table + for (int row = 1; row < numEvents + 1; row++) { + //row-specific values + int event_idx = e_start + (row - 1) * e_stride; + float eventMean = eventMeansDev[e_offset + row - 1]; + float preFlank = preFlankingDev[e_offset + row - 1]; + float postFlank = postFlankingDev[e_offset + row - 1]; + + float lp_emission_b = BAD_EVENT_PENALTY; + + //Initialise temp registers + float prevMatch = prevProbabilities[PSR9_MATCH];; + float prevSkip = prevProbabilities[PSR9_KMER_SKIP]; + float prevBad = prevProbabilities[PSR9_BAD_EVENT]; + + for (int blkIdx = 1; blkIdx < numBlocks - 1; blkIdx++) { + int curBlockIdx = blkIdx; + int prevBlockIdx = curBlockIdx - 1; + int prevBlockOffset = PSR9_NUM_STATES * prevBlockIdx; + int curBlockOffset = PSR9_NUM_STATES * curBlockIdx; + + int kmerIdx = blkIdx - 1; // because there is a start block with no associated kmer + uint32_t rank = kmerRanksDev[seqOffset + kmerIdx + (numKmers * + rc)]; + + float pore_mean = poreModelDev[m_offset + rank * 3]; + float pore_stdv = poreModelDev[m_offset + rank * 3 + 1]; + float pore_log_level_stdv = poreModelDev[m_offset + rank * 3 + 2]; + + float lp_emission_m = lp_match_r9(rank, + eventMean, + pore_mean, + pore_stdv, + pore_log_level_stdv, + scale, + shift, + var, + logVar); + + // Get all the scores for a match + float curMatch = prevProbabilities[curBlockOffset + PSR9_MATCH]; + float curBad = prevProbabilities[curBlockOffset + PSR9_BAD_EVENT]; + float curSkip = prevProbabilities[curBlockOffset + PSR9_KMER_SKIP]; + + float HMT_FROM_SAME_M = lp_mm_self + curMatch; + float HMT_FROM_PREV_M = lp_mm_next + prevMatch; + float HMT_FROM_SAME_B = lp_bm_self + curBad; + float HMT_FROM_PREV_B = lp_bm_next + prevBad; + float HMT_FROM_PREV_K = lp_km + prevSkip; + + // m_s is the probability of going from the start state + // to this kmer. The start state is (currently) only + // allowed to go to the first kmer. If ALLOW_PRE_CLIP + // is defined, we allow all events before this one to be skipped, + // with a penalty; + float HMT_FROM_SOFT = (kmerIdx == 0 && + (event_idx == e_start || + (HAF_ALLOW_PRE_CLIP))) ? lp_sm + preFlank : -INFINITY; + + // calculate the score + float sum = HMT_FROM_SAME_M; + sum = logsumexpf(sum, HMT_FROM_SOFT); + sum = logsumexpf(sum, HMT_FROM_PREV_M); + sum = logsumexpf(sum, HMT_FROM_SAME_B); + sum = logsumexpf(sum, HMT_FROM_PREV_B); + sum = logsumexpf(sum, HMT_FROM_PREV_K); + sum += lp_emission_m; + + float newMatchScore = sum; + + // Calculate the bad event scores + // state PSR9_BAD_EVENT + HMT_FROM_SAME_M = lp_mb + curMatch; + HMT_FROM_PREV_M = -INFINITY; + HMT_FROM_SAME_B = lp_bb + prevBad; + HMT_FROM_PREV_B = -INFINITY; + HMT_FROM_PREV_K = -INFINITY; + HMT_FROM_SOFT = -INFINITY; + + sum = HMT_FROM_SAME_M; + sum = logsumexpf(sum, HMT_FROM_SAME_B); + sum += lp_emission_b; + + float newBadEventScore = sum; + + // Write row out. prevProbabilities now becomes "current probabilities" for evaluating skips. + prevProbabilities[curBlockOffset + PSR9_MATCH] = newMatchScore; + prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] = newBadEventScore; + + //Update tmp vars + prevMatch = curMatch; + prevSkip = curSkip; + prevBad = prevBad; + + //Now do the non-skip-skip transition. This relies on the updated vector values. + // state PSR9_KMER_SKIP + HMT_FROM_PREV_M = lp_mk + prevProbabilities[prevBlockOffset + PSR9_MATCH]; + HMT_FROM_PREV_B = lp_bk + prevProbabilities[prevBlockOffset + PSR9_BAD_EVENT]; + HMT_FROM_PREV_K = lp_kk + prevProbabilities[prevBlockOffset + PSR9_KMER_SKIP]; + + sum = HMT_FROM_PREV_M; + sum = logsumexpf(sum, HMT_FROM_PREV_B); + sum = logsumexpf(sum, HMT_FROM_PREV_K); + sum = logsumexpf(sum, HMT_FROM_PREV_M); + + float newSkipScore = sum; + + prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] = newSkipScore; + + //post-clip transition + if (kmerIdx == lastKmerIdx && ((HAF_ALLOW_POST_CLIP) || row == lastRowIdx)) { + float lp1 = lp_ms + prevProbabilities[curBlockOffset + PSR9_MATCH] + postFlank; + float lp2 = lp_ms + prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] + postFlank; + float lp3 = lp_ms + prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] + postFlank; + + float end = returnValue; + end = logsumexpf(end, lp1); + end = logsumexpf(end, lp2); + end = logsumexpf(end, lp3); + returnValue = end; + } + } + } + returnValuesDev[scoreIdx] = returnValue; + } +} + + +GpuAligner::GpuAligner() +{ + size_t max_reads_per_worker = LOCI_PER_WORKER * MAX_COVERAGE * MAX_NUM_VARIANTS_PER_LOCUS; + int readsSizeBuffer = max_reads_per_worker * sizeof(int); + int maxBuffer = max_reads_per_worker * MAX_SEQUENCE_LENGTH * sizeof(int); + + //OLD + int max_num_sequences = 1; + //int max_sequence_length = 100; + + poreModelInitialized = false; + + CU_CHECK_ERR(cudaMalloc((void**)&scaleDev, readsSizeBuffer)); + CU_CHECK_ERR(cudaHostAlloc(&scaleHost, readsSizeBuffer, cudaHostAllocDefault)); + + CU_CHECK_ERR(cudaMalloc((void**)&shiftDev, readsSizeBuffer)); + CU_CHECK_ERR(cudaHostAlloc(&shiftHost, readsSizeBuffer, cudaHostAllocDefault)); + + CU_CHECK_ERR(cudaMalloc((void**)&varDev, readsSizeBuffer)); + CU_CHECK_ERR(cudaHostAlloc(&varHost, readsSizeBuffer, cudaHostAllocDefault)); + + CU_CHECK_ERR(cudaMalloc((void**)&logVarDev, readsSizeBuffer)); + CU_CHECK_ERR(cudaHostAlloc(&logVarHost, readsSizeBuffer, cudaHostAllocDefault)); + + CU_CHECK_ERR(cudaMalloc((void**)&readLengthsDev, readsSizeBuffer)); + CU_CHECK_ERR(cudaHostAlloc(&readLengthsHost, readsSizeBuffer, cudaHostAllocDefault)); + + CU_CHECK_ERR(cudaMalloc((void**)&eventsPerBaseDev, maxBuffer)); + CU_CHECK_ERR(cudaHostAlloc(&eventsPerBaseHost, maxBuffer, cudaHostAllocDefault)); + + CU_CHECK_ERR(cudaMalloc((void**)&eventStartsDev, readsSizeBuffer)); + CU_CHECK_ERR(cudaHostAlloc(&eventStartsHost, readsSizeBuffer, cudaHostAllocDefault)); + + CU_CHECK_ERR(cudaMalloc((void**)&eventStridesDev, maxBuffer)); + CU_CHECK_ERR(cudaHostAlloc(&eventStridesHost, maxBuffer, cudaHostAllocDefault)); + + CU_CHECK_ERR(cudaMalloc((void**)&eventOffsetsDev, maxBuffer)); + CU_CHECK_ERR(cudaHostAlloc(&eventOffsetsHost, maxBuffer, cudaHostAllocDefault)); + + CU_CHECK_ERR(cudaMalloc((void**)&eventMeansDev, maxBuffer)); + CU_CHECK_ERR(cudaHostAlloc(&eventMeans, maxBuffer , cudaHostAllocDefault)); + + CU_CHECK_ERR(cudaMalloc((void**)&modelOffsetsDev, maxBuffer)); + CU_CHECK_ERR(cudaHostAlloc(&modelOffsetsHost, maxBuffer, cudaHostAllocDefault)); + + CU_CHECK_ERR(cudaMalloc((void**)&preFlankingDev, maxBuffer)); + CU_CHECK_ERR(cudaHostAlloc(&preFlankingHost, maxBuffer, cudaHostAllocDefault)); + + CU_CHECK_ERR(cudaMalloc((void**)&postFlankingDev, maxBuffer)); + CU_CHECK_ERR(cudaHostAlloc(&postFlankingHost, maxBuffer, cudaHostAllocDefault)); + + CU_CHECK_ERR(cudaMalloc((void**)&sequenceOffsetsDev, maxBuffer)); + CU_CHECK_ERR(cudaHostAlloc(&sequenceOffsetsHost, maxBuffer, cudaHostAllocDefault)); + + CU_CHECK_ERR(cudaMalloc((void**)&sequenceLengthsDev, maxBuffer)); + CU_CHECK_ERR(cudaHostAlloc(&sequenceLengthsHost, maxBuffer, cudaHostAllocDefault)); + + CU_CHECK_ERR(cudaMalloc((void**)&scoresDev, maxBuffer)); + CU_CHECK_ERR(cudaHostAlloc(&returnValuesHost, maxBuffer, cudaHostAllocDefault)); + + CU_CHECK_ERR(cudaMalloc((void**)&seqIdxDev, maxBuffer)); + CU_CHECK_ERR(cudaHostAlloc(&seqIdxHost, maxBuffer, cudaHostAllocDefault)); + + CU_CHECK_ERR(cudaMalloc((void**)&readIdxDev, maxBuffer)); + CU_CHECK_ERR(cudaHostAlloc(&readIdxHost, maxBuffer, cudaHostAllocDefault)); + + CU_CHECK_ERR(cudaHostAlloc(&kmerRanks, maxBuffer , cudaHostAllocDefault)); + CU_CHECK_ERR(cudaMalloc((void**)&kmerRanksDev, maxBuffer )); + + // + // Allocate Device memory for pore model + // + + // Count the total number of k-mer states across all pore models + int numModelElements = 0; + int numModels = 0; + for(const PoreModel* model : PoreModelSet::get_all_models()) { + numModelElements += model->states.size(); + numModels += 1; + } + //fprintf(stderr, "Initialized %d states from %d models\n", numModelElements, numModels); + int poreModelEntriesPerState = 3; + int totalModelEntries = numModelElements * poreModelEntriesPerState; + CU_CHECK_ERR(cudaMalloc((void**)&poreModelDev, totalModelEntries * sizeof(float))); + CU_CHECK_ERR(cudaHostAlloc(&poreModelHost, totalModelEntries * sizeof(float), cudaHostAllocDefault)); + + // + // Initialize pore model + // + int modelOffset = 0; + for(const PoreModel* model : PoreModelSet::get_all_models()) { + modelToOffsetMap[model] = modelOffset; + fprintf(stderr, "inserted model %s at offset %d\n", PoreModelSet::get_model_key(*model).c_str(), modelOffset); + + int num_states = model->states.size(); + for(int st=0; ststates[st]; + poreModelHost[modelOffset++] = params.level_mean; + poreModelHost[modelOffset++] = params.level_stdv; + poreModelHost[modelOffset++] = params.level_log_stdv; + } + } + + fprintf(stderr, "Initialized %d/%d states from %d models\n", modelOffset, numModelElements, numModels); + assert(modelOffset == totalModelEntries); + + // Allocate host memory for model + returnValuesHostResultsPointers.resize(max_num_sequences); + kmerRanksDevPointers.resize(max_num_sequences); + returnValuesDevResultsPointers.resize(max_num_sequences); + + for (int i =0; i>> GpuAligner::scoreKernelMod(std::vector &scoreSets, + uint32_t alignment_flags){ + + int numEventsTotal = 0; // The number of events across all scoreSets + int numSequences = 0; // The number of sequences across all scoreSets + int kmerOffset = 0; + int numReads = 0; // The number of reads across all scoreSets + int numScoreSets = scoreSets.size(); + + int rawReadOffset = 0; + int globalReadIdx = 0; + int globalSequenceIdx = 0; + int globalScoreIdx = 0; + + //Loop over every scoreset, filling out buffers and counters + for (int scoreSetIdx=0; scoreSetIdx < numScoreSets; scoreSetIdx++) { + auto scoreSet = scoreSets[scoreSetIdx]; + int firstReadIdxinScoreSet = globalReadIdx; + + //Read data + for (int eventSequenceIdx=0; eventSequenceIdx < scoreSet.rawData.size(); eventSequenceIdx++) { + auto e = scoreSet.rawData[eventSequenceIdx]; + numReads++; + + //Read statistics - populate host buffers + scaleHost[globalReadIdx] = e.read->scalings[e.strand].scale; + shiftHost[globalReadIdx] = e.read->scalings[e.strand].shift; + varHost[globalReadIdx] = e.read->scalings[e.strand].var; + logVarHost[globalReadIdx] = e.read->scalings[e.strand].log_var; + + int e_start = e.event_start_idx; + eventStartsHost[globalReadIdx] = e_start; + + int e_stride = e.event_stride; + eventStridesHost[globalReadIdx] = e_stride; + + uint32_t e_end = e.event_stop_idx; + uint32_t n_events; + if(e_end > e_start) + n_events = e_end - e_start + 1; + else + n_events = e_start - e_end + 1; + readLengthsHost[globalReadIdx] = n_events; + numEventsTotal += n_events; + + eventOffsetsHost[globalReadIdx] = rawReadOffset; + + float readEventsPerBase = e.read->events_per_base[e.strand]; + eventsPerBaseHost[globalReadIdx] = readEventsPerBase; + + std::vector pre_flank = make_pre_flanking(e, e_start, n_events); + std::vector post_flank = make_post_flanking(e, e_start, n_events); + + for (int i=0;iget_drift_scaled_level(event_idx, e.strand); // send the data in drift scaled + eventMeans[rawReadOffset + i] = scaled; + + //populate the pre/post-flanking data, since it has a 1-1 correspondence with events + preFlankingHost[rawReadOffset + i] = pre_flank[i]; + postFlankingHost[rawReadOffset + i] = post_flank[i]; + } + + // look up model offset in the map + const auto& modelOffsetIter = modelToOffsetMap.find(e.pore_model); + assert(modelOffsetIter != modelToOffsetMap.end()); + modelOffsetsHost[globalReadIdx] = modelOffsetIter->second; + + rawReadOffset += n_events; + globalReadIdx++; + } + + auto & sequences = scoreSet.stateSequences; + numSequences += sequences.size(); + + for (int i = 0; ik; + int numKmers = sequenceLength - k + 1; + + for(size_t ki = 0; ki < numKmers; ++ki) { + int rank = sequence.get_kmer_rank(ki, k, false); + kmerRanks[ki + kmerOffset] = rank; + } + + kmerOffset += numKmers; + + for(size_t ki = 0; ki < numKmers; ++ki) { + int rank = sequence.get_kmer_rank(ki, k, true); + kmerRanks[ki + kmerOffset] = rank; + } + + kmerOffset += numKmers; + + sequenceLengthsHost[globalSequenceIdx] = numKmers; + + // Loop over the raw reads, producing a cartesian product of reads and sequences + auto numReadsInScoreSet = scoreSet.rawData.size(); + for (int r=0; r>> (poreModelDev, + readLengthsDev, + eventStartsDev, + eventStridesDev, + eventsPerBaseDev, + scaleDev, + shiftDev, + varDev, + logVarDev, + eventOffsetsDev, + eventMeansDev, + modelOffsetsDev, + preFlankingDev, + postFlankingDev, + sequenceLengthsDev, + sequenceOffsetsDev, + kmerRanksDev, + seqIdxDev, + readIdxDev, + globalScoreIdx, + scoresDev); + cudaError_t err = cudaGetLastError(); + + if (err != cudaSuccess) + printf("Errors during kernel execution: %s\n", cudaGetErrorString(err)); + + cudaMemcpyAsync(returnValuesHost, scoresDev, globalScoreIdx * sizeof(float), cudaMemcpyDeviceToHost, streams[0]); + cudaStreamSynchronize(streams[0]); + + //Unpack results + int k = 0; + std::vector>> result(scoreSets.size()); + + for(int scoreSetIdx=0; scoreSetIdx seqScores(numReads); + + for (int readIdx=0; readIdx GpuAligner::variantScoresThresholded(std::vector> input_variants_vector, + std::vector base_haplotypes, + std::vector> event_sequences_vector, + uint32_t alignment_flags, + int screen_score_threshold, + std::vector methylation_types) { + int numScoreSets = base_haplotypes.size(); + std::vector scoreSets; + scoreSets.resize(numScoreSets); + + if(gpu_aligner_debug){ + fprintf(stderr,"Generating variants:\n"); + } + + for(int scoreSetIdx=0; scoreSetIdx MAX_COVERAGE) { + event_sequences.resize(MAX_COVERAGE); + } + + int numVariants = input_variants.size(); + + std::vector out_variants = input_variants; + std::vector variant_haplotypes(numVariants, base_haplotype); + + //loop over the vector, applying the variants to the haplotypes + for (int i = 0; i sequences; + std::vector base_sequence_vector = generate_methylated_alternatives(base_haplotype.get_sequence(),methylation_types); + +#ifdef MULTI_MODEL + std::vector num_models_vector; + std::vector score_offsets_vector; + size_t offset = 0; + size_t num_models = base_sequence_vector.size(); + num_models_vector.push_back(num_models); + score_offsets_vector.push_back(offset); + if(gpu_aligner_debug){ + fprintf(stderr,"num_models_base=%ld,offset_base=%ld\t",num_models,offset); + } + offset += num_models; + for (auto base_sequence: base_sequence_vector){ + sequences.push_back(base_sequence); + } +#else + HMMInputSequence base_sequence = base_sequence_vector[0]; + sequences.push_back(base_sequence); +#endif + + for (auto v: variant_haplotypes){ + auto variant_sequence_vector = generate_methylated_alternatives(v.get_sequence(), methylation_types); +#ifdef MULTI_MODEL + size_t num_models = variant_sequence_vector.size(); + num_models_vector.push_back(num_models); + score_offsets_vector.push_back(offset); + if(gpu_aligner_debug){ + fprintf(stderr,"num_models_var=%ld,offset_var=%ld\t",num_models,offset); + } + offset += num_models; + for (auto variant_sequence: variant_sequence_vector){ + sequences.push_back(variant_sequence); + } +#else + auto variant_sequence = variant_sequence_vector[0]; + sequences.push_back(variant_sequence); +#endif + } + + ScoreSet s = { + sequences, + event_sequences +#ifdef MULTI_MODEL + , + num_models_vector, + score_offsets_vector +#endif + }; + + scoreSets[scoreSetIdx] = s; + if(gpu_aligner_debug){ + fprintf(stderr,"\n"); + } + } + if(gpu_aligner_debug){ + fprintf(stderr,"\n"); + } + + std::vector v; + if (!event_sequences_vector.empty()) { + + if(gpu_aligner_debug){ + fprintf(stderr,"Calling scoreKernelMod\n"); + } + auto scoresMod = scoreKernelMod(scoreSets, alignment_flags); + + if(gpu_aligner_debug){ + fprintf(stderr,"Unpacking scores\n"); + } + // results are now ready, need to unpack them + for (int scoreSetIdx=0; scoreSetIdx> scores = scoresMod[scoreSetIdx]; // scores for this candidate, including all variants and base(zeroth) + #ifdef MULTI_MODEL + ScoreSet s = scoreSets[scoreSetIdx]; + int numVariants = s.num_models_vector.size() -1; // subtract one for the base sequence + #else + int numVariants = scores.size() - 1; // subtract one for the base sequence + #endif + int numScores = scores[0].size(); + for (int variantIndex = 0; variantIndex < numVariants; variantIndex++) { // index 0 is the base scores + double totalScore = 0.0; + for (int k = 0; k < numScores; k++) { + if (fabs(totalScore) < screen_score_threshold) { + #ifdef MULTI_MODEL + + //compute the base score based on the base sequences + size_t num_models = s.num_models_vector[0]; + double num_model_penalty = log(num_models); + double score = scores[0][k] - num_model_penalty; + for(size_t seq_idx = 1; seq_idx < num_models; ++seq_idx) { + double alt_score = scores[seq_idx][k] - num_model_penalty; + score = add_logs(score, alt_score); + } + double baseScore = score; + if (k==0 && variantIndex==0 && gpu_aligner_debug){ + fprintf(stderr,"num_models_base=%ld,offset_base=%d\t",num_models,0); + } + + if(variantIndex+1 >= s.num_models_vector.size()){ //a sanity check + fprintf(stderr,"\nAn invalid memory access occured\nscoreSetIdx=%d, variantIndex=%d, k=%d, \n",scoreSetIdx,variantIndex,k); + assert(0); + } + + //compute the variant score based on the variant sequences + num_models = s.num_models_vector[variantIndex+1]; + size_t score_offset = s.score_offsets_vector[variantIndex+1]; + num_model_penalty = log(num_models); + score = scores[score_offset][k] - num_model_penalty; + for(size_t seq_idx = 1; seq_idx < num_models; ++seq_idx) { + double alt_score = scores[score_offset + seq_idx][k] - num_model_penalty; + score = add_logs(score, alt_score); + } + double variantScore = score; + if (k==0 && gpu_aligner_debug) { + fprintf(stderr,"num_models_var=%ld,offset_var=%ld\t",num_models,score_offset); + } + + #else + double baseScore = scores[0][k]; + double variantScore = scores[variantIndex + 1][k]; + #endif + totalScore += (variantScore - baseScore); + } + } + // get the old variant: + auto unScoredVariant = input_variants_vector[scoreSetIdx][variantIndex]; + unScoredVariant.quality = totalScore; + unScoredVariant.info = ""; + v.push_back(unScoredVariant); + } + if(gpu_aligner_debug){ + fprintf(stderr,"\n"); + } + } + if(gpu_aligner_debug){ + fprintf(stderr,"\n"); + } + } + return v; +} diff --git a/src/cuda_kernels/GpuAligner.h b/src/cuda_kernels/gpu_aligner.h similarity index 76% rename from src/cuda_kernels/GpuAligner.h rename to src/cuda_kernels/gpu_aligner.h index 731f2ed9..31121128 100644 --- a/src/cuda_kernels/GpuAligner.h +++ b/src/cuda_kernels/gpu_aligner.h @@ -46,29 +46,42 @@ #define MAX_NUM_VARIANTS_PER_LOCUS 10 #define MAX_NUM_WORKERS 16 +#define MULTI_MODEL 1 + //Data to be scored typedef struct { std::vector stateSequences; std::vector rawData; +#ifdef MULTI_MODEL + std::vector num_models_vector; //store the number of models for base sequence and then variant sequences + std::vector score_offsets_vector; //store the offsets based on number of models +#endif } ScoreSet; class GpuAligner { + public: GpuAligner(); ~GpuAligner(); + // GPU version of the candidate-variant scoring function std::vector variantScoresThresholded(std::vector>, - std::vector, - std::vector>, - uint32_t alignment_flags, int screen_score_threshold, std::vector methylation_types); + std::vector, + std::vector>, + uint32_t alignment_flags, + int screen_score_threshold, + std::vector methylation_types); std::vector> scoreKernel(std::vector sequences, - std::vector event_sequences, - uint32_t alignment_flags); + std::vector event_sequences, + uint32_t alignment_flags); + std::vector>> scoreKernelMod(std::vector &scoreSets, - uint32_t alignment_flags); + uint32_t alignment_flags); + + private: float* scaleDev; float* shiftDev; @@ -80,6 +93,7 @@ class GpuAligner int* eventOffsetsDev; int* eventStridesDev; int* eventStartsDev; + int* modelOffsetsDev; int* numRowsDev; float* postFlankingDev; float* preFlankingDev; @@ -104,6 +118,7 @@ class GpuAligner int * sequenceLengthsHost; int * eventOffsetsHost; int * sequenceOffsetsHost; + int * modelOffsetsHost; int * readIdxHost; int * seqIdxHost; @@ -123,6 +138,8 @@ class GpuAligner std::vector kmerRanksDevPointers; std::vector returnValuesDevResultsPointers; std::vector returnValuesHostResultsPointers; + + std::map modelToOffsetMap; cudaStream_t streams[8]; // TODO 8 should not be hardcoded here }; diff --git a/src/cuda_kernels/gpu_call_variants.inl b/src/cuda_kernels/gpu_call_variants.inl index c5036dcf..e2358d20 100644 --- a/src/cuda_kernels/gpu_call_variants.inl +++ b/src/cuda_kernels/gpu_call_variants.inl @@ -1,4 +1,4 @@ -#include +#include #include #include #include diff --git a/src/pore_model/nanopolish_pore_model_set.cpp b/src/pore_model/nanopolish_pore_model_set.cpp index 474621dc..c2bee02d 100644 --- a/src/pore_model/nanopolish_pore_model_set.cpp +++ b/src/pore_model/nanopolish_pore_model_set.cpp @@ -126,6 +126,16 @@ const PoreModel* PoreModelSet::get_model_by_key(const std::string& key) } } +std::vector PoreModelSet::get_all_models() +{ + PoreModelSet& model_set = getInstance(); + std::vector out; + for(auto& iter : model_set.model_map) { + out.push_back(iter.second); + } + return out; +} + // std::map PoreModelSet::copy_strand_models(const std::string& kit_name, const std::string& alphabet, diff --git a/src/pore_model/nanopolish_pore_model_set.h b/src/pore_model/nanopolish_pore_model_set.h index 63da243f..ee16f02a 100644 --- a/src/pore_model/nanopolish_pore_model_set.h +++ b/src/pore_model/nanopolish_pore_model_set.h @@ -52,9 +52,17 @@ class PoreModelSet const std::string& alphabet, const std::string& strand, size_t k); - + + // + // Get a single model + // static const PoreModel* get_model_by_key(const std::string& key); + // + // Get pointers to all models + // + static std::vector get_all_models(); + // // get all the models for the combination of parameters // From b13ab2d289a987143318125a6cd09d94e03db006 Mon Sep 17 00:00:00 2001 From: Mike Vella Date: Fri, 1 May 2020 14:38:39 +0100 Subject: [PATCH 80/80] Fixed edge case causing segfault when no reads are present in a scoreSet --- src/cuda_kernels/gpu_aligner.cu | 70 ++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 31 deletions(-) diff --git a/src/cuda_kernels/gpu_aligner.cu b/src/cuda_kernels/gpu_aligner.cu index 972086ed..30c19bd1 100644 --- a/src/cuda_kernels/gpu_aligner.cu +++ b/src/cuda_kernels/gpu_aligner.cu @@ -466,7 +466,7 @@ std::vector>> GpuAligner::scoreKernelMod(std::ve auto scoreSet = scoreSets[scoreSetIdx]; int firstReadIdxinScoreSet = globalReadIdx; - //Read data + //Populate host buffers with data from raw reads. for (int eventSequenceIdx=0; eventSequenceIdx < scoreSet.rawData.size(); eventSequenceIdx++) { auto e = scoreSet.rawData[eventSequenceIdx]; numReads++; @@ -521,15 +521,22 @@ std::vector>> GpuAligner::scoreKernelMod(std::ve auto & sequences = scoreSet.stateSequences; numSequences += sequences.size(); - + //Populate host buffers with data from sequences. for (int i = 0; ik; + // If there is no raw data associated with this scoreSet, then a default of k=1 is used. + // The sequence is copied to the device, although it is not actually used since there are no + // raw reads to compute candidates with. + uint32_t k = 1; + if (scoreSet.rawData.size() > 0){ + k = scoreSet.rawData[0].pore_model->k; + } int numKmers = sequenceLength - k + 1; for(size_t ki = 0; ki < numKmers; ++ki) { @@ -548,14 +555,13 @@ std::vector>> GpuAligner::scoreKernelMod(std::ve sequenceLengthsHost[globalSequenceIdx] = numKmers; - // Loop over the raw reads, producing a cartesian product of reads and sequences + // Loop over the raw reads, producing a Cartesian product of reads and sequences auto numReadsInScoreSet = scoreSet.rawData.size(); for (int r=0; r>> GpuAligner::scoreKernelMod(std::ve dim3 dimBlock(blockSize); dim3 dimGrid(numBlocks); - //printf("Launching get scores mod kernel\n"); - getScoresMod <<< dimGrid, dimBlock, 0, streams[0]>>> (poreModelDev, - readLengthsDev, - eventStartsDev, - eventStridesDev, - eventsPerBaseDev, - scaleDev, - shiftDev, - varDev, - logVarDev, - eventOffsetsDev, - eventMeansDev, - modelOffsetsDev, - preFlankingDev, - postFlankingDev, - sequenceLengthsDev, - sequenceOffsetsDev, - kmerRanksDev, - seqIdxDev, - readIdxDev, - globalScoreIdx, - scoresDev); - cudaError_t err = cudaGetLastError(); - - if (err != cudaSuccess) - printf("Errors during kernel execution: %s\n", cudaGetErrorString(err)); + if (globalScoreIdx > 0){ + getScoresMod <<< dimGrid, dimBlock, 0, streams[0]>>> (poreModelDev, + readLengthsDev, + eventStartsDev, + eventStridesDev, + eventsPerBaseDev, + scaleDev, + shiftDev, + varDev, + logVarDev, + eventOffsetsDev, + eventMeansDev, + modelOffsetsDev, + preFlankingDev, + postFlankingDev, + sequenceLengthsDev, + sequenceOffsetsDev, + kmerRanksDev, + seqIdxDev, + readIdxDev, + globalScoreIdx, + scoresDev); + cudaError_t err = cudaGetLastError(); + + if (err != cudaSuccess){ + printf("Errors during GPU kernel execution: %s\n", cudaGetErrorString(err)); + } + } cudaMemcpyAsync(returnValuesHost, scoresDev, globalScoreIdx * sizeof(float), cudaMemcpyDeviceToHost, streams[0]); cudaStreamSynchronize(streams[0]);