From bb403657a5f5c7d50e84244b97e69ece022d798a Mon Sep 17 00:00:00 2001
From: Mike Vella <mike.vella@nanoporetech.com>
Date: Thu, 7 Jun 2018 11:26:16 +0100
Subject: [PATCH 01/80] acuda stubs

---
 Makefile                       |  28 +++++---
 src/cuda_kernels/GpuAligner.cu | 121 +++++++++++++++++++++++++++++++++
 src/cuda_kernels/GpuAligner.h  |  61 +++++++++++++++++
 3 files changed, 201 insertions(+), 9 deletions(-)
 create mode 100644 src/cuda_kernels/GpuAligner.cu
 create mode 100644 src/cuda_kernels/GpuAligner.h
diff --git a/Makefile b/Makefile
index 9b484626..afe84ea1 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 #
 
 # Sub directories containing source code, except for the main programs
-SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/alignment src/pore_model
+SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/alignment src/pore_model src/cuda_kernels
 
 #
 # Set libraries, paths, flags and options
@@ -11,9 +11,12 @@ SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/ali
 LIBS=-lz
 CXXFLAGS ?= -g -O3
 CXXFLAGS += -std=c++11 -fopenmp -fsigned-char
-CFLAGS ?= -O3 -std=c99
+CFLAGS ?= -std=c99 -O3
 CXX ?= g++
 CC ?= gcc
+NVCC = nvcc
+NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -g
+CURTFLAGS ?= -L/usr/local/cuda-9.0/lib64 -lcudart
 
 # Change the value of HDF5, EIGEN, or HTS below to any value to disable compilation of bundled code
 HDF5?=install
@@ -102,20 +105,24 @@ eigen/INSTALL:
 
 # Find the source files by searching subdirectories
 CPP_SRC := $(foreach dir, $(SUBDIRS), $(wildcard $(dir)/*.cpp))
+CU_SRC := $(foreach dir, $(SUBDIRS), $(wildcard $(dir)/*.cu))
 C_SRC := $(foreach dir, $(SUBDIRS), $(wildcard $(dir)/*.c))
 EXE_SRC=src/main/nanopolish.cpp src/test/nanopolish_test.cpp
 
 # Automatically generated object names
 CPP_OBJ=$(CPP_SRC:.cpp=.o)
 C_OBJ=$(C_SRC:.c=.o)
+CU_OBJ=$(CU_SRC:.cu=.o)
+
+.SUFFIXES: .cu
 
 # Generate dependencies
 PHONY=depend
 depend: .depend
 
-.depend: $(CPP_SRC) $(C_SRC) $(EXE_SRC) $(H5_LIB) $(EIGEN_CHECK)
+.depend: $(CPP_SRC) $(C_SRC) $(CU_SRC) $(EXE_SRC) $(H5_LIB) $(EIGEN_CHECK)
 	rm -f ./.depend
-	$(CXX) $(CXXFLAGS) $(CPPFLAGS) -MM $(CPP_SRC) $(C_SRC) > ./.depend;
+	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $(NVCCFLAGS) $(NVCC) -MM $(CPP_SRC) $(C_SRC) $(CU_SRC) > ./.depend;
 
 include .depend
 
@@ -126,16 +133,19 @@ include .depend
 .c.o:
 	$(CC) -o $@ -c $(CFLAGS) $(CPPFLAGS) $(H5_INCLUDE) -fPIC $<
 
+.cu.o:
+	$(NVCC) -o $@ -c $(NVCCFLAGS) $(CPPFLAGS) $<
+
 # Link main executable
-$(PROGRAM): src/main/nanopolish.o $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(EIGEN_CHECK)
-	$(CXX) -o $@ $(CXXFLAGS) $(CPPFLAGS) -fPIC $< $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(LIBS) $(LDFLAGS)
+$(PROGRAM): src/main/nanopolish.o $(CU_OBJ) $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(EIGEN_CHECK)
+	$(CXX) -o $@ $(CXXFLAGS) $(CPPFLAGS) -fPIC $< $(CPP_OBJ) $(CU_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(LIBS) $(LDFLAGS) $(CURTFLAGS)
 
 # Link test executable
-$(TEST_PROGRAM): src/test/nanopolish_test.o $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB)
-	$(CXX) -o $@ $(CXXFLAGS) $(CPPFLAGS) -fPIC $< $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(LIBS) $(LDFLAGS)
+$(TEST_PROGRAM): src/test/nanopolish_test.o $(CPP_OBJ) $(CU_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB)
+	$(CXX) -o $@ $(CXXFLAGS) $(CPPFLAGS) -fPIC $< $(CPP_OBJ) $(CU_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(LIBS) $(LDFLAGS) $(CURTFLAGS)
 
 test: $(TEST_PROGRAM)
 	./$(TEST_PROGRAM)
 
 clean:
-	rm -f $(PROGRAM) $(TEST_PROGRAM) $(CPP_OBJ) $(C_OBJ) src/main/nanopolish.o src/test/nanopolish_test.o
+	rm -f $(PROGRAM) $(TEST_PROGRAM) $(CPP_OBJ) $(CU_OBJ) $(C_OBJ) src/main/nanopolish.o src/test/nanopolish_test.o
diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
new file mode 100644
index 00000000..1f9ae48d
--- /dev/null
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -0,0 +1,121 @@
+#include <iostream>
+#include <cuda.h>
+#include "GpuAligner.h"
+#include <vector>
+
+__global__ void findSumToN(int *n, int limit)
+{
+    int tId = threadIdx.x;
+
+    for (int i=0; i<=(int)log2((double)limit); i++)
+    {
+        if (tId%(int)(pow(2.0,(double)(i+1))) == 0){
+            if (tId+(int)pow(2.0, (double)i) >= limit) break;
+            n[tId] += n[tId+(int)pow(2.0, (double)i)];
+        }
+        __syncthreads();
+    }
+}
+
+GpuAligner::GpuAligner()
+{
+    y = 20;
+    asize = y*sizeof(int);
+    for (int i=0; i<y; i++)
+        n[i] = i;
+}
+
+int GpuAligner::calculateSum()
+{
+    int *n_d;
+    cudaMalloc( (void**)&n_d, asize );
+
+    cudaMemcpy(n_d, n, asize, cudaMemcpyHostToDevice );
+
+    dim3 dimBlock( y, 1 );
+    dim3 dimGrid( 1, 1 );
+    findSumToN<<<dimGrid, dimBlock>>>(n_d, y);
+    cudaMemcpy(n, n_d, asize, cudaMemcpyDeviceToHost);
+    cudaFree (n_d);
+    return n[0];
+}
+
+void GpuAligner::setY(int newVal)
+{
+    y = newVal;
+    asize = y*sizeof(int);
+    for (int i=0; i<y; i++)
+        n[i] = i;
+
+}
+
+double scoreKernel(std::vector<HMMInputSequence> sequences,
+                   std::vector<HMMInputData> event_sequences,
+                   uint32_t alignment_flags){
+
+    assert(!sequences.empty());
+    assert(std::string(sequences[0].get_alphabet()->get_name()) == "nucleotide");
+    for (auto e: event_sequences) {
+        assert(std::string(e.pore_model->pmalphabet->get_name()) == "nucleotide");
+        assert(e.read->pore_type == PT_R9);
+    }
+
+    size_t num_models = sequences.size();
+    double num_model_penalty = log(num_models);
+
+    assert(num_models == 1); //this is temporary
+
+    // start preparing the data for the CUDA Kernel
+
+
+
+    return 0.210964;
+}
+
+std::vector<double> GpuAligner::variantScoresThresholded(std::vector<Variant> input_variants,
+                                                        Haplotype base_haplotype,
+                                                        std::vector<HMMInputData> event_sequences,
+                                                        uint32_t alignment_flags,
+                                                        int screen_score_threshold,
+                                                        std::vector<std::string> methylation_types) {
+    int numVariants = input_variants.size();
+
+    std::vector<Variant> out_variants = input_variants;
+    std::vector<Haplotype> variant_haplotypes(numVariants, base_haplotype);
+
+    //loop over the vector, applying the variants to the haplotypes
+    for (int i = 0; i<input_variants.size();i++){
+        variant_haplotypes[i].apply_variant(input_variants[i]);
+    }
+
+
+    //variant_haplotype.apply_variant(input_variant);
+
+    // Make methylated versions of each input sequence. Once for the base haplotype and once each for each variant
+    std::vector<HMMInputSequence> base_sequences = generate_methylated_alternatives(base_haplotype.get_sequence(),
+                                                                                    methylation_types);
+
+    assert(base_sequences.size() == 1);
+
+    std::vector<std::vector<HMMInputSequence>> methylatedVariantSequences;
+    for(auto variant: variant_haplotypes) {
+        std::vector<HMMInputSequence> variant_sequences = generate_methylated_alternatives(
+                variant.get_sequence(), methylation_types);
+        methylatedVariantSequences.push_back(variant_sequences);
+
+    }
+
+    //For now let's not worry about methylation
+    assert(methylatedVariantSequences.size() == numVariants);
+    for (auto m: methylatedVariantSequences) {
+        assert(m.size() == 1);
+    }
+    //Next we need to get the scores.
+
+    // return the sum of the score for the base sequences over all the event sequences
+    double base_score = scoreKernel(base_sequences, event_sequences, alignment_flags);
+
+    std::vector<double> v;
+    v.push_back(base_score);
+    return v;
+}
\ No newline at end of file
diff --git a/src/cuda_kernels/GpuAligner.h b/src/cuda_kernels/GpuAligner.h
new file mode 100644
index 00000000..b6a8bbe1
--- /dev/null
+++ b/src/cuda_kernels/GpuAligner.h
@@ -0,0 +1,61 @@
+//
+// Created by mike on 05/06/18.
+//
+#include <vector>
+#include "nanopolish_variant.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <string>
+#include <vector>
+#include <inttypes.h>
+#include <assert.h>
+#include <math.h>
+#include <sys/time.h>
+#include <algorithm>
+#include <queue>
+#include <sstream>
+#include <fstream>
+#include <set>
+#include <omp.h>
+#include <getopt.h>
+#include <iterator>
+#include "htslib/faidx.h"
+#include "nanopolish_poremodel.h"
+#include "nanopolish_transition_parameters.h"
+#include "nanopolish_matrix.h"
+#include "nanopolish_klcs.h"
+#include "nanopolish_profile_hmm.h"
+#include "nanopolish_alignment_db.h"
+#include "nanopolish_anchor.h"
+#include "nanopolish_variant.h"
+#include "nanopolish_haplotype.h"
+#include "nanopolish_pore_model_set.h"
+#include "nanopolish_duration_model.h"
+#include "nanopolish_variant_db.h"
+#include "profiler.h"
+#include "progress.h"
+#include "stdaln.h"
+#include <chrono>
+
+#ifndef GPU_ALIGNER_H
+#define GPU_ALIGNER_H1
+
+class GpuAligner
+{
+public:
+    int n[20];
+    int y;
+    int asize;
+
+    GpuAligner();
+    int calculateSum();
+    void setY(int);
+
+    std::vector<double>
+    variantScoresThresholded(std::vector<Variant> tmp_variants, Haplotype haplotype, std::vector<HMMInputData> event_sequences,
+              uint32_t alignment_flags, int screen_score_threshold, std::vector<std::string> methylation_types);// {
+        //return std::vector<double>();
+    //}
+};
+#endif // GPU_ALIGNER_H
\ No newline at end of file

From 26f042d2eb8691be9f07eb3f7ae25e1dfca501d0 Mon Sep 17 00:00:00 2001
From: Mike Vella <mike.vella@nanoporetech.com>
Date: Thu, 7 Jun 2018 17:31:36 +0100
Subject: [PATCH 02/80] Sending event means to device

---
 src/common/nanopolish_variant.cpp     |  2 +-
 src/cuda_kernels/GpuAligner.cu        | 88 ++++++++++++++++++++++++++-
 src/hmm/nanopolish_profile_hmm_r9.cpp |  2 +-
 src/hmm/nanopolish_profile_hmm_r9.inl |  2 +-
 src/main/nanopolish.cpp               | 11 ++--
 src/nanopolish_call_variants.cpp      | 60 ++++++++++++++++--
 6 files changed, 152 insertions(+), 13 deletions(-)

diff --git a/src/common/nanopolish_variant.cpp b/src/common/nanopolish_variant.cpp
index 725a62ab..b73a6b2b 100644
--- a/src/common/nanopolish_variant.cpp
+++ b/src/common/nanopolish_variant.cpp
@@ -664,7 +664,7 @@ std::vector<Variant> multi_call(VariantGroup& variant_group,
 //
 Variant score_variant_thresholded(const Variant& input_variant,
                                   Haplotype base_haplotype, 
-                                  const std::vector<HMMInputData>& input,
+                                  const std::vector<HMMInputData>& input, // raw reads (I think)
                                   const uint32_t alignment_flags,
                                   const uint32_t score_threshold,
                                   const std::vector<std::string>& methylation_types)
diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index 1f9ae48d..122195e7 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -2,9 +2,12 @@
 #include <cuda.h>
 #include "GpuAligner.h"
 #include <vector>
+#include "nanopolish_profile_hmm_r9.h"
+
 
 __global__ void findSumToN(int *n, int limit)
 {
+    //printf("HELLO FROM SUM\n");
     int tId = threadIdx.x;
 
     for (int i=0; i<=(int)log2((double)limit); i++)
@@ -17,6 +20,20 @@ __global__ void findSumToN(int *n, int limit)
     }
 }
 
+
+__global__ void getScores(float * eventData, float * returnValues)
+{
+    int tId = threadIdx.x;
+    if (tId == 0) {
+        printf("data: %f\n", eventData[0]);
+        printf("data: %f\n", eventData[1]);
+        printf("data: %f\n", eventData[2]);
+    }
+    returnValues[0] = 0.356;
+    //__syncthreads();
+}
+
+
 GpuAligner::GpuAligner()
 {
     y = 20;
@@ -53,11 +70,13 @@ double scoreKernel(std::vector<HMMInputSequence> sequences,
                    std::vector<HMMInputData> event_sequences,
                    uint32_t alignment_flags){
 
+    // These asserts are here during the development phase
     assert(!sequences.empty());
     assert(std::string(sequences[0].get_alphabet()->get_name()) == "nucleotide");
     for (auto e: event_sequences) {
         assert(std::string(e.pore_model->pmalphabet->get_name()) == "nucleotide");
         assert(e.read->pore_type == PT_R9);
+        assert( (e.rc && e.event_stride == -1) || (!e.rc && e.event_stride == 1));
     }
 
     size_t num_models = sequences.size();
@@ -65,11 +84,76 @@ double scoreKernel(std::vector<HMMInputSequence> sequences,
 
     assert(num_models == 1); //this is temporary
 
-    // start preparing the data for the CUDA Kernel
+    auto sequence = sequences[0]; // temporary. We are only going to score one sequence against a set of events for now.
+
+    const uint32_t k = event_sequences[0].pore_model->k; //k is the kmerity
+    uint32_t n_kmers = sequence.length() - k + 1; //number of kmers in the sequence
+
+    uint32_t n_states = PSR9_NUM_STATES * (n_kmers + 2); // + 2 for explicit terminal states
+
+    std::vector<uint32_t> n_rows; //number of rows in the DP table (n_events + 1)
+    std::vector<uint32_t> e_starts; //event starts
+
+    for(auto e: event_sequences){
+        uint32_t e_start = e.event_start_idx;
+        e_starts.push_back(e_start);
+        uint32_t e_end = e.event_stop_idx;
+        uint32_t n_events = 0;
+        if(e_end > e_start)
+            n_events = e_end - e_start + 1;
+        else
+            n_events = e_start - e_end + 1;
+
+        n_rows.push_back(n_events + 1);
+    }
+
+
+    // Prepare raw data and send it over to the score calculator kernel
+
+    // Buffer 1: Raw event data and associated starts and stops
+
+    size_t numEventsTotal = 0;
+    //1. Count the total number of events across all reads
+    std::vector<int> eventLengths;
+    for (auto e: event_sequences){
+        size_t numEvents = e.read->events->size();
+
+        eventLengths.push_back(numEvents);
+        numEventsTotal += numEvents;
+    }
+
+    float * eventMeans;
+    //Allocate a host buffer to store the event means
+    size_t eventMeansSize = numEventsTotal * sizeof(float);
+    cudaHostAlloc(&eventMeans, eventMeansSize , cudaHostAllocDefault);
+
+    size_t offset = 0;
+    for (auto ev: event_sequences){
+        size_t num_events = ev.read->events->size();
+        for (int i=0;i<num_events;i++) {
+            eventMeans[offset + i] = ev.read->events[0][i].mean; //taking the first element. Not sure what the second one is..
+        }
+        offset += num_events;
+    }
+
+
+    float* devicePtr;
+    cudaMalloc( (void**)&devicePtr, eventMeansSize);
+    cudaMemcpy( devicePtr, eventMeans, eventMeansSize, cudaMemcpyHostToDevice );
+
+    dim3 dimBlock( 1, 1 );
+    dim3 dimGrid( 1, 1 );
+
+    float * returnValues;
+    cudaMalloc((void **) &returnValues, sizeof(float) * num_models); //one score per read
 
+    float * returnedValues;
+    getScores<<<dimGrid, dimBlock>>>(devicePtr, returnValues);
 
+    cudaMemcpy(returnedValues, returnValues, num_models *sizeof(float), cudaMemcpyDeviceToHost );
 
-    return 0.210964;
+    auto r = returnedValues[0];
+    return r;
 }
 
 std::vector<double> GpuAligner::variantScoresThresholded(std::vector<Variant> input_variants,
diff --git a/src/hmm/nanopolish_profile_hmm_r9.cpp b/src/hmm/nanopolish_profile_hmm_r9.cpp
index 773394a7..1f365ebe 100644
--- a/src/hmm/nanopolish_profile_hmm_r9.cpp
+++ b/src/hmm/nanopolish_profile_hmm_r9.cpp
@@ -46,7 +46,7 @@ float profile_hmm_score_r9(const HMMInputSequence& sequence, const HMMInputData&
     FloatMatrix fm;
     allocate_matrix(fm, n_rows, n_states);
 
-    profile_hmm_forward_initialize_r9(fm);
+    profile_hmm_forward_initialize_r9(fm); // what does this do?
 
     ProfileHMMForwardOutputR9 output(&fm);
 
diff --git a/src/hmm/nanopolish_profile_hmm_r9.inl b/src/hmm/nanopolish_profile_hmm_r9.inl
index 71d52aba..76de768f 100644
--- a/src/hmm/nanopolish_profile_hmm_r9.inl
+++ b/src/hmm/nanopolish_profile_hmm_r9.inl
@@ -285,7 +285,7 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
     
     // Calculate number of blocks
     // A block of the HMM is a set of states for one kmer
-    uint32_t num_blocks = output.get_num_columns() / PSR9_NUM_STATES;
+    uint32_t num_blocks = output.get_num_columns() / PSR9_NUM_STATES; // num_columns is the number of kmers
     uint32_t last_event_row_idx = output.get_num_rows() - 1;
 
     // Precompute the transition probabilites for each kmer block
diff --git a/src/main/nanopolish.cpp b/src/main/nanopolish.cpp
index 054f5063..cc6fcab7 100644
--- a/src/main/nanopolish.cpp
+++ b/src/main/nanopolish.cpp
@@ -64,6 +64,7 @@ int main(int argc, char** argv)
 {
     // Turn off HDF's exception printing, which is generally unhelpful for users
     H5Eset_auto(0, NULL, NULL);
+    std::cout << "CHECKPOINT 1\n";
 
     int ret = 0;
     if(argc <= 1) {
@@ -73,9 +74,11 @@ int main(int argc, char** argv)
     } else {
         std::string command(argv[1]);
         auto iter = programs.find(command);
-        if (iter != programs.end()) 
-            ret = iter->second( argc - 1, argv + 1);
-        else
+        if (iter != programs.end()) {
+            std::cout << "CHECKPOINT 2: " << iter->first <<std::endl;
+            ret = iter->second(argc - 1, argv + 1);
+        }
+       else
             ret = print_usage( argc - 1, argv + 1);
     }
 
@@ -88,7 +91,7 @@ int main(int argc, char** argv)
     extern int g_failed_alignment_reads;
     extern int g_bad_fast5_file;
     if(g_total_reads > 0) {
-        fprintf(stderr, "[post-run summary] total reads: %d, unparseable: %d, qc fail: %d, could not calibrate: %d, no alignment: %d, bad fast5: %d\n", 
+        fprintf(stderr, "[post-run summaryz] total reads: %d, unparseable: %d, qc fail: %d, could not calibrate: %d, no alignment: %d, bad fast5: %d\n",
             g_total_reads, g_unparseable_reads, g_qc_fail_reads, g_failed_calibration_reads, g_failed_alignment_reads, g_bad_fast5_file);
     }
     return ret;
diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp
index 053dff15..34d46ddc 100644
--- a/src/nanopolish_call_variants.cpp
+++ b/src/nanopolish_call_variants.cpp
@@ -38,6 +38,9 @@
 #include "profiler.h"
 #include "progress.h"
 #include "stdaln.h"
+#include <chrono>
+#include <cuda_kernels/GpuAligner.h>
+
 
 // Macros
 #define max3(x,y,z) std::max(std::max(x,y), z)
@@ -277,11 +280,18 @@ std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& ali
                                                           int region_end,
                                                           uint32_t alignment_flags)
 {
+    std::cout << "CHECKPOINT 13" << std::endl;
+    auto start = std::chrono::high_resolution_clock::now();
+
     std::vector<Variant> out_variants;
 
     std::string contig = alignments.get_region_contig();
 
     // Add all positively-scoring single-base changes into the candidate set
+
+
+    auto scoring = std::chrono::high_resolution_clock::now() - std::chrono::high_resolution_clock::now();
+
     for(size_t i = region_start; i < region_end; ++i) {
 
         int calling_start = i - opt::screen_flanking_sequence;
@@ -335,15 +345,44 @@ std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& ali
                                  calling_start,
                                  alignments.get_reference_substring(contig, calling_start, calling_end));
 
+        GpuAligner aligner;
+        aligner.setY(15);
+        std::cout << aligner.calculateSum() <<std::endl;
+
+        std::vector<double> scores = aligner.variantScoresThresholded(tmp_variants, test_haplotype, event_sequences,
+                                                       alignment_flags, opt::screen_score_threshold,
+                                                       opt::methylation_types);
+
         for(const Variant& v : tmp_variants) {
-            Variant scored_variant = score_variant_thresholded(v, test_haplotype, event_sequences, alignment_flags, opt::screen_score_threshold, opt::methylation_types);
+            auto t0 = std::chrono::high_resolution_clock::now();
+            Variant scored_variant = score_variant_thresholded(v,
+                                                               test_haplotype,
+                                                               event_sequences,
+                                                               alignment_flags,
+                                                               opt::screen_score_threshold,
+                                                               opt::methylation_types);
+            auto t1 = std::chrono::high_resolution_clock::now();
+            scoring += t1-t0;
             scored_variant.info = "";
             if(scored_variant.quality > 0) {
                 out_variants.push_back(scored_variant);
             }
         }
-
     }
+
+    std::cout << "CHECKPOINT 14 - Region end - start ength= " << region_end - region_start << std::endl;
+
+    auto end = std::chrono::high_resolution_clock::now();
+
+    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>( end - start ).count();
+
+    auto screening = std::chrono::duration_cast<std::chrono::milliseconds>(scoring).count();
+
+    std::cout << "FUNCTION TOOK " << duration << "ms" << std::endl;
+    std::cout << "SCREENING COMPONENT TOOK " << screening << "ms" << std::endl;
+
+
+
     return out_variants;
 }
 
@@ -894,7 +933,7 @@ Haplotype call_variants_for_region(const std::string& contig, int region_start,
                                alignments.get_region_start(),
                                alignments.get_reference());
 */
-
+    std::cout<<"CHECKPOINT 8 - Data loaded"<<std::endl;
     // Step 1. Discover putative variants across the whole region
     std::vector<Variant> candidate_variants;
     if(opt::candidates_file.empty()) {
@@ -903,13 +942,16 @@ Haplotype call_variants_for_region(const std::string& contig, int region_start,
         candidate_variants = read_variants_for_region(opt::candidates_file, contig, region_start, region_end);
     }
 
+    std::cout<<"CHECKPOINT 9 - Candidate variants generated"<<std::endl;
+
     if(opt::consensus_mode) {
 
         // generate single-base edits that have a positive haplotype score
         std::vector<Variant> single_base_edits = generate_candidate_single_base_edits(alignments, region_start, region_end, alignment_flags);
-
+        std::cout<<"CHECKPOINT 11 - Single base edits generated"<<std::endl;
         // insert these into the candidate set
         candidate_variants.insert(candidate_variants.end(), single_base_edits.begin(), single_base_edits.end());
+        std::cout<<"CHECKPOINT 12 - Single base edits inserted into vector"<<std::endl;
 
         // deduplicate variants
         std::set<Variant, VariantKeyComp> dedup_set(candidate_variants.begin(), candidate_variants.end());
@@ -918,6 +960,8 @@ Haplotype call_variants_for_region(const std::string& contig, int region_start,
         std::sort(candidate_variants.begin(), candidate_variants.end(), sortByPosition);
     }
 
+    std::cout<<"CHECKPOINT 10 - Additional candidate variants generated"<<std::endl;
+
     // Step 2. Call variants
 
     Haplotype called_haplotype(alignments.get_region_contig(),
@@ -926,6 +970,7 @@ Haplotype call_variants_for_region(const std::string& contig, int region_start,
 
 
     if(opt::consensus_mode) {
+        std::cout << "CHECKPOINT 7 - CONSENSUS MODE" << std::endl;
         //
         // Calling strategy in consensus mode
         //
@@ -1118,6 +1163,7 @@ int call_variants_main(int argc, char** argv)
     int end_base;
     int contig_length = -1;
 
+    std::cout << "Checkpoint 3" << std::endl;
     // If a window has been specified, only call variants/polish in that range
     if(!opt::window.empty()) {
         // Parse the window string
@@ -1153,6 +1199,8 @@ int call_variants_main(int argc, char** argv)
         out_fp = stdout;
     }
 
+    std::cout << "Checkpoint 4" << std::endl;
+
     // Build the VCF header
     std::vector<std::string> tag_fields;
 
@@ -1187,10 +1235,14 @@ int call_variants_main(int argc, char** argv)
             Variant::make_vcf_tag_string("FORMAT", "GT", 1, "String",
                 "Genotype"));
 
+    std::cout << "Checkpoint 5" << std::endl;
+
     Variant::write_vcf_header(out_fp, tag_fields);
 
     Haplotype haplotype = call_variants_for_region(contig, start_base, end_base, out_fp);
 
+    std::cout << "Checkpoint 6" << std::endl;
+
     if(out_fp != stdout) {
         fclose(out_fp);
     }

From 381c3c5b7ea36707fa2dba23786b5e3bbf2f5a9e Mon Sep 17 00:00:00 2001
From: Mike Vella <mike.vella@nanoporetech.com>
Date: Mon, 11 Jun 2018 15:12:47 +0100
Subject: [PATCH 03/80] estimating emission probabilities

---
 Makefile                              |   4 +-
 src/cuda_kernels/GpuAligner.cu        | 239 +++++++++++++++++++++++---
 src/hmm/nanopolish_profile_hmm_r9.inl |   2 +-
 3 files changed, 222 insertions(+), 23 deletions(-)

diff --git a/Makefile b/Makefile
index afe84ea1..27030a55 100644
--- a/Makefile
+++ b/Makefile
@@ -9,9 +9,9 @@ SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/ali
 
 #Basic flags every build needs
 LIBS=-lz
-CXXFLAGS ?= -g -O3
+CXXFLAGS ?= -g #-O3
 CXXFLAGS += -std=c++11 -fopenmp -fsigned-char
-CFLAGS ?= -std=c99 -O3
+CFLAGS ?= -std=c99 #-O3
 CXX ?= g++
 CC ?= gcc
 NVCC = nvcc
diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index 122195e7..c0b5b798 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -4,7 +4,6 @@
 #include <vector>
 #include "nanopolish_profile_hmm_r9.h"
 
-
 __global__ void findSumToN(int *n, int limit)
 {
     //printf("HELLO FROM SUM\n");
@@ -20,16 +19,126 @@ __global__ void findSumToN(int *n, int limit)
     }
 }
 
+//TODO: Implement, inc pore model
+__device__ float lp_match_r9(int rank,
+                             float mean,
+                             float * poreModelLevelLogStdv,
+                             float * poreModelLevelStdv,
+                             float * poreModelLevelMean){
+    float log_inv_sqrt_2pi = log(0.3989422804014327); // no need to calculate this every time. better solutions available..
+
+    // STEP 1: GET DRIFT-SCALED LEVEL:
+    float level = mean; //TODO: Do actual drift scaling. this is a cheat
+    // TODO: STEP 2: Get *scaled* Gaussian from pore model
+    //these can just be pulled from the model
+    //float gaussian_mean = 0.0;
+    //float gaussian_stdv = 0.0;
+    //float gaussian_log_level_stdv = 0.0;
+    float gaussian_mean = poreModelLevelMean[rank];
+    float gaussian_stdv = poreModelLevelStdv[rank];
+    float gaussian_log_level_stdv = poreModelLevelLogStdv[rank];
+    // Step 3: calculate log-normal PDF
+    float a = (level - gaussian_mean) / gaussian_stdv; // g is the gaussian parameters
+    return log_inv_sqrt_2pi - gaussian_log_level_stdv + (-0.5f * a * a); // log_inv_sqrt_2pi is defined in a comment above
+
+    return 0.1973;
+}
 
-__global__ void getScores(float * eventData, float * returnValues)
+__global__ void getScores(float * eventData,
+                          float * readEventsPerBase,
+                          int * numRowsPerRead,
+                          int * eventStarts,
+                          int * eventStrides,
+                          int * kmer_ranks,
+                          int * kmer_ranks_rc,
+                          int * eventOffsets, // Offset to use for getting an event IDX for a specific read (read obtained by block IDX)
+                          float * poreModelLevelLogStdv,
+                          float * poreModelLevelStdv,
+                          float * poreModelLevelMean,
+                          float * returnValues)
 {
-    int tId = threadIdx.x;
-    if (tId == 0) {
-        printf("data: %f\n", eventData[0]);
-        printf("data: %f\n", eventData[1]);
-        printf("data: %f\n", eventData[2]);
+    printf("Entered\n");
+    //float log_inv_sqrt_2pi = log(0.3989422804014327);
+
+    //Step 1: calculate transitions. For now we are going to use external params.
+    int readIdx = blockIdx.x;
+    float read_events_per_base = readEventsPerBase[readIdx];
+    int numRows = numRowsPerRead[readIdx]; // Number of rows in this DP table.
+    int e_start = eventStarts[readIdx]; // Event start for read
+    int e_stride = eventStrides[readIdx];
+    int e_offset = eventOffsets[readIdx]; // Within the event means etc, the offset needed for this block to get a specific event
+    //int kmer_ranks = kmerRanks[readIdx.x]; // TODO: Use RC for RC reads
+
+    int kmerIdx = threadIdx.x;
+
+    float p_stay = 1 - (1 / read_events_per_base);
+
+    //printf("Events per base: %f \n", read_events_per_base);
+    float p_skip = 0.0025;
+    float p_bad = 0.001;
+    float p_bad_self = p_bad;
+    float p_skip_self = 0.3;
+
+    float p_mk = p_skip; // probability of not observing an event at all
+    float p_mb = p_bad; // probabilty of observing a bad event
+    float p_mm_self = p_stay; // probability of observing additional events from this k-mer
+    float p_mm_next = 1.0f - p_mm_self - p_mk - p_mb; // normal movement from state to state
+
+    // transitions from event split state in previous block
+    float p_bb = p_bad_self;
+    float p_bk, p_bm_next, p_bm_self;
+    p_bk = p_bm_next = p_bm_self = (1.0f - p_bb) / 3;
+
+    // transitions from kmer skip state in previous block
+    float p_kk = p_skip_self;
+    float p_km = 1.0f - p_kk;
+
+    // We assign some transition probabilities. I believe this is correct and they don't vary by location in the sequence (why would they)
+    float lp_mk = log(p_mk);
+    float lp_mb = log(p_mb);
+    float lp_mm_self = log(p_mm_self);
+    float lp_mm_next = log(p_mm_next);
+    float lp_bb = log(p_bb);
+    float lp_bk = log(p_bk);
+    float lp_bm_next = log(p_bm_next);
+    float lp_bm_self = log(p_bm_self);
+    float lp_kk = log(p_kk);
+    float lp_km = log(p_km);
+
+
+    // Start filling out the "DP table"
+    // Each thread is going to work on an individual P-HMM Block
+    // WRONG - need to use threadIdx & think carefully. we have one thread per block/kmer. each block has 3 states tho.
+    //int kmerIdx = blockIdx.x;
+    int curBlockIdx = kmerIdx + 1; // Accounts for fact that we are not working with start block.
+    int prevBlockIdx = curBlockIdx -1;
+    int prevBlockOffset = PSR9_NUM_STATES * prevBlockIdx;
+    int curBlockOffset = PSR9_NUM_STATES * curBlockIdx;
+
+    // the penalty is controlled by the transition probability
+    float BAD_EVENT_PENALTY = 0.0f;
+
+    for(int row=1; row<numRows;row++){
+        // Emission probabilities
+        int event_idx = e_start + (row - 1) * e_stride;
+        uint32_t rank = kmer_ranks[kmerIdx]; // lexical rank of a kmer
+        float event_mean = eventData[e_offset + row];
+        float lp_emission_m = lp_match_r9(rank,
+                                          event_mean,
+                                          poreModelLevelLogStdv,
+                                          poreModelLevelStdv,
+                                          poreModelLevelMean);
+        printf("LP MATCH: %f\n", lp_emission_m);
+        float lp_emission_b = BAD_EVENT_PENALTY;
     }
-    returnValues[0] = 0.356;
+
+    //int tId = threadIdx.x;
+    ///if (tId == 0) {
+    //    printf("data: %f\n", eventData[0]);
+    //    printf("data: %f\n", eventData[1]);
+    //    printf("data: %f\n", eventData[2]);
+    //}
+    returnValues[blockIdx.x] = 0.356;
     //__syncthreads();
 }
 
@@ -70,6 +179,10 @@ double scoreKernel(std::vector<HMMInputSequence> sequences,
                    std::vector<HMMInputData> event_sequences,
                    uint32_t alignment_flags){
 
+    // Extract the pore model.
+    //Let's assume that every event sequence has the same pore model
+    //event_sequences[0].pore_model.
+
     // These asserts are here during the development phase
     assert(!sequences.empty());
     assert(std::string(sequences[0].get_alphabet()->get_name()) == "nucleotide");
@@ -93,10 +206,15 @@ double scoreKernel(std::vector<HMMInputSequence> sequences,
 
     std::vector<uint32_t> n_rows; //number of rows in the DP table (n_events + 1)
     std::vector<uint32_t> e_starts; //event starts
+    std::vector<uint32_t> event_strides;
 
     for(auto e: event_sequences){
         uint32_t e_start = e.event_start_idx;
         e_starts.push_back(e_start);
+
+        uint32_t e_stride = e.event_stride;
+        event_strides.push_back(e_stride);
+
         uint32_t e_end = e.event_stop_idx;
         uint32_t n_events = 0;
         if(e_end > e_start)
@@ -107,6 +225,12 @@ double scoreKernel(std::vector<HMMInputSequence> sequences,
         n_rows.push_back(n_events + 1);
     }
 
+    std::vector<uint32_t> kmer_ranks(n_kmers);
+    std::vector<uint32_t> kmer_ranks_rc(n_kmers);
+    for(size_t ki = 0; ki < n_kmers; ++ki) {
+        kmer_ranks[ki] = sequences[0].get_kmer_rank(ki, k, false);
+        kmer_ranks_rc[ki] = sequences[0].get_kmer_rank(ki, k, true);
+    }
 
     // Prepare raw data and send it over to the score calculator kernel
 
@@ -115,10 +239,14 @@ double scoreKernel(std::vector<HMMInputSequence> sequences,
     size_t numEventsTotal = 0;
     //1. Count the total number of events across all reads
     std::vector<int> eventLengths;
+    std::vector<float> eventsPerBase;
     for (auto e: event_sequences){
         size_t numEvents = e.read->events->size();
+        float readEventsPerBase = e.read->events_per_base[e.strand];
 
         eventLengths.push_back(numEvents);
+        eventsPerBase.push_back(readEventsPerBase);
+
         numEventsTotal += numEvents;
     }
 
@@ -127,8 +255,10 @@ double scoreKernel(std::vector<HMMInputSequence> sequences,
     size_t eventMeansSize = numEventsTotal * sizeof(float);
     cudaHostAlloc(&eventMeans, eventMeansSize , cudaHostAllocDefault);
 
+    std::vector<int> eventOffsets;
     size_t offset = 0;
     for (auto ev: event_sequences){
+        eventOffsets.push_back(offset);
         size_t num_events = ev.read->events->size();
         for (int i=0;i<num_events;i++) {
             eventMeans[offset + i] = ev.read->events[0][i].mean; //taking the first element. Not sure what the second one is..
@@ -136,24 +266,93 @@ double scoreKernel(std::vector<HMMInputSequence> sequences,
         offset += num_events;
     }
 
+    int num_states = event_sequences[0].pore_model->states.size();
+    std::vector<float> pore_model_level_log_stdv(num_states);
+    std::vector<float> pore_model_level_mean(num_states);
+    std::vector<float> pore_model_level_stdv(num_states);
+
+    for(int st=0; st<num_states; st++){
+        auto params = event_sequences[0].pore_model->states[0]; //let's just initially get the params for AAAAAA
+        pore_model_level_log_stdv[st] = params.level_log_stdv;
+        pore_model_level_mean[st] = params.level_mean;
+        pore_model_level_stdv[st] = params.level_stdv;
+    }
 
-    float* devicePtr;
-    cudaMalloc( (void**)&devicePtr, eventMeansSize);
-    cudaMemcpy( devicePtr, eventMeans, eventMeansSize, cudaMemcpyHostToDevice );
 
-    dim3 dimBlock( 1, 1 );
-    dim3 dimGrid( 1, 1 );
+    float* poreModelLevelLogStdvDev;
+    cudaMalloc( (void**)&poreModelLevelLogStdvDev, pore_model_level_log_stdv.size() * sizeof(float));
+    cudaMemcpy( poreModelLevelLogStdvDev, pore_model_level_log_stdv.data(), pore_model_level_log_stdv.size() * sizeof(float), cudaMemcpyHostToDevice );
+
+    float* poreModelLevelMeanDev;
+    cudaMalloc( (void**)&poreModelLevelMeanDev, pore_model_level_mean.size() * sizeof(float));
+    cudaMemcpy( poreModelLevelMeanDev, pore_model_level_mean.data(), pore_model_level_mean.size() * sizeof(float), cudaMemcpyHostToDevice );
+
+    float* poreModelLevelStdvDev;
+    cudaMalloc( (void**)&poreModelLevelStdvDev, pore_model_level_stdv.size() * sizeof(float));
+    cudaMemcpy( poreModelLevelStdvDev, pore_model_level_stdv.data(), pore_model_level_stdv.size() * sizeof(float), cudaMemcpyHostToDevice );
+
+
+    float* eventsPerBaseDev;
+    cudaMalloc( (void**)&eventsPerBaseDev, eventsPerBase.size() * sizeof(float));
+    cudaMemcpy( eventsPerBaseDev, eventsPerBase.data(), eventsPerBase.size() * sizeof(float), cudaMemcpyHostToDevice );
+
+    float* eventMeansDev;
+    cudaMalloc( (void**)&eventMeansDev, eventMeansSize);
+    cudaMemcpy( eventMeansDev, eventMeans, eventMeansSize, cudaMemcpyHostToDevice );
+
+    int* numRowsDev;
+    cudaMalloc( (void**)&numRowsDev, n_rows.size() * sizeof(int));
+    cudaMemcpy( numRowsDev, n_rows.data(), n_rows.size() * sizeof(int), cudaMemcpyHostToDevice );
+
+    int* kmerRanksDev;
+    int* kmerRanksRCDev;
+    cudaMalloc( (void**)&kmerRanksDev, kmer_ranks.size() * sizeof(int));
+    cudaMalloc( (void**)&kmerRanksRCDev, kmer_ranks_rc.size() * sizeof(int));
+    cudaMemcpy( kmerRanksDev, kmer_ranks.data(), kmer_ranks.size() * sizeof(int), cudaMemcpyHostToDevice );
+    cudaMemcpy( kmerRanksRCDev, kmer_ranks_rc.data(), kmer_ranks_rc.size() * sizeof(int), cudaMemcpyHostToDevice );
+
+    int* eventStartsDev;
+    cudaMalloc( (void**)&eventStartsDev, e_starts.size() * sizeof(int));
+    cudaMemcpy( eventStartsDev, e_starts.data(), e_starts.size() * sizeof(int), cudaMemcpyHostToDevice );
+
+    int* eventStridesDev;
+    cudaMalloc( (void**)&eventStridesDev, event_strides.size() * sizeof(int));
+    cudaMemcpy( eventStridesDev, event_strides.data(), event_strides.size() * sizeof(int), cudaMemcpyHostToDevice );
+
+    int* eventOffsetsDev;
+    cudaMalloc( (void**)&eventOffsetsDev, eventOffsets.size() * sizeof(int));
+    cudaMemcpy( eventOffsetsDev, eventOffsets.data(), eventOffsets.size() * sizeof(int), cudaMemcpyHostToDevice );
+
+
+    dim3 dimBlock(num_models);
+
+    int num_blocks = n_states / PSR9_NUM_STATES;
+    uint32_t num_kmers = num_blocks - 2; // two terminal blocks
+
+
+    dim3 dimGrid(num_blocks - 2); // One thread per state, not including Start and Terminal state.
 
     float * returnValues;
     cudaMalloc((void **) &returnValues, sizeof(float) * num_models); //one score per read
 
     float * returnedValues;
-    getScores<<<dimGrid, dimBlock>>>(devicePtr, returnValues);
-
-    cudaMemcpy(returnedValues, returnValues, num_models *sizeof(float), cudaMemcpyDeviceToHost );
-
-    auto r = returnedValues[0];
-    return r;
+    getScores<<<dimGrid, dimBlock>>>(eventMeansDev,
+            eventsPerBaseDev,
+            numRowsDev,
+            eventStartsDev,
+            eventStridesDev,
+            kmerRanksDev,
+            kmerRanksRCDev,
+            eventOffsetsDev,
+            poreModelLevelLogStdvDev,
+            poreModelLevelStdvDev,
+            poreModelLevelMeanDev,
+            returnValues);
+
+    //cudaMemcpy(returnedValues, returnValues, num_models *sizeof(float), cudaMemcpyDeviceToHost );
+
+    //auto r = returnedValues[0];
+    return 0.0;
 }
 
 std::vector<double> GpuAligner::variantScoresThresholded(std::vector<Variant> input_variants,
@@ -202,4 +401,4 @@ std::vector<double> GpuAligner::variantScoresThresholded(std::vector<Variant> in
     std::vector<double> v;
     v.push_back(base_score);
     return v;
-}
\ No newline at end of file
+}
diff --git a/src/hmm/nanopolish_profile_hmm_r9.inl b/src/hmm/nanopolish_profile_hmm_r9.inl
index 76de768f..bd8ce1e6 100644
--- a/src/hmm/nanopolish_profile_hmm_r9.inl
+++ b/src/hmm/nanopolish_profile_hmm_r9.inl
@@ -285,7 +285,7 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
     
     // Calculate number of blocks
     // A block of the HMM is a set of states for one kmer
-    uint32_t num_blocks = output.get_num_columns() / PSR9_NUM_STATES; // num_columns is the number of kmers
+    uint32_t num_blocks = output.get_num_columns() / PSR9_NUM_STATES; // num_columns is the number of HMM STATES
     uint32_t last_event_row_idx = output.get_num_rows() - 1;
 
     // Precompute the transition probabilites for each kmer block

From 8b74a57fcbec1a06f4a9ddb1fc9e0aff605d99dd Mon Sep 17 00:00:00 2001
From: Mike Vella <mike.vella@nanoporetech.com>
Date: Mon, 11 Jun 2018 16:21:05 +0100
Subject: [PATCH 04/80] estimating emission probabilities

---
 src/cuda_kernels/GpuAligner.cu | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index c0b5b798..819511c1 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -25,15 +25,13 @@ __device__ float lp_match_r9(int rank,
                              float * poreModelLevelLogStdv,
                              float * poreModelLevelStdv,
                              float * poreModelLevelMean){
+
     float log_inv_sqrt_2pi = log(0.3989422804014327); // no need to calculate this every time. better solutions available..
 
     // STEP 1: GET DRIFT-SCALED LEVEL:
     float level = mean; //TODO: Do actual drift scaling. this is a cheat
-    // TODO: STEP 2: Get *scaled* Gaussian from pore model
+    // TODO: Apply scaling to these 3 model values as is done in the CPP implementation
     //these can just be pulled from the model
-    //float gaussian_mean = 0.0;
-    //float gaussian_stdv = 0.0;
-    //float gaussian_log_level_stdv = 0.0;
     float gaussian_mean = poreModelLevelMean[rank];
     float gaussian_stdv = poreModelLevelStdv[rank];
     float gaussian_log_level_stdv = poreModelLevelLogStdv[rank];
@@ -57,7 +55,13 @@ __global__ void getScores(float * eventData,
                           float * poreModelLevelMean,
                           float * returnValues)
 {
-    printf("Entered\n");
+    int MAX_STATES=1024;
+    // kmer probabilities will be stored here
+    __shared__ float prevProbabilities[MAX_STATES];
+    for (int i =0;i<MAX_STATES;i++){
+        prevProbabilities[i] = -INFINITY;
+    }
+
     //float log_inv_sqrt_2pi = log(0.3989422804014327);
 
     //Step 1: calculate transitions. For now we are going to use external params.
@@ -128,18 +132,15 @@ __global__ void getScores(float * eventData,
                                           poreModelLevelLogStdv,
                                           poreModelLevelStdv,
                                           poreModelLevelMean);
-        printf("LP MATCH: %f\n", lp_emission_m);
         float lp_emission_b = BAD_EVENT_PENALTY;
+
+        // Get all the scores for a match
+        float HMT_FROM_SAME_M = lp_mm_self + DPTableFromRow[curBlockOffset + PSR9_MATCH];
     }
 
-    //int tId = threadIdx.x;
-    ///if (tId == 0) {
-    //    printf("data: %f\n", eventData[0]);
-    //    printf("data: %f\n", eventData[1]);
-    //    printf("data: %f\n", eventData[2]);
-    //}
+
     returnValues[blockIdx.x] = 0.356;
-    //__syncthreads();
+    __syncthreads();
 }
 
 

From 9add07a17e0bbefcee4e8dcfa4129631dc5b00f6 Mon Sep 17 00:00:00 2001
From: Mike Vella <mike.vella@nanoporetech.com>
Date: Mon, 11 Jun 2018 17:56:08 +0100
Subject: [PATCH 05/80] kermel Executing to completion but incomplete -WIP

---
 Makefile                              |  4 +-
 src/cuda_kernels/GpuAligner.cu        | 89 +++++++++++++--------------
 src/hmm/nanopolish_profile_hmm_r9.inl |  1 +
 src/nanopolish_call_variants.cpp      | 12 ++--
 4 files changed, 53 insertions(+), 53 deletions(-)

diff --git a/Makefile b/Makefile
index 27030a55..afe84ea1 100644
--- a/Makefile
+++ b/Makefile
@@ -9,9 +9,9 @@ SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/ali
 
 #Basic flags every build needs
 LIBS=-lz
-CXXFLAGS ?= -g #-O3
+CXXFLAGS ?= -g -O3
 CXXFLAGS += -std=c++11 -fopenmp -fsigned-char
-CFLAGS ?= -std=c99 #-O3
+CFLAGS ?= -std=c99 -O3
 CXX ?= g++
 CC ?= gcc
 NVCC = nvcc
diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index 819511c1..d369d030 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -4,19 +4,10 @@
 #include <vector>
 #include "nanopolish_profile_hmm_r9.h"
 
-__global__ void findSumToN(int *n, int limit)
-{
-    //printf("HELLO FROM SUM\n");
-    int tId = threadIdx.x;
-
-    for (int i=0; i<=(int)log2((double)limit); i++)
-    {
-        if (tId%(int)(pow(2.0,(double)(i+1))) == 0){
-            if (tId+(int)pow(2.0, (double)i) >= limit) break;
-            n[tId] += n[tId+(int)pow(2.0, (double)i)];
-        }
-        __syncthreads();
-    }
+#define MAX_STATES 1024
+
+__device__ float logsumexpf(float x, float y){
+    return fmax(x, y) + log1pf(expf(-fabsf(y-x)));
 }
 
 //TODO: Implement, inc pore model
@@ -38,8 +29,6 @@ __device__ float lp_match_r9(int rank,
     // Step 3: calculate log-normal PDF
     float a = (level - gaussian_mean) / gaussian_stdv; // g is the gaussian parameters
     return log_inv_sqrt_2pi - gaussian_log_level_stdv + (-0.5f * a * a); // log_inv_sqrt_2pi is defined in a comment above
-
-    return 0.1973;
 }
 
 __global__ void getScores(float * eventData,
@@ -55,7 +44,6 @@ __global__ void getScores(float * eventData,
                           float * poreModelLevelMean,
                           float * returnValues)
 {
-    int MAX_STATES=1024;
     // kmer probabilities will be stored here
     __shared__ float prevProbabilities[MAX_STATES];
     for (int i =0;i<MAX_STATES;i++){
@@ -135,7 +123,30 @@ __global__ void getScores(float * eventData,
         float lp_emission_b = BAD_EVENT_PENALTY;
 
         // Get all the scores for a match
-        float HMT_FROM_SAME_M = lp_mm_self + DPTableFromRow[curBlockOffset + PSR9_MATCH];
+        float HMT_FROM_SAME_M = lp_mm_self + prevProbabilities[curBlockOffset + PSR9_MATCH];
+        float HMT_FROM_PREV_M = lp_mm_next + prevProbabilities[prevBlockOffset + PSR9_MATCH];
+        float HMT_FROM_SAME_B = lp_bm_self + prevProbabilities[curBlockOffset + PSR9_BAD_EVENT];
+        float HMT_FROM_PREV_B = lp_bm_next + prevProbabilities[prevBlockOffset + PSR9_BAD_EVENT];
+        float HMT_FROM_PREV_K = lp_km + prevProbabilities[prevBlockOffset + PSR9_KMER_SKIP];
+
+        // m_s is the probability of going from the start state
+        // to this kmer. The start state is (currently) only
+        // allowed to go to the first kmer. If ALLOW_PRE_CLIP
+        // is defined, we allow all events before this one to be skipped,
+        // with a penalty;
+        // TODO: Implemnet the HMT_FROM_SOFT score. this appears needed but I don't yet understand it.
+
+        // NOW calculate the score
+        float sum = HMT_FROM_SAME_M;
+        sum = logsumexpf(sum, HMT_FROM_PREV_M);
+        sum = logsumexpf(sum, HMT_FROM_SAME_B);
+        sum = logsumexpf(sum, HMT_FROM_PREV_B);
+        sum = logsumexpf(sum, HMT_FROM_PREV_K);
+        sum += lp_emission_m;
+
+        __syncthreads();
+        prevProbabilities[curBlockIdx + PSR9_MATCH] = sum;
+        __syncthreads();
     }
 
 
@@ -152,30 +163,6 @@ GpuAligner::GpuAligner()
         n[i] = i;
 }
 
-int GpuAligner::calculateSum()
-{
-    int *n_d;
-    cudaMalloc( (void**)&n_d, asize );
-
-    cudaMemcpy(n_d, n, asize, cudaMemcpyHostToDevice );
-
-    dim3 dimBlock( y, 1 );
-    dim3 dimGrid( 1, 1 );
-    findSumToN<<<dimGrid, dimBlock>>>(n_d, y);
-    cudaMemcpy(n, n_d, asize, cudaMemcpyDeviceToHost);
-    cudaFree (n_d);
-    return n[0];
-}
-
-void GpuAligner::setY(int newVal)
-{
-    y = newVal;
-    asize = y*sizeof(int);
-    for (int i=0; i<y; i++)
-        n[i] = i;
-
-}
-
 double scoreKernel(std::vector<HMMInputSequence> sequences,
                    std::vector<HMMInputData> event_sequences,
                    uint32_t alignment_flags){
@@ -184,6 +171,7 @@ double scoreKernel(std::vector<HMMInputSequence> sequences,
     //Let's assume that every event sequence has the same pore model
     //event_sequences[0].pore_model.
 
+    int num_reads = event_sequences.size();
     // These asserts are here during the development phase
     assert(!sequences.empty());
     assert(std::string(sequences[0].get_alphabet()->get_name()) == "nucleotide");
@@ -325,7 +313,7 @@ double scoreKernel(std::vector<HMMInputSequence> sequences,
     cudaMemcpy( eventOffsetsDev, eventOffsets.data(), eventOffsets.size() * sizeof(int), cudaMemcpyHostToDevice );
 
 
-    dim3 dimBlock(num_models);
+    dim3 dimBlock(num_reads);
 
     int num_blocks = n_states / PSR9_NUM_STATES;
     uint32_t num_kmers = num_blocks - 2; // two terminal blocks
@@ -334,9 +322,11 @@ double scoreKernel(std::vector<HMMInputSequence> sequences,
     dim3 dimGrid(num_blocks - 2); // One thread per state, not including Start and Terminal state.
 
     float * returnValues;
-    cudaMalloc((void **) &returnValues, sizeof(float) * num_models); //one score per read
+    cudaMalloc((void **) &returnValues, sizeof(float) * num_reads); //one score per read
+
+    //TODO: this should be a cuda memalloc
+    float* returnedValues = new float[num_reads];
 
-    float * returnedValues;
     getScores<<<dimGrid, dimBlock>>>(eventMeansDev,
             eventsPerBaseDev,
             numRowsDev,
@@ -350,10 +340,15 @@ double scoreKernel(std::vector<HMMInputSequence> sequences,
             poreModelLevelMeanDev,
             returnValues);
 
-    //cudaMemcpy(returnedValues, returnValues, num_models *sizeof(float), cudaMemcpyDeviceToHost );
+    cudaMemcpy(returnedValues, returnValues, num_reads *sizeof(float), cudaMemcpyDeviceToHost);
+
+    float r = 0.0;
+    for(int i=0; i<num_reads;i++){
+        r += returnedValues[i];
+    }
 
-    //auto r = returnedValues[0];
-    return 0.0;
+    //TODO a bunch of cuda memory needs to be freed.
+    return r;
 }
 
 std::vector<double> GpuAligner::variantScoresThresholded(std::vector<Variant> input_variants,
diff --git a/src/hmm/nanopolish_profile_hmm_r9.inl b/src/hmm/nanopolish_profile_hmm_r9.inl
index bd8ce1e6..6b06e633 100644
--- a/src/hmm/nanopolish_profile_hmm_r9.inl
+++ b/src/hmm/nanopolish_profile_hmm_r9.inl
@@ -369,6 +369,7 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
             scores.x[HMT_FROM_SOFT] = -INFINITY;
             output.update_cell(row, curr_block_offset + PSR9_BAD_EVENT, scores, lp_emission_b);
 
+            // in cu this is where the shared memory sync on prev states would go.
             // state PSR9_KMER_SKIP
             scores.x[HMT_FROM_SAME_M] = -INFINITY;
             scores.x[HMT_FROM_PREV_M] = bt.lp_mk + output.get(row, prev_block_offset + PSR9_MATCH);
diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp
index 34d46ddc..dd01261a 100644
--- a/src/nanopolish_call_variants.cpp
+++ b/src/nanopolish_call_variants.cpp
@@ -291,6 +291,7 @@ std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& ali
 
 
     auto scoring = std::chrono::high_resolution_clock::now() - std::chrono::high_resolution_clock::now();
+    auto gpu_exec = std::chrono::high_resolution_clock::now() - std::chrono::high_resolution_clock::now();
 
     for(size_t i = region_start; i < region_end; ++i) {
 
@@ -346,12 +347,12 @@ std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& ali
                                  alignments.get_reference_substring(contig, calling_start, calling_end));
 
         GpuAligner aligner;
-        aligner.setY(15);
-        std::cout << aligner.calculateSum() <<std::endl;
-
+        auto t0_gpu = std::chrono::high_resolution_clock::now();
         std::vector<double> scores = aligner.variantScoresThresholded(tmp_variants, test_haplotype, event_sequences,
                                                        alignment_flags, opt::screen_score_threshold,
                                                        opt::methylation_types);
+        auto tf_gpu = std::chrono::high_resolution_clock::now();
+        gpu_exec = tf_gpu - t0_gpu;
 
         for(const Variant& v : tmp_variants) {
             auto t0 = std::chrono::high_resolution_clock::now();
@@ -378,8 +379,11 @@ std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& ali
 
     auto screening = std::chrono::duration_cast<std::chrono::milliseconds>(scoring).count();
 
+    auto gpu_screening = std::chrono::duration_cast<std::chrono::milliseconds>(gpu_exec).count();
+
     std::cout << "FUNCTION TOOK " << duration << "ms" << std::endl;
-    std::cout << "SCREENING COMPONENT TOOK " << screening << "ms" << std::endl;
+    std::cout << "SCREENING (CPU) COMPONENT TOOK " << screening << "ms" << std::endl;
+    std::cout << "SCREENING (GPU) COMPONENT TOOK " << gpu_screening << "ms" << std::endl;
 
 
 

From 2fb2b0b415a1bfc78df2186b2bc9169f0fa3bb09 Mon Sep 17 00:00:00 2001
From: Mike Vella <mike.vella@nanoporetech.com>
Date: Tue, 12 Jun 2018 15:47:31 +0100
Subject: [PATCH 06/80] Sending correct sequences to GPU

---
 src/cuda_kernels/GpuAligner.cu   | 81 ++++++++++++++++++--------------
 src/nanopolish_call_variants.cpp |  3 +-
 2 files changed, 48 insertions(+), 36 deletions(-)

diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index d369d030..7cfb5c11 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -239,8 +239,8 @@ double scoreKernel(std::vector<HMMInputSequence> sequences,
         numEventsTotal += numEvents;
     }
 
-    float * eventMeans;
     //Allocate a host buffer to store the event means
+    float * eventMeans;
     size_t eventMeansSize = numEventsTotal * sizeof(float);
     cudaHostAlloc(&eventMeans, eventMeansSize , cudaHostAllocDefault);
 
@@ -256,6 +256,7 @@ double scoreKernel(std::vector<HMMInputSequence> sequences,
     }
 
     int num_states = event_sequences[0].pore_model->states.size();
+
     std::vector<float> pore_model_level_log_stdv(num_states);
     std::vector<float> pore_model_level_mean(num_states);
     std::vector<float> pore_model_level_stdv(num_states);
@@ -270,47 +271,47 @@ double scoreKernel(std::vector<HMMInputSequence> sequences,
 
     float* poreModelLevelLogStdvDev;
     cudaMalloc( (void**)&poreModelLevelLogStdvDev, pore_model_level_log_stdv.size() * sizeof(float));
-    cudaMemcpy( poreModelLevelLogStdvDev, pore_model_level_log_stdv.data(), pore_model_level_log_stdv.size() * sizeof(float), cudaMemcpyHostToDevice );
+    cudaMemcpyAsync( poreModelLevelLogStdvDev, pore_model_level_log_stdv.data(), pore_model_level_log_stdv.size() * sizeof(float), cudaMemcpyHostToDevice );
 
     float* poreModelLevelMeanDev;
     cudaMalloc( (void**)&poreModelLevelMeanDev, pore_model_level_mean.size() * sizeof(float));
-    cudaMemcpy( poreModelLevelMeanDev, pore_model_level_mean.data(), pore_model_level_mean.size() * sizeof(float), cudaMemcpyHostToDevice );
+    cudaMemcpyAsync( poreModelLevelMeanDev, pore_model_level_mean.data(), pore_model_level_mean.size() * sizeof(float), cudaMemcpyHostToDevice );
 
     float* poreModelLevelStdvDev;
     cudaMalloc( (void**)&poreModelLevelStdvDev, pore_model_level_stdv.size() * sizeof(float));
-    cudaMemcpy( poreModelLevelStdvDev, pore_model_level_stdv.data(), pore_model_level_stdv.size() * sizeof(float), cudaMemcpyHostToDevice );
+    cudaMemcpyAsync( poreModelLevelStdvDev, pore_model_level_stdv.data(), pore_model_level_stdv.size() * sizeof(float), cudaMemcpyHostToDevice );
 
 
     float* eventsPerBaseDev;
     cudaMalloc( (void**)&eventsPerBaseDev, eventsPerBase.size() * sizeof(float));
-    cudaMemcpy( eventsPerBaseDev, eventsPerBase.data(), eventsPerBase.size() * sizeof(float), cudaMemcpyHostToDevice );
+    cudaMemcpyAsync( eventsPerBaseDev, eventsPerBase.data(), eventsPerBase.size() * sizeof(float), cudaMemcpyHostToDevice );
 
     float* eventMeansDev;
     cudaMalloc( (void**)&eventMeansDev, eventMeansSize);
-    cudaMemcpy( eventMeansDev, eventMeans, eventMeansSize, cudaMemcpyHostToDevice );
+    cudaMemcpyAsync( eventMeansDev, eventMeans, eventMeansSize, cudaMemcpyHostToDevice ); //malloc is taking 300us
 
     int* numRowsDev;
     cudaMalloc( (void**)&numRowsDev, n_rows.size() * sizeof(int));
-    cudaMemcpy( numRowsDev, n_rows.data(), n_rows.size() * sizeof(int), cudaMemcpyHostToDevice );
+    cudaMemcpyAsync( numRowsDev, n_rows.data(), n_rows.size() * sizeof(int), cudaMemcpyHostToDevice );
 
     int* kmerRanksDev;
     int* kmerRanksRCDev;
     cudaMalloc( (void**)&kmerRanksDev, kmer_ranks.size() * sizeof(int));
     cudaMalloc( (void**)&kmerRanksRCDev, kmer_ranks_rc.size() * sizeof(int));
-    cudaMemcpy( kmerRanksDev, kmer_ranks.data(), kmer_ranks.size() * sizeof(int), cudaMemcpyHostToDevice );
-    cudaMemcpy( kmerRanksRCDev, kmer_ranks_rc.data(), kmer_ranks_rc.size() * sizeof(int), cudaMemcpyHostToDevice );
+    cudaMemcpyAsync( kmerRanksDev, kmer_ranks.data(), kmer_ranks.size() * sizeof(int), cudaMemcpyHostToDevice );
+    cudaMemcpyAsync( kmerRanksRCDev, kmer_ranks_rc.data(), kmer_ranks_rc.size() * sizeof(int), cudaMemcpyHostToDevice );
 
     int* eventStartsDev;
     cudaMalloc( (void**)&eventStartsDev, e_starts.size() * sizeof(int));
-    cudaMemcpy( eventStartsDev, e_starts.data(), e_starts.size() * sizeof(int), cudaMemcpyHostToDevice );
+    cudaMemcpyAsync( eventStartsDev, e_starts.data(), e_starts.size() * sizeof(int), cudaMemcpyHostToDevice );
 
     int* eventStridesDev;
     cudaMalloc( (void**)&eventStridesDev, event_strides.size() * sizeof(int));
-    cudaMemcpy( eventStridesDev, event_strides.data(), event_strides.size() * sizeof(int), cudaMemcpyHostToDevice );
+    cudaMemcpyAsync( eventStridesDev, event_strides.data(), event_strides.size() * sizeof(int), cudaMemcpyHostToDevice );
 
     int* eventOffsetsDev;
     cudaMalloc( (void**)&eventOffsetsDev, eventOffsets.size() * sizeof(int));
-    cudaMemcpy( eventOffsetsDev, eventOffsets.data(), eventOffsets.size() * sizeof(int), cudaMemcpyHostToDevice );
+    cudaMemcpyAsync( eventOffsetsDev, eventOffsets.data(), eventOffsets.size() * sizeof(int), cudaMemcpyHostToDevice );
 
 
     dim3 dimBlock(num_reads);
@@ -325,7 +326,9 @@ double scoreKernel(std::vector<HMMInputSequence> sequences,
     cudaMalloc((void **) &returnValues, sizeof(float) * num_reads); //one score per read
 
     //TODO: this should be a cuda memalloc
-    float* returnedValues = new float[num_reads];
+    float* returnedValues;// = new float[num_reads];
+    //size_t eventMeansSize = numEventsTotal * sizeof(float);
+    cudaHostAlloc(&returnedValues, num_reads * sizeof(float) , cudaHostAllocDefault);
 
     getScores<<<dimGrid, dimBlock>>>(eventMeansDev,
             eventsPerBaseDev,
@@ -340,14 +343,31 @@ double scoreKernel(std::vector<HMMInputSequence> sequences,
             poreModelLevelMeanDev,
             returnValues);
 
-    cudaMemcpy(returnedValues, returnValues, num_reads *sizeof(float), cudaMemcpyDeviceToHost);
+    //cudaDeviceSynchronize();
+    cudaMemcpyAsync(returnedValues, returnValues, num_reads *sizeof(float), cudaMemcpyDeviceToHost);
+
+    // Free device memory
+    cudaFree(eventMeansDev);
+    cudaFree(eventsPerBaseDev);
+    cudaFree(numRowsDev);
+    cudaFree(eventStartsDev);
+    cudaFree(eventStridesDev);
+    cudaFree(kmerRanksDev);
+    cudaFree(kmerRanksRCDev);
+    cudaFree(eventOffsetsDev);
+    cudaFree(poreModelLevelLogStdvDev);
+    cudaFree(poreModelLevelStdvDev);
+    cudaFree(poreModelLevelMeanDev);
+
+
+    //Free host memory
+    cudaFreeHost(eventMeans);
 
     float r = 0.0;
     for(int i=0; i<num_reads;i++){
         r += returnedValues[i];
     }
 
-    //TODO a bunch of cuda memory needs to be freed.
     return r;
 }
 
@@ -367,34 +387,25 @@ std::vector<double> GpuAligner::variantScoresThresholded(std::vector<Variant> in
         variant_haplotypes[i].apply_variant(input_variants[i]);
     }
 
-
-    //variant_haplotype.apply_variant(input_variant);
-
     // Make methylated versions of each input sequence. Once for the base haplotype and once each for each variant
     std::vector<HMMInputSequence> base_sequences = generate_methylated_alternatives(base_haplotype.get_sequence(),
                                                                                     methylation_types);
-
-    assert(base_sequences.size() == 1);
-
-    std::vector<std::vector<HMMInputSequence>> methylatedVariantSequences;
-    for(auto variant: variant_haplotypes) {
-        std::vector<HMMInputSequence> variant_sequences = generate_methylated_alternatives(
-                variant.get_sequence(), methylation_types);
-        methylatedVariantSequences.push_back(variant_sequences);
-
+    std::vector<std::vector<HMMInputSequence>> variant_sequences;
+    for (auto v: variant_haplotypes){
+        auto variant_sequence = generate_methylated_alternatives(v.get_sequence(), methylation_types);
+        variant_sequences.push_back(variant_sequence);
     }
 
-    //For now let's not worry about methylation
-    assert(methylatedVariantSequences.size() == numVariants);
-    for (auto m: methylatedVariantSequences) {
-        assert(m.size() == 1);
-    }
-    //Next we need to get the scores.
+    assert(base_sequences.size() == 1);
 
     // return the sum of the score for the base sequences over all the event sequences
     double base_score = scoreKernel(base_sequences, event_sequences, alignment_flags);
 
-    std::vector<double> v;
-    v.push_back(base_score);
+    std::vector<double> v(variant_sequences.size());
+    for (int i=0; i<variant_sequences.size(); i++){
+        double score = scoreKernel(variant_sequences[i], event_sequences, alignment_flags); //TODO: Base sequence needs to be replaced with the variant itself
+        v[i] = (score - base_score);
+    }
+
     return v;
 }
diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp
index dd01261a..97319c76 100644
--- a/src/nanopolish_call_variants.cpp
+++ b/src/nanopolish_call_variants.cpp
@@ -284,6 +284,7 @@ std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& ali
     auto start = std::chrono::high_resolution_clock::now();
 
     std::vector<Variant> out_variants;
+    std::vector<Variant> out_variants_gpu;
 
     std::string contig = alignments.get_region_contig();
 
@@ -352,7 +353,7 @@ std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& ali
                                                        alignment_flags, opt::screen_score_threshold,
                                                        opt::methylation_types);
         auto tf_gpu = std::chrono::high_resolution_clock::now();
-        gpu_exec = tf_gpu - t0_gpu;
+        gpu_exec += tf_gpu - t0_gpu;
 
         for(const Variant& v : tmp_variants) {
             auto t0 = std::chrono::high_resolution_clock::now();

From b614313a7a66b68cbf1e2021c87f51b02835e894 Mon Sep 17 00:00:00 2001
From: Mike Vella <mike.vella@nanoporetech.com>
Date: Tue, 12 Jun 2018 16:00:57 +0100
Subject: [PATCH 07/80] Correct grid size

---
 src/cuda_kernels/GpuAligner.cu | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index 7cfb5c11..50400979 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -313,19 +313,15 @@ double scoreKernel(std::vector<HMMInputSequence> sequences,
     cudaMalloc( (void**)&eventOffsetsDev, eventOffsets.size() * sizeof(int));
     cudaMemcpyAsync( eventOffsetsDev, eventOffsets.data(), eventOffsets.size() * sizeof(int), cudaMemcpyHostToDevice );
 
-
-    dim3 dimBlock(num_reads);
-
     int num_blocks = n_states / PSR9_NUM_STATES;
     uint32_t num_kmers = num_blocks - 2; // two terminal blocks
 
-
-    dim3 dimGrid(num_blocks - 2); // One thread per state, not including Start and Terminal state.
+    dim3 dimBlock(num_blocks - 2);
+    dim3 dimGrid(1); // One thread per state, not including Start and Terminal state.
 
     float * returnValues;
     cudaMalloc((void **) &returnValues, sizeof(float) * num_reads); //one score per read
 
-    //TODO: this should be a cuda memalloc
     float* returnedValues;// = new float[num_reads];
     //size_t eventMeansSize = numEventsTotal * sizeof(float);
     cudaHostAlloc(&returnedValues, num_reads * sizeof(float) , cudaHostAllocDefault);
@@ -359,7 +355,6 @@ double scoreKernel(std::vector<HMMInputSequence> sequences,
     cudaFree(poreModelLevelStdvDev);
     cudaFree(poreModelLevelMeanDev);
 
-
     //Free host memory
     cudaFreeHost(eventMeans);
 

From aa43bc92a755e6124327e7aec34f32444e943604 Mon Sep 17 00:00:00 2001
From: Mike Vella <mike.vella@nanoporetech.com>
Date: Fri, 15 Jun 2018 10:14:40 +0100
Subject: [PATCH 08/80] Match state almost working

---
 Makefile                              |   4 +-
 src/cuda_kernels/GpuAligner.cu        | 122 +++++++++++++++++++++-----
 src/hmm/nanopolish_emissions.h        |  11 ++-
 src/hmm/nanopolish_profile_hmm_r9.inl |  41 +++++++--
 4 files changed, 149 insertions(+), 29 deletions(-)

diff --git a/Makefile b/Makefile
index afe84ea1..199f5d1d 100644
--- a/Makefile
+++ b/Makefile
@@ -9,9 +9,9 @@ SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/ali
 
 #Basic flags every build needs
 LIBS=-lz
-CXXFLAGS ?= -g -O3
+CXXFLAGS ?= -g -Og
 CXXFLAGS += -std=c++11 -fopenmp -fsigned-char
-CFLAGS ?= -std=c99 -O3
+CFLAGS ?= -std=c99 #-O3
 CXX ?= g++
 CC ?= gcc
 NVCC = nvcc
diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index 50400979..4c851014 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -7,7 +7,11 @@
 #define MAX_STATES 1024
 
 __device__ float logsumexpf(float x, float y){
-    return fmax(x, y) + log1pf(expf(-fabsf(y-x)));
+    if(x == -INFINITY && y == -INFINITY){
+        return -INFINITY;
+    }
+    float result = fmax(x, y) + log1pf(expf(-fabsf(y - x)));
+    return result;
 }
 
 //TODO: Implement, inc pore model
@@ -15,7 +19,8 @@ __device__ float lp_match_r9(int rank,
                              float mean,
                              float * poreModelLevelLogStdv,
                              float * poreModelLevelStdv,
-                             float * poreModelLevelMean){
+                             float * poreModelLevelMean,
+                             bool debug = false){
 
     float log_inv_sqrt_2pi = log(0.3989422804014327); // no need to calculate this every time. better solutions available..
 
@@ -28,7 +33,23 @@ __device__ float lp_match_r9(int rank,
     float gaussian_log_level_stdv = poreModelLevelLogStdv[rank];
     // Step 3: calculate log-normal PDF
     float a = (level - gaussian_mean) / gaussian_stdv; // g is the gaussian parameters
-    return log_inv_sqrt_2pi - gaussian_log_level_stdv + (-0.5f * a * a); // log_inv_sqrt_2pi is defined in a comment above
+
+    float emission = log_inv_sqrt_2pi - gaussian_log_level_stdv + (-0.5f * a * a); // log_inv_sqrt_2pi is defined in a comment above
+
+    if (debug == true) {
+        if (threadIdx.x == 0) {
+            printf(">GPU: kmer rank is %i\n", rank);
+            printf(">GPU: level %f\n", level);
+            printf(">GPU: gaussian mean %f\n", gaussian_mean);
+            printf(">GPU: gaussian stdv %f\n", gaussian_stdv);
+            printf(">GPU: gaussian log level stdv %f\n", gaussian_log_level_stdv);
+            printf(">GPU a: %f\n", a);
+            printf(">GPU emission: %f\n", emission);
+        }
+    }
+
+    return emission; // log_inv_sqrt_2pi is defined in a comment above
+
 }
 
 __global__ void getScores(float * eventData,
@@ -42,16 +63,16 @@ __global__ void getScores(float * eventData,
                           float * poreModelLevelLogStdv,
                           float * poreModelLevelStdv,
                           float * poreModelLevelMean,
-                          float * returnValues)
-{
-    // kmer probabilities will be stored here
+                          float * returnValues) {
+
+    // Initialise the prev probability row, which is the row of the DP table
+
+    int n_states = blockDim.x * PSR9_NUM_STATES + 2 * PSR9_NUM_STATES; // 3 blocks per kmer and then 3 each for start and end state.
     __shared__ float prevProbabilities[MAX_STATES];
-    for (int i =0;i<MAX_STATES;i++){
+    for (int i = 0; i < n_states; i++) {
         prevProbabilities[i] = -INFINITY;
     }
 
-    //float log_inv_sqrt_2pi = log(0.3989422804014327);
-
     //Step 1: calculate transitions. For now we are going to use external params.
     int readIdx = blockIdx.x;
     float read_events_per_base = readEventsPerBase[readIdx];
@@ -59,13 +80,20 @@ __global__ void getScores(float * eventData,
     int e_start = eventStarts[readIdx]; // Event start for read
     int e_stride = eventStrides[readIdx];
     int e_offset = eventOffsets[readIdx]; // Within the event means etc, the offset needed for this block to get a specific event
-    //int kmer_ranks = kmerRanks[readIdx.x]; // TODO: Use RC for RC reads
+
+    //float levelLogStdv = poreModelLevelLogStdv[e_offset];
+    //float levelStdv = poreModelLevelStdv[e_offset];
+    //float levelMean = poreModelLevelMean[e_offset];
+
+    if (threadIdx.x == 0){
+        printf(">GPU e_start %i\n", e_start);
+    }
 
     int kmerIdx = threadIdx.x;
+    uint32_t rank = kmer_ranks[kmerIdx]; // lexical rank of a kmer
+    printf("Kmer idx %i, Rank: %i\n", kmerIdx, rank);
 
     float p_stay = 1 - (1 / read_events_per_base);
-
-    //printf("Events per base: %f \n", read_events_per_base);
     float p_skip = 0.0025;
     float p_bad = 0.001;
     float p_bad_self = p_bad;
@@ -97,6 +125,8 @@ __global__ void getScores(float * eventData,
     float lp_kk = log(p_kk);
     float lp_km = log(p_km);
 
+    float lp_sm, lp_ms;
+    lp_sm = lp_ms = 0.0f;
 
     // Start filling out the "DP table"
     // Each thread is going to work on an individual P-HMM Block
@@ -113,13 +143,20 @@ __global__ void getScores(float * eventData,
     for(int row=1; row<numRows;row++){
         // Emission probabilities
         int event_idx = e_start + (row - 1) * e_stride;
-        uint32_t rank = kmer_ranks[kmerIdx]; // lexical rank of a kmer
         float event_mean = eventData[e_offset + row];
+
+        bool debug = false;
+        if (threadIdx.x == 0 && row == 1){
+            debug = true;
+        }
+
         float lp_emission_m = lp_match_r9(rank,
                                           event_mean,
                                           poreModelLevelLogStdv,
                                           poreModelLevelStdv,
-                                          poreModelLevelMean);
+                                          poreModelLevelMean,
+                                          debug);
+
         float lp_emission_b = BAD_EVENT_PENALTY;
 
         // Get all the scores for a match
@@ -129,6 +166,28 @@ __global__ void getScores(float * eventData,
         float HMT_FROM_PREV_B = lp_bm_next + prevProbabilities[prevBlockOffset + PSR9_BAD_EVENT];
         float HMT_FROM_PREV_K = lp_km + prevProbabilities[prevBlockOffset + PSR9_KMER_SKIP];
 
+        // m_s is the probability of going from the start state
+        // to this kmer. The start state is (currently) only
+        // allowed to go to the first kmer. If ALLOW_PRE_CLIP
+        // is defined, we allow all events before this one to be skipped,
+        // with a penalty;
+        float HMT_FROM_SOFT = (kmerIdx == 0 && (event_idx == e_start)) ? lp_sm  : -INFINITY; // TODO: Add the pre-flank to this calculation. Also flags and HAF_ALLOW_PRE_CLIP
+
+        if ((threadIdx.x == 0) && (row == 1)){
+            printf("rank %i\n", rank);
+            printf("event mean %f\n", event_mean);
+            printf("poreModelLevelLogStdv %f\n", poreModelLevelLogStdv);
+            printf("poreModelLevelStdv %f\n", poreModelLevelStdv);
+            printf("poreModelLevelMean %f\n", poreModelLevelMean);
+            printf("lp_emission_m is %f\n", lp_emission_m);
+            printf("PSR9_MATCH is %i\n", PSR9_MATCH);
+            printf(">GPU score HMT_FROM_SAME_M is %f\n", HMT_FROM_SAME_M);
+            printf(">GPU score HMT_FROM_PREV_M is %f\n", HMT_FROM_PREV_M);
+            printf(">GPU score HMT_FROM_SAME_B is %f\n", HMT_FROM_SAME_B);
+            printf(">GPU score HMT_FROM_PREV_B is %f\n", HMT_FROM_PREV_B);
+            printf(">GPU score HMT_FROM_PREV_K is %f\n", HMT_FROM_PREV_K);
+        }
+
         // m_s is the probability of going from the start state
         // to this kmer. The start state is (currently) only
         // allowed to go to the first kmer. If ALLOW_PRE_CLIP
@@ -136,17 +195,40 @@ __global__ void getScores(float * eventData,
         // with a penalty;
         // TODO: Implemnet the HMT_FROM_SOFT score. this appears needed but I don't yet understand it.
 
-        // NOW calculate the score
+        // calculate the score
         float sum = HMT_FROM_SAME_M;
+
+        sum = logsumexpf(sum, HMT_FROM_SOFT);
+        if (debug == true){
+            printf("Sum1 is : %f\n", sum);
+        }
         sum = logsumexpf(sum, HMT_FROM_PREV_M);
+        if (debug == true){
+            printf("Sum2 is : %f\n", sum);
+        }
+
         sum = logsumexpf(sum, HMT_FROM_SAME_B);
         sum = logsumexpf(sum, HMT_FROM_PREV_B);
+        if (debug == true){
+            printf("Sum3 is : %f\n", sum);
+        }
+
         sum = logsumexpf(sum, HMT_FROM_PREV_K);
         sum += lp_emission_m;
+        if (debug == true){
+            printf("Sum4 is : %f\n", sum);
+        }
 
         __syncthreads();
-        prevProbabilities[curBlockIdx + PSR9_MATCH] = sum;
+        prevProbabilities[curBlockOffset + PSR9_MATCH] = sum;
         __syncthreads();
+
+        if ((threadIdx.x == 0) && (row == 1)) {
+            printf("Number of states is %i\n", n_states);
+            for (int c = 0; c < n_states; c++) {
+                printf("GPU> Value for row 1 and col %i is %f\n", c, prevProbabilities[c]);
+            }
+        }
     }
 
 
@@ -386,10 +468,10 @@ std::vector<double> GpuAligner::variantScoresThresholded(std::vector<Variant> in
     std::vector<HMMInputSequence> base_sequences = generate_methylated_alternatives(base_haplotype.get_sequence(),
                                                                                     methylation_types);
     std::vector<std::vector<HMMInputSequence>> variant_sequences;
-    for (auto v: variant_haplotypes){
-        auto variant_sequence = generate_methylated_alternatives(v.get_sequence(), methylation_types);
-        variant_sequences.push_back(variant_sequence);
-    }
+    //for (auto v: variant_haplotypes){
+    //    auto variant_sequence = generate_methylated_alternatives(v.get_sequence(), methylation_types);
+    //    variant_sequences.push_back(variant_sequence);
+    //}
 
     assert(base_sequences.size() == 1);
 
diff --git a/src/hmm/nanopolish_emissions.h b/src/hmm/nanopolish_emissions.h
index f9e85142..599a24fb 100644
--- a/src/hmm/nanopolish_emissions.h
+++ b/src/hmm/nanopolish_emissions.h
@@ -58,11 +58,20 @@ inline float log_probability_match_r9(const SquiggleRead& read,
                                       const PoreModel& pore_model,
                                       uint32_t kmer_rank,
                                       uint32_t event_idx,
-                                      uint8_t strand)
+                                      uint8_t strand,
+                                      bool debug = false)
 {
     // event level mean, scaled with the drift value
     float level = read.get_drift_scaled_level(event_idx, strand);
+
     GaussianParameters gp = read.get_scaled_gaussian_from_pore_model_state(pore_model, strand, kmer_rank);
+    if (debug == true) {
+        printf(">CPU kmer_rank is: %i\n", kmer_rank);
+        printf(">CPU level is: %f\n", level);
+        printf(">CPU gaussian mean: %f\n", gp.mean);
+        printf(">CPU gaussian stdv: %f\n", gp.stdv);
+        printf(">CPU gaussian log_level_stdv: %f\n", gp.log_stdv);
+    }
     float lp = log_normal_pdf(level, gp);
     return lp;
 }
diff --git a/src/hmm/nanopolish_profile_hmm_r9.inl b/src/hmm/nanopolish_profile_hmm_r9.inl
index 6b06e633..d15161fe 100644
--- a/src/hmm/nanopolish_profile_hmm_r9.inl
+++ b/src/hmm/nanopolish_profile_hmm_r9.inl
@@ -216,7 +216,7 @@ inline std::vector<float> make_pre_flanking(const HMMInputData& data,
         pre_flank[i] = log(TRANS_CLIP_SELF) + 
                        log_probability_background(*data.read, event_idx, data.strand) + // emit from background
                        pre_flank[i - 1]; // this accounts for the transition from the start & to the silent pre
-    
+
     }
 
     return pre_flank;
@@ -261,7 +261,7 @@ inline std::vector<float> make_post_flanking(const HMMInputData& data,
 template<class ProfileHMMOutput>
 inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
                                          const HMMInputData& _data,
-                                         const uint32_t,
+                                         const uint32_t,  //e_start apparently not used by this function
                                          uint32_t flags,
                                          ProfileHMMOutput& output)
 {
@@ -282,7 +282,8 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
 #endif
 
     uint32_t e_start = data.event_start_idx;
-    
+
+    printf(">CPU e_start: %i\n", e_start);
     // Calculate number of blocks
     // A block of the HMM is a set of states for one kmer
     uint32_t num_blocks = output.get_num_columns() / PSR9_NUM_STATES; // num_columns is the number of HMM STATES
@@ -301,8 +302,11 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
     assert( data.pore_model->states.size() == sequence.get_num_kmer_ranks(k) );
 
     std::vector<uint32_t> kmer_ranks(num_kmers);
-    for(size_t ki = 0; ki < num_kmers; ++ki)
-        kmer_ranks[ki] = sequence.get_kmer_rank(ki, k, data.rc);
+    for(size_t ki = 0; ki < num_kmers; ++ki) {
+        int kr = sequence.get_kmer_rank(ki, k, data.rc);
+        printf("Kmer rank: %i\n", kr);
+        kmer_ranks[ki] = kr;
+    }
 
     size_t num_events = output.get_num_rows() - 1;
 
@@ -337,7 +341,7 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
             // Emission probabilities
             uint32_t event_idx = e_start + (row - 1) * data.event_stride;
             uint32_t rank = kmer_ranks[kmer_idx];
-            float lp_emission_m = log_probability_match_r9(*data.read, *data.pore_model, rank, event_idx, data.strand);
+            float lp_emission_m = log_probability_match_r9(*data.read, *data.pore_model, rank, event_idx, data.strand, true);
             float lp_emission_b = BAD_EVENT_PENALTY;
             
             HMMUpdateScores scores;
@@ -360,6 +364,20 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
             
             output.update_cell(row, curr_block_offset + PSR9_MATCH, scores, lp_emission_m);
 
+            printf("======\n");
+            //diagnostics - after match has been applied
+            if (row == 1) {
+                auto nc = output.get_num_columns();
+                //for (int i = 0; i < nc; i++) {
+                //    printf("CPU> Value for row 0 col %i is %f\n", i, output.get(0, i));
+                //}
+                for (int i = 0; i < nc; i++) {
+                    printf("CPU> Value for row 1 col %i is %f\n", i, output.get(1, i));
+                }
+            }
+
+
+
             // state PSR9_BAD_EVENT
             scores.x[HMT_FROM_SAME_M] = bt.lp_mb + output.get(row - 1, curr_block_offset + PSR9_MATCH);
             scores.x[HMT_FROM_PREV_M] = -INFINITY; // not allowed
@@ -369,6 +387,16 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
             scores.x[HMT_FROM_SOFT] = -INFINITY;
             output.update_cell(row, curr_block_offset + PSR9_BAD_EVENT, scores, lp_emission_b);
 
+            if ((block == 1) && (row == 1)){ //blcok 1 corresponds to threadIdx 0 on GPU
+                printf("lp_emission_m is %f\n", lp_emission_m);
+                printf("PSR9_MATCH is %i\n", PSR9_MATCH);
+                printf(">CPU score HMT_FROM_SAME_M is %f\n", scores.x[HMT_FROM_SAME_M]);
+                printf(">CPU score HMT_FROM_PREV_M is %f\n", scores.x[HMT_FROM_PREV_M]);
+                printf(">CPU score HMT_FROM_SAME_B is %f\n", scores.x[HMT_FROM_SAME_B]);
+                printf(">CPU score HMT_FROM_PREV_B is %f\n", scores.x[HMT_FROM_PREV_B]);
+                printf(">CPU score HMT_FROM_PREV_K is %f\n", scores.x[HMT_FROM_PREV_K]);
+            }
+
             // in cu this is where the shared memory sync on prev states would go.
             // state PSR9_KMER_SKIP
             scores.x[HMT_FROM_SAME_M] = -INFINITY;
@@ -425,6 +453,7 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
 #endif
         }
     }
+
     
     return output.get_end();
 }

From a4dbf437da740ed0ee490833ba95aaea35873c20 Mon Sep 17 00:00:00 2001
From: Mike Vella <mike.vella@nanoporetech.com>
Date: Fri, 15 Jun 2018 17:03:32 +0100
Subject: [PATCH 09/80] All states now being updated, but no terminal kmer or
 scaling

---
 src/cuda_kernels/GpuAligner.cu        | 107 ++++++++++++++++++++------
 src/hmm/nanopolish_profile_hmm_r9.inl |  48 ++++++------
 2 files changed, 105 insertions(+), 50 deletions(-)

diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index 4c851014..3bc3bcdf 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -173,28 +173,6 @@ __global__ void getScores(float * eventData,
         // with a penalty;
         float HMT_FROM_SOFT = (kmerIdx == 0 && (event_idx == e_start)) ? lp_sm  : -INFINITY; // TODO: Add the pre-flank to this calculation. Also flags and HAF_ALLOW_PRE_CLIP
 
-        if ((threadIdx.x == 0) && (row == 1)){
-            printf("rank %i\n", rank);
-            printf("event mean %f\n", event_mean);
-            printf("poreModelLevelLogStdv %f\n", poreModelLevelLogStdv);
-            printf("poreModelLevelStdv %f\n", poreModelLevelStdv);
-            printf("poreModelLevelMean %f\n", poreModelLevelMean);
-            printf("lp_emission_m is %f\n", lp_emission_m);
-            printf("PSR9_MATCH is %i\n", PSR9_MATCH);
-            printf(">GPU score HMT_FROM_SAME_M is %f\n", HMT_FROM_SAME_M);
-            printf(">GPU score HMT_FROM_PREV_M is %f\n", HMT_FROM_PREV_M);
-            printf(">GPU score HMT_FROM_SAME_B is %f\n", HMT_FROM_SAME_B);
-            printf(">GPU score HMT_FROM_PREV_B is %f\n", HMT_FROM_PREV_B);
-            printf(">GPU score HMT_FROM_PREV_K is %f\n", HMT_FROM_PREV_K);
-        }
-
-        // m_s is the probability of going from the start state
-        // to this kmer. The start state is (currently) only
-        // allowed to go to the first kmer. If ALLOW_PRE_CLIP
-        // is defined, we allow all events before this one to be skipped,
-        // with a penalty;
-        // TODO: Implemnet the HMT_FROM_SOFT score. this appears needed but I don't yet understand it.
-
         // calculate the score
         float sum = HMT_FROM_SAME_M;
 
@@ -219,14 +197,93 @@ __global__ void getScores(float * eventData,
             printf("Sum4 is : %f\n", sum);
         }
 
+        float newMatchScore = sum;
+        // Here need to calculate the bad event score
+
+        // state PSR9_BAD_EVENT
+        HMT_FROM_SAME_M = lp_mb + prevProbabilities[curBlockOffset + PSR9_MATCH];
+        HMT_FROM_PREV_M = -INFINITY; // not allowed
+        HMT_FROM_SAME_B = lp_bb + prevProbabilities[curBlockOffset + PSR9_BAD_EVENT];
+        HMT_FROM_PREV_B = -INFINITY;
+        HMT_FROM_PREV_K = -INFINITY;
+        HMT_FROM_SOFT = -INFINITY;
+
+        sum = HMT_FROM_SAME_M;
+        sum = logsumexpf(sum, HMT_FROM_PREV_M);
+        sum = logsumexpf(sum, HMT_FROM_SAME_B);
+        sum = logsumexpf(sum, HMT_FROM_PREV_B);
+        sum = logsumexpf(sum, HMT_FROM_PREV_K);
+        sum = logsumexpf(sum, HMT_FROM_SOFT);
+        sum += lp_emission_b;
+
+        float newBadEventScore = sum;
+
+        // Write row out
+        prevProbabilities[curBlockOffset + PSR9_MATCH] = newMatchScore;
+        prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] = newBadEventScore;
         __syncthreads();
-        prevProbabilities[curBlockOffset + PSR9_MATCH] = sum;
+
+        // state PSR9_KMER_SKIP
+        HMT_FROM_SAME_M = -INFINITY;
+        HMT_FROM_PREV_M = lp_mk + prevProbabilities[prevBlockOffset + PSR9_MATCH];
+        HMT_FROM_SAME_B = -INFINITY;
+        HMT_FROM_PREV_B = lp_bk + prevProbabilities[prevBlockOffset + PSR9_BAD_EVENT];
+
+        HMT_FROM_SOFT = -INFINITY;
+
+        sum = HMT_FROM_SAME_M;
+        sum = logsumexpf(sum, HMT_FROM_PREV_M);
+        sum = logsumexpf(sum, HMT_FROM_SAME_B);
+        sum = logsumexpf(sum, HMT_FROM_PREV_B);
+        sum = logsumexpf(sum, HMT_FROM_PREV_K);
+        sum = logsumexpf(sum, HMT_FROM_SOFT);
+        sum += 0.0;//No emission. redundant.
+
+        float newSkipScore = sum;
+
+        prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] = newSkipScore;
         __syncthreads();
 
-        if ((threadIdx.x == 0) && (row == 1)) {
+        //Now need to do the skip-skip transition, which is serial.
+        if (threadIdx.x == 0){
+            for (int blkidx = 2;blkidx <= blockDim.x; blkidx++){
+                //calculate the skipscore using the previous
+                //Current skip score for block blkidx:
+                float curSkipScore = prevProbabilities[blkidx * PSR9_NUM_STATES + PSR9_KMER_SKIP];
+                printf("Current skip score for block %i is %f",blkidx, curSkipScore);
+                //new score to add - TODO: use the correct lp_kk score
+
+                HMT_FROM_PREV_K = lp_kk + newSkipScore;
+                newSkipScore = logsumexpf(curSkipScore, HMT_FROM_PREV_K);
+                //add it
+                prevProbabilities[blkidx * PSR9_NUM_STATES + PSR9_KMER_SKIP] = newSkipScore;
+            }
+        }
+
+        // Now do the end state
+        __syncthreads();
+
+        if ((threadIdx.x == 1) && (row == 1)){
+            printf("rank %i\n", rank);
+            printf("event mean %f\n", event_mean);
+            printf("poreModelLevelLogStdv %f\n", poreModelLevelLogStdv);
+            printf("poreModelLevelStdv %f\n", poreModelLevelStdv);
+            printf("poreModelLevelMean %f\n", poreModelLevelMean);
+            printf("lp_emission_m is %f\n", lp_emission_m);
+            printf("PSR9_MATCH is %i\n", PSR9_MATCH);
+            printf(">GPU score HMT_FROM_SAME_M is %f\n", HMT_FROM_SAME_M);
+            printf(">GPU score HMT_FROM_PREV_M is %f\n", HMT_FROM_PREV_M);
+            printf(">GPU score HMT_FROM_SAME_B is %f\n", HMT_FROM_SAME_B);
+            printf(">GPU score HMT_FROM_PREV_B is %f\n", HMT_FROM_PREV_B);
+            printf(">GPU score HMT_FROM_PREV_K is %f\n", HMT_FROM_PREV_K);
+            printf(">GPU newSkipScore is %f\n", newSkipScore);
+        }
+
+
+        if ((threadIdx.x == 0) && (row == 3)) {
             printf("Number of states is %i\n", n_states);
             for (int c = 0; c < n_states; c++) {
-                printf("GPU> Value for row 1 and col %i is %f\n", c, prevProbabilities[c]);
+                printf("GPU> Value for row 3 and col %i is %f\n", c, prevProbabilities[c]);
             }
         }
     }
diff --git a/src/hmm/nanopolish_profile_hmm_r9.inl b/src/hmm/nanopolish_profile_hmm_r9.inl
index d15161fe..d8738101 100644
--- a/src/hmm/nanopolish_profile_hmm_r9.inl
+++ b/src/hmm/nanopolish_profile_hmm_r9.inl
@@ -326,6 +326,16 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
     // Fill in matrix
     for(uint32_t row = 1; row < output.get_num_rows(); row++) {
 
+        printf("======\n");
+        //diagnostics - after match and bad event have been applied
+        if (row == 4) { // row 1 has been computed so we can have a peek
+            auto nc = output.get_num_columns();
+            int rw = 3;
+            for (int i = 0; i < nc; i++) {
+                printf("CPU> Value for row %i col %i is %f\n", rw, i, output.get(rw, i));
+            }
+        }
+
         // Skip the first block which is the start state, it was initialized above
         // Similarily skip the last block, which is calculated in the terminate() function
         for(uint32_t block = 1; block < num_blocks - 1; block++) {
@@ -364,38 +374,16 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
             
             output.update_cell(row, curr_block_offset + PSR9_MATCH, scores, lp_emission_m);
 
-            printf("======\n");
-            //diagnostics - after match has been applied
-            if (row == 1) {
-                auto nc = output.get_num_columns();
-                //for (int i = 0; i < nc; i++) {
-                //    printf("CPU> Value for row 0 col %i is %f\n", i, output.get(0, i));
-                //}
-                for (int i = 0; i < nc; i++) {
-                    printf("CPU> Value for row 1 col %i is %f\n", i, output.get(1, i));
-                }
-            }
-
-
-
-            // state PSR9_BAD_EVENT
+             // state PSR9_BAD_EVENT
             scores.x[HMT_FROM_SAME_M] = bt.lp_mb + output.get(row - 1, curr_block_offset + PSR9_MATCH);
             scores.x[HMT_FROM_PREV_M] = -INFINITY; // not allowed
             scores.x[HMT_FROM_SAME_B] = bt.lp_bb + output.get(row - 1, curr_block_offset + PSR9_BAD_EVENT);
             scores.x[HMT_FROM_PREV_B] = -INFINITY;
             scores.x[HMT_FROM_PREV_K] = -INFINITY;
             scores.x[HMT_FROM_SOFT] = -INFINITY;
+            printf("before: %f:\n", output.get(row, curr_block_offset + PSR9_BAD_EVENT));
             output.update_cell(row, curr_block_offset + PSR9_BAD_EVENT, scores, lp_emission_b);
-
-            if ((block == 1) && (row == 1)){ //blcok 1 corresponds to threadIdx 0 on GPU
-                printf("lp_emission_m is %f\n", lp_emission_m);
-                printf("PSR9_MATCH is %i\n", PSR9_MATCH);
-                printf(">CPU score HMT_FROM_SAME_M is %f\n", scores.x[HMT_FROM_SAME_M]);
-                printf(">CPU score HMT_FROM_PREV_M is %f\n", scores.x[HMT_FROM_PREV_M]);
-                printf(">CPU score HMT_FROM_SAME_B is %f\n", scores.x[HMT_FROM_SAME_B]);
-                printf(">CPU score HMT_FROM_PREV_B is %f\n", scores.x[HMT_FROM_PREV_B]);
-                printf(">CPU score HMT_FROM_PREV_K is %f\n", scores.x[HMT_FROM_PREV_K]);
-            }
+            printf("after: %f:\n", output.get(row, curr_block_offset + PSR9_BAD_EVENT));
 
             // in cu this is where the shared memory sync on prev states would go.
             // state PSR9_KMER_SKIP
@@ -407,6 +395,16 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
             scores.x[HMT_FROM_SOFT] = -INFINITY;
             output.update_cell(row, curr_block_offset + PSR9_KMER_SKIP, scores, 0.0f); // no emission
 
+            if ((block == 1) && (row == 1)){ //blcok 1 corresponds to threadIdx 0 on GPU
+                printf("lp_emission_m is %f\n", lp_emission_m);
+                printf("PSR9_MATCH is %i\n", PSR9_MATCH);
+                printf(">CPU score HMT_FROM_SAME_M is %f\n", scores.x[HMT_FROM_SAME_M]);
+                printf(">CPU score HMT_FROM_PREV_M is %f\n", scores.x[HMT_FROM_PREV_M]);
+                printf(">CPU score HMT_FROM_SAME_B is %f\n", scores.x[HMT_FROM_SAME_B]);
+                printf(">CPU score HMT_FROM_PREV_B is %f\n", scores.x[HMT_FROM_PREV_B]);
+                printf(">CPU score HMT_FROM_PREV_K is %f\n", scores.x[HMT_FROM_PREV_K]);
+            }
+
             // If POST_CLIP is enabled we allow the last kmer to transition directly
             // to the end after any event. Otherwise we only allow it from the 
             // last kmer/event match.

From 46e6ead916c8293ff0f93e197996360d6cf6c125 Mon Sep 17 00:00:00 2001
From: Mike Vella <mike.vella@nanoporetech.com>
Date: Mon, 25 Jun 2018 17:34:34 +0100
Subject: [PATCH 10/80] diagnosing issue

---
 src/cuda_kernels/GpuAligner.cu        | 95 ++++++++++++++++++++++-----
 src/hmm/nanopolish_emissions.h        |  5 +-
 src/hmm/nanopolish_profile_hmm_r9.inl | 27 ++++----
 src/nanopolish_call_variants.cpp      |  3 +
 4 files changed, 101 insertions(+), 29 deletions(-)

diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index 3bc3bcdf..3cef2919 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -20,17 +20,23 @@ __device__ float lp_match_r9(int rank,
                              float * poreModelLevelLogStdv,
                              float * poreModelLevelStdv,
                              float * poreModelLevelMean,
+                             float scale,
+                             float shift,
+                             float var,
+                             float logVar,
                              bool debug = false){
 
     float log_inv_sqrt_2pi = log(0.3989422804014327); // no need to calculate this every time. better solutions available..
 
     // STEP 1: GET DRIFT-SCALED LEVEL:
-    float level = mean; //TODO: Do actual drift scaling. this is a cheat
+    float level = mean;
     // TODO: Apply scaling to these 3 model values as is done in the CPP implementation
     //these can just be pulled from the model
-    float gaussian_mean = poreModelLevelMean[rank];
-    float gaussian_stdv = poreModelLevelStdv[rank];
-    float gaussian_log_level_stdv = poreModelLevelLogStdv[rank];
+
+    float gaussian_mean = scale * poreModelLevelMean[rank] + shift;
+    float gaussian_stdv = poreModelLevelStdv[rank] * var;
+    float gaussian_log_level_stdv = poreModelLevelLogStdv[rank] + logVar;
+
     // Step 3: calculate log-normal PDF
     float a = (level - gaussian_mean) / gaussian_stdv; // g is the gaussian parameters
 
@@ -63,6 +69,10 @@ __global__ void getScores(float * eventData,
                           float * poreModelLevelLogStdv,
                           float * poreModelLevelStdv,
                           float * poreModelLevelMean,
+                          float * scaleDev,
+                          float * shiftDev,
+                          float * varDev,
+                          float * logVarDev,
                           float * returnValues) {
 
     // Initialise the prev probability row, which is the row of the DP table
@@ -140,11 +150,18 @@ __global__ void getScores(float * eventData,
     // the penalty is controlled by the transition probability
     float BAD_EVENT_PENALTY = 0.0f;
 
+    float scale = scaleDev[readIdx];
+    float shift = shiftDev[readIdx];
+    float var = varDev[readIdx];
+    float logVar = logVarDev[readIdx];
+
     for(int row=1; row<numRows;row++){
         // Emission probabilities
         int event_idx = e_start + (row - 1) * e_stride;
         float event_mean = eventData[e_offset + row];
-
+        if (threadIdx.x == 0 && row ==1){
+            printf("event mean: %f\n", event_mean);
+        }
         bool debug = false;
         if (threadIdx.x == 0 && row == 1){
             debug = true;
@@ -155,8 +172,17 @@ __global__ void getScores(float * eventData,
                                           poreModelLevelLogStdv,
                                           poreModelLevelStdv,
                                           poreModelLevelMean,
+                                          scale,
+                                          shift,
+                                          var,
+                                          logVar,
                                           debug);
 
+        //TODO: The level I am seeing is nto agreeing with the CPU one atm.
+        if (threadIdx.x == 0 && row == 1){
+            printf("GPU> lp_emission_m %f\n", lp_emission_m);
+            printf("GPU> level being used to calculate emission: %f\n", event_mean);
+        }
         float lp_emission_b = BAD_EVENT_PENALTY;
 
         // Get all the scores for a match
@@ -250,7 +276,7 @@ __global__ void getScores(float * eventData,
                 //calculate the skipscore using the previous
                 //Current skip score for block blkidx:
                 float curSkipScore = prevProbabilities[blkidx * PSR9_NUM_STATES + PSR9_KMER_SKIP];
-                printf("Current skip score for block %i is %f",blkidx, curSkipScore);
+                //printf("Current skip score for block %i is %f",blkidx, curSkipScore);
                 //new score to add - TODO: use the correct lp_kk score
 
                 HMT_FROM_PREV_K = lp_kk + newSkipScore;
@@ -266,9 +292,9 @@ __global__ void getScores(float * eventData,
         if ((threadIdx.x == 1) && (row == 1)){
             printf("rank %i\n", rank);
             printf("event mean %f\n", event_mean);
-            printf("poreModelLevelLogStdv %f\n", poreModelLevelLogStdv);
-            printf("poreModelLevelStdv %f\n", poreModelLevelStdv);
-            printf("poreModelLevelMean %f\n", poreModelLevelMean);
+            printf("poreModelLevelLogStdv %f\n", poreModelLevelLogStdv[0]);
+            printf("poreModelLevelStdv %f\n", poreModelLevelStdv[0]);
+            printf("poreModelLevelMean %f\n", poreModelLevelMean[0]);
             printf("lp_emission_m is %f\n", lp_emission_m);
             printf("PSR9_MATCH is %i\n", PSR9_MATCH);
             printf(">GPU score HMT_FROM_SAME_M is %f\n", HMT_FROM_SAME_M);
@@ -280,10 +306,10 @@ __global__ void getScores(float * eventData,
         }
 
 
-        if ((threadIdx.x == 0) && (row == 3)) {
+        if ((threadIdx.x == 0) && (row == 1)) {
             printf("Number of states is %i\n", n_states);
             for (int c = 0; c < n_states; c++) {
-                printf("GPU> Value for row 3 and col %i is %f\n", c, prevProbabilities[c]);
+                printf("GPU> Value for row %i and col %i is %f\n",row, c, prevProbabilities[c]);
             }
         }
     }
@@ -389,7 +415,9 @@ double scoreKernel(std::vector<HMMInputSequence> sequences,
         eventOffsets.push_back(offset);
         size_t num_events = ev.read->events->size();
         for (int i=0;i<num_events;i++) {
-            eventMeans[offset + i] = ev.read->events[0][i].mean; //taking the first element. Not sure what the second one is..
+            auto scaled = ev.read->get_drift_scaled_level(i, ev.strand); // send the data in drift scaled
+            //auto unscaled = ev.read->events[0][i].mean; //taking the first element. Not sure what the second one is..
+            eventMeans[offset + i] = scaled;
         }
         offset += num_events;
     }
@@ -400,13 +428,41 @@ double scoreKernel(std::vector<HMMInputSequence> sequences,
     std::vector<float> pore_model_level_mean(num_states);
     std::vector<float> pore_model_level_stdv(num_states);
 
+    //TODO: Fix this.
     for(int st=0; st<num_states; st++){
-        auto params = event_sequences[0].pore_model->states[0]; //let's just initially get the params for AAAAAA
+        auto params = event_sequences[0].pore_model->states[st]; //let's just initially get the params for AAAAAA
         pore_model_level_log_stdv[st] = params.level_log_stdv;
         pore_model_level_mean[st] = params.level_mean;
         pore_model_level_stdv[st] = params.level_stdv;
     }
 
+    std::vector<float> scale(num_reads);
+    std::vector<float> shift(num_reads);
+    std::vector<float> var(num_reads);
+    std::vector<float> log_var(num_reads);
+
+    for (int i=0;i<num_reads;i++){
+        scale[i] = event_sequences[i].read->scalings->scale;
+        shift[i] = event_sequences[i].read->scalings->shift;
+        var[i] = event_sequences[i].read->scalings->var;
+        log_var[i] = event_sequences[i].read->scalings->log_var;
+    }
+
+    float* scaleDev;
+    float* shiftDev;
+    float* varDev;
+    float* logVarDev;
+
+    cudaMalloc( (void**)&scaleDev, scale.size() * sizeof(float));
+    cudaMalloc( (void**)&shiftDev, shift.size() * sizeof(float));
+    cudaMalloc( (void**)&varDev, var.size() * sizeof(float));
+    cudaMalloc( (void**)&logVarDev, log_var.size() * sizeof(float));
+
+    cudaMemcpyAsync( scaleDev, scale.data(), scale.size() * sizeof(float), cudaMemcpyHostToDevice );
+    cudaMemcpyAsync( shiftDev, shift.data(), shift.size() * sizeof(float), cudaMemcpyHostToDevice );
+    cudaMemcpyAsync( varDev, var.data(), var.size() * sizeof(float), cudaMemcpyHostToDevice );
+    cudaMemcpyAsync( logVarDev, log_var.data(), log_var.size() * sizeof(float), cudaMemcpyHostToDevice );
+
 
     float* poreModelLevelLogStdvDev;
     cudaMalloc( (void**)&poreModelLevelLogStdvDev, pore_model_level_log_stdv.size() * sizeof(float));
@@ -455,8 +511,8 @@ double scoreKernel(std::vector<HMMInputSequence> sequences,
     int num_blocks = n_states / PSR9_NUM_STATES;
     uint32_t num_kmers = num_blocks - 2; // two terminal blocks
 
-    dim3 dimBlock(num_blocks - 2);
-    dim3 dimGrid(1); // One thread per state, not including Start and Terminal state.
+    dim3 dimBlock(num_blocks - 2); // One thread per state, not including Start and Terminal state.
+    dim3 dimGrid(1); // Only looking at first event at the moment
 
     float * returnValues;
     cudaMalloc((void **) &returnValues, sizeof(float) * num_reads); //one score per read
@@ -476,6 +532,10 @@ double scoreKernel(std::vector<HMMInputSequence> sequences,
             poreModelLevelLogStdvDev,
             poreModelLevelStdvDev,
             poreModelLevelMeanDev,
+            scaleDev,
+            shiftDev,
+            varDev,
+            logVarDev,
             returnValues);
 
     //cudaDeviceSynchronize();
@@ -493,6 +553,10 @@ double scoreKernel(std::vector<HMMInputSequence> sequences,
     cudaFree(poreModelLevelLogStdvDev);
     cudaFree(poreModelLevelStdvDev);
     cudaFree(poreModelLevelMeanDev);
+    cudaFree(scaleDev);
+    cudaFree(shiftDev);
+    cudaFree(varDev);
+    cudaFree(logVarDev);
 
     //Free host memory
     cudaFreeHost(eventMeans);
@@ -525,6 +589,7 @@ std::vector<double> GpuAligner::variantScoresThresholded(std::vector<Variant> in
     std::vector<HMMInputSequence> base_sequences = generate_methylated_alternatives(base_haplotype.get_sequence(),
                                                                                     methylation_types);
     std::vector<std::vector<HMMInputSequence>> variant_sequences;
+
     //for (auto v: variant_haplotypes){
     //    auto variant_sequence = generate_methylated_alternatives(v.get_sequence(), methylation_types);
     //    variant_sequences.push_back(variant_sequence);
diff --git a/src/hmm/nanopolish_emissions.h b/src/hmm/nanopolish_emissions.h
index 599a24fb..7b5c8108 100644
--- a/src/hmm/nanopolish_emissions.h
+++ b/src/hmm/nanopolish_emissions.h
@@ -63,9 +63,12 @@ inline float log_probability_match_r9(const SquiggleRead& read,
 {
     // event level mean, scaled with the drift value
     float level = read.get_drift_scaled_level(event_idx, strand);
-
+    if (debug == true){
+        printf("Level being used to calculate emission: %f\n", level);
+    }
     GaussianParameters gp = read.get_scaled_gaussian_from_pore_model_state(pore_model, strand, kmer_rank);
     if (debug == true) {
+        printf(">CPU Strand is: %i\n", strand);
         printf(">CPU kmer_rank is: %i\n", kmer_rank);
         printf(">CPU level is: %f\n", level);
         printf(">CPU gaussian mean: %f\n", gp.mean);
diff --git a/src/hmm/nanopolish_profile_hmm_r9.inl b/src/hmm/nanopolish_profile_hmm_r9.inl
index d8738101..bc0235c6 100644
--- a/src/hmm/nanopolish_profile_hmm_r9.inl
+++ b/src/hmm/nanopolish_profile_hmm_r9.inl
@@ -326,11 +326,11 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
     // Fill in matrix
     for(uint32_t row = 1; row < output.get_num_rows(); row++) {
 
-        printf("======\n");
+        //printf("======\n");
         //diagnostics - after match and bad event have been applied
         if (row == 4) { // row 1 has been computed so we can have a peek
             auto nc = output.get_num_columns();
-            int rw = 3;
+            int rw = 1;
             for (int i = 0; i < nc; i++) {
                 printf("CPU> Value for row %i col %i is %f\n", rw, i, output.get(rw, i));
             }
@@ -352,6 +352,7 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
             uint32_t event_idx = e_start + (row - 1) * data.event_stride;
             uint32_t rank = kmer_ranks[kmer_idx];
             float lp_emission_m = log_probability_match_r9(*data.read, *data.pore_model, rank, event_idx, data.strand, true);
+            printf("CPU> lp_emission_m %f\n", lp_emission_m);
             float lp_emission_b = BAD_EVENT_PENALTY;
             
             HMMUpdateScores scores;
@@ -381,9 +382,9 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
             scores.x[HMT_FROM_PREV_B] = -INFINITY;
             scores.x[HMT_FROM_PREV_K] = -INFINITY;
             scores.x[HMT_FROM_SOFT] = -INFINITY;
-            printf("before: %f:\n", output.get(row, curr_block_offset + PSR9_BAD_EVENT));
+            //printf("before: %f:\n", output.get(row, curr_block_offset + PSR9_BAD_EVENT));
             output.update_cell(row, curr_block_offset + PSR9_BAD_EVENT, scores, lp_emission_b);
-            printf("after: %f:\n", output.get(row, curr_block_offset + PSR9_BAD_EVENT));
+            //printf("after: %f:\n", output.get(row, curr_block_offset + PSR9_BAD_EVENT));
 
             // in cu this is where the shared memory sync on prev states would go.
             // state PSR9_KMER_SKIP
@@ -395,15 +396,15 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
             scores.x[HMT_FROM_SOFT] = -INFINITY;
             output.update_cell(row, curr_block_offset + PSR9_KMER_SKIP, scores, 0.0f); // no emission
 
-            if ((block == 1) && (row == 1)){ //blcok 1 corresponds to threadIdx 0 on GPU
-                printf("lp_emission_m is %f\n", lp_emission_m);
-                printf("PSR9_MATCH is %i\n", PSR9_MATCH);
-                printf(">CPU score HMT_FROM_SAME_M is %f\n", scores.x[HMT_FROM_SAME_M]);
-                printf(">CPU score HMT_FROM_PREV_M is %f\n", scores.x[HMT_FROM_PREV_M]);
-                printf(">CPU score HMT_FROM_SAME_B is %f\n", scores.x[HMT_FROM_SAME_B]);
-                printf(">CPU score HMT_FROM_PREV_B is %f\n", scores.x[HMT_FROM_PREV_B]);
-                printf(">CPU score HMT_FROM_PREV_K is %f\n", scores.x[HMT_FROM_PREV_K]);
-            }
+            //if ((block == 1) && (row == 1)){ //blcok 1 corresponds to threadIdx 0 on GPU
+            //    printf("lp_emission_m is %f\n", lp_emission_m);
+            //    printf("PSR9_MATCH is %i\n", PSR9_MATCH);
+            //    printf(">CPU score HMT_FROM_SAME_M is %f\n", scores.x[HMT_FROM_SAME_M]);
+            //    printf(">CPU score HMT_FROM_PREV_M is %f\n", scores.x[HMT_FROM_PREV_M]);
+            //    printf(">CPU score HMT_FROM_SAME_B is %f\n", scores.x[HMT_FROM_SAME_B]);
+            //    printf(">CPU score HMT_FROM_PREV_B is %f\n", scores.x[HMT_FROM_PREV_B]);
+            //    printf(">CPU score HMT_FROM_PREV_K is %f\n", scores.x[HMT_FROM_PREV_K]);
+            //}
 
             // If POST_CLIP is enabled we allow the last kmer to transition directly
             // to the end after any event. Otherwise we only allow it from the 
diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp
index 97319c76..ec289603 100644
--- a/src/nanopolish_call_variants.cpp
+++ b/src/nanopolish_call_variants.cpp
@@ -349,9 +349,12 @@ std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& ali
 
         GpuAligner aligner;
         auto t0_gpu = std::chrono::high_resolution_clock::now();
+        // get the scaled levels.
+
         std::vector<double> scores = aligner.variantScoresThresholded(tmp_variants, test_haplotype, event_sequences,
                                                        alignment_flags, opt::screen_score_threshold,
                                                        opt::methylation_types);
+
         auto tf_gpu = std::chrono::high_resolution_clock::now();
         gpu_exec += tf_gpu - t0_gpu;
 

From e148c87f54c145cfcfddc2d4d44f4679be98ab94 Mon Sep 17 00:00:00 2001
From: Mike Vella <mike.vella@nanoporetech.com>
Date: Wed, 27 Jun 2018 13:22:24 +0100
Subject: [PATCH 11/80] Dynamic Programming Table the same for GPU and CPU
 except end

---
 src/cuda_kernels/GpuAligner.cu        | 88 +++++++++++++++++++--------
 src/hmm/nanopolish_emissions.h        |  1 +
 src/hmm/nanopolish_profile_hmm_r9.inl | 14 +----
 3 files changed, 66 insertions(+), 37 deletions(-)

diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index 3cef2919..204078a4 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -73,6 +73,7 @@ __global__ void getScores(float * eventData,
                           float * shiftDev,
                           float * varDev,
                           float * logVarDev,
+                          float * preFlankingDev,
                           float * returnValues) {
 
     // Initialise the prev probability row, which is the row of the DP table
@@ -91,9 +92,6 @@ __global__ void getScores(float * eventData,
     int e_stride = eventStrides[readIdx];
     int e_offset = eventOffsets[readIdx]; // Within the event means etc, the offset needed for this block to get a specific event
 
-    //float levelLogStdv = poreModelLevelLogStdv[e_offset];
-    //float levelStdv = poreModelLevelStdv[e_offset];
-    //float levelMean = poreModelLevelMean[e_offset];
 
     if (threadIdx.x == 0){
         printf(">GPU e_start %i\n", e_start);
@@ -158,12 +156,12 @@ __global__ void getScores(float * eventData,
     for(int row=1; row<numRows;row++){
         // Emission probabilities
         int event_idx = e_start + (row - 1) * e_stride;
-        float event_mean = eventData[e_offset + row];
-        if (threadIdx.x == 0 && row ==1){
-            printf("event mean: %f\n", event_mean);
-        }
+        float event_mean = eventData[e_offset + row - 1];
+        float preFlank = preFlankingDev[e_offset + row - 1];
+
         bool debug = false;
-        if (threadIdx.x == 0 && row == 1){
+
+        if (threadIdx.x == 0 && row == 3){
             debug = true;
         }
 
@@ -178,11 +176,7 @@ __global__ void getScores(float * eventData,
                                           logVar,
                                           debug);
 
-        //TODO: The level I am seeing is nto agreeing with the CPU one atm.
-        if (threadIdx.x == 0 && row == 1){
-            printf("GPU> lp_emission_m %f\n", lp_emission_m);
-            printf("GPU> level being used to calculate emission: %f\n", event_mean);
-        }
+
         float lp_emission_b = BAD_EVENT_PENALTY;
 
         // Get all the scores for a match
@@ -192,12 +186,18 @@ __global__ void getScores(float * eventData,
         float HMT_FROM_PREV_B = lp_bm_next + prevProbabilities[prevBlockOffset + PSR9_BAD_EVENT];
         float HMT_FROM_PREV_K = lp_km + prevProbabilities[prevBlockOffset + PSR9_KMER_SKIP];
 
+
+
         // m_s is the probability of going from the start state
         // to this kmer. The start state is (currently) only
         // allowed to go to the first kmer. If ALLOW_PRE_CLIP
         // is defined, we allow all events before this one to be skipped,
         // with a penalty;
-        float HMT_FROM_SOFT = (kmerIdx == 0 && (event_idx == e_start)) ? lp_sm  : -INFINITY; // TODO: Add the pre-flank to this calculation. Also flags and HAF_ALLOW_PRE_CLIP
+        float HMT_FROM_SOFT = (kmerIdx == 0 &&
+                               (event_idx == e_start ||
+                                (HAF_ALLOW_PRE_CLIP)))  ? lp_sm  + preFlank : -INFINITY; // TEST! TODO: Add the pre-flank to this calculation. Also flags and HAF_ALLOW_PRE_CLIP. For now this is left out and should not have a big effect
+
+
 
         // calculate the score
         float sum = HMT_FROM_SAME_M;
@@ -226,6 +226,18 @@ __global__ void getScores(float * eventData,
         float newMatchScore = sum;
         // Here need to calculate the bad event score
 
+        if (debug==true){
+            printf("GPU> lp_emission_m for row %i and thread %i %f\n", row, threadIdx.x, lp_emission_m);
+            printf("GPU> level being used to calculate emission for thread 0: %f\n", event_mean);
+            printf("GPU> match score for row %i and thread %i %f\\n\", row, threadIdx.x", newMatchScore);
+            printf("GPU> HMT_FROM_SAME_M: %f\n", HMT_FROM_SAME_M);
+            printf("GPU> HMT_FROM_PREV_M: %f\n", HMT_FROM_PREV_M);
+            printf("GPU> HMT_FROM_SAME_B: %f\n", HMT_FROM_SAME_B);
+            printf("GPU> HMT_FROM_PREV_B: %f\n", HMT_FROM_PREV_B);
+            printf("GPU> HMT_FROM_PREV_K: %f\n", HMT_FROM_PREV_K);
+            printf("GPU> HMT_FROM_SOFT: %f\n", HMT_FROM_SOFT);
+
+        }
         // state PSR9_BAD_EVENT
         HMT_FROM_SAME_M = lp_mb + prevProbabilities[curBlockOffset + PSR9_MATCH];
         HMT_FROM_PREV_M = -INFINITY; // not allowed
@@ -244,7 +256,7 @@ __global__ void getScores(float * eventData,
 
         float newBadEventScore = sum;
 
-        // Write row out
+        // Write row out. prevProbabilities now becomes "current probabilities" for evaluating skips.
         prevProbabilities[curBlockOffset + PSR9_MATCH] = newMatchScore;
         prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] = newBadEventScore;
         __syncthreads();
@@ -254,7 +266,6 @@ __global__ void getScores(float * eventData,
         HMT_FROM_PREV_M = lp_mk + prevProbabilities[prevBlockOffset + PSR9_MATCH];
         HMT_FROM_SAME_B = -INFINITY;
         HMT_FROM_PREV_B = lp_bk + prevProbabilities[prevBlockOffset + PSR9_BAD_EVENT];
-
         HMT_FROM_SOFT = -INFINITY;
 
         sum = HMT_FROM_SAME_M;
@@ -273,23 +284,26 @@ __global__ void getScores(float * eventData,
         //Now need to do the skip-skip transition, which is serial.
         if (threadIdx.x == 0){
             for (int blkidx = 2;blkidx <= blockDim.x; blkidx++){
+                auto skipIdx = blkidx * PSR9_NUM_STATES + PSR9_KMER_SKIP;
                 //calculate the skipscore using the previous
                 //Current skip score for block blkidx:
-                float curSkipScore = prevProbabilities[blkidx * PSR9_NUM_STATES + PSR9_KMER_SKIP];
+                float prevSkipScore = prevProbabilities[skipIdx - PSR9_NUM_STATES];
+                float curSkipScore = prevProbabilities[skipIdx];
                 //printf("Current skip score for block %i is %f",blkidx, curSkipScore);
                 //new score to add - TODO: use the correct lp_kk score
 
-                HMT_FROM_PREV_K = lp_kk + newSkipScore;
+                HMT_FROM_PREV_K = lp_kk + prevSkipScore;
                 newSkipScore = logsumexpf(curSkipScore, HMT_FROM_PREV_K);
                 //add it
-                prevProbabilities[blkidx * PSR9_NUM_STATES + PSR9_KMER_SKIP] = newSkipScore;
+                prevProbabilities[skipIdx] = newSkipScore;
+                __syncthreads();
             }
         }
 
         // Now do the end state
         __syncthreads();
 
-        if ((threadIdx.x == 1) && (row == 1)){
+        if ((threadIdx.x == 0) && (row == 3)){
             printf("rank %i\n", rank);
             printf("event mean %f\n", event_mean);
             printf("poreModelLevelLogStdv %f\n", poreModelLevelLogStdv[0]);
@@ -306,7 +320,7 @@ __global__ void getScores(float * eventData,
         }
 
 
-        if ((threadIdx.x == 0) && (row == 1)) {
+        if ((threadIdx.x == 0) && (row == 3)) {
             printf("Number of states is %i\n", n_states);
             for (int c = 0; c < n_states; c++) {
                 printf("GPU> Value for row %i and col %i is %f\n",row, c, prevProbabilities[c]);
@@ -360,7 +374,10 @@ double scoreKernel(std::vector<HMMInputSequence> sequences,
 
     std::vector<uint32_t> n_rows; //number of rows in the DP table (n_events + 1)
     std::vector<uint32_t> e_starts; //event starts
-    std::vector<uint32_t> event_strides;
+    std::vector<int> event_strides;
+
+    std::vector<std::vector<float>> pre_flanks;
+    std::vector<std::vector<float>> post_flanks;
 
     for(auto e: event_sequences){
         uint32_t e_start = e.event_start_idx;
@@ -377,6 +394,12 @@ double scoreKernel(std::vector<HMMInputSequence> sequences,
             n_events = e_start - e_end + 1;
 
         n_rows.push_back(n_events + 1);
+
+        std::vector<float> pre_flank = make_pre_flanking(e, e_start, n_events);
+        std::vector<float> post_flank = make_post_flanking(e, e_start, n_events);
+
+        pre_flanks.push_back(pre_flank);
+        post_flanks.push_back(post_flank);
     }
 
     std::vector<uint32_t> kmer_ranks(n_kmers);
@@ -404,20 +427,28 @@ double scoreKernel(std::vector<HMMInputSequence> sequences,
         numEventsTotal += numEvents;
     }
 
-    //Allocate a host buffer to store the event means
+
+    //Allocate a host buffer to store the event means, pre and post-flank data
     float * eventMeans;
     size_t eventMeansSize = numEventsTotal * sizeof(float);
     cudaHostAlloc(&eventMeans, eventMeansSize , cudaHostAllocDefault);
 
+    //Allocate a host buffer to store the event means, pre and post-flank data
+    float * preFlankingHost;
+    cudaHostAlloc(&preFlankingHost, numEventsTotal * sizeof(float) , cudaHostAllocDefault);
+
     std::vector<int> eventOffsets;
     size_t offset = 0;
-    for (auto ev: event_sequences){
+    for(int j=0;j<event_sequences.size();j++){
+        auto ev = event_sequences[j];
         eventOffsets.push_back(offset);
         size_t num_events = ev.read->events->size();
         for (int i=0;i<num_events;i++) {
-            auto scaled = ev.read->get_drift_scaled_level(i, ev.strand); // send the data in drift scaled
+            auto event_idx =  e_starts[j] + i * event_strides[0];
+            auto scaled = ev.read->get_drift_scaled_level(event_idx, ev.strand); // send the data in drift scaled
             //auto unscaled = ev.read->events[0][i].mean; //taking the first element. Not sure what the second one is..
             eventMeans[offset + i] = scaled;
+            preFlankingHost[offset + i] = pre_flanks[j][i]; //also copy over the pre-flanking data, since it has a 1-1 correspondence with events
         }
         offset += num_events;
     }
@@ -485,6 +516,10 @@ double scoreKernel(std::vector<HMMInputSequence> sequences,
     cudaMalloc( (void**)&eventMeansDev, eventMeansSize);
     cudaMemcpyAsync( eventMeansDev, eventMeans, eventMeansSize, cudaMemcpyHostToDevice ); //malloc is taking 300us
 
+    float* preFlankingDev;
+    cudaMalloc( (void**)&preFlankingDev, eventMeansSize);
+    cudaMemcpyAsync( preFlankingDev, preFlankingHost, eventMeansSize, cudaMemcpyHostToDevice ); //malloc is taking 300us
+
     int* numRowsDev;
     cudaMalloc( (void**)&numRowsDev, n_rows.size() * sizeof(int));
     cudaMemcpyAsync( numRowsDev, n_rows.data(), n_rows.size() * sizeof(int), cudaMemcpyHostToDevice );
@@ -536,6 +571,7 @@ double scoreKernel(std::vector<HMMInputSequence> sequences,
             shiftDev,
             varDev,
             logVarDev,
+            preFlankingDev,
             returnValues);
 
     //cudaDeviceSynchronize();
@@ -557,6 +593,8 @@ double scoreKernel(std::vector<HMMInputSequence> sequences,
     cudaFree(shiftDev);
     cudaFree(varDev);
     cudaFree(logVarDev);
+    cudaFree(preFlankingDev);
+
 
     //Free host memory
     cudaFreeHost(eventMeans);
diff --git a/src/hmm/nanopolish_emissions.h b/src/hmm/nanopolish_emissions.h
index 7b5c8108..6069ac81 100644
--- a/src/hmm/nanopolish_emissions.h
+++ b/src/hmm/nanopolish_emissions.h
@@ -68,6 +68,7 @@ inline float log_probability_match_r9(const SquiggleRead& read,
     }
     GaussianParameters gp = read.get_scaled_gaussian_from_pore_model_state(pore_model, strand, kmer_rank);
     if (debug == true) {
+        printf(">Event IDX is: %i\n", event_idx);
         printf(">CPU Strand is: %i\n", strand);
         printf(">CPU kmer_rank is: %i\n", kmer_rank);
         printf(">CPU level is: %f\n", level);
diff --git a/src/hmm/nanopolish_profile_hmm_r9.inl b/src/hmm/nanopolish_profile_hmm_r9.inl
index bc0235c6..4728b680 100644
--- a/src/hmm/nanopolish_profile_hmm_r9.inl
+++ b/src/hmm/nanopolish_profile_hmm_r9.inl
@@ -330,7 +330,7 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
         //diagnostics - after match and bad event have been applied
         if (row == 4) { // row 1 has been computed so we can have a peek
             auto nc = output.get_num_columns();
-            int rw = 1;
+            int rw = 3;
             for (int i = 0; i < nc; i++) {
                 printf("CPU> Value for row %i col %i is %f\n", rw, i, output.get(rw, i));
             }
@@ -372,7 +372,7 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
             scores.x[HMT_FROM_SOFT] = (kmer_idx == 0 &&
                                         (event_idx == e_start ||
                                              (flags & HAF_ALLOW_PRE_CLIP))) ? lp_sm + pre_flank[row - 1] : -INFINITY;
-            
+            printf("lp_emission_m is %f\n", lp_emission_m);
             output.update_cell(row, curr_block_offset + PSR9_MATCH, scores, lp_emission_m);
 
              // state PSR9_BAD_EVENT
@@ -396,16 +396,6 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
             scores.x[HMT_FROM_SOFT] = -INFINITY;
             output.update_cell(row, curr_block_offset + PSR9_KMER_SKIP, scores, 0.0f); // no emission
 
-            //if ((block == 1) && (row == 1)){ //blcok 1 corresponds to threadIdx 0 on GPU
-            //    printf("lp_emission_m is %f\n", lp_emission_m);
-            //    printf("PSR9_MATCH is %i\n", PSR9_MATCH);
-            //    printf(">CPU score HMT_FROM_SAME_M is %f\n", scores.x[HMT_FROM_SAME_M]);
-            //    printf(">CPU score HMT_FROM_PREV_M is %f\n", scores.x[HMT_FROM_PREV_M]);
-            //    printf(">CPU score HMT_FROM_SAME_B is %f\n", scores.x[HMT_FROM_SAME_B]);
-            //    printf(">CPU score HMT_FROM_PREV_B is %f\n", scores.x[HMT_FROM_PREV_B]);
-            //    printf(">CPU score HMT_FROM_PREV_K is %f\n", scores.x[HMT_FROM_PREV_K]);
-            //}
-
             // If POST_CLIP is enabled we allow the last kmer to transition directly
             // to the end after any event. Otherwise we only allow it from the 
             // last kmer/event match.

From 6dddd4853439d1e99b7d7de6fcf2f56fef9a8491 Mon Sep 17 00:00:00 2001
From: Mike Vella <mike.vella@nanoporetech.com>
Date: Mon, 2 Jul 2018 14:30:57 +0100
Subject: [PATCH 12/80] first two base scores correct, bug for other ones

---
 src/cuda_kernels/GpuAligner.cu        | 174 +++++++++++++-------------
 src/hmm/nanopolish_profile_hmm_r9.inl |  12 +-
 src/nanopolish_call_variants.cpp      |   4 +-
 3 files changed, 98 insertions(+), 92 deletions(-)

diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index 204078a4..aa59da03 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -14,7 +14,6 @@ __device__ float logsumexpf(float x, float y){
     return result;
 }
 
-//TODO: Implement, inc pore model
 __device__ float lp_match_r9(int rank,
                              float mean,
                              float * poreModelLevelLogStdv,
@@ -42,18 +41,6 @@ __device__ float lp_match_r9(int rank,
 
     float emission = log_inv_sqrt_2pi - gaussian_log_level_stdv + (-0.5f * a * a); // log_inv_sqrt_2pi is defined in a comment above
 
-    if (debug == true) {
-        if (threadIdx.x == 0) {
-            printf(">GPU: kmer rank is %i\n", rank);
-            printf(">GPU: level %f\n", level);
-            printf(">GPU: gaussian mean %f\n", gaussian_mean);
-            printf(">GPU: gaussian stdv %f\n", gaussian_stdv);
-            printf(">GPU: gaussian log level stdv %f\n", gaussian_log_level_stdv);
-            printf(">GPU a: %f\n", a);
-            printf(">GPU emission: %f\n", emission);
-        }
-    }
-
     return emission; // log_inv_sqrt_2pi is defined in a comment above
 
 }
@@ -74,15 +61,25 @@ __global__ void getScores(float * eventData,
                           float * varDev,
                           float * logVarDev,
                           float * preFlankingDev,
+                          float * postFlankingDev,
                           float * returnValues) {
 
     // Initialise the prev probability row, which is the row of the DP table
+    int n_kmers = blockDim.x; // Question: How does this deal with the case where the block is bigger than the sequence, such as if one variant is a deletion?
+    int n_states = n_kmers * PSR9_NUM_STATES + 2 * PSR9_NUM_STATES; // 3 blocks per kmer and then 3 each for start and end state.
+
+    //initialise the return value
+    returnValues[blockIdx.x] = -INFINITY;
 
-    int n_states = blockDim.x * PSR9_NUM_STATES + 2 * PSR9_NUM_STATES; // 3 blocks per kmer and then 3 each for start and end state.
     __shared__ float prevProbabilities[MAX_STATES];
-    for (int i = 0; i < n_states; i++) {
+
+    // Initialise the previous probabilities
+    for (int i = 0; i < n_states - PSR9_NUM_STATES; i++) {
         prevProbabilities[i] = -INFINITY;
     }
+    for (int i = n_states - PSR9_NUM_STATES; i < n_states; i++) {
+        prevProbabilities[i] = 0; // Is this correct?
+    }
 
     //Step 1: calculate transitions. For now we are going to use external params.
     int readIdx = blockIdx.x;
@@ -92,14 +89,8 @@ __global__ void getScores(float * eventData,
     int e_stride = eventStrides[readIdx];
     int e_offset = eventOffsets[readIdx]; // Within the event means etc, the offset needed for this block to get a specific event
 
-
-    if (threadIdx.x == 0){
-        printf(">GPU e_start %i\n", e_start);
-    }
-
     int kmerIdx = threadIdx.x;
     uint32_t rank = kmer_ranks[kmerIdx]; // lexical rank of a kmer
-    printf("Kmer idx %i, Rank: %i\n", kmerIdx, rank);
 
     float p_stay = 1 - (1 / read_events_per_base);
     float p_skip = 0.0025;
@@ -158,10 +149,11 @@ __global__ void getScores(float * eventData,
         int event_idx = e_start + (row - 1) * e_stride;
         float event_mean = eventData[e_offset + row - 1];
         float preFlank = preFlankingDev[e_offset + row - 1];
+        float postFlank = postFlankingDev[e_offset + row - 1];
 
         bool debug = false;
 
-        if (threadIdx.x == 0 && row == 3){
+        if (threadIdx.x == 0 && (row == numRows -1) && blockIdx.x == 0){
             debug = true;
         }
 
@@ -195,49 +187,22 @@ __global__ void getScores(float * eventData,
         // with a penalty;
         float HMT_FROM_SOFT = (kmerIdx == 0 &&
                                (event_idx == e_start ||
-                                (HAF_ALLOW_PRE_CLIP)))  ? lp_sm  + preFlank : -INFINITY; // TEST! TODO: Add the pre-flank to this calculation. Also flags and HAF_ALLOW_PRE_CLIP. For now this is left out and should not have a big effect
-
-
+                                (HAF_ALLOW_PRE_CLIP)))  ? lp_sm  + preFlank : -INFINITY; // TODO: Add flag for HAF ALLOW_PRE_CLIP
 
         // calculate the score
         float sum = HMT_FROM_SAME_M;
-
         sum = logsumexpf(sum, HMT_FROM_SOFT);
-        if (debug == true){
-            printf("Sum1 is : %f\n", sum);
-        }
         sum = logsumexpf(sum, HMT_FROM_PREV_M);
-        if (debug == true){
-            printf("Sum2 is : %f\n", sum);
-        }
-
         sum = logsumexpf(sum, HMT_FROM_SAME_B);
         sum = logsumexpf(sum, HMT_FROM_PREV_B);
-        if (debug == true){
-            printf("Sum3 is : %f\n", sum);
-        }
-
         sum = logsumexpf(sum, HMT_FROM_PREV_K);
         sum += lp_emission_m;
-        if (debug == true){
-            printf("Sum4 is : %f\n", sum);
-        }
+
 
         float newMatchScore = sum;
         // Here need to calculate the bad event score
 
-        if (debug==true){
-            printf("GPU> lp_emission_m for row %i and thread %i %f\n", row, threadIdx.x, lp_emission_m);
-            printf("GPU> level being used to calculate emission for thread 0: %f\n", event_mean);
-            printf("GPU> match score for row %i and thread %i %f\\n\", row, threadIdx.x", newMatchScore);
-            printf("GPU> HMT_FROM_SAME_M: %f\n", HMT_FROM_SAME_M);
-            printf("GPU> HMT_FROM_PREV_M: %f\n", HMT_FROM_PREV_M);
-            printf("GPU> HMT_FROM_SAME_B: %f\n", HMT_FROM_SAME_B);
-            printf("GPU> HMT_FROM_PREV_B: %f\n", HMT_FROM_PREV_B);
-            printf("GPU> HMT_FROM_PREV_K: %f\n", HMT_FROM_PREV_K);
-            printf("GPU> HMT_FROM_SOFT: %f\n", HMT_FROM_SOFT);
 
-        }
         // state PSR9_BAD_EVENT
         HMT_FROM_SAME_M = lp_mb + prevProbabilities[curBlockOffset + PSR9_MATCH];
         HMT_FROM_PREV_M = -INFINITY; // not allowed
@@ -281,29 +246,52 @@ __global__ void getScores(float * eventData,
         prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] = newSkipScore;
         __syncthreads();
 
-        //Now need to do the skip-skip transition, which is serial.
+        //Now need to do the skip-skip transition, which is serial so for now letting one thread execute it.
         if (threadIdx.x == 0){
             for (int blkidx = 2;blkidx <= blockDim.x; blkidx++){
                 auto skipIdx = blkidx * PSR9_NUM_STATES + PSR9_KMER_SKIP;
-                //calculate the skipscore using the previous
-                //Current skip score for block blkidx:
                 float prevSkipScore = prevProbabilities[skipIdx - PSR9_NUM_STATES];
                 float curSkipScore = prevProbabilities[skipIdx];
-                //printf("Current skip score for block %i is %f",blkidx, curSkipScore);
-                //new score to add - TODO: use the correct lp_kk score
-
                 HMT_FROM_PREV_K = lp_kk + prevSkipScore;
                 newSkipScore = logsumexpf(curSkipScore, HMT_FROM_PREV_K);
-                //add it
                 prevProbabilities[skipIdx] = newSkipScore;
                 __syncthreads();
             }
         }
 
+        __syncthreads();
+
+        int lastKmerIdx = n_kmers -1;
+        int lastRowIdx = numRows -1;
+        float end;
+        // Now do the post-clip transition
+        if(kmerIdx == lastKmerIdx && ( (HAF_ALLOW_POST_CLIP) || row == lastRowIdx)) {
+            float lp1 = lp_ms + prevProbabilities[curBlockOffset + PSR9_MATCH] + postFlank;
+            float lp2 = lp_ms + prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] + postFlank;
+            float lp3 = lp_ms + prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] + postFlank;
+
+            printf(">GPU Post-clip transition on row %i, read %i, threadIdx is %i\n"
+                           "LP1=%f\n"
+                           "LP2=%f\n"
+                           "LP3=%f\n",
+                   row,
+                   blockIdx.x,
+                   threadIdx.x,
+                   lp1,
+                   lp2,
+                   lp3);
+
+            end = returnValues[blockIdx.x];
+            end = logsumexpf(end, lp1);
+            end = logsumexpf(end, lp2);
+            end = logsumexpf(end, lp3);
+            returnValues[blockIdx.x] = end;
+        }
         // Now do the end state
         __syncthreads();
 
-        if ((threadIdx.x == 0) && (row == 3)){
+        // DIAGNOSTIC
+        if (debug == true){
             printf("rank %i\n", rank);
             printf("event mean %f\n", event_mean);
             printf("poreModelLevelLogStdv %f\n", poreModelLevelLogStdv[0]);
@@ -317,10 +305,6 @@ __global__ void getScores(float * eventData,
             printf(">GPU score HMT_FROM_PREV_B is %f\n", HMT_FROM_PREV_B);
             printf(">GPU score HMT_FROM_PREV_K is %f\n", HMT_FROM_PREV_K);
             printf(">GPU newSkipScore is %f\n", newSkipScore);
-        }
-
-
-        if ((threadIdx.x == 0) && (row == 3)) {
             printf("Number of states is %i\n", n_states);
             for (int c = 0; c < n_states; c++) {
                 printf("GPU> Value for row %i and col %i is %f\n",row, c, prevProbabilities[c]);
@@ -328,8 +312,6 @@ __global__ void getScores(float * eventData,
         }
     }
 
-
-    returnValues[blockIdx.x] = 0.356;
     __syncthreads();
 }
 
@@ -342,9 +324,9 @@ GpuAligner::GpuAligner()
         n[i] = i;
 }
 
-double scoreKernel(std::vector<HMMInputSequence> sequences,
-                   std::vector<HMMInputData> event_sequences,
-                   uint32_t alignment_flags){
+std::vector<double> scoreKernel(std::vector<HMMInputSequence> sequences,
+                    std::vector<HMMInputData> event_sequences,
+                    uint32_t alignment_flags){
 
     // Extract the pore model.
     //Let's assume that every event sequence has the same pore model
@@ -435,7 +417,9 @@ double scoreKernel(std::vector<HMMInputSequence> sequences,
 
     //Allocate a host buffer to store the event means, pre and post-flank data
     float * preFlankingHost;
+    float * postFlankingHost;
     cudaHostAlloc(&preFlankingHost, numEventsTotal * sizeof(float) , cudaHostAllocDefault);
+    cudaHostAlloc(&postFlankingHost, numEventsTotal * sizeof(float) , cudaHostAllocDefault);
 
     std::vector<int> eventOffsets;
     size_t offset = 0;
@@ -449,6 +433,7 @@ double scoreKernel(std::vector<HMMInputSequence> sequences,
             //auto unscaled = ev.read->events[0][i].mean; //taking the first element. Not sure what the second one is..
             eventMeans[offset + i] = scaled;
             preFlankingHost[offset + i] = pre_flanks[j][i]; //also copy over the pre-flanking data, since it has a 1-1 correspondence with events
+            postFlankingHost[offset + i] = post_flanks[j][i]; //also copy over the pre-flanking data, since it has a 1-1 correspondence with events
         }
         offset += num_events;
     }
@@ -461,7 +446,7 @@ double scoreKernel(std::vector<HMMInputSequence> sequences,
 
     //TODO: Fix this.
     for(int st=0; st<num_states; st++){
-        auto params = event_sequences[0].pore_model->states[st]; //let's just initially get the params for AAAAAA
+        auto params = event_sequences[0].pore_model->states[st]; //TODO: Is this OK?
         pore_model_level_log_stdv[st] = params.level_log_stdv;
         pore_model_level_mean[st] = params.level_mean;
         pore_model_level_stdv[st] = params.level_stdv;
@@ -520,6 +505,10 @@ double scoreKernel(std::vector<HMMInputSequence> sequences,
     cudaMalloc( (void**)&preFlankingDev, eventMeansSize);
     cudaMemcpyAsync( preFlankingDev, preFlankingHost, eventMeansSize, cudaMemcpyHostToDevice ); //malloc is taking 300us
 
+    float* postFlankingDev;
+    cudaMalloc( (void**)&postFlankingDev, eventMeansSize);
+    cudaMemcpyAsync( postFlankingDev, postFlankingHost, eventMeansSize, cudaMemcpyHostToDevice ); //malloc is taking 300us
+
     int* numRowsDev;
     cudaMalloc( (void**)&numRowsDev, n_rows.size() * sizeof(int));
     cudaMemcpyAsync( numRowsDev, n_rows.data(), n_rows.size() * sizeof(int), cudaMemcpyHostToDevice );
@@ -544,18 +533,18 @@ double scoreKernel(std::vector<HMMInputSequence> sequences,
     cudaMemcpyAsync( eventOffsetsDev, eventOffsets.data(), eventOffsets.size() * sizeof(int), cudaMemcpyHostToDevice );
 
     int num_blocks = n_states / PSR9_NUM_STATES;
-    uint32_t num_kmers = num_blocks - 2; // two terminal blocks
+    uint32_t num_kmers = num_blocks - 2; // two terminal blocks. Not currently used but left here for now.
 
     dim3 dimBlock(num_blocks - 2); // One thread per state, not including Start and Terminal state.
-    dim3 dimGrid(1); // Only looking at first event at the moment
+    dim3 dimGrid(num_reads); // let's look at only the first read
 
-    float * returnValues;
-    cudaMalloc((void **) &returnValues, sizeof(float) * num_reads); //one score per read
+    float * returnValuesDev;
+    cudaMalloc((void **) &returnValuesDev, sizeof(float) * num_reads); //one score per read
 
-    float* returnedValues;// = new float[num_reads];
-    //size_t eventMeansSize = numEventsTotal * sizeof(float);
+    float* returnedValues;
     cudaHostAlloc(&returnedValues, num_reads * sizeof(float) , cudaHostAllocDefault);
 
+    printf("About to run getscores...\n");
     getScores<<<dimGrid, dimBlock>>>(eventMeansDev,
             eventsPerBaseDev,
             numRowsDev,
@@ -572,10 +561,11 @@ double scoreKernel(std::vector<HMMInputSequence> sequences,
             varDev,
             logVarDev,
             preFlankingDev,
-            returnValues);
+            postFlankingDev,
+            returnValuesDev);
 
-    //cudaDeviceSynchronize();
-    cudaMemcpyAsync(returnedValues, returnValues, num_reads *sizeof(float), cudaMemcpyDeviceToHost);
+    cudaDeviceSynchronize();
+    cudaMemcpyAsync(returnedValues, returnValuesDev, num_reads *sizeof(float), cudaMemcpyDeviceToHost);
 
     // Free device memory
     cudaFree(eventMeansDev);
@@ -594,14 +584,15 @@ double scoreKernel(std::vector<HMMInputSequence> sequences,
     cudaFree(varDev);
     cudaFree(logVarDev);
     cudaFree(preFlankingDev);
-
+    cudaFree(postFlankingDev);
 
     //Free host memory
     cudaFreeHost(eventMeans);
 
-    float r = 0.0;
+    //Send all the scores back
+    std::vector<double> r(num_reads);
     for(int i=0; i<num_reads;i++){
-        r += returnedValues[i];
+        r[i]= (double) returnedValues[i];
     }
 
     return r;
@@ -628,20 +619,27 @@ std::vector<double> GpuAligner::variantScoresThresholded(std::vector<Variant> in
                                                                                     methylation_types);
     std::vector<std::vector<HMMInputSequence>> variant_sequences;
 
-    //for (auto v: variant_haplotypes){
-    //    auto variant_sequence = generate_methylated_alternatives(v.get_sequence(), methylation_types);
-    //    variant_sequences.push_back(variant_sequence);
-    //}
+    for (auto v: variant_haplotypes){
+        auto variant_sequence = generate_methylated_alternatives(v.get_sequence(), methylation_types);
+        variant_sequences.push_back(variant_sequence);
+    }
 
     assert(base_sequences.size() == 1);
 
     // return the sum of the score for the base sequences over all the event sequences
-    double base_score = scoreKernel(base_sequences, event_sequences, alignment_flags);
+    auto base_scores = scoreKernel(base_sequences, event_sequences, alignment_flags);
 
     std::vector<double> v(variant_sequences.size());
     for (int i=0; i<variant_sequences.size(); i++){
-        double score = scoreKernel(variant_sequences[i], event_sequences, alignment_flags); //TODO: Base sequence needs to be replaced with the variant itself
-        v[i] = (score - base_score);
+        auto scores = scoreKernel(variant_sequences[i], event_sequences, alignment_flags); //TODO: Base sequence needs to be replaced with the variant itself
+
+        double totalScore = 0.0;
+        for(int k=0; k<scores.size(); k++){
+            if (fabs(totalScore) < screen_score_threshold){ //threshold, hardcoded. TODO: Make this an argument, although the thresholding doesn't make much sense on GPU
+                totalScore += (scores[k] - base_scores[k]);
+            }
+        }
+        v[i] = totalScore;
     }
 
     return v;
diff --git a/src/hmm/nanopolish_profile_hmm_r9.inl b/src/hmm/nanopolish_profile_hmm_r9.inl
index 4728b680..315618a3 100644
--- a/src/hmm/nanopolish_profile_hmm_r9.inl
+++ b/src/hmm/nanopolish_profile_hmm_r9.inl
@@ -328,9 +328,9 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
 
         //printf("======\n");
         //diagnostics - after match and bad event have been applied
-        if (row == 4) { // row 1 has been computed so we can have a peek
+        if (row == 29) { // row 1 has been computed so we can have a peek
             auto nc = output.get_num_columns();
-            int rw = 3;
+            int rw = 28;
             for (int i = 0; i < nc; i++) {
                 printf("CPU> Value for row %i col %i is %f\n", rw, i, output.get(rw, i));
             }
@@ -399,7 +399,9 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
             // If POST_CLIP is enabled we allow the last kmer to transition directly
             // to the end after any event. Otherwise we only allow it from the 
             // last kmer/event match.
+
             if(kmer_idx == last_kmer_idx && ( (flags & HAF_ALLOW_POST_CLIP) || row == last_event_row_idx)) {
+                printf(">CPU Post-clip transition on row %i\n", row);
                 float lp1 = lp_ms + output.get(row, curr_block_offset + PSR9_MATCH) + post_flank[row - 1];
                 float lp2 = lp_ms + output.get(row, curr_block_offset + PSR9_BAD_EVENT) + post_flank[row - 1];
                 float lp3 = lp_ms + output.get(row, curr_block_offset + PSR9_KMER_SKIP) + post_flank[row - 1];
@@ -407,6 +409,12 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
                 output.update_end(lp1, row, curr_block_offset + PSR9_MATCH);
                 output.update_end(lp2, row, curr_block_offset + PSR9_BAD_EVENT);
                 output.update_end(lp3, row, curr_block_offset + PSR9_KMER_SKIP);
+
+                printf(">LP1 %f\n", lp1);
+                printf(">LP2 %f\n", lp2);
+                printf(">LP3 %f\n", lp3);
+                printf(">end %f\n", output.get_end());
+
             }
 
 #ifdef DEBUG_LOCAL_ALIGNMENT
diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp
index ec289603..864a8c59 100644
--- a/src/nanopolish_call_variants.cpp
+++ b/src/nanopolish_call_variants.cpp
@@ -352,7 +352,7 @@ std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& ali
         // get the scaled levels.
 
         std::vector<double> scores = aligner.variantScoresThresholded(tmp_variants, test_haplotype, event_sequences,
-                                                       alignment_flags, opt::screen_score_threshold,
+                                                       alignment_flags, 10,//opt::screen_score_threshold,
                                                        opt::methylation_types);
 
         auto tf_gpu = std::chrono::high_resolution_clock::now();
@@ -364,7 +364,7 @@ std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& ali
                                                                test_haplotype,
                                                                event_sequences,
                                                                alignment_flags,
-                                                               opt::screen_score_threshold,
+                                                               10,//opt::screen_score_threshold,
                                                                opt::methylation_types);
             auto t1 = std::chrono::high_resolution_clock::now();
             scoring += t1-t0;

From 03ffaba469a0c2f9c3803963f289c71cfcdc1a8c Mon Sep 17 00:00:00 2001
From: Mike Vella <mike.vella@nanoporetech.com>
Date: Tue, 3 Jul 2018 14:36:29 +0100
Subject: [PATCH 13/80] GPU and CPU versions now giving same results

---
 src/common/nanopolish_variant.cpp     |   1 +
 src/cuda_kernels/GpuAligner.cu        | 116 ++++++++++++++++----------
 src/hmm/nanopolish_emissions.h        |  24 +++---
 src/hmm/nanopolish_profile_hmm.cpp    |   1 +
 src/hmm/nanopolish_profile_hmm_r9.inl |  54 +++++++-----
 src/nanopolish_call_variants.cpp      |   5 +-
 6 files changed, 122 insertions(+), 79 deletions(-)

diff --git a/src/common/nanopolish_variant.cpp b/src/common/nanopolish_variant.cpp
index b73a6b2b..bbc5933b 100644
--- a/src/common/nanopolish_variant.cpp
+++ b/src/common/nanopolish_variant.cpp
@@ -686,6 +686,7 @@ Variant score_variant_thresholded(const Variant& input_variant,
         if(fabs(total_score) < score_threshold) {
 
             // Calculate scores using the base nucleotide model
+            printf("Working with input %i\n", j);
             double base_score = profile_hmm_score_set(base_sequences, input[j], alignment_flags);
             double variant_score = profile_hmm_score_set(variant_sequences, input[j], alignment_flags);
 
diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index aa59da03..ef52b68c 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -87,10 +87,25 @@ __global__ void getScores(float * eventData,
     int numRows = numRowsPerRead[readIdx]; // Number of rows in this DP table.
     int e_start = eventStarts[readIdx]; // Event start for read
     int e_stride = eventStrides[readIdx];
+    bool rc = false;
+    if (e_stride == -1){
+        rc = true;
+    }
     int e_offset = eventOffsets[readIdx]; // Within the event means etc, the offset needed for this block to get a specific event
 
+    if(blockIdx.x==2){ // read 2 is an RC read
+        printf("Block IDX is %i and stride is %i\n", blockIdx.x, e_stride);
+    }
+
     int kmerIdx = threadIdx.x;
-    uint32_t rank = kmer_ranks[kmerIdx]; // lexical rank of a kmer
+    uint32_t rank;
+
+    if (rc == true) {
+        rank = kmer_ranks_rc[kmerIdx];
+        //printf("Using an RC rank of %i\n", rank);
+    }else{
+        rank = kmer_ranks[kmerIdx];
+    }
 
     float p_stay = 1 - (1 / read_events_per_base);
     float p_skip = 0.0025;
@@ -112,7 +127,7 @@ __global__ void getScores(float * eventData,
     float p_kk = p_skip_self;
     float p_km = 1.0f - p_kk;
 
-    // We assign some transition probabilities. I believe this is correct and they don't vary by location in the sequence (why would they)
+    // We assign some transition probabilities. I believe this is correct and they don't vary by location in the sequence
     float lp_mk = log(p_mk);
     float lp_mb = log(p_mb);
     float lp_mm_self = log(p_mm_self);
@@ -153,7 +168,7 @@ __global__ void getScores(float * eventData,
 
         bool debug = false;
 
-        if (threadIdx.x == 0 && (row == numRows -1) && blockIdx.x == 0){
+        if (threadIdx.x == 0 && (row == numRows -1) && blockIdx.x == 2){
             debug = true;
         }
 
@@ -189,6 +204,12 @@ __global__ void getScores(float * eventData,
                                (event_idx == e_start ||
                                 (HAF_ALLOW_PRE_CLIP)))  ? lp_sm  + preFlank : -INFINITY; // TODO: Add flag for HAF ALLOW_PRE_CLIP
 
+        if (blockIdx.x == 2 && threadIdx.x == 0 && row == 2){
+            printf("HMT_FROM_SOFT should be (?) -5.99 but is in fact %f\n", HMT_FROM_SOFT);
+            printf("event IDX is %i\n", event_idx);
+            printf("e_start is %i\n", e_start);
+        }
+
         // calculate the score
         float sum = HMT_FROM_SAME_M;
         sum = logsumexpf(sum, HMT_FROM_SOFT);
@@ -269,17 +290,17 @@ __global__ void getScores(float * eventData,
             float lp1 = lp_ms + prevProbabilities[curBlockOffset + PSR9_MATCH] + postFlank;
             float lp2 = lp_ms + prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] + postFlank;
             float lp3 = lp_ms + prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] + postFlank;
-
-            printf(">GPU Post-clip transition on row %i, read %i, threadIdx is %i\n"
-                           "LP1=%f\n"
-                           "LP2=%f\n"
-                           "LP3=%f\n",
-                   row,
-                   blockIdx.x,
-                   threadIdx.x,
-                   lp1,
-                   lp2,
-                   lp3);
+//
+//            printf(">GPU Post-clip transition on row %i, read %i, threadIdx is %i\n"
+//                           "LP1=%f\n"
+//                           "LP2=%f\n"
+//                           "LP3=%f\n",
+//                   row,
+//                   blockIdx.x,
+//                   threadIdx.x,
+//                   lp1,
+//                   lp2,
+//                   lp3);
 
             end = returnValues[blockIdx.x];
             end = logsumexpf(end, lp1);
@@ -290,29 +311,39 @@ __global__ void getScores(float * eventData,
         // Now do the end state
         __syncthreads();
 
-        // DIAGNOSTIC
-        if (debug == true){
-            printf("rank %i\n", rank);
-            printf("event mean %f\n", event_mean);
-            printf("poreModelLevelLogStdv %f\n", poreModelLevelLogStdv[0]);
-            printf("poreModelLevelStdv %f\n", poreModelLevelStdv[0]);
-            printf("poreModelLevelMean %f\n", poreModelLevelMean[0]);
-            printf("lp_emission_m is %f\n", lp_emission_m);
-            printf("PSR9_MATCH is %i\n", PSR9_MATCH);
-            printf(">GPU score HMT_FROM_SAME_M is %f\n", HMT_FROM_SAME_M);
-            printf(">GPU score HMT_FROM_PREV_M is %f\n", HMT_FROM_PREV_M);
-            printf(">GPU score HMT_FROM_SAME_B is %f\n", HMT_FROM_SAME_B);
-            printf(">GPU score HMT_FROM_PREV_B is %f\n", HMT_FROM_PREV_B);
-            printf(">GPU score HMT_FROM_PREV_K is %f\n", HMT_FROM_PREV_K);
-            printf(">GPU newSkipScore is %f\n", newSkipScore);
-            printf("Number of states is %i\n", n_states);
-            for (int c = 0; c < n_states; c++) {
-                printf("GPU> Value for row %i and col %i is %f\n",row, c, prevProbabilities[c]);
-            }
-        }
-    }
+        if ((blockIdx.x == 2) && (threadIdx.x == 0)){
+//            printf("rank %i\n", rank);
+//            printf("event mean %f\n", event_mean);
+//            printf("poreModelLevelLogStdv %f\n", poreModelLevelLogStdv[0]);
+//            printf("poreModelLevelStdv %f\n", poreModelLevelStdv[0]);
+//            printf("poreModelLevelMean %f\n", poreModelLevelMean[0]);
+//            printf("lp_emission_m is %f\n", lp_emission_m);
+//            printf("PSR9_MATCH is %i\n", PSR9_MATCH);
+//            printf(">GPU score HMT_FROM_SAME_M is %f\n", HMT_FROM_SAME_M);
+//            printf(">GPU score HMT_FROM_PREV_M is %f\n", HMT_FROM_PREV_M);
+//            printf(">GPU score HMT_FROM_SAME_B is %f\n", HMT_FROM_SAME_B);
+//            printf(">GPU score HMT_FROM_PREV_B is %f\n", HMT_FROM_PREV_B);
+//            printf(">GPU score HMT_FROM_PREV_K is %f\n", HMT_FROM_PREV_K);
+//            printf(">GPU newSkipScore is %f\n", newSkipScore);
+//            printf("Number of states is %i\n", n_states);
+                for (int c = 0; c < n_states; c++) {
+                    printf("GPU> Value for row %i and col %i is %f\n", row, c, prevProbabilities[c]);
+                }
+            printf("HMT_FROM_SOFT = %f\n", HMT_FROM_SOFT);
+            printf("lp_mk = %f\n", lp_mk);
+            printf("lp_mb = %f\n", lp_mb);
+            printf("lp_mm_self = %f\n", lp_mm_self);
+            printf("lp_mm_next = %f\n", lp_mm_next);
+            printf("lp_bb = %f\n", lp_bb);
+            printf("lp_bk = %f\n", lp_bk);
+            printf("lp_bm_next = %f\n", lp_bm_next);
+            printf("lp_bm_self = %f\n", lp_bm_self);
+            printf("lp_kk = %f\n", lp_kk);
+            printf("lp_km = %f\n", lp_km);
 
-    __syncthreads();
+        }
+        }
+        __syncthreads();
 }
 
 
@@ -426,9 +457,9 @@ std::vector<double> scoreKernel(std::vector<HMMInputSequence> sequences,
     for(int j=0;j<event_sequences.size();j++){
         auto ev = event_sequences[j];
         eventOffsets.push_back(offset);
-        size_t num_events = ev.read->events->size();
+        size_t num_events = 100;//TODO: FIX! ev.read->events->size();
         for (int i=0;i<num_events;i++) {
-            auto event_idx =  e_starts[j] + i * event_strides[0];
+            auto event_idx =  e_starts[j] + i * event_strides[j];
             auto scaled = ev.read->get_drift_scaled_level(event_idx, ev.strand); // send the data in drift scaled
             //auto unscaled = ev.read->events[0][i].mean; //taking the first element. Not sure what the second one is..
             eventMeans[offset + i] = scaled;
@@ -458,10 +489,11 @@ std::vector<double> scoreKernel(std::vector<HMMInputSequence> sequences,
     std::vector<float> log_var(num_reads);
 
     for (int i=0;i<num_reads;i++){
-        scale[i] = event_sequences[i].read->scalings->scale;
-        shift[i] = event_sequences[i].read->scalings->shift;
-        var[i] = event_sequences[i].read->scalings->var;
-        log_var[i] = event_sequences[i].read->scalings->log_var;
+        auto read = event_sequences[i];
+        scale[i] = event_sequences[i].read->scalings[read.strand].scale;
+        shift[i] = event_sequences[i].read->scalings[read.strand].shift;
+        var[i] = event_sequences[i].read->scalings[read.strand].var;
+        log_var[i] = event_sequences[i].read->scalings[read.strand].log_var;
     }
 
     float* scaleDev;
diff --git a/src/hmm/nanopolish_emissions.h b/src/hmm/nanopolish_emissions.h
index 6069ac81..5f99a410 100644
--- a/src/hmm/nanopolish_emissions.h
+++ b/src/hmm/nanopolish_emissions.h
@@ -63,19 +63,19 @@ inline float log_probability_match_r9(const SquiggleRead& read,
 {
     // event level mean, scaled with the drift value
     float level = read.get_drift_scaled_level(event_idx, strand);
-    if (debug == true){
-        printf("Level being used to calculate emission: %f\n", level);
-    }
+    //if (debug == true){
+    //    printf("Level being used to calculate emission: %f\n", level);
+    //}
     GaussianParameters gp = read.get_scaled_gaussian_from_pore_model_state(pore_model, strand, kmer_rank);
-    if (debug == true) {
-        printf(">Event IDX is: %i\n", event_idx);
-        printf(">CPU Strand is: %i\n", strand);
-        printf(">CPU kmer_rank is: %i\n", kmer_rank);
-        printf(">CPU level is: %f\n", level);
-        printf(">CPU gaussian mean: %f\n", gp.mean);
-        printf(">CPU gaussian stdv: %f\n", gp.stdv);
-        printf(">CPU gaussian log_level_stdv: %f\n", gp.log_stdv);
-    }
+//    if (debug == true) {
+//        printf(">Event IDX is: %i\n", event_idx);
+//        printf(">CPU Strand is: %i\n", strand);
+//        printf(">CPU kmer_rank is: %i\n", kmer_rank);
+//        printf(">CPU level is: %f\n", level);
+//        printf(">CPU gaussian mean: %f\n", gp.mean);
+//        printf(">CPU gaussian stdv: %f\n", gp.stdv);
+//        printf(">CPU gaussian log_level_stdv: %f\n", gp.log_stdv);
+//    }
     float lp = log_normal_pdf(level, gp);
     return lp;
 }
diff --git a/src/hmm/nanopolish_profile_hmm.cpp b/src/hmm/nanopolish_profile_hmm.cpp
index 6d5d0f37..d82ec344 100644
--- a/src/hmm/nanopolish_profile_hmm.cpp
+++ b/src/hmm/nanopolish_profile_hmm.cpp
@@ -31,6 +31,7 @@ float profile_hmm_score(const HMMInputSequence& sequence, const HMMInputData& da
 
 float profile_hmm_score_set(const std::vector<HMMInputSequence>& sequences, const HMMInputData& data, const uint32_t flags)
 {
+    printf("In profile_hmm_score set function...\n");
     assert(!sequences.empty());
     assert(std::string(sequences[0].get_alphabet()->get_name()) == "nucleotide");
     assert(std::string(data.pore_model->pmalphabet->get_name()) == "nucleotide");
diff --git a/src/hmm/nanopolish_profile_hmm_r9.inl b/src/hmm/nanopolish_profile_hmm_r9.inl
index 315618a3..f402fd1c 100644
--- a/src/hmm/nanopolish_profile_hmm_r9.inl
+++ b/src/hmm/nanopolish_profile_hmm_r9.inl
@@ -283,7 +283,7 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
 
     uint32_t e_start = data.event_start_idx;
 
-    printf(">CPU e_start: %i\n", e_start);
+    //printf(">CPU e_start: %i\n", e_start);
     // Calculate number of blocks
     // A block of the HMM is a set of states for one kmer
     uint32_t num_blocks = output.get_num_columns() / PSR9_NUM_STATES; // num_columns is the number of HMM STATES
@@ -303,8 +303,8 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
 
     std::vector<uint32_t> kmer_ranks(num_kmers);
     for(size_t ki = 0; ki < num_kmers; ++ki) {
-        int kr = sequence.get_kmer_rank(ki, k, data.rc);
-        printf("Kmer rank: %i\n", kr);
+        int kr = sequence.get_kmer_rank(ki, k, data.rc); // can * -1 here to see if 3rd is correct
+        printf(">CPU Kmer rank: %i\n", kr);
         kmer_ranks[ki] = kr;
     }
 
@@ -326,16 +326,6 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
     // Fill in matrix
     for(uint32_t row = 1; row < output.get_num_rows(); row++) {
 
-        //printf("======\n");
-        //diagnostics - after match and bad event have been applied
-        if (row == 29) { // row 1 has been computed so we can have a peek
-            auto nc = output.get_num_columns();
-            int rw = 28;
-            for (int i = 0; i < nc; i++) {
-                printf("CPU> Value for row %i col %i is %f\n", rw, i, output.get(rw, i));
-            }
-        }
-
         // Skip the first block which is the start state, it was initialized above
         // Similarily skip the last block, which is calculated in the terminate() function
         for(uint32_t block = 1; block < num_blocks - 1; block++) {
@@ -352,7 +342,7 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
             uint32_t event_idx = e_start + (row - 1) * data.event_stride;
             uint32_t rank = kmer_ranks[kmer_idx];
             float lp_emission_m = log_probability_match_r9(*data.read, *data.pore_model, rank, event_idx, data.strand, true);
-            printf("CPU> lp_emission_m %f\n", lp_emission_m);
+            //printf("CPU> lp_emission_m %f\n", lp_emission_m);
             float lp_emission_b = BAD_EVENT_PENALTY;
             
             HMMUpdateScores scores;
@@ -364,6 +354,8 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
             scores.x[HMT_FROM_PREV_B] = bt.lp_bm_next + output.get(row - 1, prev_block_offset + PSR9_BAD_EVENT);
             scores.x[HMT_FROM_PREV_K] = bt.lp_km + output.get(row - 1, prev_block_offset + PSR9_KMER_SKIP);
 
+            scores.x[HMT_FROM_PREV_B] = bt.lp_bm_next + output.get(row - 1, prev_block_offset + PSR9_BAD_EVENT);
+
             // m_s is the probability of going from the start state
             // to this kmer. The start state is (currently) only 
             // allowed to go to the first kmer. If ALLOW_PRE_CLIP
@@ -372,7 +364,18 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
             scores.x[HMT_FROM_SOFT] = (kmer_idx == 0 &&
                                         (event_idx == e_start ||
                                              (flags & HAF_ALLOW_PRE_CLIP))) ? lp_sm + pre_flank[row - 1] : -INFINITY;
-            printf("lp_emission_m is %f\n", lp_emission_m);
+
+            if (row == 2) {
+                printf("Working with matches in row 2\n");
+                printf("HMT_FROM_SOFT IS %f\n", scores.x[HMT_FROM_SOFT]);
+                printf("Strand is %i\n", data.strand);
+                printf("bt.lp_mm_self %f\n", bt.lp_mm_self);
+                printf("bt.lp_mm_next %f\n", bt.lp_mm_next);
+                printf("bt.lp_bm_self %f\n", bt.lp_bm_self);
+                printf("bt.lp_bm_next %f\n", bt.lp_bm_next);
+                printf("bt.lp_km %f\n", bt.lp_km);
+            }
+
             output.update_cell(row, curr_block_offset + PSR9_MATCH, scores, lp_emission_m);
 
              // state PSR9_BAD_EVENT
@@ -401,7 +404,7 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
             // last kmer/event match.
 
             if(kmer_idx == last_kmer_idx && ( (flags & HAF_ALLOW_POST_CLIP) || row == last_event_row_idx)) {
-                printf(">CPU Post-clip transition on row %i\n", row);
+                //printf(">CPU Post-clip transition on row %i\n", row);
                 float lp1 = lp_ms + output.get(row, curr_block_offset + PSR9_MATCH) + post_flank[row - 1];
                 float lp2 = lp_ms + output.get(row, curr_block_offset + PSR9_BAD_EVENT) + post_flank[row - 1];
                 float lp3 = lp_ms + output.get(row, curr_block_offset + PSR9_KMER_SKIP) + post_flank[row - 1];
@@ -410,13 +413,14 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
                 output.update_end(lp2, row, curr_block_offset + PSR9_BAD_EVENT);
                 output.update_end(lp3, row, curr_block_offset + PSR9_KMER_SKIP);
 
-                printf(">LP1 %f\n", lp1);
-                printf(">LP2 %f\n", lp2);
-                printf(">LP3 %f\n", lp3);
-                printf(">end %f\n", output.get_end());
+                //printf(">LP1 %f\n", lp1);
+                //printf(">LP2 %f\n", lp2);
+                //printf(">LP3 %f\n", lp3);
+                //printf(">end %f\n", output.get_end());
 
             }
 
+
 #ifdef DEBUG_LOCAL_ALIGNMENT
             printf("[%d %d] start: %.2lf  pre: %.2lf fm: %.2lf\n", event_idx, kmer_idx, m_s + lp_emission_m, pre_flank[row - 1], output.get(row, curr_block_offset + PSR9_MATCH));
             printf("[%d %d]   end: %.2lf post: %.2lf\n", event_idx, kmer_idx, lp_end, post_flank[row - 1]);
@@ -451,7 +455,13 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
         }
     }
 
-    
-    return output.get_end();
+    for(uint32_t row = 1; row < output.get_num_rows(); row++) {
+        //for (int col=0; col<output.get_num_columns(); col++) {
+        //    printf("CPU> Value for row %i and col %i is %f\n", row, col, output.get(row, col));
+       // }
+    }
+
+
+        return output.get_end();
 }
 
diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp
index 864a8c59..037a4a60 100644
--- a/src/nanopolish_call_variants.cpp
+++ b/src/nanopolish_call_variants.cpp
@@ -352,9 +352,8 @@ std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& ali
         // get the scaled levels.
 
         std::vector<double> scores = aligner.variantScoresThresholded(tmp_variants, test_haplotype, event_sequences,
-                                                       alignment_flags, 10,//opt::screen_score_threshold,
+                                                       alignment_flags, opt::screen_score_threshold,
                                                        opt::methylation_types);
-
         auto tf_gpu = std::chrono::high_resolution_clock::now();
         gpu_exec += tf_gpu - t0_gpu;
 
@@ -364,7 +363,7 @@ std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& ali
                                                                test_haplotype,
                                                                event_sequences,
                                                                alignment_flags,
-                                                               10,//opt::screen_score_threshold,
+                                                               opt::screen_score_threshold,
                                                                opt::methylation_types);
             auto t1 = std::chrono::high_resolution_clock::now();
             scoring += t1-t0;

From ac82456d3f8fe3e8998c2a62d1fcec8e20742ec3 Mon Sep 17 00:00:00 2001
From: Mike Vella <mike.vella@nanoporetech.com>
Date: Tue, 3 Jul 2018 15:54:02 +0100
Subject: [PATCH 14/80] Removed print statements

---
 Makefile                              |  6 +-
 src/common/nanopolish_variant.cpp     |  2 +-
 src/cuda_kernels/GpuAligner.cu        | 84 +++++++++++++--------------
 src/hmm/nanopolish_profile_hmm.cpp    |  2 +-
 src/hmm/nanopolish_profile_hmm_r9.inl | 22 +++----
 src/nanopolish_call_variants.cpp      |  4 +-
 6 files changed, 60 insertions(+), 60 deletions(-)

diff --git a/Makefile b/Makefile
index 199f5d1d..38ff3360 100644
--- a/Makefile
+++ b/Makefile
@@ -9,13 +9,13 @@ SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/ali
 
 #Basic flags every build needs
 LIBS=-lz
-CXXFLAGS ?= -g -Og
+CXXFLAGS ?= -O3 #-g
 CXXFLAGS += -std=c++11 -fopenmp -fsigned-char
-CFLAGS ?= -std=c99 #-O3
+CFLAGS ?= -std=c99 -O3
 CXX ?= g++
 CC ?= gcc
 NVCC = nvcc
-NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -g
+NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include #-g
 CURTFLAGS ?= -L/usr/local/cuda-9.0/lib64 -lcudart
 
 # Change the value of HDF5, EIGEN, or HTS below to any value to disable compilation of bundled code
diff --git a/src/common/nanopolish_variant.cpp b/src/common/nanopolish_variant.cpp
index bbc5933b..357c7fae 100644
--- a/src/common/nanopolish_variant.cpp
+++ b/src/common/nanopolish_variant.cpp
@@ -686,7 +686,7 @@ Variant score_variant_thresholded(const Variant& input_variant,
         if(fabs(total_score) < score_threshold) {
 
             // Calculate scores using the base nucleotide model
-            printf("Working with input %i\n", j);
+            //printf("Working with input %i\n", j);
             double base_score = profile_hmm_score_set(base_sequences, input[j], alignment_flags);
             double variant_score = profile_hmm_score_set(variant_sequences, input[j], alignment_flags);
 
diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index ef52b68c..75947314 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -93,9 +93,9 @@ __global__ void getScores(float * eventData,
     }
     int e_offset = eventOffsets[readIdx]; // Within the event means etc, the offset needed for this block to get a specific event
 
-    if(blockIdx.x==2){ // read 2 is an RC read
-        printf("Block IDX is %i and stride is %i\n", blockIdx.x, e_stride);
-    }
+    //if(blockIdx.x==2){ // read 2 is an RC read
+    //    printf("Block IDX is %i and stride is %i\n", blockIdx.x, e_stride);
+    //}
 
     int kmerIdx = threadIdx.x;
     uint32_t rank;
@@ -204,11 +204,11 @@ __global__ void getScores(float * eventData,
                                (event_idx == e_start ||
                                 (HAF_ALLOW_PRE_CLIP)))  ? lp_sm  + preFlank : -INFINITY; // TODO: Add flag for HAF ALLOW_PRE_CLIP
 
-        if (blockIdx.x == 2 && threadIdx.x == 0 && row == 2){
-            printf("HMT_FROM_SOFT should be (?) -5.99 but is in fact %f\n", HMT_FROM_SOFT);
-            printf("event IDX is %i\n", event_idx);
-            printf("e_start is %i\n", e_start);
-        }
+        //if (blockIdx.x == 2 && threadIdx.x == 0 && row == 2){
+        //    printf("HMT_FROM_SOFT should be (?) -5.99 but is in fact %f\n", HMT_FROM_SOFT);
+        //    printf("event IDX is %i\n", event_idx);
+        //    printf("e_start is %i\n", e_start);
+        //}
 
         // calculate the score
         float sum = HMT_FROM_SAME_M;
@@ -311,37 +311,37 @@ __global__ void getScores(float * eventData,
         // Now do the end state
         __syncthreads();
 
-        if ((blockIdx.x == 2) && (threadIdx.x == 0)){
-//            printf("rank %i\n", rank);
-//            printf("event mean %f\n", event_mean);
-//            printf("poreModelLevelLogStdv %f\n", poreModelLevelLogStdv[0]);
-//            printf("poreModelLevelStdv %f\n", poreModelLevelStdv[0]);
-//            printf("poreModelLevelMean %f\n", poreModelLevelMean[0]);
-//            printf("lp_emission_m is %f\n", lp_emission_m);
-//            printf("PSR9_MATCH is %i\n", PSR9_MATCH);
-//            printf(">GPU score HMT_FROM_SAME_M is %f\n", HMT_FROM_SAME_M);
-//            printf(">GPU score HMT_FROM_PREV_M is %f\n", HMT_FROM_PREV_M);
-//            printf(">GPU score HMT_FROM_SAME_B is %f\n", HMT_FROM_SAME_B);
-//            printf(">GPU score HMT_FROM_PREV_B is %f\n", HMT_FROM_PREV_B);
-//            printf(">GPU score HMT_FROM_PREV_K is %f\n", HMT_FROM_PREV_K);
-//            printf(">GPU newSkipScore is %f\n", newSkipScore);
-//            printf("Number of states is %i\n", n_states);
-                for (int c = 0; c < n_states; c++) {
-                    printf("GPU> Value for row %i and col %i is %f\n", row, c, prevProbabilities[c]);
-                }
-            printf("HMT_FROM_SOFT = %f\n", HMT_FROM_SOFT);
-            printf("lp_mk = %f\n", lp_mk);
-            printf("lp_mb = %f\n", lp_mb);
-            printf("lp_mm_self = %f\n", lp_mm_self);
-            printf("lp_mm_next = %f\n", lp_mm_next);
-            printf("lp_bb = %f\n", lp_bb);
-            printf("lp_bk = %f\n", lp_bk);
-            printf("lp_bm_next = %f\n", lp_bm_next);
-            printf("lp_bm_self = %f\n", lp_bm_self);
-            printf("lp_kk = %f\n", lp_kk);
-            printf("lp_km = %f\n", lp_km);
-
-        }
+//        if ((blockIdx.x == 2) && (threadIdx.x == 0)){
+////            printf("rank %i\n", rank);
+////            printf("event mean %f\n", event_mean);
+////            printf("poreModelLevelLogStdv %f\n", poreModelLevelLogStdv[0]);
+////            printf("poreModelLevelStdv %f\n", poreModelLevelStdv[0]);
+////            printf("poreModelLevelMean %f\n", poreModelLevelMean[0]);
+////            printf("lp_emission_m is %f\n", lp_emission_m);
+////            printf("PSR9_MATCH is %i\n", PSR9_MATCH);
+////            printf(">GPU score HMT_FROM_SAME_M is %f\n", HMT_FROM_SAME_M);
+////            printf(">GPU score HMT_FROM_PREV_M is %f\n", HMT_FROM_PREV_M);
+////            printf(">GPU score HMT_FROM_SAME_B is %f\n", HMT_FROM_SAME_B);
+////            printf(">GPU score HMT_FROM_PREV_B is %f\n", HMT_FROM_PREV_B);
+////            printf(">GPU score HMT_FROM_PREV_K is %f\n", HMT_FROM_PREV_K);
+////            printf(">GPU newSkipScore is %f\n", newSkipScore);
+////            printf("Number of states is %i\n", n_states);
+//                for (int c = 0; c < n_states; c++) {
+//                    printf("GPU> Value for row %i and col %i is %f\n", row, c, prevProbabilities[c]);
+//                }
+//            printf("HMT_FROM_SOFT = %f\n", HMT_FROM_SOFT);
+//            printf("lp_mk = %f\n", lp_mk);
+//            printf("lp_mb = %f\n", lp_mb);
+//            printf("lp_mm_self = %f\n", lp_mm_self);
+//            printf("lp_mm_next = %f\n", lp_mm_next);
+//            printf("lp_bb = %f\n", lp_bb);
+//            printf("lp_bk = %f\n", lp_bk);
+//            printf("lp_bm_next = %f\n", lp_bm_next);
+//            printf("lp_bm_self = %f\n", lp_bm_self);
+//            printf("lp_kk = %f\n", lp_kk);
+//            printf("lp_km = %f\n", lp_km);
+//
+//        }
         }
         __syncthreads();
 }
@@ -576,7 +576,7 @@ std::vector<double> scoreKernel(std::vector<HMMInputSequence> sequences,
     float* returnedValues;
     cudaHostAlloc(&returnedValues, num_reads * sizeof(float) , cudaHostAllocDefault);
 
-    printf("About to run getscores...\n");
+    //printf("About to run getscores...\n");
     getScores<<<dimGrid, dimBlock>>>(eventMeansDev,
             eventsPerBaseDev,
             numRowsDev,
@@ -663,11 +663,11 @@ std::vector<double> GpuAligner::variantScoresThresholded(std::vector<Variant> in
 
     std::vector<double> v(variant_sequences.size());
     for (int i=0; i<variant_sequences.size(); i++){
-        auto scores = scoreKernel(variant_sequences[i], event_sequences, alignment_flags); //TODO: Base sequence needs to be replaced with the variant itself
+        auto scores = scoreKernel(variant_sequences[i], event_sequences, alignment_flags);
 
         double totalScore = 0.0;
         for(int k=0; k<scores.size(); k++){
-            if (fabs(totalScore) < screen_score_threshold){ //threshold, hardcoded. TODO: Make this an argument, although the thresholding doesn't make much sense on GPU
+            if (fabs(totalScore) < screen_score_threshold){
                 totalScore += (scores[k] - base_scores[k]);
             }
         }
diff --git a/src/hmm/nanopolish_profile_hmm.cpp b/src/hmm/nanopolish_profile_hmm.cpp
index d82ec344..0d9f5167 100644
--- a/src/hmm/nanopolish_profile_hmm.cpp
+++ b/src/hmm/nanopolish_profile_hmm.cpp
@@ -31,7 +31,7 @@ float profile_hmm_score(const HMMInputSequence& sequence, const HMMInputData& da
 
 float profile_hmm_score_set(const std::vector<HMMInputSequence>& sequences, const HMMInputData& data, const uint32_t flags)
 {
-    printf("In profile_hmm_score set function...\n");
+    //printf("In profile_hmm_score set function...\n");
     assert(!sequences.empty());
     assert(std::string(sequences[0].get_alphabet()->get_name()) == "nucleotide");
     assert(std::string(data.pore_model->pmalphabet->get_name()) == "nucleotide");
diff --git a/src/hmm/nanopolish_profile_hmm_r9.inl b/src/hmm/nanopolish_profile_hmm_r9.inl
index f402fd1c..001e44f4 100644
--- a/src/hmm/nanopolish_profile_hmm_r9.inl
+++ b/src/hmm/nanopolish_profile_hmm_r9.inl
@@ -304,7 +304,7 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
     std::vector<uint32_t> kmer_ranks(num_kmers);
     for(size_t ki = 0; ki < num_kmers; ++ki) {
         int kr = sequence.get_kmer_rank(ki, k, data.rc); // can * -1 here to see if 3rd is correct
-        printf(">CPU Kmer rank: %i\n", kr);
+        //printf(">CPU Kmer rank: %i\n", kr);
         kmer_ranks[ki] = kr;
     }
 
@@ -365,16 +365,16 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
                                         (event_idx == e_start ||
                                              (flags & HAF_ALLOW_PRE_CLIP))) ? lp_sm + pre_flank[row - 1] : -INFINITY;
 
-            if (row == 2) {
-                printf("Working with matches in row 2\n");
-                printf("HMT_FROM_SOFT IS %f\n", scores.x[HMT_FROM_SOFT]);
-                printf("Strand is %i\n", data.strand);
-                printf("bt.lp_mm_self %f\n", bt.lp_mm_self);
-                printf("bt.lp_mm_next %f\n", bt.lp_mm_next);
-                printf("bt.lp_bm_self %f\n", bt.lp_bm_self);
-                printf("bt.lp_bm_next %f\n", bt.lp_bm_next);
-                printf("bt.lp_km %f\n", bt.lp_km);
-            }
+            //if (row == 2) {
+            //    printf("Working with matches in row 2\n");
+            //    printf("HMT_FROM_SOFT IS %f\n", scores.x[HMT_FROM_SOFT]);
+            //    printf("Strand is %i\n", data.strand);
+            //    printf("bt.lp_mm_self %f\n", bt.lp_mm_self);
+            //    printf("bt.lp_mm_next %f\n", bt.lp_mm_next);
+            //    printf("bt.lp_bm_self %f\n", bt.lp_bm_self);
+            //    printf("bt.lp_bm_next %f\n", bt.lp_bm_next);
+            //    printf("bt.lp_km %f\n", bt.lp_km);
+            //}
 
             output.update_cell(row, curr_block_offset + PSR9_MATCH, scores, lp_emission_m);
 
diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp
index 037a4a60..ee3e6d05 100644
--- a/src/nanopolish_call_variants.cpp
+++ b/src/nanopolish_call_variants.cpp
@@ -352,7 +352,7 @@ std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& ali
         // get the scaled levels.
 
         std::vector<double> scores = aligner.variantScoresThresholded(tmp_variants, test_haplotype, event_sequences,
-                                                       alignment_flags, opt::screen_score_threshold,
+                                                       alignment_flags, 100000,//opt::screen_score_threshold,
                                                        opt::methylation_types);
         auto tf_gpu = std::chrono::high_resolution_clock::now();
         gpu_exec += tf_gpu - t0_gpu;
@@ -363,7 +363,7 @@ std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& ali
                                                                test_haplotype,
                                                                event_sequences,
                                                                alignment_flags,
-                                                               opt::screen_score_threshold,
+                                                               100000,//opt::screen_score_threshold,
                                                                opt::methylation_types);
             auto t1 = std::chrono::high_resolution_clock::now();
             scoring += t1-t0;

From 10db85ad60ed15c7af54fd828f4e00a27b4dc6c9 Mon Sep 17 00:00:00 2001
From: Mike Vella <mike.vella@nanoporetech.com>
Date: Wed, 4 Jul 2018 11:33:35 +0100
Subject: [PATCH 15/80] Fixed bug with overly-large host allocations

---
 Makefile                       |  4 ++--
 src/cuda_kernels/GpuAligner.cu | 14 ++++++++------
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/Makefile b/Makefile
index 38ff3360..68fb07a9 100644
--- a/Makefile
+++ b/Makefile
@@ -9,13 +9,13 @@ SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/ali
 
 #Basic flags every build needs
 LIBS=-lz
-CXXFLAGS ?= -O3 #-g
+CXXFLAGS ?= -O3 -g
 CXXFLAGS += -std=c++11 -fopenmp -fsigned-char
 CFLAGS ?= -std=c99 -O3
 CXX ?= g++
 CC ?= gcc
 NVCC = nvcc
-NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include #-g
+NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -g
 CURTFLAGS ?= -L/usr/local/cuda-9.0/lib64 -lcudart
 
 # Change the value of HDF5, EIGEN, or HTS below to any value to disable compilation of bundled code
diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index 75947314..b375f2e8 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -392,6 +392,7 @@ std::vector<double> scoreKernel(std::vector<HMMInputSequence> sequences,
     std::vector<std::vector<float>> pre_flanks;
     std::vector<std::vector<float>> post_flanks;
 
+    int numEventsTotal = 0;
     for(auto e: event_sequences){
         uint32_t e_start = e.event_start_idx;
         e_starts.push_back(e_start);
@@ -407,6 +408,7 @@ std::vector<double> scoreKernel(std::vector<HMMInputSequence> sequences,
             n_events = e_start - e_end + 1;
 
         n_rows.push_back(n_events + 1);
+        numEventsTotal += n_events + 1; // TODO: is +1 necessary?
 
         std::vector<float> pre_flank = make_pre_flanking(e, e_start, n_events);
         std::vector<float> post_flank = make_post_flanking(e, e_start, n_events);
@@ -426,18 +428,18 @@ std::vector<double> scoreKernel(std::vector<HMMInputSequence> sequences,
 
     // Buffer 1: Raw event data and associated starts and stops
 
-    size_t numEventsTotal = 0;
+   // size_t numEventsTotal;
     //1. Count the total number of events across all reads
-    std::vector<int> eventLengths;
+    //std::vector<int> eventLengths;
     std::vector<float> eventsPerBase;
     for (auto e: event_sequences){
         size_t numEvents = e.read->events->size();
         float readEventsPerBase = e.read->events_per_base[e.strand];
 
-        eventLengths.push_back(numEvents);
+        //eventLengths.push_back(numEvents);
         eventsPerBase.push_back(readEventsPerBase);
 
-        numEventsTotal += numEvents;
+        //numEventsTotal += numEvents;
     }
 
 
@@ -457,7 +459,7 @@ std::vector<double> scoreKernel(std::vector<HMMInputSequence> sequences,
     for(int j=0;j<event_sequences.size();j++){
         auto ev = event_sequences[j];
         eventOffsets.push_back(offset);
-        size_t num_events = 100;//TODO: FIX! ev.read->events->size();
+        size_t num_events = n_rows[j];//TODO: is this sometimes causing a segfault? is it correct?
         for (int i=0;i<num_events;i++) {
             auto event_idx =  e_starts[j] + i * event_strides[j];
             auto scaled = ev.read->get_drift_scaled_level(event_idx, ev.strand); // send the data in drift scaled
@@ -513,7 +515,7 @@ std::vector<double> scoreKernel(std::vector<HMMInputSequence> sequences,
 
 
     float* poreModelLevelLogStdvDev;
-    cudaMalloc( (void**)&poreModelLevelLogStdvDev, pore_model_level_log_stdv.size() * sizeof(float));
+    cudaMalloc( (void**)&poreModelLevelLogStdvDev, pore_model_level_log_stdv.size() * sizeof(float)); // for some reason this malloc is slow
     cudaMemcpyAsync( poreModelLevelLogStdvDev, pore_model_level_log_stdv.data(), pore_model_level_log_stdv.size() * sizeof(float), cudaMemcpyHostToDevice );
 
     float* poreModelLevelMeanDev;

From f5c0b4a6d46a73eade0fe01ec04a60c1b16280ed Mon Sep 17 00:00:00 2001
From: Mike Vella <mike.vella@nanoporetech.com>
Date: Wed, 4 Jul 2018 11:53:32 +0100
Subject: [PATCH 16/80] removed some print statements

---
 Makefile                       |  4 +--
 src/cuda_kernels/GpuAligner.cu | 52 ----------------------------------
 2 files changed, 2 insertions(+), 54 deletions(-)

diff --git a/Makefile b/Makefile
index 68fb07a9..ed4eccff 100644
--- a/Makefile
+++ b/Makefile
@@ -9,13 +9,13 @@ SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/ali
 
 #Basic flags every build needs
 LIBS=-lz
-CXXFLAGS ?= -O3 -g
+CXXFLAGS ?= -O3 #-g
 CXXFLAGS += -std=c++11 -fopenmp -fsigned-char
 CFLAGS ?= -std=c99 -O3
 CXX ?= g++
 CC ?= gcc
 NVCC = nvcc
-NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -g
+NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -O3 #-g
 CURTFLAGS ?= -L/usr/local/cuda-9.0/lib64 -lcudart
 
 # Change the value of HDF5, EIGEN, or HTS below to any value to disable compilation of bundled code
diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index b375f2e8..247e7e35 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -93,10 +93,6 @@ __global__ void getScores(float * eventData,
     }
     int e_offset = eventOffsets[readIdx]; // Within the event means etc, the offset needed for this block to get a specific event
 
-    //if(blockIdx.x==2){ // read 2 is an RC read
-    //    printf("Block IDX is %i and stride is %i\n", blockIdx.x, e_stride);
-    //}
-
     int kmerIdx = threadIdx.x;
     uint32_t rank;
 
@@ -204,12 +200,6 @@ __global__ void getScores(float * eventData,
                                (event_idx == e_start ||
                                 (HAF_ALLOW_PRE_CLIP)))  ? lp_sm  + preFlank : -INFINITY; // TODO: Add flag for HAF ALLOW_PRE_CLIP
 
-        //if (blockIdx.x == 2 && threadIdx.x == 0 && row == 2){
-        //    printf("HMT_FROM_SOFT should be (?) -5.99 but is in fact %f\n", HMT_FROM_SOFT);
-        //    printf("event IDX is %i\n", event_idx);
-        //    printf("e_start is %i\n", e_start);
-        //}
-
         // calculate the score
         float sum = HMT_FROM_SAME_M;
         sum = logsumexpf(sum, HMT_FROM_SOFT);
@@ -290,17 +280,6 @@ __global__ void getScores(float * eventData,
             float lp1 = lp_ms + prevProbabilities[curBlockOffset + PSR9_MATCH] + postFlank;
             float lp2 = lp_ms + prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] + postFlank;
             float lp3 = lp_ms + prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] + postFlank;
-//
-//            printf(">GPU Post-clip transition on row %i, read %i, threadIdx is %i\n"
-//                           "LP1=%f\n"
-//                           "LP2=%f\n"
-//                           "LP3=%f\n",
-//                   row,
-//                   blockIdx.x,
-//                   threadIdx.x,
-//                   lp1,
-//                   lp2,
-//                   lp3);
 
             end = returnValues[blockIdx.x];
             end = logsumexpf(end, lp1);
@@ -311,37 +290,6 @@ __global__ void getScores(float * eventData,
         // Now do the end state
         __syncthreads();
 
-//        if ((blockIdx.x == 2) && (threadIdx.x == 0)){
-////            printf("rank %i\n", rank);
-////            printf("event mean %f\n", event_mean);
-////            printf("poreModelLevelLogStdv %f\n", poreModelLevelLogStdv[0]);
-////            printf("poreModelLevelStdv %f\n", poreModelLevelStdv[0]);
-////            printf("poreModelLevelMean %f\n", poreModelLevelMean[0]);
-////            printf("lp_emission_m is %f\n", lp_emission_m);
-////            printf("PSR9_MATCH is %i\n", PSR9_MATCH);
-////            printf(">GPU score HMT_FROM_SAME_M is %f\n", HMT_FROM_SAME_M);
-////            printf(">GPU score HMT_FROM_PREV_M is %f\n", HMT_FROM_PREV_M);
-////            printf(">GPU score HMT_FROM_SAME_B is %f\n", HMT_FROM_SAME_B);
-////            printf(">GPU score HMT_FROM_PREV_B is %f\n", HMT_FROM_PREV_B);
-////            printf(">GPU score HMT_FROM_PREV_K is %f\n", HMT_FROM_PREV_K);
-////            printf(">GPU newSkipScore is %f\n", newSkipScore);
-////            printf("Number of states is %i\n", n_states);
-//                for (int c = 0; c < n_states; c++) {
-//                    printf("GPU> Value for row %i and col %i is %f\n", row, c, prevProbabilities[c]);
-//                }
-//            printf("HMT_FROM_SOFT = %f\n", HMT_FROM_SOFT);
-//            printf("lp_mk = %f\n", lp_mk);
-//            printf("lp_mb = %f\n", lp_mb);
-//            printf("lp_mm_self = %f\n", lp_mm_self);
-//            printf("lp_mm_next = %f\n", lp_mm_next);
-//            printf("lp_bb = %f\n", lp_bb);
-//            printf("lp_bk = %f\n", lp_bk);
-//            printf("lp_bm_next = %f\n", lp_bm_next);
-//            printf("lp_bm_self = %f\n", lp_bm_self);
-//            printf("lp_kk = %f\n", lp_kk);
-//            printf("lp_km = %f\n", lp_km);
-//
-//        }
         }
         __syncthreads();
 }

From 5a203a40cbb5c3a65cf7123195ca7fa42cfdd4ad Mon Sep 17 00:00:00 2001
From: Mike Vella <mike.vella@nanoporetech.com>
Date: Wed, 4 Jul 2018 12:57:35 +0100
Subject: [PATCH 17/80] removed some print statements

---
 src/cuda_kernels/GpuAligner.cu | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index 247e7e35..fd5e3706 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -526,7 +526,6 @@ std::vector<double> scoreKernel(std::vector<HMMInputSequence> sequences,
     float* returnedValues;
     cudaHostAlloc(&returnedValues, num_reads * sizeof(float) , cudaHostAllocDefault);
 
-    //printf("About to run getscores...\n");
     getScores<<<dimGrid, dimBlock>>>(eventMeansDev,
             eventsPerBaseDev,
             numRowsDev,

From 458a84cc775c43b0f0f2cc06712af8df2e507063 Mon Sep 17 00:00:00 2001
From: Mike Vella <mike.vella@nanoporetech.com>
Date: Wed, 4 Jul 2018 15:11:06 +0100
Subject: [PATCH 18/80] Sharing a lot more memory

---
 Makefile                       |   4 +-
 src/cuda_kernels/GpuAligner.cu | 172 +++++++++++++++++++--------------
 2 files changed, 99 insertions(+), 77 deletions(-)

diff --git a/Makefile b/Makefile
index ed4eccff..0c08f211 100644
--- a/Makefile
+++ b/Makefile
@@ -9,13 +9,13 @@ SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/ali
 
 #Basic flags every build needs
 LIBS=-lz
-CXXFLAGS ?= -O3 #-g
+CXXFLAGS ?= -O3# -g
 CXXFLAGS += -std=c++11 -fopenmp -fsigned-char
 CFLAGS ?= -std=c99 -O3
 CXX ?= g++
 CC ?= gcc
 NVCC = nvcc
-NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -O3 #-g
+NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -O3# -g
 CURTFLAGS ?= -L/usr/local/cuda-9.0/lib64 -lcudart
 
 # Change the value of HDF5, EIGEN, or HTS below to any value to disable compilation of bundled code
diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index fd5e3706..2be67114 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -303,9 +303,9 @@ GpuAligner::GpuAligner()
         n[i] = i;
 }
 
-std::vector<double> scoreKernel(std::vector<HMMInputSequence> sequences,
-                    std::vector<HMMInputData> event_sequences,
-                    uint32_t alignment_flags){
+std::vector<std::vector<double>> scoreKernel(std::vector<HMMInputSequence> sequences,
+                                             std::vector<HMMInputData> event_sequences,
+                                             uint32_t alignment_flags){
 
     // Extract the pore model.
     //Let's assume that every event sequence has the same pore model
@@ -324,14 +324,11 @@ std::vector<double> scoreKernel(std::vector<HMMInputSequence> sequences,
     size_t num_models = sequences.size();
     double num_model_penalty = log(num_models);
 
-    assert(num_models == 1); //this is temporary
+    assert(num_models != 1); //this is temporary
 
-    auto sequence = sequences[0]; // temporary. We are only going to score one sequence against a set of events for now.
+    //auto sequence = sequences[0]; // temporary. We are only going to score one sequence against a set of events for now.
 
     const uint32_t k = event_sequences[0].pore_model->k; //k is the kmerity
-    uint32_t n_kmers = sequence.length() - k + 1; //number of kmers in the sequence
-
-    uint32_t n_states = PSR9_NUM_STATES * (n_kmers + 2); // + 2 for explicit terminal states
 
     std::vector<uint32_t> n_rows; //number of rows in the DP table (n_events + 1)
     std::vector<uint32_t> e_starts; //event starts
@@ -365,13 +362,6 @@ std::vector<double> scoreKernel(std::vector<HMMInputSequence> sequences,
         post_flanks.push_back(post_flank);
     }
 
-    std::vector<uint32_t> kmer_ranks(n_kmers);
-    std::vector<uint32_t> kmer_ranks_rc(n_kmers);
-    for(size_t ki = 0; ki < n_kmers; ++ki) {
-        kmer_ranks[ki] = sequences[0].get_kmer_rank(ki, k, false);
-        kmer_ranks_rc[ki] = sequences[0].get_kmer_rank(ki, k, true);
-    }
-
     // Prepare raw data and send it over to the score calculator kernel
 
     // Buffer 1: Raw event data and associated starts and stops
@@ -383,11 +373,8 @@ std::vector<double> scoreKernel(std::vector<HMMInputSequence> sequences,
     for (auto e: event_sequences){
         size_t numEvents = e.read->events->size();
         float readEventsPerBase = e.read->events_per_base[e.strand];
-
         //eventLengths.push_back(numEvents);
         eventsPerBase.push_back(readEventsPerBase);
-
-        //numEventsTotal += numEvents;
     }
 
 
@@ -461,7 +448,6 @@ std::vector<double> scoreKernel(std::vector<HMMInputSequence> sequences,
     cudaMemcpyAsync( varDev, var.data(), var.size() * sizeof(float), cudaMemcpyHostToDevice );
     cudaMemcpyAsync( logVarDev, log_var.data(), log_var.size() * sizeof(float), cudaMemcpyHostToDevice );
 
-
     float* poreModelLevelLogStdvDev;
     cudaMalloc( (void**)&poreModelLevelLogStdvDev, pore_model_level_log_stdv.size() * sizeof(float)); // for some reason this malloc is slow
     cudaMemcpyAsync( poreModelLevelLogStdvDev, pore_model_level_log_stdv.data(), pore_model_level_log_stdv.size() * sizeof(float), cudaMemcpyHostToDevice );
@@ -495,13 +481,6 @@ std::vector<double> scoreKernel(std::vector<HMMInputSequence> sequences,
     cudaMalloc( (void**)&numRowsDev, n_rows.size() * sizeof(int));
     cudaMemcpyAsync( numRowsDev, n_rows.data(), n_rows.size() * sizeof(int), cudaMemcpyHostToDevice );
 
-    int* kmerRanksDev;
-    int* kmerRanksRCDev;
-    cudaMalloc( (void**)&kmerRanksDev, kmer_ranks.size() * sizeof(int));
-    cudaMalloc( (void**)&kmerRanksRCDev, kmer_ranks_rc.size() * sizeof(int));
-    cudaMemcpyAsync( kmerRanksDev, kmer_ranks.data(), kmer_ranks.size() * sizeof(int), cudaMemcpyHostToDevice );
-    cudaMemcpyAsync( kmerRanksRCDev, kmer_ranks_rc.data(), kmer_ranks_rc.size() * sizeof(int), cudaMemcpyHostToDevice );
-
     int* eventStartsDev;
     cudaMalloc( (void**)&eventStartsDev, e_starts.size() * sizeof(int));
     cudaMemcpyAsync( eventStartsDev, e_starts.data(), e_starts.size() * sizeof(int), cudaMemcpyHostToDevice );
@@ -514,39 +493,78 @@ std::vector<double> scoreKernel(std::vector<HMMInputSequence> sequences,
     cudaMalloc( (void**)&eventOffsetsDev, eventOffsets.size() * sizeof(int));
     cudaMemcpyAsync( eventOffsetsDev, eventOffsets.data(), eventOffsets.size() * sizeof(int), cudaMemcpyHostToDevice );
 
-    int num_blocks = n_states / PSR9_NUM_STATES;
-    uint32_t num_kmers = num_blocks - 2; // two terminal blocks. Not currently used but left here for now.
-
-    dim3 dimBlock(num_blocks - 2); // One thread per state, not including Start and Terminal state.
-    dim3 dimGrid(num_reads); // let's look at only the first read
-
     float * returnValuesDev;
     cudaMalloc((void **) &returnValuesDev, sizeof(float) * num_reads); //one score per read
 
     float* returnedValues;
     cudaHostAlloc(&returnedValues, num_reads * sizeof(float) , cudaHostAllocDefault);
 
-    getScores<<<dimGrid, dimBlock>>>(eventMeansDev,
-            eventsPerBaseDev,
-            numRowsDev,
-            eventStartsDev,
-            eventStridesDev,
-            kmerRanksDev,
-            kmerRanksRCDev,
-            eventOffsetsDev,
-            poreModelLevelLogStdvDev,
-            poreModelLevelStdvDev,
-            poreModelLevelMeanDev,
-            scaleDev,
-            shiftDev,
-            varDev,
-            logVarDev,
-            preFlankingDev,
-            postFlankingDev,
-            returnValuesDev);
-
-    cudaDeviceSynchronize();
-    cudaMemcpyAsync(returnedValues, returnValuesDev, num_reads *sizeof(float), cudaMemcpyDeviceToHost);
+    uint8_t num_streams = sequences.size();
+    cudaStream_t streams[num_streams];
+    //float *data[num_streams];
+
+
+    std::vector<std::vector<double>> results(sequences.size());
+    for (int i =0; i<sequences.size();i++) {
+        auto sequence = sequences[i];
+        uint32_t n_kmers = sequence.length() - k + 1; //number of kmers in the sequence
+        uint32_t n_states = PSR9_NUM_STATES * (n_kmers + 2); // + 2 for explicit terminal states
+
+        std::vector<uint32_t> kmer_ranks(n_kmers);
+        std::vector<uint32_t> kmer_ranks_rc(n_kmers);
+
+        for(size_t ki = 0; ki < n_kmers; ++ki) {
+            kmer_ranks[ki] = sequence.get_kmer_rank(ki, k, false);
+            kmer_ranks_rc[ki] = sequence.get_kmer_rank(ki, k, true);
+        }
+
+        int num_blocks = n_states / PSR9_NUM_STATES;
+        uint32_t num_kmers = num_blocks - 2; // two terminal blocks. Not currently used but left here for now.
+
+        dim3 dimBlock(num_blocks - 2); // One thread per state, not including Start and Terminal state.
+        dim3 dimGrid(num_reads); // let's look at only the first read
+
+        int *kmerRanksDev;
+        int *kmerRanksRCDev;
+        cudaMalloc((void **) &kmerRanksDev, kmer_ranks.size() * sizeof(int));
+        cudaMalloc((void **) &kmerRanksRCDev, kmer_ranks_rc.size() * sizeof(int));
+        cudaMemcpyAsync(kmerRanksDev, kmer_ranks.data(), kmer_ranks.size() * sizeof(int), cudaMemcpyHostToDevice);
+        cudaMemcpyAsync(kmerRanksRCDev, kmer_ranks_rc.data(), kmer_ranks_rc.size() * sizeof(int),
+                        cudaMemcpyHostToDevice);
+
+        getScores <<< dimGrid, dimBlock, 0>>> (eventMeansDev,
+                eventsPerBaseDev,
+                numRowsDev,
+                eventStartsDev,
+                eventStridesDev,
+                kmerRanksDev,
+                kmerRanksRCDev,
+                eventOffsetsDev,
+                poreModelLevelLogStdvDev,
+                poreModelLevelStdvDev,
+                poreModelLevelMeanDev,
+                scaleDev,
+                shiftDev,
+                varDev,
+                logVarDev,
+                preFlankingDev,
+                postFlankingDev,
+                returnValuesDev);
+
+        cudaDeviceSynchronize();
+        cudaMemcpyAsync(returnedValues, returnValuesDev, num_reads *sizeof(float), cudaMemcpyDeviceToHost);
+        cudaDeviceSynchronize();
+
+        cudaFree(kmerRanksDev);
+        cudaFree(kmerRanksRCDev);
+
+        //Send all the scores back
+        //std::vector<double> r(num_reads);
+        results[i].resize(num_reads);
+        for(int readIdx=0; readIdx<num_reads;readIdx++){
+            results[i][readIdx]= (double) returnedValues[readIdx];
+        }
+    }
 
     // Free device memory
     cudaFree(eventMeansDev);
@@ -554,8 +572,6 @@ std::vector<double> scoreKernel(std::vector<HMMInputSequence> sequences,
     cudaFree(numRowsDev);
     cudaFree(eventStartsDev);
     cudaFree(eventStridesDev);
-    cudaFree(kmerRanksDev);
-    cudaFree(kmerRanksRCDev);
     cudaFree(eventOffsetsDev);
     cudaFree(poreModelLevelLogStdvDev);
     cudaFree(poreModelLevelStdvDev);
@@ -569,14 +585,11 @@ std::vector<double> scoreKernel(std::vector<HMMInputSequence> sequences,
 
     //Free host memory
     cudaFreeHost(eventMeans);
+    cudaFreeHost(preFlankingHost);
+    cudaFreeHost(postFlankingHost);
+    cudaFreeHost(returnedValues);
 
-    //Send all the scores back
-    std::vector<double> r(num_reads);
-    for(int i=0; i<num_reads;i++){
-        r[i]= (double) returnedValues[i];
-    }
-
-    return r;
+    return results;
 }
 
 std::vector<double> GpuAligner::variantScoresThresholded(std::vector<Variant> input_variants,
@@ -596,31 +609,40 @@ std::vector<double> GpuAligner::variantScoresThresholded(std::vector<Variant> in
     }
 
     // Make methylated versions of each input sequence. Once for the base haplotype and once each for each variant
-    std::vector<HMMInputSequence> base_sequences = generate_methylated_alternatives(base_haplotype.get_sequence(),
-                                                                                    methylation_types);
-    std::vector<std::vector<HMMInputSequence>> variant_sequences;
+
+    std::vector<HMMInputSequence> sequences;
+
+    HMMInputSequence base_sequence = generate_methylated_alternatives(base_haplotype.get_sequence(),
+                                                                                    methylation_types)[0]; //TODO: always 0?
+
+    sequences.push_back(base_sequence);
+
+    //std::vector<std::vector<HMMInputSequence>> variant_sequences;
 
     for (auto v: variant_haplotypes){
-        auto variant_sequence = generate_methylated_alternatives(v.get_sequence(), methylation_types);
-        variant_sequences.push_back(variant_sequence);
+        auto variant_sequence = generate_methylated_alternatives(v.get_sequence(), methylation_types)[0];
+        sequences.push_back(variant_sequence);
     }
 
-    assert(base_sequences.size() == 1);
+    //assert(base_sequences.size() == 1);
 
     // return the sum of the score for the base sequences over all the event sequences
-    auto base_scores = scoreKernel(base_sequences, event_sequences, alignment_flags);
+    //auto base_scores = scoreKernel(base_sequences, event_sequences, alignment_flags);
+
+    std::vector<std::vector<double>> scores = scoreKernel(sequences, event_sequences, alignment_flags);
 
-    std::vector<double> v(variant_sequences.size());
-    for (int i=0; i<variant_sequences.size(); i++){
-        auto scores = scoreKernel(variant_sequences[i], event_sequences, alignment_flags);
+    std::vector<double> v(numVariants); // Thresholded score for each //(variant_sequences.size()); //TODO: Fix - temporary
 
+    uint32_t  numScores = scores[0].size();
+    for (int variantIndex=0; variantIndex<numVariants; variantIndex++){ //0 is the base scores
         double totalScore = 0.0;
-        for(int k=0; k<scores.size(); k++){
+        for(int k=0; k<numScores; k++){
             if (fabs(totalScore) < screen_score_threshold){
-                totalScore += (scores[k] - base_scores[k]);
+                double baseScore = scores[0][k];
+                totalScore += (scores[variantIndex + 1][k] - baseScore);
             }
         }
-        v[i] = totalScore;
+        v[variantIndex] = totalScore;
     }
 
     return v;

From eae79cb7023f92d7c1b917b5a0c13fa77395028c Mon Sep 17 00:00:00 2001
From: Mike Vella <mike.vella@nanoporetech.com>
Date: Thu, 5 Jul 2018 17:55:35 +0100
Subject: [PATCH 19/80] Kernel now fast but some numerical errors remain

---
 Makefile                         |   8 +-
 src/cuda_kernels/GpuAligner.cu   | 309 +++++++++++++++----------------
 src/cuda_kernels/GpuAligner.h    |  44 ++++-
 src/nanopolish_call_variants.cpp |   9 +-
 4 files changed, 192 insertions(+), 178 deletions(-)

diff --git a/Makefile b/Makefile
index 0c08f211..9494ceb5 100644
--- a/Makefile
+++ b/Makefile
@@ -9,13 +9,13 @@ SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/ali
 
 #Basic flags every build needs
 LIBS=-lz
-CXXFLAGS ?= -O3# -g
+CXXFLAGS ?= -O3 -g
 CXXFLAGS += -std=c++11 -fopenmp -fsigned-char
 CFLAGS ?= -std=c99 -O3
 CXX ?= g++
 CC ?= gcc
 NVCC = nvcc
-NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -O3# -g
+NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -O3 --default-stream per-thread -g -G
 CURTFLAGS ?= -L/usr/local/cuda-9.0/lib64 -lcudart
 
 # Change the value of HDF5, EIGEN, or HTS below to any value to disable compilation of bundled code
@@ -69,8 +69,10 @@ EIGEN_INCLUDE=-I./eigen/
 # Include the src subdirectories
 NP_INCLUDE=$(addprefix -I./, $(SUBDIRS))
 
+CUDA_INCLUDE=-I/usr/local/cuda-9.0/include
+
 # Add include flags
-CPPFLAGS += $(H5_INCLUDE) $(HTS_INCLUDE) $(FAST5_INCLUDE) $(NP_INCLUDE) $(EIGEN_INCLUDE)
+CPPFLAGS += $(H5_INCLUDE) $(HTS_INCLUDE) $(FAST5_INCLUDE) $(NP_INCLUDE) $(EIGEN_INCLUDE) $(CUDA_INCLUDE)
 
 # Main programs to build
 PROGRAM=nanopolish
diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index 2be67114..225effd9 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -4,7 +4,7 @@
 #include <vector>
 #include "nanopolish_profile_hmm_r9.h"
 
-#define MAX_STATES 1024
+#define MAX_STATES 128
 
 __device__ float logsumexpf(float x, float y){
     if(x == -INFINITY && y == -INFINITY){
@@ -65,7 +65,7 @@ __global__ void getScores(float * eventData,
                           float * returnValues) {
 
     // Initialise the prev probability row, which is the row of the DP table
-    int n_kmers = blockDim.x; // Question: How does this deal with the case where the block is bigger than the sequence, such as if one variant is a deletion?
+    int n_kmers = blockDim.x;
     int n_states = n_kmers * PSR9_NUM_STATES + 2 * PSR9_NUM_STATES; // 3 blocks per kmer and then 3 each for start and end state.
 
     //initialise the return value
@@ -98,7 +98,6 @@ __global__ void getScores(float * eventData,
 
     if (rc == true) {
         rank = kmer_ranks_rc[kmerIdx];
-        //printf("Using an RC rank of %i\n", rank);
     }else{
         rank = kmer_ranks[kmerIdx];
     }
@@ -231,7 +230,7 @@ __global__ void getScores(float * eventData,
         sum += lp_emission_b;
 
         float newBadEventScore = sum;
-
+        __syncthreads();
         // Write row out. prevProbabilities now becomes "current probabilities" for evaluating skips.
         prevProbabilities[curBlockOffset + PSR9_MATCH] = newMatchScore;
         prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] = newBadEventScore;
@@ -295,24 +294,111 @@ __global__ void getScores(float * eventData,
 }
 
 
+//Default constructor
 GpuAligner::GpuAligner()
 {
-    y = 20;
-    asize = y*sizeof(int);
-    for (int i=0; i<y; i++)
-        n[i] = i;
+    int numModelElements = 4096;
+    int max_num_reads = 300;
+    int maxEventsPerBase = 100;
+    int totalEvents = maxEventsPerBase * max_num_reads;
+
+    cudaMalloc( (void**)&poreModelLevelMeanDev, numModelElements * sizeof(float));
+    cudaMalloc( (void**)&poreModelLevelLogStdvDev, numModelElements * sizeof(float));
+    cudaMalloc( (void**)&poreModelLevelStdvDev, numModelElements * sizeof(float));
+
+    cudaMalloc( (void**)&scaleDev, max_num_reads * sizeof(float));
+    cudaMalloc( (void**)&shiftDev, max_num_reads * sizeof(float));
+    cudaMalloc( (void**)&varDev, max_num_reads * sizeof(float));
+    cudaMalloc( (void**)&logVarDev, max_num_reads * sizeof(float));
+
+    cudaMalloc( (void**)&eventsPerBaseDev, max_num_reads * sizeof(float));
+
+    int max_n_rows = 100;
+    int maxBuffer = 50000 * sizeof(float);  //TODO: allocate more smartly
+
+    cudaMalloc( (void**)&numRowsDev, max_n_rows * sizeof(int));
+    cudaMalloc( (void**)&eventStartsDev, maxBuffer);
+    cudaMalloc( (void**)&eventStridesDev, maxBuffer);
+    cudaMalloc( (void**)&eventOffsetsDev, maxBuffer);
+
+    cudaMalloc( (void**)&eventMeansDev, maxBuffer);
+    cudaMalloc( (void**)&preFlankingDev, maxBuffer);
+    cudaMalloc( (void**)&postFlankingDev, maxBuffer);
+
+    //Allocate a host buffer to store the event means, pre and post-flank data
+    cudaHostAlloc(&eventMeans, maxBuffer , cudaHostAllocDefault);
+    cudaHostAlloc(&preFlankingHost, maxBuffer, cudaHostAllocDefault);
+    cudaHostAlloc(&postFlankingHost, maxBuffer, cudaHostAllocDefault);
+
+    int max_num_sequences = 8;
+    kmerRanksDevPointers.resize(max_num_sequences);
+    kmerRanksRCDevPointers.resize(max_num_sequences);
+    returnValuesDevResultsPointers.resize(max_num_sequences);
+    returnValuesHostResultsPointers.resize(max_num_sequences);
+
+    uint8_t num_streams = max_num_sequences;
+
+
+    for (int i =0; i<max_num_sequences;i++){
+        int *kmerRanksDev;
+        int *kmerRanksRCDev;
+        float * returnValuesDev;
+        float* returnedValues;
+
+        cudaMalloc((void **) &returnValuesDev, sizeof(float) * max_num_reads); //one score per read
+        cudaHostAlloc(&returnedValues, max_num_reads * sizeof(float) , cudaHostAllocDefault);
+
+        cudaMalloc((void **) &kmerRanksDev, max_n_rows * sizeof(int));
+        cudaMalloc((void **) &kmerRanksRCDev, max_n_rows * sizeof(int));
+
+        kmerRanksDevPointers[i] = kmerRanksDev;
+        kmerRanksRCDevPointers[i] = kmerRanksRCDev;
+        returnValuesDevResultsPointers[i] = returnValuesDev;
+        returnValuesHostResultsPointers[i] = returnedValues;
+
+        //create a stream per sequence
+        cudaStreamCreate(&streams[i]);
+    }
 }
 
-std::vector<std::vector<double>> scoreKernel(std::vector<HMMInputSequence> sequences,
-                                             std::vector<HMMInputData> event_sequences,
-                                             uint32_t alignment_flags){
+//Destructor
+GpuAligner::~GpuAligner() {
+    cudaFree(poreModelLevelMeanDev);
+    cudaFree(scaleDev);
+    cudaFree(shiftDev);
+    cudaFree(varDev);
+    cudaFree(logVarDev);
+    cudaFree(eventMeansDev);
+    cudaFree(eventsPerBaseDev);
+    cudaFree(numRowsDev);
+    cudaFree(eventStartsDev);
+    cudaFree(eventStridesDev);
+    cudaFree(eventOffsetsDev);
+    cudaFree(poreModelLevelLogStdvDev);
+    cudaFree(poreModelLevelStdvDev);
+    cudaFree(preFlankingDev);
+    cudaFree(postFlankingDev);
 
-    // Extract the pore model.
-    //Let's assume that every event sequence has the same pore model
-    //event_sequences[0].pore_model.
+    cudaFreeHost(eventMeans);
+    cudaFreeHost(preFlankingHost);
+    cudaFreeHost(postFlankingHost);
 
-    int num_reads = event_sequences.size();
-    // These asserts are here during the development phase
+    int max_num_sequences = 8; // should be a private variable
+    // Free device and host memory
+    for (int i =0; i<max_num_sequences; i++) {
+        cudaStreamDestroy(streams[i]);
+        cudaFree(kmerRanksRCDevPointers[i]);
+        cudaFree(kmerRanksDevPointers[i]);
+        cudaFree(returnValuesDevResultsPointers[i]);
+        cudaFreeHost(returnValuesHostResultsPointers[i]);
+    }
+
+}
+
+std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSequence> sequences,
+                                                std::vector<HMMInputData> event_sequences,
+                                                uint32_t alignment_flags){
+    // pre-running asserts
     assert(!sequences.empty());
     assert(std::string(sequences[0].get_alphabet()->get_name()) == "nucleotide");
     for (auto e: event_sequences) {
@@ -321,22 +407,21 @@ std::vector<std::vector<double>> scoreKernel(std::vector<HMMInputSequence> seque
         assert( (e.rc && e.event_stride == -1) || (!e.rc && e.event_stride == 1));
     }
 
-    size_t num_models = sequences.size();
-    double num_model_penalty = log(num_models);
-
-    assert(num_models != 1); //this is temporary
-
-    //auto sequence = sequences[0]; // temporary. We are only going to score one sequence against a set of events for now.
-
-    const uint32_t k = event_sequences[0].pore_model->k; //k is the kmerity
+    int num_reads = event_sequences.size();
 
-    std::vector<uint32_t> n_rows; //number of rows in the DP table (n_events + 1)
-    std::vector<uint32_t> e_starts; //event starts
-    std::vector<int> event_strides;
+    // Extract the pore model.
+    // Assume that every event sequence has the same pore model
+    // event_sequences[0].pore_model.
+    const uint32_t k = event_sequences[0].pore_model->k; //k is the length of a kmer
 
+    std::vector<uint32_t> n_rows; //number of rows in the DP table (n_events + 1) for each read
+    std::vector<uint32_t> e_starts; //event starts in the read for each read
+    std::vector<int> event_strides; //event strides for each read
     std::vector<std::vector<float>> pre_flanks;
     std::vector<std::vector<float>> post_flanks;
+    std::vector<float> eventsPerBase;
 
+    //Populate per-read vectors
     int numEventsTotal = 0;
     for(auto e: event_sequences){
         uint32_t e_start = e.event_start_idx;
@@ -352,53 +437,30 @@ std::vector<std::vector<double>> scoreKernel(std::vector<HMMInputSequence> seque
         else
             n_events = e_start - e_end + 1;
 
-        n_rows.push_back(n_events + 1);
-        numEventsTotal += n_events + 1; // TODO: is +1 necessary?
+        // TODO: is a +1 necessary here?
+        n_rows.push_back(n_events);
+        numEventsTotal += n_events;
 
         std::vector<float> pre_flank = make_pre_flanking(e, e_start, n_events);
         std::vector<float> post_flank = make_post_flanking(e, e_start, n_events);
 
         pre_flanks.push_back(pre_flank);
         post_flanks.push_back(post_flank);
-    }
-
-    // Prepare raw data and send it over to the score calculator kernel
-
-    // Buffer 1: Raw event data and associated starts and stops
 
-   // size_t numEventsTotal;
-    //1. Count the total number of events across all reads
-    //std::vector<int> eventLengths;
-    std::vector<float> eventsPerBase;
-    for (auto e: event_sequences){
-        size_t numEvents = e.read->events->size();
         float readEventsPerBase = e.read->events_per_base[e.strand];
-        //eventLengths.push_back(numEvents);
         eventsPerBase.push_back(readEventsPerBase);
     }
 
-
-    //Allocate a host buffer to store the event means, pre and post-flank data
-    float * eventMeans;
-    size_t eventMeansSize = numEventsTotal * sizeof(float);
-    cudaHostAlloc(&eventMeans, eventMeansSize , cudaHostAllocDefault);
-
-    //Allocate a host buffer to store the event means, pre and post-flank data
-    float * preFlankingHost;
-    float * postFlankingHost;
-    cudaHostAlloc(&preFlankingHost, numEventsTotal * sizeof(float) , cudaHostAllocDefault);
-    cudaHostAlloc(&postFlankingHost, numEventsTotal * sizeof(float) , cudaHostAllocDefault);
-
+    //Populate buffers for flanks and scaled means data
     std::vector<int> eventOffsets;
     size_t offset = 0;
-    for(int j=0;j<event_sequences.size();j++){
-        auto ev = event_sequences[j];
+    for(int j=0; j<num_reads; j++){
+        auto e = event_sequences[j];
         eventOffsets.push_back(offset);
-        size_t num_events = n_rows[j];//TODO: is this sometimes causing a segfault? is it correct?
+        size_t num_events = n_rows[j];
         for (int i=0;i<num_events;i++) {
             auto event_idx =  e_starts[j] + i * event_strides[j];
-            auto scaled = ev.read->get_drift_scaled_level(event_idx, ev.strand); // send the data in drift scaled
-            //auto unscaled = ev.read->events[0][i].mean; //taking the first element. Not sure what the second one is..
+            auto scaled = e.read->get_drift_scaled_level(event_idx, e.strand); // send the data in drift scaled
             eventMeans[offset + i] = scaled;
             preFlankingHost[offset + i] = pre_flanks[j][i]; //also copy over the pre-flanking data, since it has a 1-1 correspondence with events
             postFlankingHost[offset + i] = post_flanks[j][i]; //also copy over the pre-flanking data, since it has a 1-1 correspondence with events
@@ -408,23 +470,22 @@ std::vector<std::vector<double>> scoreKernel(std::vector<HMMInputSequence> seque
 
     int num_states = event_sequences[0].pore_model->states.size();
 
+    // Populate pore model buffers
     std::vector<float> pore_model_level_log_stdv(num_states);
     std::vector<float> pore_model_level_mean(num_states);
     std::vector<float> pore_model_level_stdv(num_states);
-
-    //TODO: Fix this.
     for(int st=0; st<num_states; st++){
-        auto params = event_sequences[0].pore_model->states[st]; //TODO: Is this OK?
+        auto params = event_sequences[0].pore_model->states[st];
         pore_model_level_log_stdv[st] = params.level_log_stdv;
         pore_model_level_mean[st] = params.level_mean;
         pore_model_level_stdv[st] = params.level_stdv;
     }
 
+    //Populating read-statistics buffers
     std::vector<float> scale(num_reads);
     std::vector<float> shift(num_reads);
     std::vector<float> var(num_reads);
     std::vector<float> log_var(num_reads);
-
     for (int i=0;i<num_reads;i++){
         auto read = event_sequences[i];
         scale[i] = event_sequences[i].read->scalings[read.strand].scale;
@@ -433,79 +494,31 @@ std::vector<std::vector<double>> scoreKernel(std::vector<HMMInputSequence> seque
         log_var[i] = event_sequences[i].read->scalings[read.strand].log_var;
     }
 
-    float* scaleDev;
-    float* shiftDev;
-    float* varDev;
-    float* logVarDev;
-
-    cudaMalloc( (void**)&scaleDev, scale.size() * sizeof(float));
-    cudaMalloc( (void**)&shiftDev, shift.size() * sizeof(float));
-    cudaMalloc( (void**)&varDev, var.size() * sizeof(float));
-    cudaMalloc( (void**)&logVarDev, log_var.size() * sizeof(float));
-
+    // Copy to the device all buffers shared across kmer sequences.
     cudaMemcpyAsync( scaleDev, scale.data(), scale.size() * sizeof(float), cudaMemcpyHostToDevice );
     cudaMemcpyAsync( shiftDev, shift.data(), shift.size() * sizeof(float), cudaMemcpyHostToDevice );
     cudaMemcpyAsync( varDev, var.data(), var.size() * sizeof(float), cudaMemcpyHostToDevice );
     cudaMemcpyAsync( logVarDev, log_var.data(), log_var.size() * sizeof(float), cudaMemcpyHostToDevice );
-
-    float* poreModelLevelLogStdvDev;
-    cudaMalloc( (void**)&poreModelLevelLogStdvDev, pore_model_level_log_stdv.size() * sizeof(float)); // for some reason this malloc is slow
     cudaMemcpyAsync( poreModelLevelLogStdvDev, pore_model_level_log_stdv.data(), pore_model_level_log_stdv.size() * sizeof(float), cudaMemcpyHostToDevice );
-
-    float* poreModelLevelMeanDev;
-    cudaMalloc( (void**)&poreModelLevelMeanDev, pore_model_level_mean.size() * sizeof(float));
     cudaMemcpyAsync( poreModelLevelMeanDev, pore_model_level_mean.data(), pore_model_level_mean.size() * sizeof(float), cudaMemcpyHostToDevice );
-
-    float* poreModelLevelStdvDev;
-    cudaMalloc( (void**)&poreModelLevelStdvDev, pore_model_level_stdv.size() * sizeof(float));
     cudaMemcpyAsync( poreModelLevelStdvDev, pore_model_level_stdv.data(), pore_model_level_stdv.size() * sizeof(float), cudaMemcpyHostToDevice );
-
-
-    float* eventsPerBaseDev;
-    cudaMalloc( (void**)&eventsPerBaseDev, eventsPerBase.size() * sizeof(float));
     cudaMemcpyAsync( eventsPerBaseDev, eventsPerBase.data(), eventsPerBase.size() * sizeof(float), cudaMemcpyHostToDevice );
-
-    float* eventMeansDev;
-    cudaMalloc( (void**)&eventMeansDev, eventMeansSize);
-    cudaMemcpyAsync( eventMeansDev, eventMeans, eventMeansSize, cudaMemcpyHostToDevice ); //malloc is taking 300us
-
-    float* preFlankingDev;
-    cudaMalloc( (void**)&preFlankingDev, eventMeansSize);
-    cudaMemcpyAsync( preFlankingDev, preFlankingHost, eventMeansSize, cudaMemcpyHostToDevice ); //malloc is taking 300us
-
-    float* postFlankingDev;
-    cudaMalloc( (void**)&postFlankingDev, eventMeansSize);
-    cudaMemcpyAsync( postFlankingDev, postFlankingHost, eventMeansSize, cudaMemcpyHostToDevice ); //malloc is taking 300us
-
-    int* numRowsDev;
-    cudaMalloc( (void**)&numRowsDev, n_rows.size() * sizeof(int));
+    cudaMemcpyAsync( eventMeansDev, eventMeans, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice );
+    cudaMemcpyAsync( preFlankingDev, preFlankingHost, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice );
+    cudaMemcpyAsync( postFlankingDev, postFlankingHost, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice );
     cudaMemcpyAsync( numRowsDev, n_rows.data(), n_rows.size() * sizeof(int), cudaMemcpyHostToDevice );
-
-    int* eventStartsDev;
-    cudaMalloc( (void**)&eventStartsDev, e_starts.size() * sizeof(int));
     cudaMemcpyAsync( eventStartsDev, e_starts.data(), e_starts.size() * sizeof(int), cudaMemcpyHostToDevice );
-
-    int* eventStridesDev;
-    cudaMalloc( (void**)&eventStridesDev, event_strides.size() * sizeof(int));
     cudaMemcpyAsync( eventStridesDev, event_strides.data(), event_strides.size() * sizeof(int), cudaMemcpyHostToDevice );
-
-    int* eventOffsetsDev;
-    cudaMalloc( (void**)&eventOffsetsDev, eventOffsets.size() * sizeof(int));
     cudaMemcpyAsync( eventOffsetsDev, eventOffsets.data(), eventOffsets.size() * sizeof(int), cudaMemcpyHostToDevice );
 
-    float * returnValuesDev;
-    cudaMalloc((void **) &returnValuesDev, sizeof(float) * num_reads); //one score per read
-
-    float* returnedValues;
-    cudaHostAlloc(&returnedValues, num_reads * sizeof(float) , cudaHostAllocDefault);
+    uint8_t  MAX_NUM_KMERS = 100;
 
-    uint8_t num_streams = sequences.size();
-    cudaStream_t streams[num_streams];
-    //float *data[num_streams];
+    for (int i =0; i<sequences.size();i++) {
 
+        int * kmerRanksDev = kmerRanksDevPointers[i];
+        int * kmerRanksRCDev = kmerRanksDevPointers[i];
+        float * returnValuesDev = returnValuesDevResultsPointers[i];
 
-    std::vector<std::vector<double>> results(sequences.size());
-    for (int i =0; i<sequences.size();i++) {
         auto sequence = sequences[i];
         uint32_t n_kmers = sequence.length() - k + 1; //number of kmers in the sequence
         uint32_t n_states = PSR9_NUM_STATES * (n_kmers + 2); // + 2 for explicit terminal states
@@ -518,21 +531,19 @@ std::vector<std::vector<double>> scoreKernel(std::vector<HMMInputSequence> seque
             kmer_ranks_rc[ki] = sequence.get_kmer_rank(ki, k, true);
         }
 
+        assert(kmer_ranks.size() < MAX_NUM_KMERS);
+        cudaMemcpyAsync(kmerRanksDev, kmer_ranks.data(), kmer_ranks.size() * sizeof(int),
+                        cudaMemcpyHostToDevice);
+        cudaMemcpyAsync(kmerRanksRCDev, kmer_ranks_rc.data(), kmer_ranks_rc.size() * sizeof(int),
+                        cudaMemcpyHostToDevice);
+
         int num_blocks = n_states / PSR9_NUM_STATES;
         uint32_t num_kmers = num_blocks - 2; // two terminal blocks. Not currently used but left here for now.
 
         dim3 dimBlock(num_blocks - 2); // One thread per state, not including Start and Terminal state.
         dim3 dimGrid(num_reads); // let's look at only the first read
 
-        int *kmerRanksDev;
-        int *kmerRanksRCDev;
-        cudaMalloc((void **) &kmerRanksDev, kmer_ranks.size() * sizeof(int));
-        cudaMalloc((void **) &kmerRanksRCDev, kmer_ranks_rc.size() * sizeof(int));
-        cudaMemcpyAsync(kmerRanksDev, kmer_ranks.data(), kmer_ranks.size() * sizeof(int), cudaMemcpyHostToDevice);
-        cudaMemcpyAsync(kmerRanksRCDev, kmer_ranks_rc.data(), kmer_ranks_rc.size() * sizeof(int),
-                        cudaMemcpyHostToDevice);
-
-        getScores <<< dimGrid, dimBlock, 0>>> (eventMeansDev,
+        getScores <<< dimGrid, dimBlock, 0, streams[i]>>> (eventMeansDev,
                 eventsPerBaseDev,
                 numRowsDev,
                 eventStartsDev,
@@ -550,45 +561,19 @@ std::vector<std::vector<double>> scoreKernel(std::vector<HMMInputSequence> seque
                 preFlankingDev,
                 postFlankingDev,
                 returnValuesDev);
+    }
 
-        cudaDeviceSynchronize();
-        cudaMemcpyAsync(returnedValues, returnValuesDev, num_reads *sizeof(float), cudaMemcpyDeviceToHost);
-        cudaDeviceSynchronize();
-
-        cudaFree(kmerRanksDev);
-        cudaFree(kmerRanksRCDev);
+    cudaDeviceSynchronize();
 
-        //Send all the scores back
-        //std::vector<double> r(num_reads);
-        results[i].resize(num_reads);
-        for(int readIdx=0; readIdx<num_reads;readIdx++){
-            results[i][readIdx]= (double) returnedValues[readIdx];
+    std::vector<std::vector<double>> results(sequences.size());
+    for (int i =0; i<sequences.size();i++) {
+        cudaMemcpy(returnValuesHostResultsPointers[i], returnValuesDevResultsPointers[i], num_reads *sizeof(float), cudaMemcpyDeviceToHost);
+        for(int readIdx=0; readIdx<num_reads;readIdx++) {
+            results[i].resize(num_reads);
+            results[i][readIdx] = (double) returnValuesHostResultsPointers[i][readIdx];
         }
     }
 
-    // Free device memory
-    cudaFree(eventMeansDev);
-    cudaFree(eventsPerBaseDev);
-    cudaFree(numRowsDev);
-    cudaFree(eventStartsDev);
-    cudaFree(eventStridesDev);
-    cudaFree(eventOffsetsDev);
-    cudaFree(poreModelLevelLogStdvDev);
-    cudaFree(poreModelLevelStdvDev);
-    cudaFree(poreModelLevelMeanDev);
-    cudaFree(scaleDev);
-    cudaFree(shiftDev);
-    cudaFree(varDev);
-    cudaFree(logVarDev);
-    cudaFree(preFlankingDev);
-    cudaFree(postFlankingDev);
-
-    //Free host memory
-    cudaFreeHost(eventMeans);
-    cudaFreeHost(preFlankingHost);
-    cudaFreeHost(postFlankingHost);
-    cudaFreeHost(returnedValues);
-
     return results;
 }
 
diff --git a/src/cuda_kernels/GpuAligner.h b/src/cuda_kernels/GpuAligner.h
index b6a8bbe1..912e55eb 100644
--- a/src/cuda_kernels/GpuAligner.h
+++ b/src/cuda_kernels/GpuAligner.h
@@ -37,6 +37,8 @@
 #include "progress.h"
 #include "stdaln.h"
 #include <chrono>
+#include <cuda.h>
+#include <cuda_runtime.h>
 
 #ifndef GPU_ALIGNER_H
 #define GPU_ALIGNER_H1
@@ -44,18 +46,42 @@
 class GpuAligner
 {
 public:
-    int n[20];
-    int y;
-    int asize;
-
     GpuAligner();
-    int calculateSum();
-    void setY(int);
+    ~GpuAligner();
 
     std::vector<double>
     variantScoresThresholded(std::vector<Variant> tmp_variants, Haplotype haplotype, std::vector<HMMInputData> event_sequences,
-              uint32_t alignment_flags, int screen_score_threshold, std::vector<std::string> methylation_types);// {
-        //return std::vector<double>();
-    //}
+              uint32_t alignment_flags, int screen_score_threshold, std::vector<std::string> methylation_types);
+
+    std::vector<std::vector<double>> scoreKernel(std::vector<HMMInputSequence> sequences,
+    std::vector<HMMInputData> event_sequences,
+            uint32_t alignment_flags);
+private:
+    float* poreModelLevelMeanDev;
+    float* scaleDev;
+    float* shiftDev;
+    float* varDev;
+    float* logVarDev;
+    float * eventMeans;
+    float * preFlankingHost;
+    float * postFlankingHost;
+    int* eventOffsetsDev;
+    int* eventStridesDev;
+    int* eventStartsDev;
+    int* numRowsDev;
+    float* postFlankingDev;
+    float* preFlankingDev;
+    float* eventMeansDev;
+    float* eventsPerBaseDev;
+    float* poreModelLevelStdvDev;
+    float* poreModelLevelLogStdvDev;
+    // Allocate arrays for storing results, kmerRanksDev and kmerRanksRCDev
+
+    std::vector<int*> kmerRanksDevPointers;
+    std::vector<int*> kmerRanksRCDevPointers;
+    std::vector<float*> returnValuesDevResultsPointers;
+    std::vector<float*> returnValuesHostResultsPointers;
+
+    cudaStream_t streams[8]; // TODO 8 should not be hardcoded here
 };
 #endif // GPU_ALIGNER_H
\ No newline at end of file
diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp
index ee3e6d05..93f16d9b 100644
--- a/src/nanopolish_call_variants.cpp
+++ b/src/nanopolish_call_variants.cpp
@@ -130,7 +130,7 @@ namespace opt
     static int min_flanking_sequence = 30;
     static int max_haplotypes = 1000;
     static int max_rounds = 50;
-    static int screen_score_threshold = 100;
+    static int screen_score_threshold = 1000;
     static int screen_flanking_sequence = 10;
     static int debug_alignments = 0;
     static std::vector<std::string> methylation_types;
@@ -294,6 +294,8 @@ std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& ali
     auto scoring = std::chrono::high_resolution_clock::now() - std::chrono::high_resolution_clock::now();
     auto gpu_exec = std::chrono::high_resolution_clock::now() - std::chrono::high_resolution_clock::now();
 
+    GpuAligner aligner;
+
     for(size_t i = region_start; i < region_end; ++i) {
 
         int calling_start = i - opt::screen_flanking_sequence;
@@ -347,12 +349,11 @@ std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& ali
                                  calling_start,
                                  alignments.get_reference_substring(contig, calling_start, calling_end));
 
-        GpuAligner aligner;
         auto t0_gpu = std::chrono::high_resolution_clock::now();
         // get the scaled levels.
 
         std::vector<double> scores = aligner.variantScoresThresholded(tmp_variants, test_haplotype, event_sequences,
-                                                       alignment_flags, 100000,//opt::screen_score_threshold,
+                                                       alignment_flags, opt::screen_score_threshold,
                                                        opt::methylation_types);
         auto tf_gpu = std::chrono::high_resolution_clock::now();
         gpu_exec += tf_gpu - t0_gpu;
@@ -363,7 +364,7 @@ std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& ali
                                                                test_haplotype,
                                                                event_sequences,
                                                                alignment_flags,
-                                                               100000,//opt::screen_score_threshold,
+                                                               opt::screen_score_threshold,
                                                                opt::methylation_types);
             auto t1 = std::chrono::high_resolution_clock::now();
             scoring += t1-t0;

From 348fcf01717c78492e443b51c5e33e9b17210604 Mon Sep 17 00:00:00 2001
From: Mike Vella <mike.vella@nanoporetech.com>
Date: Fri, 6 Jul 2018 15:43:35 +0100
Subject: [PATCH 20/80] Fixed bug which was causing incorrect forward strand
 results

---
 Makefile                              |  4 +-
 src/cuda_kernels/GpuAligner.cu        | 96 ++++++++++++++++++++-------
 src/hmm/nanopolish_profile_hmm_r9.inl | 11 ++-
 3 files changed, 83 insertions(+), 28 deletions(-)

diff --git a/Makefile b/Makefile
index 9494ceb5..aaf8cbc2 100644
--- a/Makefile
+++ b/Makefile
@@ -9,13 +9,13 @@ SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/ali
 
 #Basic flags every build needs
 LIBS=-lz
-CXXFLAGS ?= -O3 -g
+CXXFLAGS ?= -O0 -g
 CXXFLAGS += -std=c++11 -fopenmp -fsigned-char
 CFLAGS ?= -std=c99 -O3
 CXX ?= g++
 CC ?= gcc
 NVCC = nvcc
-NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -O3 --default-stream per-thread -g -G
+NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -O0 --default-stream per-thread -g -G
 CURTFLAGS ?= -L/usr/local/cuda-9.0/lib64 -lcudart
 
 # Change the value of HDF5, EIGEN, or HTS below to any value to disable compilation of bundled code
diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index 225effd9..8f9577b0 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -64,21 +64,37 @@ __global__ void getScores(float * eventData,
                           float * postFlankingDev,
                           float * returnValues) {
 
+    bool debug = false;
+    if(threadIdx.x==0 && blockIdx.x==0){
+        debug=true;
+    }
     // Initialise the prev probability row, which is the row of the DP table
+
     int n_kmers = blockDim.x;
     int n_states = n_kmers * PSR9_NUM_STATES + 2 * PSR9_NUM_STATES; // 3 blocks per kmer and then 3 each for start and end state.
 
-    //initialise the return value
+    //initialise the return value// Better to do this in a register
     returnValues[blockIdx.x] = -INFINITY;
+    __syncthreads();
 
     __shared__ float prevProbabilities[MAX_STATES];
 
-    // Initialise the previous probabilities
+    // Initialise the previous probabilities - this may not be quite correct as the intialization is different to the C++ version but I don't think it matter
     for (int i = 0; i < n_states - PSR9_NUM_STATES; i++) {
         prevProbabilities[i] = -INFINITY;
     }
     for (int i = n_states - PSR9_NUM_STATES; i < n_states; i++) {
-        prevProbabilities[i] = 0; // Is this correct?
+        prevProbabilities[i] = 0.0f; // Is this correct?
+    }
+
+    if(debug==true){
+        printf("Number of kmers is: %i\n", n_kmers);
+        printf("n_states is: %i\n", n_states);
+        printf("***\n");
+        printf("Prev probabilities row has been intialised to: \n");
+        for (int i = 0; i < n_states; i++) {
+            printf("Element %i = %f\n", i, prevProbabilities[i]);
+        }
     }
 
     //Step 1: calculate transitions. For now we are going to use external params.
@@ -87,11 +103,12 @@ __global__ void getScores(float * eventData,
     int numRows = numRowsPerRead[readIdx]; // Number of rows in this DP table.
     int e_start = eventStarts[readIdx]; // Event start for read
     int e_stride = eventStrides[readIdx];
+    int e_offset = eventOffsets[readIdx]; // Within the event means etc, the offset needed for this block to get a specific event
+
     bool rc = false;
     if (e_stride == -1){
         rc = true;
     }
-    int e_offset = eventOffsets[readIdx]; // Within the event means etc, the offset needed for this block to get a specific event
 
     int kmerIdx = threadIdx.x;
     uint32_t rank;
@@ -154,17 +171,31 @@ __global__ void getScores(float * eventData,
     float var = varDev[readIdx];
     float logVar = logVarDev[readIdx];
 
-    for(int row=1; row<numRows;row++){
+    if (debug==true){
+        printf("Number of rows is : %i\n", numRows);
+        printf("Event data offset is : %i\n", e_offset);
+        printf("Event start is %i\n", e_start);
+        printf("Stride: %i\n", e_stride);
+        printf("RC: %d\n", rc);
+        printf("First Kmer (should be 6 something and *not* 295) %i\n", kmer_ranks[0]);
+    }
+
+    for(int row=1; row<numRows + 1;row++){
         // Emission probabilities
         int event_idx = e_start + (row - 1) * e_stride;
         float event_mean = eventData[e_offset + row - 1];
         float preFlank = preFlankingDev[e_offset + row - 1];
         float postFlank = postFlankingDev[e_offset + row - 1];
 
-        bool debug = false;
+        //if(debug==true) {
+        //    printf("Row %i, event IDX = %i, event mean = %f, preFlank = %f, postFlank = %f\n", row, event_idx, event_mean,
+        //           preFlank, postFlank);
+        //}
 
-        if (threadIdx.x == 0 && (row == numRows -1) && blockIdx.x == 2){
-            debug = true;
+        if(debug==true) {
+            for (int col = 0; col < n_states; col++) {
+                printf("Row = %i, col = %i, val = %f\n", row - 1, col, prevProbabilities[col]);
+            }
         }
 
         float lp_emission_m = lp_match_r9(rank,
@@ -188,8 +219,6 @@ __global__ void getScores(float * eventData,
         float HMT_FROM_PREV_B = lp_bm_next + prevProbabilities[prevBlockOffset + PSR9_BAD_EVENT];
         float HMT_FROM_PREV_K = lp_km + prevProbabilities[prevBlockOffset + PSR9_KMER_SKIP];
 
-
-
         // m_s is the probability of going from the start state
         // to this kmer. The start state is (currently) only
         // allowed to go to the first kmer. If ALLOW_PRE_CLIP
@@ -208,10 +237,29 @@ __global__ void getScores(float * eventData,
         sum = logsumexpf(sum, HMT_FROM_PREV_K);
         sum += lp_emission_m;
 
-
         float newMatchScore = sum;
-        // Here need to calculate the bad event score
 
+        if(debug && (row == 1)){
+            printf("event IDX is %i\n", event_idx);
+            printf("rank %i\n", rank);
+            printf("This is thread %i\n", threadIdx.x);
+            printf("Writing out value for match of %f \n", newMatchScore);
+            printf("lp_emission_m is %f\n", lp_emission_m);
+            printf("event_mean %f\n", event_mean);
+            printf("poreModelLevelLogStdv %f\n", poreModelLevelLogStdv[rank]);
+            printf("poreModelLevelStdv %f\n", poreModelLevelLogStdv[rank]);
+            printf("poreModelLevelMean %f\n", poreModelLevelMean[rank]);
+            printf("scale %f\n", scale);
+            printf("shift %f\n", shift);
+            printf("var %f\n", var);
+            printf("logVar %f\n", logVar);
+
+            printf("Analysing pore model...\n");
+            for (int i=0;i<4096;i++){
+                printf("Pore model level mean %i = %f\n", i, poreModelLevelMean[i]);
+            }
+        }
+        // Here need to calculate the bad event score
 
         // state PSR9_BAD_EVENT
         HMT_FROM_SAME_M = lp_mb + prevProbabilities[curBlockOffset + PSR9_MATCH];
@@ -249,7 +297,7 @@ __global__ void getScores(float * eventData,
         sum = logsumexpf(sum, HMT_FROM_PREV_B);
         sum = logsumexpf(sum, HMT_FROM_PREV_K);
         sum = logsumexpf(sum, HMT_FROM_SOFT);
-        sum += 0.0;//No emission. redundant.
+        sum += 0.0; //No emission. redundant.
 
         float newSkipScore = sum;
 
@@ -258,7 +306,7 @@ __global__ void getScores(float * eventData,
 
         //Now need to do the skip-skip transition, which is serial so for now letting one thread execute it.
         if (threadIdx.x == 0){
-            for (int blkidx = 2;blkidx <= blockDim.x; blkidx++){
+            for (int blkidx=2; blkidx <= blockDim.x; blkidx++){
                 auto skipIdx = blkidx * PSR9_NUM_STATES + PSR9_KMER_SKIP;
                 float prevSkipScore = prevProbabilities[skipIdx - PSR9_NUM_STATES];
                 float curSkipScore = prevProbabilities[skipIdx];
@@ -409,12 +457,9 @@ std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSeq
 
     int num_reads = event_sequences.size();
 
-    // Extract the pore model.
-    // Assume that every event sequence has the same pore model
-    // event_sequences[0].pore_model.
     const uint32_t k = event_sequences[0].pore_model->k; //k is the length of a kmer
 
-    std::vector<uint32_t> n_rows; //number of rows in the DP table (n_events + 1) for each read
+    std::vector<uint32_t> n_rows; //number of rows in the DP table (n_events) for each read
     std::vector<uint32_t> e_starts; //event starts in the read for each read
     std::vector<int> event_strides; //event strides for each read
     std::vector<std::vector<float>> pre_flanks;
@@ -468,17 +513,17 @@ std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSeq
         offset += num_events;
     }
 
-    int num_states = event_sequences[0].pore_model->states.size();
-
     // Populate pore model buffers
+    // Assume that every event sequence has the same pore model
+    int num_states = event_sequences[0].pore_model->states.size();
     std::vector<float> pore_model_level_log_stdv(num_states);
     std::vector<float> pore_model_level_mean(num_states);
     std::vector<float> pore_model_level_stdv(num_states);
     for(int st=0; st<num_states; st++){
         auto params = event_sequences[0].pore_model->states[st];
-        pore_model_level_log_stdv[st] = params.level_log_stdv;
-        pore_model_level_mean[st] = params.level_mean;
+        pore_model_level_log_stdv[st] = params.level_log_stdv; //TODO: I am seeing level log stdv and level stdv return the same value. need to investigate this.
         pore_model_level_stdv[st] = params.level_stdv;
+        pore_model_level_mean[st] = params.level_mean;
     }
 
     //Populating read-statistics buffers
@@ -511,12 +556,13 @@ std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSeq
     cudaMemcpyAsync( eventStridesDev, event_strides.data(), event_strides.size() * sizeof(int), cudaMemcpyHostToDevice );
     cudaMemcpyAsync( eventOffsetsDev, eventOffsets.data(), eventOffsets.size() * sizeof(int), cudaMemcpyHostToDevice );
 
-    uint8_t  MAX_NUM_KMERS = 100;
+    cudaDeviceSynchronize();
+    uint8_t  MAX_NUM_KMERS = 30;
 
-    for (int i =0; i<sequences.size();i++) {
+    for (int i =0; i<1;i++) { //i<sequences.size()  //TODO: This is temporary, we are only invoking one stream at the moment
 
         int * kmerRanksDev = kmerRanksDevPointers[i];
-        int * kmerRanksRCDev = kmerRanksDevPointers[i];
+        int * kmerRanksRCDev = kmerRanksRCDevPointers[i];
         float * returnValuesDev = returnValuesDevResultsPointers[i];
 
         auto sequence = sequences[i];
diff --git a/src/hmm/nanopolish_profile_hmm_r9.inl b/src/hmm/nanopolish_profile_hmm_r9.inl
index 001e44f4..37e8daec 100644
--- a/src/hmm/nanopolish_profile_hmm_r9.inl
+++ b/src/hmm/nanopolish_profile_hmm_r9.inl
@@ -326,6 +326,10 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
     // Fill in matrix
     for(uint32_t row = 1; row < output.get_num_rows(); row++) {
 
+        for (int col = 0; col< output.get_num_columns();col++){
+            printf("Row = %i, col = %i, val = %f\n", row - 1, col, output.get(row -1,col));
+        }
+
         // Skip the first block which is the start state, it was initialized above
         // Similarily skip the last block, which is calculated in the terminate() function
         for(uint32_t block = 1; block < num_blocks - 1; block++) {
@@ -342,7 +346,7 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
             uint32_t event_idx = e_start + (row - 1) * data.event_stride;
             uint32_t rank = kmer_ranks[kmer_idx];
             float lp_emission_m = log_probability_match_r9(*data.read, *data.pore_model, rank, event_idx, data.strand, true);
-            //printf("CPU> lp_emission_m %f\n", lp_emission_m);
+
             float lp_emission_b = BAD_EVENT_PENALTY;
             
             HMMUpdateScores scores;
@@ -376,6 +380,11 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
             //    printf("bt.lp_km %f\n", bt.lp_km);
             //}
 
+            if(row==1 && block == 1) {
+                printf("CPU> lp_emission_m %f\n", lp_emission_m);
+                printf("Rank is %i\n", rank);
+            }
+
             output.update_cell(row, curr_block_offset + PSR9_MATCH, scores, lp_emission_m);
 
              // state PSR9_BAD_EVENT

From 0719a9b61730ead964824f3d4810302ae70ae502 Mon Sep 17 00:00:00 2001
From: Mike Vella <mike.vella@nanoporetech.com>
Date: Fri, 6 Jul 2018 16:05:07 +0100
Subject: [PATCH 21/80] tidyup

---
 src/common/nanopolish_variant.cpp     |  1 -
 src/cuda_kernels/GpuAligner.cu        | 55 +--------------------------
 src/hmm/nanopolish_profile_hmm_r9.inl | 39 -------------------
 3 files changed, 2 insertions(+), 93 deletions(-)

diff --git a/src/common/nanopolish_variant.cpp b/src/common/nanopolish_variant.cpp
index 357c7fae..b73a6b2b 100644
--- a/src/common/nanopolish_variant.cpp
+++ b/src/common/nanopolish_variant.cpp
@@ -686,7 +686,6 @@ Variant score_variant_thresholded(const Variant& input_variant,
         if(fabs(total_score) < score_threshold) {
 
             // Calculate scores using the base nucleotide model
-            //printf("Working with input %i\n", j);
             double base_score = profile_hmm_score_set(base_sequences, input[j], alignment_flags);
             double variant_score = profile_hmm_score_set(variant_sequences, input[j], alignment_flags);
 
diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index 8f9577b0..82581f04 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -87,16 +87,6 @@ __global__ void getScores(float * eventData,
         prevProbabilities[i] = 0.0f; // Is this correct?
     }
 
-    if(debug==true){
-        printf("Number of kmers is: %i\n", n_kmers);
-        printf("n_states is: %i\n", n_states);
-        printf("***\n");
-        printf("Prev probabilities row has been intialised to: \n");
-        for (int i = 0; i < n_states; i++) {
-            printf("Element %i = %f\n", i, prevProbabilities[i]);
-        }
-    }
-
     //Step 1: calculate transitions. For now we are going to use external params.
     int readIdx = blockIdx.x;
     float read_events_per_base = readEventsPerBase[readIdx];
@@ -171,15 +161,6 @@ __global__ void getScores(float * eventData,
     float var = varDev[readIdx];
     float logVar = logVarDev[readIdx];
 
-    if (debug==true){
-        printf("Number of rows is : %i\n", numRows);
-        printf("Event data offset is : %i\n", e_offset);
-        printf("Event start is %i\n", e_start);
-        printf("Stride: %i\n", e_stride);
-        printf("RC: %d\n", rc);
-        printf("First Kmer (should be 6 something and *not* 295) %i\n", kmer_ranks[0]);
-    }
-
     for(int row=1; row<numRows + 1;row++){
         // Emission probabilities
         int event_idx = e_start + (row - 1) * e_stride;
@@ -187,17 +168,6 @@ __global__ void getScores(float * eventData,
         float preFlank = preFlankingDev[e_offset + row - 1];
         float postFlank = postFlankingDev[e_offset + row - 1];
 
-        //if(debug==true) {
-        //    printf("Row %i, event IDX = %i, event mean = %f, preFlank = %f, postFlank = %f\n", row, event_idx, event_mean,
-        //           preFlank, postFlank);
-        //}
-
-        if(debug==true) {
-            for (int col = 0; col < n_states; col++) {
-                printf("Row = %i, col = %i, val = %f\n", row - 1, col, prevProbabilities[col]);
-            }
-        }
-
         float lp_emission_m = lp_match_r9(rank,
                                           event_mean,
                                           poreModelLevelLogStdv,
@@ -239,28 +209,7 @@ __global__ void getScores(float * eventData,
 
         float newMatchScore = sum;
 
-        if(debug && (row == 1)){
-            printf("event IDX is %i\n", event_idx);
-            printf("rank %i\n", rank);
-            printf("This is thread %i\n", threadIdx.x);
-            printf("Writing out value for match of %f \n", newMatchScore);
-            printf("lp_emission_m is %f\n", lp_emission_m);
-            printf("event_mean %f\n", event_mean);
-            printf("poreModelLevelLogStdv %f\n", poreModelLevelLogStdv[rank]);
-            printf("poreModelLevelStdv %f\n", poreModelLevelLogStdv[rank]);
-            printf("poreModelLevelMean %f\n", poreModelLevelMean[rank]);
-            printf("scale %f\n", scale);
-            printf("shift %f\n", shift);
-            printf("var %f\n", var);
-            printf("logVar %f\n", logVar);
-
-            printf("Analysing pore model...\n");
-            for (int i=0;i<4096;i++){
-                printf("Pore model level mean %i = %f\n", i, poreModelLevelMean[i]);
-            }
-        }
-        // Here need to calculate the bad event score
-
+        // Calculate the bad event scores
         // state PSR9_BAD_EVENT
         HMT_FROM_SAME_M = lp_mb + prevProbabilities[curBlockOffset + PSR9_MATCH];
         HMT_FROM_PREV_M = -INFINITY; // not allowed
@@ -559,7 +508,7 @@ std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSeq
     cudaDeviceSynchronize();
     uint8_t  MAX_NUM_KMERS = 30;
 
-    for (int i =0; i<1;i++) { //i<sequences.size()  //TODO: This is temporary, we are only invoking one stream at the moment
+    for (size_t i =0; i < sequences.size();i++){
 
         int * kmerRanksDev = kmerRanksDevPointers[i];
         int * kmerRanksRCDev = kmerRanksRCDevPointers[i];
diff --git a/src/hmm/nanopolish_profile_hmm_r9.inl b/src/hmm/nanopolish_profile_hmm_r9.inl
index 37e8daec..0d90b5c3 100644
--- a/src/hmm/nanopolish_profile_hmm_r9.inl
+++ b/src/hmm/nanopolish_profile_hmm_r9.inl
@@ -283,7 +283,6 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
 
     uint32_t e_start = data.event_start_idx;
 
-    //printf(">CPU e_start: %i\n", e_start);
     // Calculate number of blocks
     // A block of the HMM is a set of states for one kmer
     uint32_t num_blocks = output.get_num_columns() / PSR9_NUM_STATES; // num_columns is the number of HMM STATES
@@ -304,7 +303,6 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
     std::vector<uint32_t> kmer_ranks(num_kmers);
     for(size_t ki = 0; ki < num_kmers; ++ki) {
         int kr = sequence.get_kmer_rank(ki, k, data.rc); // can * -1 here to see if 3rd is correct
-        //printf(">CPU Kmer rank: %i\n", kr);
         kmer_ranks[ki] = kr;
     }
 
@@ -326,10 +324,6 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
     // Fill in matrix
     for(uint32_t row = 1; row < output.get_num_rows(); row++) {
 
-        for (int col = 0; col< output.get_num_columns();col++){
-            printf("Row = %i, col = %i, val = %f\n", row - 1, col, output.get(row -1,col));
-        }
-
         // Skip the first block which is the start state, it was initialized above
         // Similarily skip the last block, which is calculated in the terminate() function
         for(uint32_t block = 1; block < num_blocks - 1; block++) {
@@ -369,22 +363,6 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
                                         (event_idx == e_start ||
                                              (flags & HAF_ALLOW_PRE_CLIP))) ? lp_sm + pre_flank[row - 1] : -INFINITY;
 
-            //if (row == 2) {
-            //    printf("Working with matches in row 2\n");
-            //    printf("HMT_FROM_SOFT IS %f\n", scores.x[HMT_FROM_SOFT]);
-            //    printf("Strand is %i\n", data.strand);
-            //    printf("bt.lp_mm_self %f\n", bt.lp_mm_self);
-            //    printf("bt.lp_mm_next %f\n", bt.lp_mm_next);
-            //    printf("bt.lp_bm_self %f\n", bt.lp_bm_self);
-            //    printf("bt.lp_bm_next %f\n", bt.lp_bm_next);
-            //    printf("bt.lp_km %f\n", bt.lp_km);
-            //}
-
-            if(row==1 && block == 1) {
-                printf("CPU> lp_emission_m %f\n", lp_emission_m);
-                printf("Rank is %i\n", rank);
-            }
-
             output.update_cell(row, curr_block_offset + PSR9_MATCH, scores, lp_emission_m);
 
              // state PSR9_BAD_EVENT
@@ -394,11 +372,8 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
             scores.x[HMT_FROM_PREV_B] = -INFINITY;
             scores.x[HMT_FROM_PREV_K] = -INFINITY;
             scores.x[HMT_FROM_SOFT] = -INFINITY;
-            //printf("before: %f:\n", output.get(row, curr_block_offset + PSR9_BAD_EVENT));
             output.update_cell(row, curr_block_offset + PSR9_BAD_EVENT, scores, lp_emission_b);
-            //printf("after: %f:\n", output.get(row, curr_block_offset + PSR9_BAD_EVENT));
 
-            // in cu this is where the shared memory sync on prev states would go.
             // state PSR9_KMER_SKIP
             scores.x[HMT_FROM_SAME_M] = -INFINITY;
             scores.x[HMT_FROM_PREV_M] = bt.lp_mk + output.get(row, prev_block_offset + PSR9_MATCH);
@@ -413,7 +388,6 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
             // last kmer/event match.
 
             if(kmer_idx == last_kmer_idx && ( (flags & HAF_ALLOW_POST_CLIP) || row == last_event_row_idx)) {
-                //printf(">CPU Post-clip transition on row %i\n", row);
                 float lp1 = lp_ms + output.get(row, curr_block_offset + PSR9_MATCH) + post_flank[row - 1];
                 float lp2 = lp_ms + output.get(row, curr_block_offset + PSR9_BAD_EVENT) + post_flank[row - 1];
                 float lp3 = lp_ms + output.get(row, curr_block_offset + PSR9_KMER_SKIP) + post_flank[row - 1];
@@ -421,12 +395,6 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
                 output.update_end(lp1, row, curr_block_offset + PSR9_MATCH);
                 output.update_end(lp2, row, curr_block_offset + PSR9_BAD_EVENT);
                 output.update_end(lp3, row, curr_block_offset + PSR9_KMER_SKIP);
-
-                //printf(">LP1 %f\n", lp1);
-                //printf(">LP2 %f\n", lp2);
-                //printf(">LP3 %f\n", lp3);
-                //printf(">end %f\n", output.get_end());
-
             }
 
 
@@ -464,13 +432,6 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
         }
     }
 
-    for(uint32_t row = 1; row < output.get_num_rows(); row++) {
-        //for (int col=0; col<output.get_num_columns(); col++) {
-        //    printf("CPU> Value for row %i and col %i is %f\n", row, col, output.get(row, col));
-       // }
-    }
-
-
         return output.get_end();
 }
 

From 712e0685e1cc33472f223f6a120fd8c6f7b7dd01 Mon Sep 17 00:00:00 2001
From: Mike Vella <mike.vella@nanoporetech.com>
Date: Fri, 6 Jul 2018 17:38:51 +0100
Subject: [PATCH 22/80] some performance improvments

---
 Makefile                         |  4 +--
 src/cuda_kernels/GpuAligner.cu   | 46 ++++++++++++++++++--------------
 src/cuda_kernels/GpuAligner.h    |  2 ++
 src/nanopolish_call_variants.cpp |  3 ---
 4 files changed, 30 insertions(+), 25 deletions(-)

diff --git a/Makefile b/Makefile
index aaf8cbc2..060645e9 100644
--- a/Makefile
+++ b/Makefile
@@ -9,13 +9,13 @@ SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/ali
 
 #Basic flags every build needs
 LIBS=-lz
-CXXFLAGS ?= -O0 -g
+CXXFLAGS ?= -O3 # -g
 CXXFLAGS += -std=c++11 -fopenmp -fsigned-char
 CFLAGS ?= -std=c99 -O3
 CXX ?= g++
 CC ?= gcc
 NVCC = nvcc
-NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -O0 --default-stream per-thread -g -G
+NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -O3 -use_fast_math --default-stream per-thread #-g -G
 CURTFLAGS ?= -L/usr/local/cuda-9.0/lib64 -lcudart
 
 # Change the value of HDF5, EIGEN, or HTS below to any value to disable compilation of bundled code
diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index 82581f04..0dcabfd0 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -299,6 +299,8 @@ GpuAligner::GpuAligner()
     int maxEventsPerBase = 100;
     int totalEvents = maxEventsPerBase * max_num_reads;
 
+    poreModelInitialized = false;
+
     cudaMalloc( (void**)&poreModelLevelMeanDev, numModelElements * sizeof(float));
     cudaMalloc( (void**)&poreModelLevelLogStdvDev, numModelElements * sizeof(float));
     cudaMalloc( (void**)&poreModelLevelStdvDev, numModelElements * sizeof(float));
@@ -493,9 +495,6 @@ std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSeq
     cudaMemcpyAsync( shiftDev, shift.data(), shift.size() * sizeof(float), cudaMemcpyHostToDevice );
     cudaMemcpyAsync( varDev, var.data(), var.size() * sizeof(float), cudaMemcpyHostToDevice );
     cudaMemcpyAsync( logVarDev, log_var.data(), log_var.size() * sizeof(float), cudaMemcpyHostToDevice );
-    cudaMemcpyAsync( poreModelLevelLogStdvDev, pore_model_level_log_stdv.data(), pore_model_level_log_stdv.size() * sizeof(float), cudaMemcpyHostToDevice );
-    cudaMemcpyAsync( poreModelLevelMeanDev, pore_model_level_mean.data(), pore_model_level_mean.size() * sizeof(float), cudaMemcpyHostToDevice );
-    cudaMemcpyAsync( poreModelLevelStdvDev, pore_model_level_stdv.data(), pore_model_level_stdv.size() * sizeof(float), cudaMemcpyHostToDevice );
     cudaMemcpyAsync( eventsPerBaseDev, eventsPerBase.data(), eventsPerBase.size() * sizeof(float), cudaMemcpyHostToDevice );
     cudaMemcpyAsync( eventMeansDev, eventMeans, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice );
     cudaMemcpyAsync( preFlankingDev, preFlankingHost, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice );
@@ -505,9 +504,17 @@ std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSeq
     cudaMemcpyAsync( eventStridesDev, event_strides.data(), event_strides.size() * sizeof(int), cudaMemcpyHostToDevice );
     cudaMemcpyAsync( eventOffsetsDev, eventOffsets.data(), eventOffsets.size() * sizeof(int), cudaMemcpyHostToDevice );
 
-    cudaDeviceSynchronize();
-    uint8_t  MAX_NUM_KMERS = 30;
+    if (poreModelInitialized == false) {
+        cudaMemcpyAsync(poreModelLevelLogStdvDev, pore_model_level_log_stdv.data(),
+                        pore_model_level_log_stdv.size() * sizeof(float), cudaMemcpyHostToDevice);
+        cudaMemcpyAsync(poreModelLevelMeanDev, pore_model_level_mean.data(),
+                        pore_model_level_mean.size() * sizeof(float), cudaMemcpyHostToDevice);
+        cudaMemcpyAsync(poreModelLevelStdvDev, pore_model_level_stdv.data(),
+                        pore_model_level_stdv.size() * sizeof(float), cudaMemcpyHostToDevice);
+        poreModelInitialized = true;
+    }
 
+    uint8_t  MAX_NUM_KMERS = 30;
     for (size_t i =0; i < sequences.size();i++){
 
         int * kmerRanksDev = kmerRanksDevPointers[i];
@@ -528,9 +535,9 @@ std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSeq
 
         assert(kmer_ranks.size() < MAX_NUM_KMERS);
         cudaMemcpyAsync(kmerRanksDev, kmer_ranks.data(), kmer_ranks.size() * sizeof(int),
-                        cudaMemcpyHostToDevice);
+                        cudaMemcpyHostToDevice, streams[i]);
         cudaMemcpyAsync(kmerRanksRCDev, kmer_ranks_rc.data(), kmer_ranks_rc.size() * sizeof(int),
-                        cudaMemcpyHostToDevice);
+                        cudaMemcpyHostToDevice, streams[i]);
 
         int num_blocks = n_states / PSR9_NUM_STATES;
         uint32_t num_kmers = num_blocks - 2; // two terminal blocks. Not currently used but left here for now.
@@ -556,15 +563,21 @@ std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSeq
                 preFlankingDev,
                 postFlankingDev,
                 returnValuesDev);
-    }
 
-    cudaDeviceSynchronize();
+        cudaMemcpyAsync(returnValuesHostResultsPointers[i], returnValuesDevResultsPointers[i], num_reads *sizeof(float), cudaMemcpyDeviceToHost, streams[i]);
+    }
 
     std::vector<std::vector<double>> results(sequences.size());
-    for (int i =0; i<sequences.size();i++) {
-        cudaMemcpy(returnValuesHostResultsPointers[i], returnValuesDevResultsPointers[i], num_reads *sizeof(float), cudaMemcpyDeviceToHost);
+    for (size_t i =0; i<sequences.size();i++) {
         for(int readIdx=0; readIdx<num_reads;readIdx++) {
             results[i].resize(num_reads);
+        }
+    }
+
+    cudaDeviceSynchronize();
+
+    for (size_t i =0; i<sequences.size();i++) {
+        for(int readIdx=0; readIdx<num_reads;readIdx++) {
             results[i][readIdx] = (double) returnValuesHostResultsPointers[i][readIdx];
         }
     }
@@ -597,24 +610,17 @@ std::vector<double> GpuAligner::variantScoresThresholded(std::vector<Variant> in
 
     sequences.push_back(base_sequence);
 
-    //std::vector<std::vector<HMMInputSequence>> variant_sequences;
-
     for (auto v: variant_haplotypes){
         auto variant_sequence = generate_methylated_alternatives(v.get_sequence(), methylation_types)[0];
         sequences.push_back(variant_sequence);
     }
 
-    //assert(base_sequences.size() == 1);
-
-    // return the sum of the score for the base sequences over all the event sequences
-    //auto base_scores = scoreKernel(base_sequences, event_sequences, alignment_flags);
-
     std::vector<std::vector<double>> scores = scoreKernel(sequences, event_sequences, alignment_flags);
 
-    std::vector<double> v(numVariants); // Thresholded score for each //(variant_sequences.size()); //TODO: Fix - temporary
+    std::vector<double> v(numVariants);
 
     uint32_t  numScores = scores[0].size();
-    for (int variantIndex=0; variantIndex<numVariants; variantIndex++){ //0 is the base scores
+    for (int variantIndex=0; variantIndex<numVariants; variantIndex++){ // index 0 is the base scores
         double totalScore = 0.0;
         for(int k=0; k<numScores; k++){
             if (fabs(totalScore) < screen_score_threshold){
diff --git a/src/cuda_kernels/GpuAligner.h b/src/cuda_kernels/GpuAligner.h
index 912e55eb..1f804fab 100644
--- a/src/cuda_kernels/GpuAligner.h
+++ b/src/cuda_kernels/GpuAligner.h
@@ -75,6 +75,8 @@ class GpuAligner
     float* eventsPerBaseDev;
     float* poreModelLevelStdvDev;
     float* poreModelLevelLogStdvDev;
+
+    bool poreModelInitialized;
     // Allocate arrays for storing results, kmerRanksDev and kmerRanksRCDev
 
     std::vector<int*> kmerRanksDevPointers;
diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp
index 93f16d9b..ad24b5be 100644
--- a/src/nanopolish_call_variants.cpp
+++ b/src/nanopolish_call_variants.cpp
@@ -280,7 +280,6 @@ std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& ali
                                                           int region_end,
                                                           uint32_t alignment_flags)
 {
-    std::cout << "CHECKPOINT 13" << std::endl;
     auto start = std::chrono::high_resolution_clock::now();
 
     std::vector<Variant> out_variants;
@@ -375,8 +374,6 @@ std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& ali
         }
     }
 
-    std::cout << "CHECKPOINT 14 - Region end - start ength= " << region_end - region_start << std::endl;
-
     auto end = std::chrono::high_resolution_clock::now();
 
     auto duration = std::chrono::duration_cast<std::chrono::milliseconds>( end - start ).count();

From d6be1c617c6a2034120fd3bd35efaf2ae79f4abb Mon Sep 17 00:00:00 2001
From: Mike Vella <mike.vella@nanoporetech.com>
Date: Mon, 9 Jul 2018 17:18:14 +0100
Subject: [PATCH 23/80] Fix error and tidy up

---
 Makefile                          |  4 +-
 src/common/nanopolish_variant.cpp |  2 +-
 src/cuda_kernels/GpuAligner.cu    | 36 +++++++--------
 src/cuda_kernels/GpuAligner.h     |  2 +-
 src/main/nanopolish.cpp           |  2 -
 src/nanopolish_call_variants.cpp  | 73 ++++++++++++-------------------
 6 files changed, 51 insertions(+), 68 deletions(-)

diff --git a/Makefile b/Makefile
index 060645e9..5adcd6eb 100644
--- a/Makefile
+++ b/Makefile
@@ -9,13 +9,13 @@ SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/ali
 
 #Basic flags every build needs
 LIBS=-lz
-CXXFLAGS ?= -O3 # -g
+CXXFLAGS ?= -O3
 CXXFLAGS += -std=c++11 -fopenmp -fsigned-char
 CFLAGS ?= -std=c99 -O3
 CXX ?= g++
 CC ?= gcc
 NVCC = nvcc
-NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -O3 -use_fast_math --default-stream per-thread #-g -G
+NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -O3 -use_fast_math --default-stream per-thread
 CURTFLAGS ?= -L/usr/local/cuda-9.0/lib64 -lcudart
 
 # Change the value of HDF5, EIGEN, or HTS below to any value to disable compilation of bundled code
diff --git a/src/common/nanopolish_variant.cpp b/src/common/nanopolish_variant.cpp
index b73a6b2b..725a62ab 100644
--- a/src/common/nanopolish_variant.cpp
+++ b/src/common/nanopolish_variant.cpp
@@ -664,7 +664,7 @@ std::vector<Variant> multi_call(VariantGroup& variant_group,
 //
 Variant score_variant_thresholded(const Variant& input_variant,
                                   Haplotype base_haplotype, 
-                                  const std::vector<HMMInputData>& input, // raw reads (I think)
+                                  const std::vector<HMMInputData>& input,
                                   const uint32_t alignment_flags,
                                   const uint32_t score_threshold,
                                   const std::vector<std::string>& methylation_types)
diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index 0dcabfd0..7489ec7c 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -29,9 +29,6 @@ __device__ float lp_match_r9(int rank,
 
     // STEP 1: GET DRIFT-SCALED LEVEL:
     float level = mean;
-    // TODO: Apply scaling to these 3 model values as is done in the CPP implementation
-    //these can just be pulled from the model
-
     float gaussian_mean = scale * poreModelLevelMean[rank] + shift;
     float gaussian_stdv = poreModelLevelStdv[rank] * var;
     float gaussian_log_level_stdv = poreModelLevelLogStdv[rank] + logVar;
@@ -399,6 +396,7 @@ std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSeq
                                                 uint32_t alignment_flags){
     // pre-running asserts
     assert(!sequences.empty());
+    assert(!event_sequences.empty());
     assert(std::string(sequences[0].get_alphabet()->get_name()) == "nucleotide");
     for (auto e: event_sequences) {
         assert(std::string(e.pore_model->pmalphabet->get_name()) == "nucleotide");
@@ -585,7 +583,7 @@ std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSeq
     return results;
 }
 
-std::vector<double> GpuAligner::variantScoresThresholded(std::vector<Variant> input_variants,
+std::vector<Variant> GpuAligner::variantScoresThresholded(std::vector<Variant> input_variants,
                                                         Haplotype base_haplotype,
                                                         std::vector<HMMInputData> event_sequences,
                                                         uint32_t alignment_flags,
@@ -606,29 +604,31 @@ std::vector<double> GpuAligner::variantScoresThresholded(std::vector<Variant> in
     std::vector<HMMInputSequence> sequences;
 
     HMMInputSequence base_sequence = generate_methylated_alternatives(base_haplotype.get_sequence(),
-                                                                                    methylation_types)[0]; //TODO: always 0?
+                                                                                    methylation_types)[0]; //TODO: fix for non-zero
 
     sequences.push_back(base_sequence);
 
     for (auto v: variant_haplotypes){
-        auto variant_sequence = generate_methylated_alternatives(v.get_sequence(), methylation_types)[0];
+        auto variant_sequence = generate_methylated_alternatives(v.get_sequence(), methylation_types)[0];  //TODO: fix for non-zero
         sequences.push_back(variant_sequence);
     }
 
-    std::vector<std::vector<double>> scores = scoreKernel(sequences, event_sequences, alignment_flags);
-
-    std::vector<double> v(numVariants);
-
-    uint32_t  numScores = scores[0].size();
-    for (int variantIndex=0; variantIndex<numVariants; variantIndex++){ // index 0 is the base scores
-        double totalScore = 0.0;
-        for(int k=0; k<numScores; k++){
-            if (fabs(totalScore) < screen_score_threshold){
-                double baseScore = scores[0][k];
-                totalScore += (scores[variantIndex + 1][k] - baseScore);
+    std::vector<Variant> v = input_variants;
+
+    if (!event_sequences.empty()) {
+        std::vector<std::vector<double>> scores = scoreKernel(sequences, event_sequences, alignment_flags);
+        uint32_t numScores = scores[0].size();
+        for (int variantIndex = 0; variantIndex < numVariants; variantIndex++) { // index 0 is the base scores
+            double totalScore = 0.0;
+            for (int k = 0; k < numScores; k++) {
+                if (fabs(totalScore) < screen_score_threshold) {
+                    double baseScore = scores[0][k];
+                    totalScore += (scores[variantIndex + 1][k] - baseScore);
+                }
             }
+            v[variantIndex].quality = totalScore;
+            v[variantIndex].info = "";
         }
-        v[variantIndex] = totalScore;
     }
 
     return v;
diff --git a/src/cuda_kernels/GpuAligner.h b/src/cuda_kernels/GpuAligner.h
index 1f804fab..25df67a4 100644
--- a/src/cuda_kernels/GpuAligner.h
+++ b/src/cuda_kernels/GpuAligner.h
@@ -49,7 +49,7 @@ class GpuAligner
     GpuAligner();
     ~GpuAligner();
 
-    std::vector<double>
+    std::vector<Variant>
     variantScoresThresholded(std::vector<Variant> tmp_variants, Haplotype haplotype, std::vector<HMMInputData> event_sequences,
               uint32_t alignment_flags, int screen_score_threshold, std::vector<std::string> methylation_types);
 
diff --git a/src/main/nanopolish.cpp b/src/main/nanopolish.cpp
index cc6fcab7..d25df269 100644
--- a/src/main/nanopolish.cpp
+++ b/src/main/nanopolish.cpp
@@ -64,7 +64,6 @@ int main(int argc, char** argv)
 {
     // Turn off HDF's exception printing, which is generally unhelpful for users
     H5Eset_auto(0, NULL, NULL);
-    std::cout << "CHECKPOINT 1\n";
 
     int ret = 0;
     if(argc <= 1) {
@@ -75,7 +74,6 @@ int main(int argc, char** argv)
         std::string command(argv[1]);
         auto iter = programs.find(command);
         if (iter != programs.end()) {
-            std::cout << "CHECKPOINT 2: " << iter->first <<std::endl;
             ret = iter->second(argc - 1, argv + 1);
         }
        else
diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp
index ad24b5be..1d9f147e 100644
--- a/src/nanopolish_call_variants.cpp
+++ b/src/nanopolish_call_variants.cpp
@@ -134,6 +134,8 @@ namespace opt
     static int screen_flanking_sequence = 10;
     static int debug_alignments = 0;
     static std::vector<std::string> methylation_types;
+    static int gpu = 0;
+
 }
 
 static const char* shortopts = "r:b:g:t:w:o:e:m:c:d:a:x:q:p:v";
@@ -145,6 +147,7 @@ enum { OPT_HELP = 1,
        OPT_SNPS_ONLY,
        OPT_CALC_ALL_SUPPORT,
        OPT_CONSENSUS,
+       OPT_GPU,
        OPT_FIX_HOMOPOLYMERS,
        OPT_GENOTYPE,
        OPT_MODELS_FOFN,
@@ -181,6 +184,7 @@ static const struct option longopts[] = {
     { "p-bad",                     required_argument, NULL, OPT_P_BAD },
     { "p-bad-self",                required_argument, NULL, OPT_P_BAD_SELF },
     { "consensus",                 required_argument, NULL, OPT_CONSENSUS },
+    { "gpu",                       required_argument, NULL, OPT_GPU },
     { "faster",                    no_argument,       NULL, OPT_FASTER },
     { "fix-homopolymers",          no_argument,       NULL, OPT_FIX_HOMOPOLYMERS },
     { "calculate-all-support",     no_argument,       NULL, OPT_CALC_ALL_SUPPORT },
@@ -349,27 +353,30 @@ std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& ali
                                  alignments.get_reference_substring(contig, calling_start, calling_end));
 
         auto t0_gpu = std::chrono::high_resolution_clock::now();
-        // get the scaled levels.
-
-        std::vector<double> scores = aligner.variantScoresThresholded(tmp_variants, test_haplotype, event_sequences,
-                                                       alignment_flags, opt::screen_score_threshold,
-                                                       opt::methylation_types);
-        auto tf_gpu = std::chrono::high_resolution_clock::now();
-        gpu_exec += tf_gpu - t0_gpu;
-
-        for(const Variant& v : tmp_variants) {
-            auto t0 = std::chrono::high_resolution_clock::now();
-            Variant scored_variant = score_variant_thresholded(v,
-                                                               test_haplotype,
-                                                               event_sequences,
-                                                               alignment_flags,
-                                                               opt::screen_score_threshold,
-                                                               opt::methylation_types);
-            auto t1 = std::chrono::high_resolution_clock::now();
-            scoring += t1-t0;
-            scored_variant.info = "";
-            if(scored_variant.quality > 0) {
-                out_variants.push_back(scored_variant);
+
+        if (opt::gpu){
+            std::vector<Variant> scoredVariants = aligner.variantScoresThresholded(tmp_variants, test_haplotype, event_sequences,
+                                                                          alignment_flags, opt::screen_score_threshold,
+                                                                          opt::methylation_types);
+            for (auto variant: scoredVariants){
+                if (variant.quality > 0) {
+                    out_variants.push_back(variant);
+                }
+            }
+        } else {
+            for (const Variant &v : tmp_variants) {
+                auto t0 = std::chrono::high_resolution_clock::now();
+                Variant scored_variant = score_variant_thresholded(v,
+                                                                   test_haplotype,
+                                                                   event_sequences,
+                                                                   alignment_flags,
+                                                                   opt::screen_score_threshold,
+                                                                   opt::methylation_types);
+                auto t1 = std::chrono::high_resolution_clock::now();
+                scored_variant.info = "";
+                if (scored_variant.quality > 0) {
+                    out_variants.push_back(scored_variant);
+                }
             }
         }
     }
@@ -382,12 +389,6 @@ std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& ali
 
     auto gpu_screening = std::chrono::duration_cast<std::chrono::milliseconds>(gpu_exec).count();
 
-    std::cout << "FUNCTION TOOK " << duration << "ms" << std::endl;
-    std::cout << "SCREENING (CPU) COMPONENT TOOK " << screening << "ms" << std::endl;
-    std::cout << "SCREENING (GPU) COMPONENT TOOK " << gpu_screening << "ms" << std::endl;
-
-
-
     return out_variants;
 }
 
@@ -938,7 +939,6 @@ Haplotype call_variants_for_region(const std::string& contig, int region_start,
                                alignments.get_region_start(),
                                alignments.get_reference());
 */
-    std::cout<<"CHECKPOINT 8 - Data loaded"<<std::endl;
     // Step 1. Discover putative variants across the whole region
     std::vector<Variant> candidate_variants;
     if(opt::candidates_file.empty()) {
@@ -947,16 +947,12 @@ Haplotype call_variants_for_region(const std::string& contig, int region_start,
         candidate_variants = read_variants_for_region(opt::candidates_file, contig, region_start, region_end);
     }
 
-    std::cout<<"CHECKPOINT 9 - Candidate variants generated"<<std::endl;
-
     if(opt::consensus_mode) {
 
         // generate single-base edits that have a positive haplotype score
         std::vector<Variant> single_base_edits = generate_candidate_single_base_edits(alignments, region_start, region_end, alignment_flags);
-        std::cout<<"CHECKPOINT 11 - Single base edits generated"<<std::endl;
         // insert these into the candidate set
         candidate_variants.insert(candidate_variants.end(), single_base_edits.begin(), single_base_edits.end());
-        std::cout<<"CHECKPOINT 12 - Single base edits inserted into vector"<<std::endl;
 
         // deduplicate variants
         std::set<Variant, VariantKeyComp> dedup_set(candidate_variants.begin(), candidate_variants.end());
@@ -965,17 +961,12 @@ Haplotype call_variants_for_region(const std::string& contig, int region_start,
         std::sort(candidate_variants.begin(), candidate_variants.end(), sortByPosition);
     }
 
-    std::cout<<"CHECKPOINT 10 - Additional candidate variants generated"<<std::endl;
-
-    // Step 2. Call variants
-
     Haplotype called_haplotype(alignments.get_region_contig(),
                                alignments.get_region_start(),
                                alignments.get_reference());
 
 
     if(opt::consensus_mode) {
-        std::cout << "CHECKPOINT 7 - CONSENSUS MODE" << std::endl;
         //
         // Calling strategy in consensus mode
         //
@@ -1075,6 +1066,7 @@ void parse_call_variants_options(int argc, char** argv)
             case 't': arg >> opt::num_threads; break;
             case 'v': opt::verbose++; break;
             case OPT_CONSENSUS: arg >> opt::consensus_output; opt::consensus_mode = 1; break;
+            case OPT_GPU: opt::gpu = 1; break;
             case OPT_FIX_HOMOPOLYMERS: opt::fix_homopolymers = 1; break;
             case OPT_EFFORT: arg >> opt::screen_score_threshold; break;
             case OPT_FASTER: opt::screen_score_threshold = 25; break;
@@ -1168,7 +1160,6 @@ int call_variants_main(int argc, char** argv)
     int end_base;
     int contig_length = -1;
 
-    std::cout << "Checkpoint 3" << std::endl;
     // If a window has been specified, only call variants/polish in that range
     if(!opt::window.empty()) {
         // Parse the window string
@@ -1204,8 +1195,6 @@ int call_variants_main(int argc, char** argv)
         out_fp = stdout;
     }
 
-    std::cout << "Checkpoint 4" << std::endl;
-
     // Build the VCF header
     std::vector<std::string> tag_fields;
 
@@ -1240,14 +1229,10 @@ int call_variants_main(int argc, char** argv)
             Variant::make_vcf_tag_string("FORMAT", "GT", 1, "String",
                 "Genotype"));
 
-    std::cout << "Checkpoint 5" << std::endl;
-
     Variant::write_vcf_header(out_fp, tag_fields);
 
     Haplotype haplotype = call_variants_for_region(contig, start_base, end_base, out_fp);
 
-    std::cout << "Checkpoint 6" << std::endl;
-
     if(out_fp != stdout) {
         fclose(out_fp);
     }

From ca3af6e4b91c4bbeb14b043ce21625d7089db8a9 Mon Sep 17 00:00:00 2001
From: Mike Vella <mike.vella@nanoporetech.com>
Date: Tue, 10 Jul 2018 13:26:42 +0100
Subject: [PATCH 24/80] tidy up

---
 src/cuda_kernels/GpuAligner.cu | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index 7489ec7c..bedff299 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -25,20 +25,18 @@ __device__ float lp_match_r9(int rank,
                              float logVar,
                              bool debug = false){
 
-    float log_inv_sqrt_2pi = log(0.3989422804014327); // no need to calculate this every time. better solutions available..
+    float log_inv_sqrt_2pi = log(0.3989422804014327);
 
-    // STEP 1: GET DRIFT-SCALED LEVEL:
     float level = mean;
     float gaussian_mean = scale * poreModelLevelMean[rank] + shift;
     float gaussian_stdv = poreModelLevelStdv[rank] * var;
     float gaussian_log_level_stdv = poreModelLevelLogStdv[rank] + logVar;
 
-    // Step 3: calculate log-normal PDF
-    float a = (level - gaussian_mean) / gaussian_stdv; // g is the gaussian parameters
+    float a = (level - gaussian_mean) / gaussian_stdv;
 
-    float emission = log_inv_sqrt_2pi - gaussian_log_level_stdv + (-0.5f * a * a); // log_inv_sqrt_2pi is defined in a comment above
+    float emission = log_inv_sqrt_2pi - gaussian_log_level_stdv + (-0.5f * a * a);
 
-    return emission; // log_inv_sqrt_2pi is defined in a comment above
+    return emission;
 
 }
 
@@ -288,13 +286,10 @@ __global__ void getScores(float * eventData,
 }
 
 
-//Default constructor
 GpuAligner::GpuAligner()
 {
     int numModelElements = 4096;
     int max_num_reads = 300;
-    int maxEventsPerBase = 100;
-    int totalEvents = maxEventsPerBase * max_num_reads;
 
     poreModelInitialized = false;
 
@@ -332,9 +327,6 @@ GpuAligner::GpuAligner()
     returnValuesDevResultsPointers.resize(max_num_sequences);
     returnValuesHostResultsPointers.resize(max_num_sequences);
 
-    uint8_t num_streams = max_num_sequences;
-
-
     for (int i =0; i<max_num_sequences;i++){
         int *kmerRanksDev;
         int *kmerRanksRCDev;
@@ -431,7 +423,6 @@ std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSeq
         else
             n_events = e_start - e_end + 1;
 
-        // TODO: is a +1 necessary here?
         n_rows.push_back(n_events);
         numEventsTotal += n_events;
 
@@ -538,7 +529,6 @@ std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSeq
                         cudaMemcpyHostToDevice, streams[i]);
 
         int num_blocks = n_states / PSR9_NUM_STATES;
-        uint32_t num_kmers = num_blocks - 2; // two terminal blocks. Not currently used but left here for now.
 
         dim3 dimBlock(num_blocks - 2); // One thread per state, not including Start and Terminal state.
         dim3 dimGrid(num_reads); // let's look at only the first read
@@ -632,4 +622,4 @@ std::vector<Variant> GpuAligner::variantScoresThresholded(std::vector<Variant> i
     }
 
     return v;
-}
+}
\ No newline at end of file

From 27fe62735a67c7e95619ccac2013b3231567f3f6 Mon Sep 17 00:00:00 2001
From: Mike Vella <mike.vella@nanoporetech.com>
Date: Tue, 10 Jul 2018 14:02:14 +0100
Subject: [PATCH 25/80] small performance improvments

---
 src/cuda_kernels/GpuAligner.cu | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index bedff299..8a2714f7 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -64,13 +64,11 @@ __global__ void getScores(float * eventData,
         debug=true;
     }
     // Initialise the prev probability row, which is the row of the DP table
-
     int n_kmers = blockDim.x;
     int n_states = n_kmers * PSR9_NUM_STATES + 2 * PSR9_NUM_STATES; // 3 blocks per kmer and then 3 each for start and end state.
 
-    //initialise the return value// Better to do this in a register
-    returnValues[blockIdx.x] = -INFINITY;
-    __syncthreads();
+    __shared__ float returnValue;
+    returnValue = -INFINITY;
 
     __shared__ float prevProbabilities[MAX_STATES];
 
@@ -222,7 +220,7 @@ __global__ void getScores(float * eventData,
         sum += lp_emission_b;
 
         float newBadEventScore = sum;
-        __syncthreads();
+
         // Write row out. prevProbabilities now becomes "current probabilities" for evaluating skips.
         prevProbabilities[curBlockOffset + PSR9_MATCH] = newMatchScore;
         prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] = newBadEventScore;
@@ -241,7 +239,6 @@ __global__ void getScores(float * eventData,
         sum = logsumexpf(sum, HMT_FROM_PREV_B);
         sum = logsumexpf(sum, HMT_FROM_PREV_K);
         sum = logsumexpf(sum, HMT_FROM_SOFT);
-        sum += 0.0; //No emission. redundant.
 
         float newSkipScore = sum;
 
@@ -249,20 +246,21 @@ __global__ void getScores(float * eventData,
         __syncthreads();
 
         //Now need to do the skip-skip transition, which is serial so for now letting one thread execute it.
+
         if (threadIdx.x == 0){
-            for (int blkidx=2; blkidx <= blockDim.x; blkidx++){
+            int firstBlockIdx = 2;
+            float prevSkipScore; prevSkipScore = prevProbabilities[(firstBlockIdx - 1) * PSR9_NUM_STATES + PSR9_KMER_SKIP];
+            for (int blkidx = firstBlockIdx; blkidx <= blockDim.x; blkidx++){
                 auto skipIdx = blkidx * PSR9_NUM_STATES + PSR9_KMER_SKIP;
-                float prevSkipScore = prevProbabilities[skipIdx - PSR9_NUM_STATES];
-                float curSkipScore = prevProbabilities[skipIdx];
+                float curSkipScore = prevProbabilities[skipIdx + PSR9_KMER_SKIP];
                 HMT_FROM_PREV_K = lp_kk + prevSkipScore;
                 newSkipScore = logsumexpf(curSkipScore, HMT_FROM_PREV_K);
                 prevProbabilities[skipIdx] = newSkipScore;
+                prevSkipScore = newSkipScore;
                 __syncthreads();
             }
         }
 
-        __syncthreads();
-
         int lastKmerIdx = n_kmers -1;
         int lastRowIdx = numRows -1;
         float end;
@@ -272,17 +270,16 @@ __global__ void getScores(float * eventData,
             float lp2 = lp_ms + prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] + postFlank;
             float lp3 = lp_ms + prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] + postFlank;
 
-            end = returnValues[blockIdx.x];
+            end = returnValue;
             end = logsumexpf(end, lp1);
             end = logsumexpf(end, lp2);
             end = logsumexpf(end, lp3);
-            returnValues[blockIdx.x] = end;
+            returnValue = end;
         }
-        // Now do the end state
-        __syncthreads();
 
-        }
-        __syncthreads();
+    }
+    returnValues[blockIdx.x] = returnValue;
+    __syncthreads();
 }
 
 

From 0e7fdcb68eca7dd2316125abc93dc0913af3e750 Mon Sep 17 00:00:00 2001
From: Mike Vella <mike.vella@nanoporetech.com>
Date: Tue, 10 Jul 2018 14:49:07 +0100
Subject: [PATCH 26/80] tidyup

---
 src/cuda_kernels/GpuAligner.cu | 74 +++++++++++++++-------------------
 1 file changed, 32 insertions(+), 42 deletions(-)

diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index 8a2714f7..c261381f 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -22,8 +22,7 @@ __device__ float lp_match_r9(int rank,
                              float scale,
                              float shift,
                              float var,
-                             float logVar,
-                             bool debug = false){
+                             float logVar){
 
     float log_inv_sqrt_2pi = log(0.3989422804014327);
 
@@ -33,9 +32,7 @@ __device__ float lp_match_r9(int rank,
     float gaussian_log_level_stdv = poreModelLevelLogStdv[rank] + logVar;
 
     float a = (level - gaussian_mean) / gaussian_stdv;
-
     float emission = log_inv_sqrt_2pi - gaussian_log_level_stdv + (-0.5f * a * a);
-
     return emission;
 
 }
@@ -59,10 +56,6 @@ __global__ void getScores(float * eventData,
                           float * postFlankingDev,
                           float * returnValues) {
 
-    bool debug = false;
-    if(threadIdx.x==0 && blockIdx.x==0){
-        debug=true;
-    }
     // Initialise the prev probability row, which is the row of the DP table
     int n_kmers = blockDim.x;
     int n_states = n_kmers * PSR9_NUM_STATES + 2 * PSR9_NUM_STATES; // 3 blocks per kmer and then 3 each for start and end state.
@@ -169,8 +162,7 @@ __global__ void getScores(float * eventData,
                                           scale,
                                           shift,
                                           var,
-                                          logVar,
-                                          debug);
+                                          logVar);
 
 
         float lp_emission_b = BAD_EVENT_PENALTY;
@@ -189,7 +181,7 @@ __global__ void getScores(float * eventData,
         // with a penalty;
         float HMT_FROM_SOFT = (kmerIdx == 0 &&
                                (event_idx == e_start ||
-                                (HAF_ALLOW_PRE_CLIP)))  ? lp_sm  + preFlank : -INFINITY; // TODO: Add flag for HAF ALLOW_PRE_CLIP
+                                (HAF_ALLOW_PRE_CLIP)))  ? lp_sm  + preFlank : -INFINITY;
 
         // calculate the score
         float sum = HMT_FROM_SAME_M;
@@ -205,19 +197,19 @@ __global__ void getScores(float * eventData,
         // Calculate the bad event scores
         // state PSR9_BAD_EVENT
         HMT_FROM_SAME_M = lp_mb + prevProbabilities[curBlockOffset + PSR9_MATCH];
-        HMT_FROM_PREV_M = -INFINITY; // not allowed
+        HMT_FROM_PREV_M = -INFINITY;
         HMT_FROM_SAME_B = lp_bb + prevProbabilities[curBlockOffset + PSR9_BAD_EVENT];
         HMT_FROM_PREV_B = -INFINITY;
         HMT_FROM_PREV_K = -INFINITY;
         HMT_FROM_SOFT = -INFINITY;
 
         sum = HMT_FROM_SAME_M;
-        sum = logsumexpf(sum, HMT_FROM_PREV_M);
         sum = logsumexpf(sum, HMT_FROM_SAME_B);
-        sum = logsumexpf(sum, HMT_FROM_PREV_B);
-        sum = logsumexpf(sum, HMT_FROM_PREV_K);
-        sum = logsumexpf(sum, HMT_FROM_SOFT);
         sum += lp_emission_b;
+        //sum = logsumexpf(sum, HMT_FROM_PREV_B);
+        //sum = logsumexpf(sum, HMT_FROM_PREV_K);
+        //sum = logsumexpf(sum, HMT_FROM_SOFT);
+        //sum = logsumexpf(sum, HMT_FROM_PREV_M);
 
         float newBadEventScore = sum;
 
@@ -233,12 +225,12 @@ __global__ void getScores(float * eventData,
         HMT_FROM_PREV_B = lp_bk + prevProbabilities[prevBlockOffset + PSR9_BAD_EVENT];
         HMT_FROM_SOFT = -INFINITY;
 
-        sum = HMT_FROM_SAME_M;
-        sum = logsumexpf(sum, HMT_FROM_PREV_M);
-        sum = logsumexpf(sum, HMT_FROM_SAME_B);
+        sum = HMT_FROM_PREV_M;
         sum = logsumexpf(sum, HMT_FROM_PREV_B);
         sum = logsumexpf(sum, HMT_FROM_PREV_K);
-        sum = logsumexpf(sum, HMT_FROM_SOFT);
+        //sum = logsumexpf(sum, HMT_FROM_SAME_M);
+        //sum = logsumexpf(sum, HMT_FROM_SAME_B);
+        //sum = logsumexpf(sum, HMT_FROM_SOFT);
 
         float newSkipScore = sum;
 
@@ -246,7 +238,6 @@ __global__ void getScores(float * eventData,
         __syncthreads();
 
         //Now need to do the skip-skip transition, which is serial so for now letting one thread execute it.
-
         if (threadIdx.x == 0){
             int firstBlockIdx = 2;
             float prevSkipScore; prevSkipScore = prevProbabilities[(firstBlockIdx - 1) * PSR9_NUM_STATES + PSR9_KMER_SKIP];
@@ -257,7 +248,6 @@ __global__ void getScores(float * eventData,
                 newSkipScore = logsumexpf(curSkipScore, HMT_FROM_PREV_K);
                 prevProbabilities[skipIdx] = newSkipScore;
                 prevSkipScore = newSkipScore;
-                __syncthreads();
             }
         }
 
@@ -290,28 +280,28 @@ GpuAligner::GpuAligner()
 
     poreModelInitialized = false;
 
-    cudaMalloc( (void**)&poreModelLevelMeanDev, numModelElements * sizeof(float));
-    cudaMalloc( (void**)&poreModelLevelLogStdvDev, numModelElements * sizeof(float));
-    cudaMalloc( (void**)&poreModelLevelStdvDev, numModelElements * sizeof(float));
+    cudaMalloc((void**)&poreModelLevelMeanDev, numModelElements * sizeof(float));
+    cudaMalloc((void**)&poreModelLevelLogStdvDev, numModelElements * sizeof(float));
+    cudaMalloc((void**)&poreModelLevelStdvDev, numModelElements * sizeof(float));
 
-    cudaMalloc( (void**)&scaleDev, max_num_reads * sizeof(float));
-    cudaMalloc( (void**)&shiftDev, max_num_reads * sizeof(float));
-    cudaMalloc( (void**)&varDev, max_num_reads * sizeof(float));
-    cudaMalloc( (void**)&logVarDev, max_num_reads * sizeof(float));
+    cudaMalloc((void**)&scaleDev, max_num_reads * sizeof(float));
+    cudaMalloc((void**)&shiftDev, max_num_reads * sizeof(float));
+    cudaMalloc((void**)&varDev, max_num_reads * sizeof(float));
+    cudaMalloc((void**)&logVarDev, max_num_reads * sizeof(float));
 
     cudaMalloc( (void**)&eventsPerBaseDev, max_num_reads * sizeof(float));
 
     int max_n_rows = 100;
     int maxBuffer = 50000 * sizeof(float);  //TODO: allocate more smartly
 
-    cudaMalloc( (void**)&numRowsDev, max_n_rows * sizeof(int));
-    cudaMalloc( (void**)&eventStartsDev, maxBuffer);
-    cudaMalloc( (void**)&eventStridesDev, maxBuffer);
-    cudaMalloc( (void**)&eventOffsetsDev, maxBuffer);
+    cudaMalloc((void**)&numRowsDev, max_n_rows * sizeof(int));
+    cudaMalloc((void**)&eventStartsDev, maxBuffer);
+    cudaMalloc((void**)&eventStridesDev, maxBuffer);
+    cudaMalloc((void**)&eventOffsetsDev, maxBuffer);
 
-    cudaMalloc( (void**)&eventMeansDev, maxBuffer);
-    cudaMalloc( (void**)&preFlankingDev, maxBuffer);
-    cudaMalloc( (void**)&postFlankingDev, maxBuffer);
+    cudaMalloc((void**)&eventMeansDev, maxBuffer);
+    cudaMalloc((void**)&preFlankingDev, maxBuffer);
+    cudaMalloc((void**)&postFlankingDev, maxBuffer);
 
     //Allocate a host buffer to store the event means, pre and post-flank data
     cudaHostAlloc(&eventMeans, maxBuffer , cudaHostAllocDefault);
@@ -325,16 +315,16 @@ GpuAligner::GpuAligner()
     returnValuesHostResultsPointers.resize(max_num_sequences);
 
     for (int i =0; i<max_num_sequences;i++){
-        int *kmerRanksDev;
-        int *kmerRanksRCDev;
+        int * kmerRanksDev;
+        int * kmerRanksRCDev;
         float * returnValuesDev;
-        float* returnedValues;
+        float * returnedValues;
 
-        cudaMalloc((void **) &returnValuesDev, sizeof(float) * max_num_reads); //one score per read
+        cudaMalloc((void**)&returnValuesDev, sizeof(float) * max_num_reads); //one score per read
         cudaHostAlloc(&returnedValues, max_num_reads * sizeof(float) , cudaHostAllocDefault);
 
-        cudaMalloc((void **) &kmerRanksDev, max_n_rows * sizeof(int));
-        cudaMalloc((void **) &kmerRanksRCDev, max_n_rows * sizeof(int));
+        cudaMalloc((void**)&kmerRanksDev, max_n_rows * sizeof(int));
+        cudaMalloc((void**)&kmerRanksRCDev, max_n_rows * sizeof(int));
 
         kmerRanksDevPointers[i] = kmerRanksDev;
         kmerRanksRCDevPointers[i] = kmerRanksRCDev;

From a0cce8f354cc80bcba7d4f7278430834da00a1bd Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Tue, 10 Jul 2018 15:01:47 +0100
Subject: [PATCH 27/80] Update README.md

---
 README.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/README.md b/README.md
index 1821d495..ba7039a6 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,12 @@
 # Nanopolish
 
+## GPU acceleration branch - experimental/Work in progress
+
+This branch adds CUDA-enabled GPU acceleration to the nanopolish consensus improvement algorithm. To try this feature run with the `--gpu` flag e.g:
+
+../nanopolish variants --consensus polished_gpu.fa -w "tig00000001:200000-230000" -r reads.fasta -b reads.sorted.bam -g draft.fa --threads=8 --gpu=1
+
+
 [![Build Status](https://travis-ci.org/jts/nanopolish.svg?branch=master)](https://travis-ci.org/jts/nanopolish)
 
 Software package for signal-level analysis of Oxford Nanopore sequencing data. Nanopolish can calculate an improved consensus sequence for a draft genome assembly, detect base modifications, call SNPs and indels with respect to a reference genome and more (see Nanopolish modules, below).

From 677c94b2d98159bec20bd53509246288cbf15f23 Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Tue, 10 Jul 2018 15:02:00 +0100
Subject: [PATCH 28/80] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index ba7039a6..aebd9835 100644
--- a/README.md
+++ b/README.md
@@ -3,9 +3,9 @@
 ## GPU acceleration branch - experimental/Work in progress
 
 This branch adds CUDA-enabled GPU acceleration to the nanopolish consensus improvement algorithm. To try this feature run with the `--gpu` flag e.g:
-
+```
 ../nanopolish variants --consensus polished_gpu.fa -w "tig00000001:200000-230000" -r reads.fasta -b reads.sorted.bam -g draft.fa --threads=8 --gpu=1
-
+```
 
 [![Build Status](https://travis-ci.org/jts/nanopolish.svg?branch=master)](https://travis-ci.org/jts/nanopolish)
 

From 213b8ebf89898bda16eb7c2a537e57a01085182a Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Tue, 10 Jul 2018 15:02:29 +0100
Subject: [PATCH 29/80] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index aebd9835..9609577b 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 
 This branch adds CUDA-enabled GPU acceleration to the nanopolish consensus improvement algorithm. To try this feature run with the `--gpu` flag e.g:
 ```
-../nanopolish variants --consensus polished_gpu.fa -w "tig00000001:200000-230000" -r reads.fasta -b reads.sorted.bam -g draft.fa --threads=8 --gpu=1
+nanopolish variants --consensus polished_gpu.fa -w "tig00000001:200000-230000" -r reads.fasta -b reads.sorted.bam -g draft.fa --threads=8 --gpu=1
 ```
 
 [![Build Status](https://travis-ci.org/jts/nanopolish.svg?branch=master)](https://travis-ci.org/jts/nanopolish)

From 33d3b56419b3c15812b96b570dfa72caf41ba55a Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Tue, 10 Jul 2018 15:09:37 +0100
Subject: [PATCH 30/80] tidup

---
 src/cuda_kernels/GpuAligner.cu | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index c261381f..e33ea674 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -132,8 +132,6 @@ __global__ void getScores(float * eventData,
 
     // Start filling out the "DP table"
     // Each thread is going to work on an individual P-HMM Block
-    // WRONG - need to use threadIdx & think carefully. we have one thread per block/kmer. each block has 3 states tho.
-    //int kmerIdx = blockIdx.x;
     int curBlockIdx = kmerIdx + 1; // Accounts for fact that we are not working with start block.
     int prevBlockIdx = curBlockIdx -1;
     int prevBlockOffset = PSR9_NUM_STATES * prevBlockIdx;

From dbd79064e498f69eaa480e0e96c4cfb617c86561 Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Tue, 10 Jul 2018 15:45:38 +0100
Subject: [PATCH 31/80] typo fix

---
 src/cuda_kernels/GpuAligner.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cuda_kernels/GpuAligner.h b/src/cuda_kernels/GpuAligner.h
index 25df67a4..1a82e492 100644
--- a/src/cuda_kernels/GpuAligner.h
+++ b/src/cuda_kernels/GpuAligner.h
@@ -41,7 +41,7 @@
 #include <cuda_runtime.h>
 
 #ifndef GPU_ALIGNER_H
-#define GPU_ALIGNER_H1
+#define GPU_ALIGNER_H
 
 class GpuAligner
 {

From 29bf0603ba62279e91f1f4dd5345dcc1e20eaa5e Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Wed, 11 Jul 2018 09:39:49 +0100
Subject: [PATCH 32/80] Storing kmer ranks in one buffer

---
 src/cuda_kernels/GpuAligner.cu | 63 ++++++++++++++++++++++------------
 src/cuda_kernels/GpuAligner.h  |  4 ++-
 2 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index e33ea674..6504499f 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -307,11 +307,17 @@ GpuAligner::GpuAligner()
     cudaHostAlloc(&postFlankingHost, maxBuffer, cudaHostAllocDefault);
 
     int max_num_sequences = 8;
+    int max_sequence_length = 50;
     kmerRanksDevPointers.resize(max_num_sequences);
     kmerRanksRCDevPointers.resize(max_num_sequences);
     returnValuesDevResultsPointers.resize(max_num_sequences);
     returnValuesHostResultsPointers.resize(max_num_sequences);
 
+    // Populate host buffer with kmer ranks
+    int numKmers = max_sequence_length * max_num_sequences;
+    cudaHostAlloc(&kmerRanks, numKmers  * 2 * sizeof(int), cudaHostAllocDefault);
+    cudaMalloc((void**)&kmerRanksDev, numKmers  * 2 * sizeof(int));
+
     for (int i =0; i<max_num_sequences;i++){
         int * kmerRanksDev;
         int * kmerRanksRCDev;
@@ -488,31 +494,45 @@ std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSeq
         poreModelInitialized = true;
     }
 
+    //Let's populate a host buffer with all the sequences.
+    size_t  numKmers = 0;
+    for (auto sequence: sequences) {
+        numKmers += sequence.length();
+    }
+
+    size_t kmerOffset = 0;
+    for (int i = 0; i<sequences.size(); i++) {
+        auto sequence = sequences[i];
+
+        size_t sequenceLength = sequence.length();
+        for(size_t ki = 0; ki < sequenceLength; ++ki) {
+            kmerRanks[ki + kmerOffset] = sequence.get_kmer_rank(ki, k, false);
+        }
+        kmerRanksDevPointers[i] = kmerRanksDev + kmerOffset;
+        kmerOffset += sequenceLength;
+
+        for(size_t ki = 0; ki < sequenceLength; ++ki) {
+            kmerRanks[ki + kmerOffset] = sequence.get_kmer_rank(ki, k, true);
+        }
+        kmerRanksRCDevPointers[i] = kmerRanksDev + kmerOffset;
+        kmerOffset += sequenceLength;
+    }
+
+    cudaMemcpyAsync(kmerRanksDev, kmerRanks, numKmers * sizeof(int) * 2,
+                    cudaMemcpyHostToDevice);
+
     uint8_t  MAX_NUM_KMERS = 30;
     for (size_t i =0; i < sequences.size();i++){
 
-        int * kmerRanksDev = kmerRanksDevPointers[i];
-        int * kmerRanksRCDev = kmerRanksRCDevPointers[i];
+        int * kmerRanksDevPtr = kmerRanksDevPointers[i];
+        int * kmerRanksRCDevPtr = kmerRanksRCDevPointers[i];
+
         float * returnValuesDev = returnValuesDevResultsPointers[i];
 
         auto sequence = sequences[i];
         uint32_t n_kmers = sequence.length() - k + 1; //number of kmers in the sequence
         uint32_t n_states = PSR9_NUM_STATES * (n_kmers + 2); // + 2 for explicit terminal states
 
-        std::vector<uint32_t> kmer_ranks(n_kmers);
-        std::vector<uint32_t> kmer_ranks_rc(n_kmers);
-
-        for(size_t ki = 0; ki < n_kmers; ++ki) {
-            kmer_ranks[ki] = sequence.get_kmer_rank(ki, k, false);
-            kmer_ranks_rc[ki] = sequence.get_kmer_rank(ki, k, true);
-        }
-
-        assert(kmer_ranks.size() < MAX_NUM_KMERS);
-        cudaMemcpyAsync(kmerRanksDev, kmer_ranks.data(), kmer_ranks.size() * sizeof(int),
-                        cudaMemcpyHostToDevice, streams[i]);
-        cudaMemcpyAsync(kmerRanksRCDev, kmer_ranks_rc.data(), kmer_ranks_rc.size() * sizeof(int),
-                        cudaMemcpyHostToDevice, streams[i]);
-
         int num_blocks = n_states / PSR9_NUM_STATES;
 
         dim3 dimBlock(num_blocks - 2); // One thread per state, not including Start and Terminal state.
@@ -523,8 +543,8 @@ std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSeq
                 numRowsDev,
                 eventStartsDev,
                 eventStridesDev,
-                kmerRanksDev,
-                kmerRanksRCDev,
+                kmerRanksDevPtr,
+                kmerRanksRCDevPtr,
                 eventOffsetsDev,
                 poreModelLevelLogStdvDev,
                 poreModelLevelStdvDev,
@@ -536,10 +556,11 @@ std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSeq
                 preFlankingDev,
                 postFlankingDev,
                 returnValuesDev);
-
-        cudaMemcpyAsync(returnValuesHostResultsPointers[i], returnValuesDevResultsPointers[i], num_reads *sizeof(float), cudaMemcpyDeviceToHost, streams[i]);
     }
-
+    for (int i = 0; i<8;i++) {
+        cudaMemcpyAsync(returnValuesHostResultsPointers[i], returnValuesDevResultsPointers[i],
+                        num_reads * sizeof(float), cudaMemcpyDeviceToHost, streams[i]);
+    }
     std::vector<std::vector<double>> results(sequences.size());
     for (size_t i =0; i<sequences.size();i++) {
         for(int readIdx=0; readIdx<num_reads;readIdx++) {
diff --git a/src/cuda_kernels/GpuAligner.h b/src/cuda_kernels/GpuAligner.h
index 1a82e492..eecaf632 100644
--- a/src/cuda_kernels/GpuAligner.h
+++ b/src/cuda_kernels/GpuAligner.h
@@ -57,7 +57,6 @@ class GpuAligner
     std::vector<HMMInputData> event_sequences,
             uint32_t alignment_flags);
 private:
-    float* poreModelLevelMeanDev;
     float* scaleDev;
     float* shiftDev;
     float* varDev;
@@ -75,6 +74,9 @@ class GpuAligner
     float* eventsPerBaseDev;
     float* poreModelLevelStdvDev;
     float* poreModelLevelLogStdvDev;
+    float* poreModelLevelMeanDev;
+    int * kmerRanks;
+    int * kmerRanksDev;
 
     bool poreModelInitialized;
     // Allocate arrays for storing results, kmerRanksDev and kmerRanksRCDev

From 39fec2bfe329011d41483b4af29f579cdd038ae2 Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Wed, 11 Jul 2018 11:12:53 +0100
Subject: [PATCH 33/80] fixed a makefile error

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 5adcd6eb..5d96b37b 100644
--- a/Makefile
+++ b/Makefile
@@ -124,7 +124,7 @@ depend: .depend
 
 .depend: $(CPP_SRC) $(C_SRC) $(CU_SRC) $(EXE_SRC) $(H5_LIB) $(EIGEN_CHECK)
 	rm -f ./.depend
-	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $(NVCCFLAGS) $(NVCC) -MM $(CPP_SRC) $(C_SRC) $(CU_SRC) > ./.depend;
+	$(CXX) $(CXXFLAGS) $(CPPFLAGS) -MM $(CPP_SRC) $(C_SRC) > ./.depend;
 
 include .depend
 

From c6414ccd8ac7d78a29558ed4b58868f81db258ab Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Thu, 12 Jul 2018 11:38:31 +0100
Subject: [PATCH 34/80] Some simple CUDA API error reporting

---
 src/cuda_kernels/GpuAligner.cu | 103 ++++++++++++++++-----------------
 1 file changed, 50 insertions(+), 53 deletions(-)

diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index 6504499f..b9bb9636 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -6,6 +6,10 @@
 
 #define MAX_STATES 128
 
+#define EXPAND_TO_STRING(X) #X
+#define TO_STRING(X) EXPAND_TO_STRING(X)
+#define CU_CHECK_ERR(X) if (X != cudaSuccess){printf("CUDA error: %s at line %s\n", cudaGetErrorString(X), TO_STRING(__LINE__));}
+
 __device__ float logsumexpf(float x, float y){
     if(x == -INFINITY && y == -INFINITY){
         return -INFINITY;
@@ -278,33 +282,28 @@ GpuAligner::GpuAligner()
 
     poreModelInitialized = false;
 
-    cudaMalloc((void**)&poreModelLevelMeanDev, numModelElements * sizeof(float));
-    cudaMalloc((void**)&poreModelLevelLogStdvDev, numModelElements * sizeof(float));
-    cudaMalloc((void**)&poreModelLevelStdvDev, numModelElements * sizeof(float));
-
-    cudaMalloc((void**)&scaleDev, max_num_reads * sizeof(float));
-    cudaMalloc((void**)&shiftDev, max_num_reads * sizeof(float));
-    cudaMalloc((void**)&varDev, max_num_reads * sizeof(float));
-    cudaMalloc((void**)&logVarDev, max_num_reads * sizeof(float));
-
-    cudaMalloc( (void**)&eventsPerBaseDev, max_num_reads * sizeof(float));
+    CU_CHECK_ERR(cudaMalloc((void**)&poreModelLevelMeanDev, numModelElements * sizeof(float)));
+    CU_CHECK_ERR(cudaMalloc((void**)&poreModelLevelLogStdvDev, numModelElements * sizeof(float)));
+    CU_CHECK_ERR(cudaMalloc((void**)&poreModelLevelStdvDev, numModelElements * sizeof(float)));
+    CU_CHECK_ERR(cudaMalloc((void**)&scaleDev, max_num_reads * sizeof(float)));
+    CU_CHECK_ERR(cudaMalloc((void**)&shiftDev, max_num_reads * sizeof(float)));
+    CU_CHECK_ERR(cudaMalloc((void**)&varDev, max_num_reads * sizeof(float)));
+    CU_CHECK_ERR(cudaMalloc((void**)&logVarDev, max_num_reads * sizeof(float)));
+    CU_CHECK_ERR(cudaMalloc( (void**)&eventsPerBaseDev, max_num_reads * sizeof(float)));
 
     int max_n_rows = 100;
     int maxBuffer = 50000 * sizeof(float);  //TODO: allocate more smartly
 
-    cudaMalloc((void**)&numRowsDev, max_n_rows * sizeof(int));
-    cudaMalloc((void**)&eventStartsDev, maxBuffer);
-    cudaMalloc((void**)&eventStridesDev, maxBuffer);
-    cudaMalloc((void**)&eventOffsetsDev, maxBuffer);
-
-    cudaMalloc((void**)&eventMeansDev, maxBuffer);
-    cudaMalloc((void**)&preFlankingDev, maxBuffer);
-    cudaMalloc((void**)&postFlankingDev, maxBuffer);
-
-    //Allocate a host buffer to store the event means, pre and post-flank data
-    cudaHostAlloc(&eventMeans, maxBuffer , cudaHostAllocDefault);
-    cudaHostAlloc(&preFlankingHost, maxBuffer, cudaHostAllocDefault);
-    cudaHostAlloc(&postFlankingHost, maxBuffer, cudaHostAllocDefault);
+    CU_CHECK_ERR(cudaMalloc((void**)&numRowsDev, max_n_rows * sizeof(int)));
+    CU_CHECK_ERR(cudaMalloc((void**)&eventStartsDev, maxBuffer));
+    CU_CHECK_ERR(cudaMalloc((void**)&eventStridesDev, maxBuffer));
+    CU_CHECK_ERR(cudaMalloc((void**)&eventOffsetsDev, maxBuffer));
+    CU_CHECK_ERR(cudaMalloc((void**)&eventMeansDev, maxBuffer));
+    CU_CHECK_ERR(cudaMalloc((void**)&preFlankingDev, maxBuffer));
+    CU_CHECK_ERR(cudaMalloc((void**)&postFlankingDev, maxBuffer));
+    CU_CHECK_ERR(cudaHostAlloc(&eventMeans, maxBuffer , cudaHostAllocDefault));
+    CU_CHECK_ERR(cudaHostAlloc(&preFlankingHost, maxBuffer, cudaHostAllocDefault));
+    CU_CHECK_ERR(cudaHostAlloc(&postFlankingHost, maxBuffer, cudaHostAllocDefault));
 
     int max_num_sequences = 8;
     int max_sequence_length = 50;
@@ -324,11 +323,10 @@ GpuAligner::GpuAligner()
         float * returnValuesDev;
         float * returnedValues;
 
-        cudaMalloc((void**)&returnValuesDev, sizeof(float) * max_num_reads); //one score per read
-        cudaHostAlloc(&returnedValues, max_num_reads * sizeof(float) , cudaHostAllocDefault);
-
-        cudaMalloc((void**)&kmerRanksDev, max_n_rows * sizeof(int));
-        cudaMalloc((void**)&kmerRanksRCDev, max_n_rows * sizeof(int));
+        CU_CHECK_ERR(cudaMalloc((void**)&returnValuesDev, sizeof(float) * max_num_reads)); //one score per read
+        CU_CHECK_ERR(cudaHostAlloc(&returnedValues, max_num_reads * sizeof(float) , cudaHostAllocDefault));
+        CU_CHECK_ERR(cudaMalloc((void**)&kmerRanksDev, max_n_rows * sizeof(int)));
+        CU_CHECK_ERR(cudaMalloc((void**)&kmerRanksRCDev, max_n_rows * sizeof(int)));
 
         kmerRanksDevPointers[i] = kmerRanksDev;
         kmerRanksRCDevPointers[i] = kmerRanksRCDev;
@@ -342,34 +340,33 @@ GpuAligner::GpuAligner()
 
 //Destructor
 GpuAligner::~GpuAligner() {
-    cudaFree(poreModelLevelMeanDev);
-    cudaFree(scaleDev);
-    cudaFree(shiftDev);
-    cudaFree(varDev);
-    cudaFree(logVarDev);
-    cudaFree(eventMeansDev);
-    cudaFree(eventsPerBaseDev);
-    cudaFree(numRowsDev);
-    cudaFree(eventStartsDev);
-    cudaFree(eventStridesDev);
-    cudaFree(eventOffsetsDev);
-    cudaFree(poreModelLevelLogStdvDev);
-    cudaFree(poreModelLevelStdvDev);
-    cudaFree(preFlankingDev);
-    cudaFree(postFlankingDev);
-
-    cudaFreeHost(eventMeans);
-    cudaFreeHost(preFlankingHost);
-    cudaFreeHost(postFlankingHost);
+    CU_CHECK_ERR(cudaFree(poreModelLevelMeanDev));
+    CU_CHECK_ERR(cudaFree(scaleDev));
+    CU_CHECK_ERR(cudaFree(shiftDev));
+    CU_CHECK_ERR(cudaFree(varDev));
+    CU_CHECK_ERR(cudaFree(logVarDev));
+    CU_CHECK_ERR(cudaFree(eventMeansDev));
+    CU_CHECK_ERR(cudaFree(eventsPerBaseDev));
+    CU_CHECK_ERR(cudaFree(numRowsDev));
+    CU_CHECK_ERR(cudaFree(eventStartsDev));
+    CU_CHECK_ERR(cudaFree(eventStridesDev));
+    CU_CHECK_ERR(cudaFree(eventOffsetsDev));
+    CU_CHECK_ERR(cudaFree(poreModelLevelLogStdvDev));
+    CU_CHECK_ERR(cudaFree(poreModelLevelStdvDev));
+    CU_CHECK_ERR(cudaFree(preFlankingDev));
+    CU_CHECK_ERR(cudaFree(postFlankingDev));
+    CU_CHECK_ERR(cudaFree(kmerRanksDev));
+    CU_CHECK_ERR(cudaFreeHost(eventMeans));
+    CU_CHECK_ERR(cudaFreeHost(preFlankingHost));
+    CU_CHECK_ERR(cudaFreeHost(postFlankingHost));
+    CU_CHECK_ERR(cudaFreeHost(kmerRanks));
 
     int max_num_sequences = 8; // should be a private variable
     // Free device and host memory
     for (int i =0; i<max_num_sequences; i++) {
-        cudaStreamDestroy(streams[i]);
-        cudaFree(kmerRanksRCDevPointers[i]);
-        cudaFree(kmerRanksDevPointers[i]);
-        cudaFree(returnValuesDevResultsPointers[i]);
-        cudaFreeHost(returnValuesHostResultsPointers[i]);
+      CU_CHECK_ERR(cudaStreamDestroy(streams[i]));
+      CU_CHECK_ERR(cudaFree(returnValuesDevResultsPointers[i]));
+      CU_CHECK_ERR(cudaFreeHost(returnValuesHostResultsPointers[i]));
     }
 
 }
@@ -628,4 +625,4 @@ std::vector<Variant> GpuAligner::variantScoresThresholded(std::vector<Variant> i
     }
 
     return v;
-}
\ No newline at end of file
+}

From 188de17f689236dbd198b866c2c970263500a404 Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Thu, 12 Jul 2018 14:21:46 +0100
Subject: [PATCH 35/80] One buffer for pore model

---
 src/cuda_kernels/GpuAligner.cu | 66 ++++++++++++++--------------------
 src/cuda_kernels/GpuAligner.h  |  5 ++-
 2 files changed, 31 insertions(+), 40 deletions(-)

diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index b9bb9636..e12ab8a8 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -20,9 +20,7 @@ __device__ float logsumexpf(float x, float y){
 
 __device__ float lp_match_r9(int rank,
                              float mean,
-                             float * poreModelLevelLogStdv,
-                             float * poreModelLevelStdv,
-                             float * poreModelLevelMean,
+                             float * poreModelDev,
                              float scale,
                              float shift,
                              float var,
@@ -31,9 +29,9 @@ __device__ float lp_match_r9(int rank,
     float log_inv_sqrt_2pi = log(0.3989422804014327);
 
     float level = mean;
-    float gaussian_mean = scale * poreModelLevelMean[rank] + shift;
-    float gaussian_stdv = poreModelLevelStdv[rank] * var;
-    float gaussian_log_level_stdv = poreModelLevelLogStdv[rank] + logVar;
+    float gaussian_mean = scale * poreModelDev[rank * 3] + shift;
+    float gaussian_stdv = poreModelDev[rank * 3 + 1] * var;
+    float gaussian_log_level_stdv = poreModelDev[rank * 3 + 2] + logVar;
 
     float a = (level - gaussian_mean) / gaussian_stdv;
     float emission = log_inv_sqrt_2pi - gaussian_log_level_stdv + (-0.5f * a * a);
@@ -49,9 +47,7 @@ __global__ void getScores(float * eventData,
                           int * kmer_ranks,
                           int * kmer_ranks_rc,
                           int * eventOffsets, // Offset to use for getting an event IDX for a specific read (read obtained by block IDX)
-                          float * poreModelLevelLogStdv,
-                          float * poreModelLevelStdv,
-                          float * poreModelLevelMean,
+			              float * poreModelDev,
                           float * scaleDev,
                           float * shiftDev,
                           float * varDev,
@@ -156,11 +152,9 @@ __global__ void getScores(float * eventData,
         float preFlank = preFlankingDev[e_offset + row - 1];
         float postFlank = postFlankingDev[e_offset + row - 1];
 
-        float lp_emission_m = lp_match_r9(rank,
+	    float lp_emission_m = lp_match_r9(rank,
                                           event_mean,
-                                          poreModelLevelLogStdv,
-                                          poreModelLevelStdv,
-                                          poreModelLevelMean,
+                                          poreModelDev,
                                           scale,
                                           shift,
                                           var,
@@ -282,15 +276,15 @@ GpuAligner::GpuAligner()
 
     poreModelInitialized = false;
 
-    CU_CHECK_ERR(cudaMalloc((void**)&poreModelLevelMeanDev, numModelElements * sizeof(float)));
-    CU_CHECK_ERR(cudaMalloc((void**)&poreModelLevelLogStdvDev, numModelElements * sizeof(float)));
-    CU_CHECK_ERR(cudaMalloc((void**)&poreModelLevelStdvDev, numModelElements * sizeof(float)));
     CU_CHECK_ERR(cudaMalloc((void**)&scaleDev, max_num_reads * sizeof(float)));
     CU_CHECK_ERR(cudaMalloc((void**)&shiftDev, max_num_reads * sizeof(float)));
     CU_CHECK_ERR(cudaMalloc((void**)&varDev, max_num_reads * sizeof(float)));
     CU_CHECK_ERR(cudaMalloc((void**)&logVarDev, max_num_reads * sizeof(float)));
     CU_CHECK_ERR(cudaMalloc( (void**)&eventsPerBaseDev, max_num_reads * sizeof(float)));
 
+    // Allocate Device memory for pore model
+    CU_CHECK_ERR(cudaMalloc((void**)&poreModelDev, numModelElements * 3 * sizeof(float)));
+    
     int max_n_rows = 100;
     int maxBuffer = 50000 * sizeof(float);  //TODO: allocate more smartly
 
@@ -305,6 +299,8 @@ GpuAligner::GpuAligner()
     CU_CHECK_ERR(cudaHostAlloc(&preFlankingHost, maxBuffer, cudaHostAllocDefault));
     CU_CHECK_ERR(cudaHostAlloc(&postFlankingHost, maxBuffer, cudaHostAllocDefault));
 
+    // Allocate host memory for model
+    CU_CHECK_ERR(cudaHostAlloc(&poreModelHost, numModelElements * sizeof(float) * 3, cudaHostAllocDefault));
     int max_num_sequences = 8;
     int max_sequence_length = 50;
     kmerRanksDevPointers.resize(max_num_sequences);
@@ -340,7 +336,6 @@ GpuAligner::GpuAligner()
 
 //Destructor
 GpuAligner::~GpuAligner() {
-    CU_CHECK_ERR(cudaFree(poreModelLevelMeanDev));
     CU_CHECK_ERR(cudaFree(scaleDev));
     CU_CHECK_ERR(cudaFree(shiftDev));
     CU_CHECK_ERR(cudaFree(varDev));
@@ -351,15 +346,16 @@ GpuAligner::~GpuAligner() {
     CU_CHECK_ERR(cudaFree(eventStartsDev));
     CU_CHECK_ERR(cudaFree(eventStridesDev));
     CU_CHECK_ERR(cudaFree(eventOffsetsDev));
-    CU_CHECK_ERR(cudaFree(poreModelLevelLogStdvDev));
-    CU_CHECK_ERR(cudaFree(poreModelLevelStdvDev));
     CU_CHECK_ERR(cudaFree(preFlankingDev));
     CU_CHECK_ERR(cudaFree(postFlankingDev));
     CU_CHECK_ERR(cudaFree(kmerRanksDev));
+    CU_CHECK_ERR(cudaFree(poreModelDev));
+
     CU_CHECK_ERR(cudaFreeHost(eventMeans));
     CU_CHECK_ERR(cudaFreeHost(preFlankingHost));
     CU_CHECK_ERR(cudaFreeHost(postFlankingHost));
     CU_CHECK_ERR(cudaFreeHost(kmerRanks));
+    CU_CHECK_ERR(cudaFreeHost(poreModelHost));
 
     int max_num_sequences = 8; // should be a private variable
     // Free device and host memory
@@ -444,16 +440,6 @@ std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSeq
     // Populate pore model buffers
     // Assume that every event sequence has the same pore model
     int num_states = event_sequences[0].pore_model->states.size();
-    std::vector<float> pore_model_level_log_stdv(num_states);
-    std::vector<float> pore_model_level_mean(num_states);
-    std::vector<float> pore_model_level_stdv(num_states);
-    for(int st=0; st<num_states; st++){
-        auto params = event_sequences[0].pore_model->states[st];
-        pore_model_level_log_stdv[st] = params.level_log_stdv; //TODO: I am seeing level log stdv and level stdv return the same value. need to investigate this.
-        pore_model_level_stdv[st] = params.level_stdv;
-        pore_model_level_mean[st] = params.level_mean;
-    }
-
     //Populating read-statistics buffers
     std::vector<float> scale(num_reads);
     std::vector<float> shift(num_reads);
@@ -482,13 +468,17 @@ std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSeq
     cudaMemcpyAsync( eventOffsetsDev, eventOffsets.data(), eventOffsets.size() * sizeof(int), cudaMemcpyHostToDevice );
 
     if (poreModelInitialized == false) {
-        cudaMemcpyAsync(poreModelLevelLogStdvDev, pore_model_level_log_stdv.data(),
-                        pore_model_level_log_stdv.size() * sizeof(float), cudaMemcpyHostToDevice);
-        cudaMemcpyAsync(poreModelLevelMeanDev, pore_model_level_mean.data(),
-                        pore_model_level_mean.size() * sizeof(float), cudaMemcpyHostToDevice);
-        cudaMemcpyAsync(poreModelLevelStdvDev, pore_model_level_stdv.data(),
-                        pore_model_level_stdv.size() * sizeof(float), cudaMemcpyHostToDevice);
-        poreModelInitialized = true;
+          int poreModelEntriesPerState = 3;
+	  for(int st=0; st<num_states; st++){
+	    auto params = event_sequences[0].pore_model->states[st];
+	    poreModelHost[st * poreModelEntriesPerState] = params.level_mean;
+	    poreModelHost[st * poreModelEntriesPerState + 1] = params.level_stdv;
+	    poreModelHost[st * poreModelEntriesPerState + 2] = params.level_log_stdv;
+	  }
+        // copy over the pore model
+	  cudaMemcpyAsync(poreModelDev, poreModelHost,
+			  poreModelEntriesPerState * 4096 * sizeof(float), cudaMemcpyHostToDevice); // TODO don't hardcode num kmers
+	  poreModelInitialized = true;
     }
 
     //Let's populate a host buffer with all the sequences.
@@ -543,9 +533,7 @@ std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSeq
                 kmerRanksDevPtr,
                 kmerRanksRCDevPtr,
                 eventOffsetsDev,
-                poreModelLevelLogStdvDev,
-                poreModelLevelStdvDev,
-                poreModelLevelMeanDev,
+                poreModelDev,							   
                 scaleDev,
                 shiftDev,
                 varDev,
diff --git a/src/cuda_kernels/GpuAligner.h b/src/cuda_kernels/GpuAligner.h
index eecaf632..c956fbe2 100644
--- a/src/cuda_kernels/GpuAligner.h
+++ b/src/cuda_kernels/GpuAligner.h
@@ -75,6 +75,9 @@ class GpuAligner
     float* poreModelLevelStdvDev;
     float* poreModelLevelLogStdvDev;
     float* poreModelLevelMeanDev;
+    float* poreModelDev;
+    float* poreModelHost;
+
     int * kmerRanks;
     int * kmerRanksDev;
 
@@ -88,4 +91,4 @@ class GpuAligner
 
     cudaStream_t streams[8]; // TODO 8 should not be hardcoded here
 };
-#endif // GPU_ALIGNER_H
\ No newline at end of file
+#endif // GPU_ALIGNER_H

From a2e8c2f043e5900b8f25c6c97fe7654aa5c41afa Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Thu, 12 Jul 2018 14:59:27 +0100
Subject: [PATCH 36/80] One buffer for pore model

---
 src/cuda_kernels/GpuAligner.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index e12ab8a8..21cdc0f1 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -20,7 +20,7 @@ __device__ float logsumexpf(float x, float y){
 
 __device__ float lp_match_r9(int rank,
                              float mean,
-                             float * poreModelDev,
+                             const float * poreModelDev,
                              float scale,
                              float shift,
                              float var,
@@ -47,7 +47,7 @@ __global__ void getScores(float * eventData,
                           int * kmer_ranks,
                           int * kmer_ranks_rc,
                           int * eventOffsets, // Offset to use for getting an event IDX for a specific read (read obtained by block IDX)
-			              float * poreModelDev,
+			              const float * poreModelDev,
                           float * scaleDev,
                           float * shiftDev,
                           float * varDev,

From 9b8f0297c4697e4be2557861b557788c07f14de9 Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Thu, 12 Jul 2018 16:21:09 +0100
Subject: [PATCH 37/80] Keeping pore model in registers

---
 src/cuda_kernels/GpuAligner.cu   | 83 ++++++++++++++++----------------
 src/cuda_kernels/GpuAligner.h    |  2 -
 src/nanopolish_call_variants.cpp | 20 +-------
 3 files changed, 43 insertions(+), 62 deletions(-)

diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index 21cdc0f1..830bb3e8 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -20,18 +20,20 @@ __device__ float logsumexpf(float x, float y){
 
 __device__ float lp_match_r9(int rank,
                              float mean,
-                             const float * poreModelDev,
+                             float pore_mean,
+                             float pore_stdv,
+                             float pore_log_level_stdv,
                              float scale,
                              float shift,
                              float var,
                              float logVar){
 
-    float log_inv_sqrt_2pi = log(0.3989422804014327);
+    float log_inv_sqrt_2pi = logf(0.3989422804014327);
 
     float level = mean;
-    float gaussian_mean = scale * poreModelDev[rank * 3] + shift;
-    float gaussian_stdv = poreModelDev[rank * 3 + 1] * var;
-    float gaussian_log_level_stdv = poreModelDev[rank * 3 + 2] + logVar;
+    float gaussian_mean = scale * pore_mean + shift;
+    float gaussian_stdv = pore_stdv * var;
+    float gaussian_log_level_stdv = pore_log_level_stdv + logVar;
 
     float a = (level - gaussian_mean) / gaussian_stdv;
     float emission = log_inv_sqrt_2pi - gaussian_log_level_stdv + (-0.5f * a * a);
@@ -39,21 +41,20 @@ __device__ float lp_match_r9(int rank,
 
 }
 
-__global__ void getScores(float * eventData,
-                          float * readEventsPerBase,
-                          int * numRowsPerRead,
-                          int * eventStarts,
-                          int * eventStrides,
-                          int * kmer_ranks,
-                          int * kmer_ranks_rc,
-                          int * eventOffsets, // Offset to use for getting an event IDX for a specific read (read obtained by block IDX)
-			              const float * poreModelDev,
-                          float * scaleDev,
-                          float * shiftDev,
-                          float * varDev,
-                          float * logVarDev,
-                          float * preFlankingDev,
-                          float * postFlankingDev,
+__global__ void getScores(float * const eventData,
+                          float * const readEventsPerBase,
+                          int * const numRowsPerRead,
+                          int * const eventStarts,
+                          int * const eventStrides,
+                          int * const kmerRanks,
+                          int * const eventOffsets, // Offset to use for getting an event IDX for a specific read (read obtained by block IDX)
+			              float * const poreModelDev,
+                          float * const scaleDev,
+                          float * const shiftDev,
+                          float * const varDev,
+                          float * const logVarDev,
+                          float * const preFlankingDev,
+                          float * const postFlankingDev,
                           float * returnValues) {
 
     // Initialise the prev probability row, which is the row of the DP table
@@ -90,11 +91,16 @@ __global__ void getScores(float * eventData,
     uint32_t rank;
 
     if (rc == true) {
-        rank = kmer_ranks_rc[kmerIdx];
+        rank = kmerRanks[kmerIdx + n_kmers];
     }else{
-        rank = kmer_ranks[kmerIdx];
+        rank = kmerRanks[kmerIdx];
     }
 
+    float pore_mean = poreModelDev[rank * 3];
+    float pore_stdv = poreModelDev[rank * 3 + 1];
+    float pore_log_level_stdv = poreModelDev[rank * 3 + 2];
+
+
     float p_stay = 1 - (1 / read_events_per_base);
     float p_skip = 0.0025;
     float p_bad = 0.001;
@@ -116,16 +122,16 @@ __global__ void getScores(float * eventData,
     float p_km = 1.0f - p_kk;
 
     // We assign some transition probabilities. I believe this is correct and they don't vary by location in the sequence
-    float lp_mk = log(p_mk);
-    float lp_mb = log(p_mb);
-    float lp_mm_self = log(p_mm_self);
-    float lp_mm_next = log(p_mm_next);
-    float lp_bb = log(p_bb);
-    float lp_bk = log(p_bk);
-    float lp_bm_next = log(p_bm_next);
-    float lp_bm_self = log(p_bm_self);
-    float lp_kk = log(p_kk);
-    float lp_km = log(p_km);
+    float lp_mk = logf(p_mk);
+    float lp_mb = logf(p_mb);
+    float lp_mm_self = logf(p_mm_self);
+    float lp_mm_next = logf(p_mm_next);
+    float lp_bb = logf(p_bb);
+    float lp_bk = logf(p_bk);
+    float lp_bm_next = logf(p_bm_next);
+    float lp_bm_self = logf(p_bm_self);
+    float lp_kk = logf(p_kk);
+    float lp_km = logf(p_km);
 
     float lp_sm, lp_ms;
     lp_sm = lp_ms = 0.0f;
@@ -152,9 +158,11 @@ __global__ void getScores(float * eventData,
         float preFlank = preFlankingDev[e_offset + row - 1];
         float postFlank = postFlankingDev[e_offset + row - 1];
 
-	    float lp_emission_m = lp_match_r9(rank,
+        float lp_emission_m = lp_match_r9(rank,
                                           event_mean,
-                                          poreModelDev,
+                                          pore_mean,
+                                          pore_stdv,
+                                          pore_log_level_stdv,
                                           scale,
                                           shift,
                                           var,
@@ -304,7 +312,6 @@ GpuAligner::GpuAligner()
     int max_num_sequences = 8;
     int max_sequence_length = 50;
     kmerRanksDevPointers.resize(max_num_sequences);
-    kmerRanksRCDevPointers.resize(max_num_sequences);
     returnValuesDevResultsPointers.resize(max_num_sequences);
     returnValuesHostResultsPointers.resize(max_num_sequences);
 
@@ -315,17 +322,14 @@ GpuAligner::GpuAligner()
 
     for (int i =0; i<max_num_sequences;i++){
         int * kmerRanksDev;
-        int * kmerRanksRCDev;
         float * returnValuesDev;
         float * returnedValues;
 
         CU_CHECK_ERR(cudaMalloc((void**)&returnValuesDev, sizeof(float) * max_num_reads)); //one score per read
         CU_CHECK_ERR(cudaHostAlloc(&returnedValues, max_num_reads * sizeof(float) , cudaHostAllocDefault));
         CU_CHECK_ERR(cudaMalloc((void**)&kmerRanksDev, max_n_rows * sizeof(int)));
-        CU_CHECK_ERR(cudaMalloc((void**)&kmerRanksRCDev, max_n_rows * sizeof(int)));
 
         kmerRanksDevPointers[i] = kmerRanksDev;
-        kmerRanksRCDevPointers[i] = kmerRanksRCDev;
         returnValuesDevResultsPointers[i] = returnValuesDev;
         returnValuesHostResultsPointers[i] = returnedValues;
 
@@ -501,7 +505,6 @@ std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSeq
         for(size_t ki = 0; ki < sequenceLength; ++ki) {
             kmerRanks[ki + kmerOffset] = sequence.get_kmer_rank(ki, k, true);
         }
-        kmerRanksRCDevPointers[i] = kmerRanksDev + kmerOffset;
         kmerOffset += sequenceLength;
     }
 
@@ -512,7 +515,6 @@ std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSeq
     for (size_t i =0; i < sequences.size();i++){
 
         int * kmerRanksDevPtr = kmerRanksDevPointers[i];
-        int * kmerRanksRCDevPtr = kmerRanksRCDevPointers[i];
 
         float * returnValuesDev = returnValuesDevResultsPointers[i];
 
@@ -531,7 +533,6 @@ std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSeq
                 eventStartsDev,
                 eventStridesDev,
                 kmerRanksDevPtr,
-                kmerRanksRCDevPtr,
                 eventOffsetsDev,
                 poreModelDev,							   
                 scaleDev,
diff --git a/src/cuda_kernels/GpuAligner.h b/src/cuda_kernels/GpuAligner.h
index c956fbe2..35e10187 100644
--- a/src/cuda_kernels/GpuAligner.h
+++ b/src/cuda_kernels/GpuAligner.h
@@ -82,10 +82,8 @@ class GpuAligner
     int * kmerRanksDev;
 
     bool poreModelInitialized;
-    // Allocate arrays for storing results, kmerRanksDev and kmerRanksRCDev
 
     std::vector<int*> kmerRanksDevPointers;
-    std::vector<int*> kmerRanksRCDevPointers;
     std::vector<float*> returnValuesDevResultsPointers;
     std::vector<float*> returnValuesHostResultsPointers;
 
diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp
index f89dc3ca..33d7e8c9 100644
--- a/src/nanopolish_call_variants.cpp
+++ b/src/nanopolish_call_variants.cpp
@@ -287,19 +287,13 @@ std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& ali
                                                           int region_end,
                                                           uint32_t alignment_flags)
 {
-    auto start = std::chrono::high_resolution_clock::now();
-
+    printf("In the outer loop, %i, %i\n",region_start, region_end);
     std::vector<Variant> out_variants;
-    std::vector<Variant> out_variants_gpu;
 
     std::string contig = alignments.get_region_contig();
 
     // Add all positively-scoring single-base changes into the candidate set
 
-
-    auto scoring = std::chrono::high_resolution_clock::now() - std::chrono::high_resolution_clock::now();
-    auto gpu_exec = std::chrono::high_resolution_clock::now() - std::chrono::high_resolution_clock::now();
-
     GpuAligner aligner;
 
     for(size_t i = region_start; i < region_end; ++i) {
@@ -355,8 +349,6 @@ std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& ali
                                  calling_start,
                                  alignments.get_reference_substring(contig, calling_start, calling_end));
 
-        auto t0_gpu = std::chrono::high_resolution_clock::now();
-
         if (opt::gpu){
             std::vector<Variant> scoredVariants = aligner.variantScoresThresholded(tmp_variants, test_haplotype, event_sequences,
                                                                           alignment_flags, opt::screen_score_threshold,
@@ -375,7 +367,6 @@ std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& ali
                                                                    alignment_flags,
                                                                    opt::screen_score_threshold,
                                                                    opt::methylation_types);
-                auto t1 = std::chrono::high_resolution_clock::now();
                 scored_variant.info = "";
                 if (scored_variant.quality > 0) {
                     out_variants.push_back(scored_variant);
@@ -383,15 +374,6 @@ std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& ali
             }
         }
     }
-
-    auto end = std::chrono::high_resolution_clock::now();
-
-    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>( end - start ).count();
-
-    auto screening = std::chrono::duration_cast<std::chrono::milliseconds>(scoring).count();
-
-    auto gpu_screening = std::chrono::duration_cast<std::chrono::milliseconds>(gpu_exec).count();
-
     return out_variants;
 }
 

From 20eca32d86cd440d6a5bc9d156cf41ddb7af360d Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Thu, 12 Jul 2018 16:24:29 +0100
Subject: [PATCH 38/80] Removed print statement

---
 src/nanopolish_call_variants.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp
index 33d7e8c9..6a3cd816 100644
--- a/src/nanopolish_call_variants.cpp
+++ b/src/nanopolish_call_variants.cpp
@@ -287,7 +287,6 @@ std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& ali
                                                           int region_end,
                                                           uint32_t alignment_flags)
 {
-    printf("In the outer loop, %i, %i\n",region_start, region_end);
     std::vector<Variant> out_variants;
 
     std::string contig = alignments.get_region_contig();

From ca1796f477097b2c93297023cca8bde1b7913cb2 Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Fri, 13 Jul 2018 13:30:26 +0100
Subject: [PATCH 39/80] Async kernel invocations for improved occupancy

---
 src/cuda_kernels/GpuAligner.cu   |  32 ++---
 src/nanopolish_call_variants.cpp | 222 ++++++++++++++++++++-----------
 2 files changed, 163 insertions(+), 91 deletions(-)

diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index 830bb3e8..e31da5b8 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -458,18 +458,18 @@ std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSeq
     }
 
     // Copy to the device all buffers shared across kmer sequences.
-    cudaMemcpyAsync( scaleDev, scale.data(), scale.size() * sizeof(float), cudaMemcpyHostToDevice );
-    cudaMemcpyAsync( shiftDev, shift.data(), shift.size() * sizeof(float), cudaMemcpyHostToDevice );
-    cudaMemcpyAsync( varDev, var.data(), var.size() * sizeof(float), cudaMemcpyHostToDevice );
-    cudaMemcpyAsync( logVarDev, log_var.data(), log_var.size() * sizeof(float), cudaMemcpyHostToDevice );
-    cudaMemcpyAsync( eventsPerBaseDev, eventsPerBase.data(), eventsPerBase.size() * sizeof(float), cudaMemcpyHostToDevice );
-    cudaMemcpyAsync( eventMeansDev, eventMeans, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice );
-    cudaMemcpyAsync( preFlankingDev, preFlankingHost, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice );
-    cudaMemcpyAsync( postFlankingDev, postFlankingHost, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice );
-    cudaMemcpyAsync( numRowsDev, n_rows.data(), n_rows.size() * sizeof(int), cudaMemcpyHostToDevice );
-    cudaMemcpyAsync( eventStartsDev, e_starts.data(), e_starts.size() * sizeof(int), cudaMemcpyHostToDevice );
-    cudaMemcpyAsync( eventStridesDev, event_strides.data(), event_strides.size() * sizeof(int), cudaMemcpyHostToDevice );
-    cudaMemcpyAsync( eventOffsetsDev, eventOffsets.data(), eventOffsets.size() * sizeof(int), cudaMemcpyHostToDevice );
+    cudaMemcpyAsync( scaleDev, scale.data(), scale.size() * sizeof(float), cudaMemcpyHostToDevice, streams[0] );
+    cudaMemcpyAsync( shiftDev, shift.data(), shift.size() * sizeof(float), cudaMemcpyHostToDevice,  streams[0]);
+    cudaMemcpyAsync( varDev, var.data(), var.size() * sizeof(float), cudaMemcpyHostToDevice, streams[0]);
+    cudaMemcpyAsync( logVarDev, log_var.data(), log_var.size() * sizeof(float), cudaMemcpyHostToDevice, streams[0] );
+    cudaMemcpyAsync( eventsPerBaseDev, eventsPerBase.data(), eventsPerBase.size() * sizeof(float), cudaMemcpyHostToDevice, streams[0]);
+    cudaMemcpyAsync( eventMeansDev, eventMeans, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice, streams[0] );
+    cudaMemcpyAsync( preFlankingDev, preFlankingHost, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice, streams[0] );
+    cudaMemcpyAsync( postFlankingDev, postFlankingHost, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice, streams[0] );
+    cudaMemcpyAsync( numRowsDev, n_rows.data(), n_rows.size() * sizeof(int), cudaMemcpyHostToDevice, streams[0] );
+    cudaMemcpyAsync( eventStartsDev, e_starts.data(), e_starts.size() * sizeof(int), cudaMemcpyHostToDevice, streams[0] );
+    cudaMemcpyAsync( eventStridesDev, event_strides.data(), event_strides.size() * sizeof(int), cudaMemcpyHostToDevice, streams[0] );
+    cudaMemcpyAsync( eventOffsetsDev, eventOffsets.data(), eventOffsets.size() * sizeof(int), cudaMemcpyHostToDevice, streams[0] );
 
     if (poreModelInitialized == false) {
           int poreModelEntriesPerState = 3;
@@ -481,7 +481,7 @@ std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSeq
 	  }
         // copy over the pore model
 	  cudaMemcpyAsync(poreModelDev, poreModelHost,
-			  poreModelEntriesPerState * 4096 * sizeof(float), cudaMemcpyHostToDevice); // TODO don't hardcode num kmers
+			  poreModelEntriesPerState * 4096 * sizeof(float), cudaMemcpyHostToDevice, streams[0]); // TODO don't hardcode num kmers
 	  poreModelInitialized = true;
     }
 
@@ -509,7 +509,7 @@ std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSeq
     }
 
     cudaMemcpyAsync(kmerRanksDev, kmerRanks, numKmers * sizeof(int) * 2,
-                    cudaMemcpyHostToDevice);
+                    cudaMemcpyHostToDevice, streams[0]);
 
     uint8_t  MAX_NUM_KMERS = 30;
     for (size_t i =0; i < sequences.size();i++){
@@ -554,7 +554,9 @@ std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSeq
         }
     }
 
-    cudaDeviceSynchronize();
+    for (size_t i = 0; i<sequences.size();i++){
+      cudaStreamSynchronize(streams[i]);
+    }
 
     for (size_t i =0; i<sequences.size();i++) {
         for(int readIdx=0; readIdx<num_reads;readIdx++) {
diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp
index 6a3cd816..62992bce 100644
--- a/src/nanopolish_call_variants.cpp
+++ b/src/nanopolish_call_variants.cpp
@@ -40,7 +40,9 @@
 #include "stdaln.h"
 #include <chrono>
 #include <cuda_kernels/GpuAligner.h>
-
+#include <thread>
+#include <chrono>
+#include <future>
 
 // Macros
 #define max3(x,y,z) std::max(std::max(x,y), z)
@@ -281,97 +283,165 @@ void annotate_with_all_support(std::vector<Variant>& variants,
     }
 }
 
-// Given the input region, calculate all single base edits to the current assembly
-std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& alignments,
-                                                          int region_start,
-                                                          int region_end,
-                                                          uint32_t alignment_flags)
-{
-    std::vector<Variant> out_variants;
-
-    std::string contig = alignments.get_region_contig();
 
-    // Add all positively-scoring single-base changes into the candidate set
+void singleLocusBaseEditCandidate(int i,
+                                  const AlignmentDB& alignments,
+                                  uint32_t alignment_flags,
+                                  std::vector<Variant> &out_variants,
+                                  std::string contig,
+                                  GpuAligner &aligner,
+                                  std::mutex &outVariantsMutex
+){
 
-    GpuAligner aligner;
+    int calling_start = i - opt::screen_flanking_sequence;
+    int calling_end = i + 1 + opt::screen_flanking_sequence;
 
-    for(size_t i = region_start; i < region_end; ++i) {
+    if(!alignments.are_coordinates_valid(contig, calling_start, calling_end)) {
+        return;
+    }
 
-        int calling_start = i - opt::screen_flanking_sequence;
-        int calling_end = i + 1 + opt::screen_flanking_sequence;
+    std::vector<Variant> tmp_variants;
+    for(size_t j = 0; j < 4; ++j) {
+        // Substitutions
+        Variant v;
+        v.ref_name = contig;
+        v.ref_position = i;
+        v.ref_seq = alignments.get_reference_substring(contig, i, i);
+        v.alt_seq = "ACGT"[j];
+
+        if(v.ref_seq != v.alt_seq) {
+            tmp_variants.push_back(v);
+        }
 
-        if(!alignments.are_coordinates_valid(contig, calling_start, calling_end)) {
-            continue;
+        // Insertions
+        v.alt_seq = v.ref_seq + "ACGT"[j];
+        // ignore insertions of the type "A" -> "AA" as these are redundant
+        if(v.alt_seq[1] != v.ref_seq[0]) {
+            tmp_variants.push_back(v);
         }
+    }
 
-        std::vector<Variant> tmp_variants;
-        for(size_t j = 0; j < 4; ++j) {
-            // Substitutions
-            Variant v;
-            v.ref_name = contig;
-            v.ref_position = i;
-            v.ref_seq = alignments.get_reference_substring(contig, i, i);
-            v.alt_seq = "ACGT"[j];
-
-            if(v.ref_seq != v.alt_seq) {
-                tmp_variants.push_back(v);
-            }
+    // deletion
+    Variant del;
+    del.ref_name = contig;
+    del.ref_position = i - 1;
+    del.ref_seq = alignments.get_reference_substring(contig, i - 1, i);
+    del.alt_seq = del.ref_seq[0];
 
-            // Insertions
-            v.alt_seq = v.ref_seq + "ACGT"[j];
-            // ignore insertions of the type "A" -> "AA" as these are redundant
-            if(v.alt_seq[1] != v.ref_seq[0]) {
-                tmp_variants.push_back(v);
-            }
-        }
+    // ignore deletions of the type "AA" -> "A" as these are redundant
+    if(del.alt_seq[0] != del.ref_seq[1]) {
+        tmp_variants.push_back(del);
+    }
 
-        // deletion
-        Variant del;
-        del.ref_name = contig;
-        del.ref_position = i - 1;
-        del.ref_seq = alignments.get_reference_substring(contig, i - 1, i);
-        del.alt_seq = del.ref_seq[0];
-
-        // ignore deletions of the type "AA" -> "A" as these are redundant
-        if(del.alt_seq[0] != del.ref_seq[1]) {
-            tmp_variants.push_back(del);
+    // Screen variants by score
+    // We do this internally here as it is much faster to get the event sequences
+    // for the entire window for all variants at this position once, rather than
+    // for each variant individually
+    std::vector<HMMInputData> event_sequences =
+            alignments.get_event_subsequences(contig, calling_start, calling_end);
+
+    Haplotype test_haplotype(contig,
+                             calling_start,
+                             alignments.get_reference_substring(contig, calling_start, calling_end));
+
+    if (opt::gpu){
+      std::vector<Variant> scoredVariants = aligner.variantScoresThresholded(tmp_variants, test_haplotype, event_sequences,
+                                                                               alignment_flags, opt::screen_score_threshold,
+                                                                               opt::methylation_types);
+      for (auto variant: scoredVariants){
+          if (variant.quality > 0) {
+            std::lock_guard<std::mutex> lock(outVariantsMutex);
+            out_variants.push_back(variant);
+          }
+      }
+    } else {
+        for (const Variant &v : tmp_variants) {
+            auto t0 = std::chrono::high_resolution_clock::now();
+            Variant scored_variant = score_variant_thresholded(v,
+                                                               test_haplotype,
+                                                               event_sequences,
+                                                               alignment_flags,
+                                                               opt::screen_score_threshold,
+                                                               opt::methylation_types);
+            scored_variant.info = "";
+            if (scored_variant.quality > 0) {
+                out_variants.push_back(scored_variant);
+            }
         }
+    }
+}
 
-        // Screen variants by score
-        // We do this internally here as it is much faster to get the event sequences
-        // for the entire window for all variants at this position once, rather than
-        // for each variant individually
-        std::vector<HMMInputData> event_sequences =
-            alignments.get_event_subsequences(contig, calling_start, calling_end);
+// Given the input region, calculate all single base edits to the current assembly
+std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& alignments,
+                                                          int region_start,
+                                                          int region_end,
+                                                          uint32_t alignment_flags){
+    std::vector<Variant> out_variants;
+    std::string contig = alignments.get_region_contig();
+    std::mutex outVariantsMutex;
 
-        Haplotype test_haplotype(contig,
-                                 calling_start,
-                                 alignments.get_reference_substring(contig, calling_start, calling_end));
+    // Add all positively-scoring single-base changes into the candidate set
+    if (opt::gpu){
+        size_t num_workers = 8;
+        std::vector<GpuAligner> gpuAligners(num_workers);
+
+        //std::vector<std::thread> workerThreads(num_workers);
+        std::vector<std::future<void>> handles(num_workers);
+        int nextLocus = region_start;
+
+        //Initialise workers
+        for (int workerIdx=0; workerIdx<num_workers; workerIdx++) {
+          auto aligner = std::ref(gpuAligners[workerIdx]);
+          if (nextLocus < region_end){
+              handles[workerIdx] = std::async(std::launch::async,
+                         singleLocusBaseEditCandidate,
+                         nextLocus,
+                         std::ref(alignments),
+                         alignment_flags,
+                         std::ref(out_variants),
+                         std::ref(contig),
+                         aligner,
+                         std::ref(outVariantsMutex));
+          nextLocus++;
+          }
+        }
 
-        if (opt::gpu){
-            std::vector<Variant> scoredVariants = aligner.variantScoresThresholded(tmp_variants, test_haplotype, event_sequences,
-                                                                          alignment_flags, opt::screen_score_threshold,
-                                                                          opt::methylation_types);
-            for (auto variant: scoredVariants){
-                if (variant.quality > 0) {
-                    out_variants.push_back(variant);
-                }
-            }
-        } else {
-            for (const Variant &v : tmp_variants) {
-                auto t0 = std::chrono::high_resolution_clock::now();
-                Variant scored_variant = score_variant_thresholded(v,
-                                                                   test_haplotype,
-                                                                   event_sequences,
-                                                                   alignment_flags,
-                                                                   opt::screen_score_threshold,
-                                                                   opt::methylation_types);
-                scored_variant.info = "";
-                if (scored_variant.quality > 0) {
-                    out_variants.push_back(scored_variant);
+        //Round robin the workers until done
+        while(nextLocus < region_end){
+          for (int i = 0; i<num_workers; i++){
+              auto status = handles[i].wait_for(std::chrono::microseconds(100));
+              if (status == std::future_status::ready && (nextLocus < region_end)) {
+                  auto aligner = std::ref(gpuAligners[i]);
+                  handles[i].get();
+                  handles[i] = std::async(std::launch::async,
+                             singleLocusBaseEditCandidate,
+                             nextLocus,
+                             std::ref(alignments),
+                             alignment_flags,
+                             std::ref(out_variants),
+                             std::ref(contig),
+                             aligner,
+                             std::ref(outVariantsMutex));
+                  nextLocus++;
                 }
             }
         }
+
+        //Synchronize the remaining ones
+        for (int workerIdx=0; workerIdx<num_workers; workerIdx++) {
+            handles[workerIdx].wait();
+        }
+    } else{
+      GpuAligner aligner; //TODO: temporary - refactor to get rid of this
+      for(size_t i = region_start; i < region_end; ++i){
+	    singleLocusBaseEditCandidate(i,
+				     std::ref(alignments),
+				     alignment_flags,
+				     std::ref(out_variants),
+				     std::ref(contig),
+                     std::ref(aligner),
+				     std::ref(outVariantsMutex));
+      }
     }
     return out_variants;
 }

From 58bf7b77a79669812e3b6d195a0b2fcfe3c53f70 Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Mon, 16 Jul 2018 12:00:11 +0100
Subject: [PATCH 40/80] Adding restrict flag to nvcc

---
 Makefile                       | 2 +-
 src/cuda_kernels/GpuAligner.cu | 7 -------
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/Makefile b/Makefile
index 5d96b37b..0d86b514 100644
--- a/Makefile
+++ b/Makefile
@@ -15,7 +15,7 @@ CFLAGS ?= -std=c99 -O3
 CXX ?= g++
 CC ?= gcc
 NVCC = nvcc
-NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -O3 -use_fast_math --default-stream per-thread
+NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -O3 -use_fast_math --default-stream per-thread -restrict
 CURTFLAGS ?= -L/usr/local/cuda-9.0/lib64 -lcudart
 
 # Change the value of HDF5, EIGEN, or HTS below to any value to disable compilation of bundled code
diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index e31da5b8..f12b3a0e 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -210,10 +210,6 @@ __global__ void getScores(float * const eventData,
         sum = HMT_FROM_SAME_M;
         sum = logsumexpf(sum, HMT_FROM_SAME_B);
         sum += lp_emission_b;
-        //sum = logsumexpf(sum, HMT_FROM_PREV_B);
-        //sum = logsumexpf(sum, HMT_FROM_PREV_K);
-        //sum = logsumexpf(sum, HMT_FROM_SOFT);
-        //sum = logsumexpf(sum, HMT_FROM_PREV_M);
 
         float newBadEventScore = sum;
 
@@ -232,9 +228,6 @@ __global__ void getScores(float * const eventData,
         sum = HMT_FROM_PREV_M;
         sum = logsumexpf(sum, HMT_FROM_PREV_B);
         sum = logsumexpf(sum, HMT_FROM_PREV_K);
-        //sum = logsumexpf(sum, HMT_FROM_SAME_M);
-        //sum = logsumexpf(sum, HMT_FROM_SAME_B);
-        //sum = logsumexpf(sum, HMT_FROM_SOFT);
 
         float newSkipScore = sum;
 

From 8b020be7c72159ab475789af7ddf7a9660e044e9 Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Mon, 16 Jul 2018 16:15:35 +0100
Subject: [PATCH 41/80] transferring means data and pre/post-flanks

---
 Makefile                         |  4 +-
 src/cuda_kernels/GpuAligner.cu   | 87 ++++++++++++++++++++++++++++++++
 src/cuda_kernels/GpuAligner.h    |  9 ++++
 src/nanopolish_call_variants.cpp |  2 +-
 4 files changed, 99 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 0d86b514..bb339cce 100644
--- a/Makefile
+++ b/Makefile
@@ -10,12 +10,12 @@ SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/ali
 #Basic flags every build needs
 LIBS=-lz
 CXXFLAGS ?= -O3
-CXXFLAGS += -std=c++11 -fopenmp -fsigned-char
+CXXFLAGS += -std=c++11 -fopenmp -fsigned-char -g
 CFLAGS ?= -std=c99 -O3
 CXX ?= g++
 CC ?= gcc
 NVCC = nvcc
-NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -O3 -use_fast_math --default-stream per-thread -restrict
+NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -O3 -use_fast_math --default-stream per-thread -restrict -g
 CURTFLAGS ?= -L/usr/local/cuda-9.0/lib64 -lcudart
 
 # Change the value of HDF5, EIGEN, or HTS below to any value to disable compilation of bundled code
diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index f12b3a0e..550b3a9a 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -364,6 +364,75 @@ GpuAligner::~GpuAligner() {
 
 }
 
+std::vector<std::vector<std::vector<double>>> GpuAligner::scoreKernelMod(std::vector<ScoreSet> scoreSets,
+                                                                      uint32_t alignment_flags){
+    std::vector<std::vector<std::vector<double>>> result(scoreSets.size());
+
+    int numScores = 0;
+    int numScoreSets = scoreSets.size(); // the number of sequence/read sets to be scored
+
+    std::vector<std::vector<int>> read_lengths(numScoreSets);
+    std::vector<std::vector<int>> e_starts(numScoreSets);
+    std::vector<std::vector<int>> event_strides(numScoreSets);
+
+    //Each sequence-event combination is its own thread and requires the following information:
+    //1. Event offsets (raw data offset)
+    //2. Sequence offset
+    //3. Event length (How long it will need to run before computing the score)
+    //4. Other sequence/event specific data
+
+    // STEP1. Unpack read data.
+    // STEP2. Unpack sequence data.
+    // STEP3. Prepare buffers for job (thread) - specific data e.g read lengths, sequence lengths, read and sequence indexes etc. This can also be done on the fly.
+
+    size_t rawReadOffset = 0;
+    size_t numEventsTotal = 0;
+    std::vector<int> eventOffsets; //offsets of all the raw reads
+
+    for (int scoreSetIdx=0; scoreSetIdx < numScoreSets; scoreSetIdx++){
+        auto &scoreSet = scoreSets[scoreSetIdx];
+
+        //First unpack per-read data from the scoreSet
+        for (int eventSequenceIdx=0; eventSequenceIdx < scoreSet.rawData.size();eventSequenceIdx++){
+            auto e = scoreSet.rawData[eventSequenceIdx];
+            int e_start = e.event_start_idx;
+
+            e_starts[scoreSetIdx].push_back(e_start);
+
+            int e_stride = e.event_stride;
+            event_strides[scoreSetIdx].push_back(e_stride);
+
+            uint32_t e_end = e.event_stop_idx;
+            uint32_t n_events = 0;
+            if(e_end > e_start)
+                n_events = e_end - e_start + 1;
+            else
+                n_events = e_start - e_end + 1;
+
+            read_lengths[scoreSetIdx].push_back(n_events);
+            numEventsTotal += n_events;
+
+            eventOffsets.push_back(rawReadOffset);
+
+            std::vector<float> pre_flank = make_pre_flanking(e, e_start, n_events);
+            std::vector<float> post_flank = make_post_flanking(e, e_start, n_events);
+
+            for (int i=0;i<n_events;i++) {
+                auto event_idx =  e_start + i * e_stride;
+                auto scaled = e.read->get_drift_scaled_level(event_idx, e.strand); // send the data in drift scaled
+                eventMeans[rawReadOffset + i] = scaled;
+
+                //populate the pre/post-flanking data, since it has a 1-1 correspondence with events
+                preFlankingHost[rawReadOffset + i] = pre_flank[i];
+                postFlankingHost[rawReadOffset + i] = post_flank[i];
+                }
+            rawReadOffset += n_events;
+        }
+    }
+
+    return result;
+}
+
 std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSequence> sequences,
                                                 std::vector<HMMInputData> event_sequences,
                                                 uint32_t alignment_flags){
@@ -560,6 +629,7 @@ std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSeq
     return results;
 }
 
+
 std::vector<Variant> GpuAligner::variantScoresThresholded(std::vector<Variant> input_variants,
                                                         Haplotype base_haplotype,
                                                         std::vector<HMMInputData> event_sequences,
@@ -594,6 +664,23 @@ std::vector<Variant> GpuAligner::variantScoresThresholded(std::vector<Variant> i
 
     if (!event_sequences.empty()) {
         std::vector<std::vector<double>> scores = scoreKernel(sequences, event_sequences, alignment_flags);
+
+        // Now try it with the new method
+        ScoreSet s = {
+                sequences,
+                event_sequences
+        };
+
+        std::vector<ScoreSet>  scoreSets(1,s);
+
+        std::vector<std::vector<std::vector<double>>> scoresMod = scoreKernelMod(scoreSets, alignment_flags);
+
+//        for (int i=0; i<scores[0].size();i++){
+//            printf("Index: %i score (normal): %f\n", i, scores[0][i]);
+//            printf("Index: %i score (modified): %f\n", i, scoresMod[0][0][i]);
+//        }
+
+
         uint32_t numScores = scores[0].size();
         for (int variantIndex = 0; variantIndex < numVariants; variantIndex++) { // index 0 is the base scores
             double totalScore = 0.0;
diff --git a/src/cuda_kernels/GpuAligner.h b/src/cuda_kernels/GpuAligner.h
index 35e10187..9a118350 100644
--- a/src/cuda_kernels/GpuAligner.h
+++ b/src/cuda_kernels/GpuAligner.h
@@ -43,6 +43,12 @@
 #ifndef GPU_ALIGNER_H
 #define GPU_ALIGNER_H
 
+//Data to be scored
+typedef struct {
+    std::vector<HMMInputSequence> stateSequences;
+    std::vector<HMMInputData> rawData;
+} ScoreSet;
+
 class GpuAligner
 {
 public:
@@ -56,6 +62,8 @@ class GpuAligner
     std::vector<std::vector<double>> scoreKernel(std::vector<HMMInputSequence> sequences,
     std::vector<HMMInputData> event_sequences,
             uint32_t alignment_flags);
+    std::vector<std::vector<std::vector<double>>> scoreKernelMod(std::vector<ScoreSet> scoreSets,
+                                                                             uint32_t alignment_flags);
 private:
     float* scaleDev;
     float* shiftDev;
@@ -89,4 +97,5 @@ class GpuAligner
 
     cudaStream_t streams[8]; // TODO 8 should not be hardcoded here
 };
+
 #endif // GPU_ALIGNER_H
diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp
index 62992bce..2cd5982e 100644
--- a/src/nanopolish_call_variants.cpp
+++ b/src/nanopolish_call_variants.cpp
@@ -382,7 +382,7 @@ std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& ali
 
     // Add all positively-scoring single-base changes into the candidate set
     if (opt::gpu){
-        size_t num_workers = 8;
+        size_t num_workers = 1;
         std::vector<GpuAligner> gpuAligners(num_workers);
 
         //std::vector<std::thread> workerThreads(num_workers);

From 05a8896ec5503ff34e8c15e3f1c6c7d6bae4f569 Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Thu, 19 Jul 2018 15:45:13 +0100
Subject: [PATCH 42/80] WIP - modifications to kernel for performance
 improvments

---
 Makefile                              |   6 +-
 src/common/nanopolish_variant.cpp     |   2 +-
 src/cuda_kernels/GpuAligner.cu        | 606 ++++++++++++++++++++++----
 src/cuda_kernels/GpuAligner.h         |  28 +-
 src/hmm/nanopolish_profile_hmm_r7.inl |  10 +-
 src/nanopolish_call_variants.cpp      | 164 +++----
 6 files changed, 645 insertions(+), 171 deletions(-)

diff --git a/Makefile b/Makefile
index bb339cce..bf0324da 100644
--- a/Makefile
+++ b/Makefile
@@ -9,13 +9,13 @@ SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/ali
 
 #Basic flags every build needs
 LIBS=-lz
-CXXFLAGS ?= -O3
+CXXFLAGS ?= -O0
 CXXFLAGS += -std=c++11 -fopenmp -fsigned-char -g
-CFLAGS ?= -std=c99 -O3
+CFLAGS ?= -std=c99 -O0
 CXX ?= g++
 CC ?= gcc
 NVCC = nvcc
-NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0/include -O3 -use_fast_math --default-stream per-thread -restrict -g
+NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0include -O0 -use_fast_math --default-stream per-thread -restrict -g -G
 CURTFLAGS ?= -L/usr/local/cuda-9.0/lib64 -lcudart
 
 # Change the value of HDF5, EIGEN, or HTS below to any value to disable compilation of bundled code
diff --git a/src/common/nanopolish_variant.cpp b/src/common/nanopolish_variant.cpp
index 725a62ab..b73a6b2b 100644
--- a/src/common/nanopolish_variant.cpp
+++ b/src/common/nanopolish_variant.cpp
@@ -664,7 +664,7 @@ std::vector<Variant> multi_call(VariantGroup& variant_group,
 //
 Variant score_variant_thresholded(const Variant& input_variant,
                                   Haplotype base_haplotype, 
-                                  const std::vector<HMMInputData>& input,
+                                  const std::vector<HMMInputData>& input, // raw reads (I think)
                                   const uint32_t alignment_flags,
                                   const uint32_t score_threshold,
                                   const std::vector<std::string>& methylation_types)
diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index 550b3a9a..7150b3dc 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -4,11 +4,11 @@
 #include <vector>
 #include "nanopolish_profile_hmm_r9.h"
 
-#define MAX_STATES 128
+#define MAX_STATES 512
 
 #define EXPAND_TO_STRING(X) #X
 #define TO_STRING(X) EXPAND_TO_STRING(X)
-#define CU_CHECK_ERR(X) if (X != cudaSuccess){printf("CUDA error: %s at line %s\n", cudaGetErrorString(X), TO_STRING(__LINE__));}
+#define CU_CHECK_ERR(X) if (X != cudaSuccess){printf("CUDA error: %s at line %s\n", cudaGetErrorString(X), TO_STRING(__LINE__));throw std::runtime_error("CUDA ERRROR");}
 
 __device__ float logsumexpf(float x, float y){
     if(x == -INFINITY && y == -INFINITY){
@@ -41,6 +41,241 @@ __device__ float lp_match_r9(int rank,
 
 }
 
+
+
+__global__ void getScoresMod (float * poreModelDev,
+                              int * readLengthsDev,
+                              int * eventStartsDev,
+                              int * eventStridesDev,
+                              float * eventsPerBaseDev,
+                              float * scaleDev,
+                              float * shiftDev,
+                              float * varDev,
+                              float * logVarDev,
+                              int * eventOffsetsDev,
+                              float * eventMeansDev,
+                              float * preFlankingDev,
+                              float * postFlankingDev,
+                              int * sequenceLengthsDev,
+                              int * sequenceOffsetsDev,
+                              int * kmerRanksDev,
+                              int * seqIdxDev,
+                              int * readIdxDev,
+                              float * returnValuesDev){
+
+    // get buffer indices
+    int scoreIdx = threadIdx.x;
+    int readIdx = readIdxDev[scoreIdx];
+    int seqIdx = seqIdxDev[scoreIdx];
+
+    // get read statistics
+    int numEvents = readLengthsDev[readIdx];
+    int readOffset = eventOffsetsDev[readIdx];
+    float read_events_per_base = eventsPerBaseDev[readIdx];
+    int e_start = eventStartsDev[readIdx]; // Event start for read
+    int e_stride = eventStridesDev[readIdx];
+    int e_offset = eventOffsetsDev[readIdx]; // Within the event means etc, the offset needed for this block to get a specific event
+    float scale = scaleDev[readIdx];
+    float shift = shiftDev[readIdx];
+    float var = varDev[readIdx];
+    float logVar = logVarDev[readIdx];
+
+    // get sequence statistics
+    int numKmers = sequenceLengthsDev[seqIdx];
+
+    int lastRowIdx = numEvents -1;
+    int lastKmerIdx = numKmers - 1;
+
+    float returnValue = -INFINITY; //Used to sum over the last column.
+    float prevProbabilities[MAX_STATES];
+
+    int numBlocks = numKmers + 2;
+    int numStates = numBlocks * PSR9_NUM_STATES; // 3 blocks per kmer and then 3 each for start and end state.
+
+    // Initialise the prev probabilities vector
+    for (int i = 0; i < numStates - PSR9_NUM_STATES; i++) {
+        prevProbabilities[i] = -INFINITY;
+    }
+    for (int i = numStates - PSR9_NUM_STATES; i < numStates; i++) {
+        prevProbabilities[i] = 0.0f;
+    }
+
+    bool rc = false;
+    if (e_stride == -1){
+        rc = true;
+    }
+
+    //int kmerIdx = threadIdx.x;
+    uint32_t rank;
+
+    float p_stay = 1 - (1 / read_events_per_base);
+    float p_skip = 0.0025;
+    float p_bad = 0.001;
+    float p_bad_self = p_bad;
+    float p_skip_self = 0.3;
+    float p_mk = p_skip; // probability of not observing an event at all
+    float p_mb = p_bad; // probabilty of observing a bad event
+    float p_mm_self = p_stay; // probability of observing additional events from this k-mer
+    float p_mm_next = 1.0f - p_mm_self - p_mk - p_mb; // normal movement from state to state
+    // transitions from event split state in previous block
+    float p_bb = p_bad_self;
+    float p_bk, p_bm_next, p_bm_self;
+    p_bk = p_bm_next = p_bm_self = (1.0f - p_bb) / 3;
+    // transitions from kmer skip state in previous block
+    float p_kk = p_skip_self;
+    float p_km = 1.0f - p_kk;
+    // We assign some transition probabilities. I believe this is correct and they don't vary by location in the sequence
+    float lp_mk = logf(p_mk);
+    float lp_mb = logf(p_mb);
+    float lp_mm_self = logf(p_mm_self);
+    float lp_mm_next = logf(p_mm_next);
+    float lp_bb = logf(p_bb);
+    float lp_bk = logf(p_bk);
+    float lp_bm_next = logf(p_bm_next);
+    float lp_bm_self = logf(p_bm_self);
+    float lp_kk = logf(p_kk);
+    float lp_km = logf(p_km);
+    float lp_sm, lp_ms;
+    lp_sm = lp_ms = 0.0f;
+
+    // the penalty is controlled by the transition probability
+    float BAD_EVENT_PENALTY = 0.0f;
+
+    //Fill out the dynamic programming table
+    for(int row=1; row<numEvents + 1;row++) {//TODO: check that numRows is correct value.
+        //row-specific values
+        int event_idx = e_start + (row - 1) * e_stride;
+        float eventMean = eventMeansDev[e_offset + row - 1];
+        float preFlank = preFlankingDev[e_offset + row - 1];
+        float postFlank = postFlankingDev[e_offset + row - 1];
+
+        float lp_emission_b = BAD_EVENT_PENALTY; //TODO: Can this be taken out of the inner loop?
+
+        for (int blkIdx = 1; blkIdx<numBlocks - 1; blkIdx++) {
+            int curBlockIdx = blkIdx; // Accounts for fact that we are not working with start block.
+            int prevBlockIdx = curBlockIdx - 1;
+            int prevBlockOffset = PSR9_NUM_STATES * prevBlockIdx;
+            int curBlockOffset = PSR9_NUM_STATES * curBlockIdx;
+
+            int kmerIdx = blkIdx - 1; // because there is a start block with no associated kmer
+            uint32_t rank;
+
+            if (rc == true) {
+                rank = kmerRanksDev[kmerIdx +
+                                    numKmers]; // TODO understand why this is segfaulting sometimes, why does kmerIdx sometimes exceed 4096
+            } else {
+                rank = kmerRanksDev[kmerIdx];
+            }
+
+            float pore_mean = poreModelDev[rank * 3];
+            float pore_stdv = poreModelDev[rank * 3 + 1];
+            float pore_log_level_stdv = poreModelDev[rank * 3 + 2];
+
+            float lp_emission_m = lp_match_r9(rank,
+                                              eventMean,
+                                              pore_mean,
+                                              pore_stdv,
+                                              pore_log_level_stdv,
+                                              scale,
+                                              shift,
+                                              var,
+                                              logVar);
+
+            // Get all the scores for a match
+            float HMT_FROM_SAME_M = lp_mm_self + prevProbabilities[curBlockOffset + PSR9_MATCH];
+            float HMT_FROM_PREV_M = lp_mm_next + prevProbabilities[prevBlockOffset + PSR9_MATCH];
+            float HMT_FROM_SAME_B = lp_bm_self + prevProbabilities[curBlockOffset + PSR9_BAD_EVENT];
+            float HMT_FROM_PREV_B = lp_bm_next + prevProbabilities[prevBlockOffset + PSR9_BAD_EVENT];
+            float HMT_FROM_PREV_K = lp_km + prevProbabilities[prevBlockOffset + PSR9_KMER_SKIP];
+
+            // m_s is the probability of going from the start state
+            // to this kmer. The start state is (currently) only
+            // allowed to go to the first kmer. If ALLOW_PRE_CLIP
+            // is defined, we allow all events before this one to be skipped,
+            // with a penalty;
+            float HMT_FROM_SOFT = (kmerIdx == 0 &&
+                                   (event_idx == e_start ||
+                                    (HAF_ALLOW_PRE_CLIP))) ? lp_sm + preFlank : -INFINITY;
+
+            // calculate the score
+            float sum = HMT_FROM_SAME_M;
+            sum = logsumexpf(sum, HMT_FROM_SOFT);
+            sum = logsumexpf(sum, HMT_FROM_PREV_M);
+            sum = logsumexpf(sum, HMT_FROM_SAME_B);
+            sum = logsumexpf(sum, HMT_FROM_PREV_B);
+            sum = logsumexpf(sum, HMT_FROM_PREV_K);
+            sum += lp_emission_m;
+
+            float newMatchScore = sum;
+
+            // Calculate the bad event scores
+            // state PSR9_BAD_EVENT
+            HMT_FROM_SAME_M = lp_mb + prevProbabilities[curBlockOffset + PSR9_MATCH];
+            HMT_FROM_PREV_M = -INFINITY;
+            HMT_FROM_SAME_B = lp_bb + prevProbabilities[curBlockOffset + PSR9_BAD_EVENT];
+            HMT_FROM_PREV_B = -INFINITY;
+            HMT_FROM_PREV_K = -INFINITY;
+            HMT_FROM_SOFT = -INFINITY;
+
+            sum = HMT_FROM_SAME_M;
+            sum = logsumexpf(sum, HMT_FROM_SAME_B);
+            sum += lp_emission_b;
+
+            float newBadEventScore = sum;
+
+            // Write row out. prevProbabilities now becomes "current probabilities" for evaluating skips.
+            prevProbabilities[curBlockOffset + PSR9_MATCH] = newMatchScore;
+            prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] = newBadEventScore;
+        }
+        // Filled out Matches and Bad Events for current row. Can now work on skips and end before moving onto next row.
+        for (int blkIdx = 1; blkIdx<numBlocks - 1; blkIdx++){
+            int kmerIdx = blkIdx - 1; // because there is a start block with no associated kmer
+            int curBlockIdx = blkIdx; // Accounts for fact that we are not working with start block.
+            int prevBlockIdx = curBlockIdx - 1;
+            int prevBlockOffset = PSR9_NUM_STATES * prevBlockIdx;
+            int curBlockOffset = PSR9_NUM_STATES * curBlockIdx;
+            float HMT_FROM_SAME_M = -INFINITY;
+            float HMT_FROM_PREV_M = lp_mk + prevProbabilities[prevBlockOffset + PSR9_MATCH];
+            float HMT_FROM_SAME_B = -INFINITY;
+            float HMT_FROM_PREV_B = lp_bk + prevProbabilities[prevBlockOffset + PSR9_BAD_EVENT];
+            float HMT_FROM_SOFT = -INFINITY;
+
+            float sum = HMT_FROM_PREV_M;
+            sum = logsumexpf(sum, HMT_FROM_PREV_B);
+            sum = logsumexpf(sum, HMT_FROM_PREV_K);
+
+            float newSkipScore = sum;
+
+            prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] = newSkipScore;
+
+            //Skip-Skip
+            float prevSkipScore = prevProbabilities[(blkIdx - 1) * PSR9_NUM_STATES + PSR9_KMER_SKIP];
+            auto skipIdx = blkIdx * PSR9_NUM_STATES + PSR9_KMER_SKIP;
+            float curSkipScore = prevProbabilities[skipIdx + PSR9_KMER_SKIP];
+            float HMT_FROM_PREV_K = lp_kk + prevSkipScore;
+            newSkipScore = logsumexpf(curSkipScore, HMT_FROM_PREV_K);
+            prevProbabilities[skipIdx] = newSkipScore;
+            prevSkipScore = newSkipScore;
+
+            float end;
+            //Only executed once per loop
+            if(kmerIdx == lastKmerIdx && ( (HAF_ALLOW_POST_CLIP) || row == lastRowIdx)) {
+                float lp1 = lp_ms + prevProbabilities[curBlockOffset + PSR9_MATCH] + postFlank;
+                float lp2 = lp_ms + prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] + postFlank;
+                float lp3 = lp_ms + prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] + postFlank;
+
+                end = returnValue;
+                end = logsumexpf(end, lp1);
+                end = logsumexpf(end, lp2);
+                end = logsumexpf(end, lp3);
+                returnValue = end;
+            }
+        }
+    }
+
+    returnValuesDev[scoreIdx] = returnValue;
+}
+
 __global__ void getScores(float * const eventData,
                           float * const readEventsPerBase,
                           int * const numRowsPerRead,
@@ -48,7 +283,7 @@ __global__ void getScores(float * const eventData,
                           int * const eventStrides,
                           int * const kmerRanks,
                           int * const eventOffsets, // Offset to use for getting an event IDX for a specific read (read obtained by block IDX)
-			              float * const poreModelDev,
+                          float * const poreModelDev,
                           float * const scaleDev,
                           float * const shiftDev,
                           float * const varDev,
@@ -237,7 +472,8 @@ __global__ void getScores(float * const eventData,
         //Now need to do the skip-skip transition, which is serial so for now letting one thread execute it.
         if (threadIdx.x == 0){
             int firstBlockIdx = 2;
-            float prevSkipScore; prevSkipScore = prevProbabilities[(firstBlockIdx - 1) * PSR9_NUM_STATES + PSR9_KMER_SKIP];
+            float prevSkipScore = prevProbabilities[(firstBlockIdx - 1) *
+						    PSR9_NUM_STATES + PSR9_KMER_SKIP];
             for (int blkidx = firstBlockIdx; blkidx <= blockDim.x; blkidx++){
                 auto skipIdx = blkidx * PSR9_NUM_STATES + PSR9_KMER_SKIP;
                 float curSkipScore = prevProbabilities[skipIdx + PSR9_KMER_SKIP];
@@ -263,7 +499,6 @@ __global__ void getScores(float * const eventData,
             end = logsumexpf(end, lp3);
             returnValue = end;
         }
-
     }
     returnValues[blockIdx.x] = returnValue;
     __syncthreads();
@@ -273,46 +508,85 @@ __global__ void getScores(float * const eventData,
 GpuAligner::GpuAligner()
 {
     int numModelElements = 4096;
-    int max_num_reads = 300;
+    int max_num_reads = 1000;
+    int readsSizeBuffer = max_num_reads * sizeof(int);
+    int max_n_rows = 100;
+    int maxBuffer = 50000 * sizeof(float);  //TODO: allocate more smartly
+    int max_num_sequences = 8;
+    int max_sequence_length = 50;
 
     poreModelInitialized = false;
 
-    CU_CHECK_ERR(cudaMalloc((void**)&scaleDev, max_num_reads * sizeof(float)));
-    CU_CHECK_ERR(cudaMalloc((void**)&shiftDev, max_num_reads * sizeof(float)));
-    CU_CHECK_ERR(cudaMalloc((void**)&varDev, max_num_reads * sizeof(float)));
-    CU_CHECK_ERR(cudaMalloc((void**)&logVarDev, max_num_reads * sizeof(float)));
-    CU_CHECK_ERR(cudaMalloc( (void**)&eventsPerBaseDev, max_num_reads * sizeof(float)));
+    CU_CHECK_ERR(cudaMalloc((void**)&scaleDev, readsSizeBuffer));
+    CU_CHECK_ERR(cudaHostAlloc(&scaleHost, readsSizeBuffer, cudaHostAllocDefault));
+
+    CU_CHECK_ERR(cudaMalloc((void**)&shiftDev, readsSizeBuffer));
+    CU_CHECK_ERR(cudaHostAlloc(&shiftHost, readsSizeBuffer, cudaHostAllocDefault));
+
+    CU_CHECK_ERR(cudaMalloc((void**)&varDev, readsSizeBuffer));
+    CU_CHECK_ERR(cudaHostAlloc(&varHost, readsSizeBuffer, cudaHostAllocDefault));
+
+    CU_CHECK_ERR(cudaMalloc((void**)&logVarDev, readsSizeBuffer));
+    CU_CHECK_ERR(cudaHostAlloc(&logVarHost, readsSizeBuffer, cudaHostAllocDefault));
+
+    CU_CHECK_ERR(cudaMalloc( (void**)&eventsPerBaseDev, maxBuffer));
+    CU_CHECK_ERR(cudaHostAlloc(&eventsPerBaseHost, maxBuffer, cudaHostAllocDefault));
+
+    CU_CHECK_ERR(cudaMalloc( (void**)&readLengthsDev, readsSizeBuffer));
+    CU_CHECK_ERR(cudaHostAlloc(&readLengthsHost, readsSizeBuffer, cudaHostAllocDefault));
+
 
     // Allocate Device memory for pore model
     CU_CHECK_ERR(cudaMalloc((void**)&poreModelDev, numModelElements * 3 * sizeof(float)));
-    
-    int max_n_rows = 100;
-    int maxBuffer = 50000 * sizeof(float);  //TODO: allocate more smartly
+    CU_CHECK_ERR(cudaHostAlloc(&poreModelHost, numModelElements * sizeof(float) * 3, cudaHostAllocDefault));
+
 
     CU_CHECK_ERR(cudaMalloc((void**)&numRowsDev, max_n_rows * sizeof(int)));
-    CU_CHECK_ERR(cudaMalloc((void**)&eventStartsDev, maxBuffer));
+
+    CU_CHECK_ERR(cudaMalloc((void**)&eventStartsDev, readsSizeBuffer));
+    CU_CHECK_ERR(cudaHostAlloc(&eventStartsHost, readsSizeBuffer, cudaHostAllocDefault));
+
     CU_CHECK_ERR(cudaMalloc((void**)&eventStridesDev, maxBuffer));
+    CU_CHECK_ERR(cudaHostAlloc(&eventStridesHost, maxBuffer, cudaHostAllocDefault));
+
     CU_CHECK_ERR(cudaMalloc((void**)&eventOffsetsDev, maxBuffer));
+    CU_CHECK_ERR(cudaHostAlloc(&eventOffsetsHost, maxBuffer, cudaHostAllocDefault));
+
     CU_CHECK_ERR(cudaMalloc((void**)&eventMeansDev, maxBuffer));
-    CU_CHECK_ERR(cudaMalloc((void**)&preFlankingDev, maxBuffer));
-    CU_CHECK_ERR(cudaMalloc((void**)&postFlankingDev, maxBuffer));
     CU_CHECK_ERR(cudaHostAlloc(&eventMeans, maxBuffer , cudaHostAllocDefault));
+
+    CU_CHECK_ERR(cudaMalloc((void**)&preFlankingDev, maxBuffer));
     CU_CHECK_ERR(cudaHostAlloc(&preFlankingHost, maxBuffer, cudaHostAllocDefault));
+
+    CU_CHECK_ERR(cudaMalloc((void**)&postFlankingDev, maxBuffer));
     CU_CHECK_ERR(cudaHostAlloc(&postFlankingHost, maxBuffer, cudaHostAllocDefault));
 
+    CU_CHECK_ERR(cudaMalloc((void**)&sequenceOffsetsDev, maxBuffer));
+    CU_CHECK_ERR(cudaHostAlloc(&sequenceOffsetsHost, maxBuffer, cudaHostAllocDefault));
+
+    CU_CHECK_ERR(cudaMalloc((void**)&sequenceLengthsDev, maxBuffer));
+    CU_CHECK_ERR(cudaHostAlloc(&sequenceLengthsHost, maxBuffer, cudaHostAllocDefault));
+
+    CU_CHECK_ERR(cudaMalloc((void**)&scoresDev, maxBuffer));
+    CU_CHECK_ERR(cudaHostAlloc(&returnValuesHost, maxBuffer, cudaHostAllocDefault));
+
+    CU_CHECK_ERR(cudaMalloc((void**)&seqIdxDev, maxBuffer));
+    CU_CHECK_ERR(cudaHostAlloc(&seqIdxHost, maxBuffer, cudaHostAllocDefault));
+
+    CU_CHECK_ERR(cudaMalloc((void**)&readIdxDev, maxBuffer));
+    CU_CHECK_ERR(cudaHostAlloc(&readIdxHost, maxBuffer, cudaHostAllocDefault));
+
+    int numKmers = max_sequence_length * max_num_sequences;
+    CU_CHECK_ERR(cudaHostAlloc(&kmerRanks, numKmers  * 2 * sizeof(int), cudaHostAllocDefault));
+    CU_CHECK_ERR(cudaMalloc((void**)&kmerRanksDev, numKmers  * 2 * sizeof(int)));
+
     // Allocate host memory for model
-    CU_CHECK_ERR(cudaHostAlloc(&poreModelHost, numModelElements * sizeof(float) * 3, cudaHostAllocDefault));
-    int max_num_sequences = 8;
-    int max_sequence_length = 50;
+    returnValuesHostResultsPointers.resize(max_num_sequences);
     kmerRanksDevPointers.resize(max_num_sequences);
     returnValuesDevResultsPointers.resize(max_num_sequences);
-    returnValuesHostResultsPointers.resize(max_num_sequences);
 
-    // Populate host buffer with kmer ranks
-    int numKmers = max_sequence_length * max_num_sequences;
-    cudaHostAlloc(&kmerRanks, numKmers  * 2 * sizeof(int), cudaHostAllocDefault);
-    cudaMalloc((void**)&kmerRanksDev, numKmers  * 2 * sizeof(int));
 
+    //. This is the "old" way
     for (int i =0; i<max_num_sequences;i++){
         int * kmerRanksDev;
         float * returnValuesDev;
@@ -365,54 +639,54 @@ GpuAligner::~GpuAligner() {
 }
 
 std::vector<std::vector<std::vector<double>>> GpuAligner::scoreKernelMod(std::vector<ScoreSet> scoreSets,
-                                                                      uint32_t alignment_flags){
-    std::vector<std::vector<std::vector<double>>> result(scoreSets.size());
-
-    int numScores = 0;
-    int numScoreSets = scoreSets.size(); // the number of sequence/read sets to be scored
+                                                                         uint32_t alignment_flags){
 
-    std::vector<std::vector<int>> read_lengths(numScoreSets);
-    std::vector<std::vector<int>> e_starts(numScoreSets);
-    std::vector<std::vector<int>> event_strides(numScoreSets);
+    int numEventsTotal = 0; // The number of events across all scoreSets
+    int  numSequences = 0; // The number of sequences across all scoreSets
+    int kmerOffset = 0;
+    int numReads = 0; // The number of reads across all scoreSets
+    int numBases = 0;
+    int numScoreSets = scoreSets.size();
 
-    //Each sequence-event combination is its own thread and requires the following information:
-    //1. Event offsets (raw data offset)
-    //2. Sequence offset
-    //3. Event length (How long it will need to run before computing the score)
-    //4. Other sequence/event specific data
-
-    // STEP1. Unpack read data.
-    // STEP2. Unpack sequence data.
-    // STEP3. Prepare buffers for job (thread) - specific data e.g read lengths, sequence lengths, read and sequence indexes etc. This can also be done on the fly.
-
-    size_t rawReadOffset = 0;
-    size_t numEventsTotal = 0;
-    std::vector<int> eventOffsets; //offsets of all the raw reads
+    int rawReadOffset = 0;
+    int globalReadIdx = 0;
+    int globalSequenceIdx = 0;
+    int globalScoreIdx = 0;
 
+    //Loop over every scoreset, filling out buffers and counters
     for (int scoreSetIdx=0; scoreSetIdx < numScoreSets; scoreSetIdx++){
         auto &scoreSet = scoreSets[scoreSetIdx];
-
-        //First unpack per-read data from the scoreSet
+        int firstReadIdxinScoreSet = globalReadIdx;
+        //Read data
         for (int eventSequenceIdx=0; eventSequenceIdx < scoreSet.rawData.size();eventSequenceIdx++){
             auto e = scoreSet.rawData[eventSequenceIdx];
-            int e_start = e.event_start_idx;
+            numReads++;
+
+            //Read statistics - populate host buffers
+            scaleHost[globalReadIdx] = e.read->scalings[e.strand].scale;
+            shiftHost[globalReadIdx] = e.read->scalings[e.strand].shift;
+            varHost[globalReadIdx] = e.read->scalings[e.strand].var;
+            logVarHost[globalReadIdx] = e.read->scalings[e.strand].log_var;
 
-            e_starts[scoreSetIdx].push_back(e_start);
+            int e_start = e.event_start_idx;
+            eventStartsHost[globalReadIdx] = e_start;
 
             int e_stride = e.event_stride;
-            event_strides[scoreSetIdx].push_back(e_stride);
+            eventStridesHost[globalReadIdx] = e_stride;
 
             uint32_t e_end = e.event_stop_idx;
-            uint32_t n_events = 0;
+            uint32_t n_events;
             if(e_end > e_start)
                 n_events = e_end - e_start + 1;
             else
                 n_events = e_start - e_end + 1;
-
-            read_lengths[scoreSetIdx].push_back(n_events);
+            readLengthsHost[globalReadIdx] = n_events;
             numEventsTotal += n_events;
 
-            eventOffsets.push_back(rawReadOffset);
+            eventOffsetsHost[globalReadIdx] = rawReadOffset;
+
+            float readEventsPerBase = e.read->events_per_base[e.strand];
+            eventsPerBaseHost[globalReadIdx] = readEventsPerBase;
 
             std::vector<float> pre_flank = make_pre_flanking(e, e_start, n_events);
             std::vector<float> post_flank = make_post_flanking(e, e_start, n_events);
@@ -426,7 +700,170 @@ std::vector<std::vector<std::vector<double>>> GpuAligner::scoreKernelMod(std::ve
                 preFlankingHost[rawReadOffset + i] = pre_flank[i];
                 postFlankingHost[rawReadOffset + i] = post_flank[i];
                 }
+
             rawReadOffset += n_events;
+            globalReadIdx++;
+        }
+        //Pore Model
+        const uint32_t k = scoreSets[0].rawData[0].pore_model->k; //k is the length of a kmer in the pore model
+        if (poreModelInitialized == false) {
+            int num_states = scoreSets[0].rawData[0].pore_model->states.size();
+            int poreModelEntriesPerState = 3;
+            for(int st=0; st<num_states; st++){
+                auto params = scoreSets[0].rawData[0].pore_model->states[st];
+                poreModelHost[st * poreModelEntriesPerState] = params.level_mean;
+                poreModelHost[st * poreModelEntriesPerState + 1] = params.level_stdv;
+                poreModelHost[st * poreModelEntriesPerState + 2] = params.level_log_stdv;
+            }
+            // copy over the pore model
+            CU_CHECK_ERR(cudaMemcpyAsync(poreModelDev, poreModelHost,
+                            poreModelEntriesPerState * 4096 * sizeof(float), cudaMemcpyHostToDevice, streams[0])); // TODO don't hardcode num kmers
+            poreModelInitialized = true;
+        }
+        // Sequences
+        // Sequences
+        auto & sequences = scoreSet.stateSequences;
+        numSequences += sequences.size();
+
+        for (int i = 0; i<sequences.size(); i++) {
+            auto sequence = sequences[i];
+
+            sequenceOffsetsHost[globalSequenceIdx] = kmerOffset;
+
+            int sequenceLength = sequence.length();
+            numBases += sequenceLength;
+
+            for(size_t ki = 0; ki < sequenceLength; ++ki) {
+                int rank = sequence.get_kmer_rank(ki, k, false);
+                kmerRanks[ki + kmerOffset] = rank;
+            }
+            //kmerRanksDevPointers[i] = kmerRanksDev + kmerOffset;
+            kmerOffset += sequenceLength;
+
+            for(size_t ki = 0; ki < sequenceLength; ++ki) {
+                int rank = sequence.get_kmer_rank(ki, k, true);
+                kmerRanks[ki + kmerOffset] = rank;
+            }
+
+            kmerOffset += sequenceLength;
+
+            sequenceLengthsHost[globalSequenceIdx] = sequenceLength;
+
+            // Loop over the raw reads, producing a cartesian product of the two
+
+            auto numReadsInScoreSet = scoreSet.rawData.size();
+            for (int r=0; r<numReadsInScoreSet; r++){
+                seqIdxHost[globalScoreIdx] = globalSequenceIdx;
+                readIdxHost[globalScoreIdx] = firstReadIdxinScoreSet + r;
+                globalScoreIdx++;
+            }
+
+            globalSequenceIdx++;
+            }
+
+    }
+
+    // All data is now in host buffers - perform memcpys
+    //Read statistics
+    CU_CHECK_ERR(cudaMemcpyAsync(eventStartsDev, eventStartsHost,
+                    numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
+
+    CU_CHECK_ERR(cudaMemcpyAsync(eventStridesDev, eventStridesHost,
+                    numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
+
+    CU_CHECK_ERR(cudaMemcpyAsync(eventsPerBaseDev, eventsPerBaseHost,
+                    numReads * sizeof(float), cudaMemcpyHostToDevice, streams[0]));
+
+    CU_CHECK_ERR(cudaMemcpyAsync(scaleDev, scaleHost,
+                    numReads * sizeof(float), cudaMemcpyHostToDevice, streams[0]));
+
+    CU_CHECK_ERR(cudaMemcpyAsync(shiftDev, shiftHost,
+                    numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
+
+    CU_CHECK_ERR(cudaMemcpyAsync(varDev, varHost,
+                    numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
+
+    CU_CHECK_ERR(cudaMemcpyAsync(logVarDev, logVarHost,
+                    numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
+
+    CU_CHECK_ERR(cudaMemcpyAsync(readLengthsDev, readLengthsHost,
+                                 numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
+
+    // Read offsets
+    CU_CHECK_ERR(cudaMemcpyAsync(eventOffsetsDev, eventOffsetsHost,
+                    numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
+
+    // Reads + Flanks
+    CU_CHECK_ERR(cudaMemcpyAsync( eventMeansDev, eventMeans, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice, streams[0] ));
+
+    CU_CHECK_ERR(cudaMemcpyAsync( preFlankingDev, preFlankingHost, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice, streams[0] ));
+
+    CU_CHECK_ERR(cudaMemcpyAsync( postFlankingDev, postFlankingHost, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice, streams[0] ));
+
+    // Sequence statistics
+
+    CU_CHECK_ERR(cudaMemcpyAsync( sequenceLengthsDev, sequenceLengthsHost, numSequences * sizeof(int), cudaMemcpyHostToDevice, streams[0] ));
+
+    // Sequence offsets
+    CU_CHECK_ERR(cudaMemcpyAsync( sequenceOffsetsDev, sequenceOffsetsHost, numSequences * sizeof(int), cudaMemcpyHostToDevice, streams[0] ));
+
+    // Sequences
+    CU_CHECK_ERR(cudaMemcpyAsync( kmerRanksDev, kmerRanks, kmerOffset * sizeof(int), cudaMemcpyHostToDevice, streams[0] ));
+
+    // Job details
+    CU_CHECK_ERR(cudaMemcpyAsync( seqIdxDev, seqIdxHost, globalScoreIdx * sizeof(int), cudaMemcpyHostToDevice, streams[0] ));
+    CU_CHECK_ERR(cudaMemcpyAsync( readIdxDev, readIdxHost, globalScoreIdx * sizeof(int), cudaMemcpyHostToDevice, streams[0] ));
+
+    // Launch Kernels
+
+    dim3 dimBlock(1); //TODO change back to globalScoreIDx this is only for debugging
+    dim3 dimGrid(1);
+
+    //printf("Launching get scores mod kernel\n");
+    getScoresMod <<< dimGrid, dimBlock, MAX_STATES * sizeof(int), streams[0]>>> (poreModelDev,
+                                                          readLengthsDev,
+                                                          eventStartsDev,
+                                                          eventStridesDev,
+                                                          eventsPerBaseDev,
+                                                          scaleDev,
+                                                          shiftDev,
+                                                          varDev,
+                                                          logVarDev,
+                                                          eventOffsetsDev,
+                                                          eventMeansDev,
+                                                          preFlankingDev,
+                                                          postFlankingDev,
+                                                          sequenceLengthsDev,
+                                                          sequenceOffsetsDev,
+                                                          kmerRanksDev,
+                                                          seqIdxDev,
+                                                          readIdxDev,
+                                                          scoresDev);
+    cudaError_t err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+        printf("Errors during kernel execution: %s\n", cudaGetErrorString(err));
+
+    cudaMemcpyAsync(returnValuesHost, scoresDev,
+                    globalScoreIdx * sizeof(float), cudaMemcpyDeviceToHost, streams[0]);
+    cudaStreamSynchronize(streams[0]);
+
+    //Unpack results
+    int k = 0;
+    std::vector<std::vector<std::vector<double>>> result(scoreSets.size());
+
+    for(int scoreSetIdx=0; scoreSetIdx<numScoreSets; scoreSetIdx++){
+        auto scoreSet = scoreSets[scoreSetIdx];
+        int numSequences = scoreSet.stateSequences.size();
+        int numReads = scoreSet.rawData.size();
+        for (int seqIdx=0; seqIdx<numSequences; seqIdx++){
+            std::vector<double> seqScores;
+            for (int readIdx=0; readIdx<numReads; readIdx++){
+                float score = returnValuesHost[k];
+                seqScores.push_back(score);
+                k++;
+            }
+            result[scoreSetIdx].push_back(seqScores);
         }
     }
 
@@ -503,8 +940,6 @@ std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSeq
         offset += num_events;
     }
 
-    // Populate pore model buffers
-    // Assume that every event sequence has the same pore model
     int num_states = event_sequences[0].pore_model->states.size();
     //Populating read-statistics buffers
     std::vector<float> scale(num_reads);
@@ -520,19 +955,21 @@ std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSeq
     }
 
     // Copy to the device all buffers shared across kmer sequences.
-    cudaMemcpyAsync( scaleDev, scale.data(), scale.size() * sizeof(float), cudaMemcpyHostToDevice, streams[0] );
-    cudaMemcpyAsync( shiftDev, shift.data(), shift.size() * sizeof(float), cudaMemcpyHostToDevice,  streams[0]);
-    cudaMemcpyAsync( varDev, var.data(), var.size() * sizeof(float), cudaMemcpyHostToDevice, streams[0]);
-    cudaMemcpyAsync( logVarDev, log_var.data(), log_var.size() * sizeof(float), cudaMemcpyHostToDevice, streams[0] );
-    cudaMemcpyAsync( eventsPerBaseDev, eventsPerBase.data(), eventsPerBase.size() * sizeof(float), cudaMemcpyHostToDevice, streams[0]);
-    cudaMemcpyAsync( eventMeansDev, eventMeans, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice, streams[0] );
-    cudaMemcpyAsync( preFlankingDev, preFlankingHost, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice, streams[0] );
-    cudaMemcpyAsync( postFlankingDev, postFlankingHost, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice, streams[0] );
-    cudaMemcpyAsync( numRowsDev, n_rows.data(), n_rows.size() * sizeof(int), cudaMemcpyHostToDevice, streams[0] );
-    cudaMemcpyAsync( eventStartsDev, e_starts.data(), e_starts.size() * sizeof(int), cudaMemcpyHostToDevice, streams[0] );
-    cudaMemcpyAsync( eventStridesDev, event_strides.data(), event_strides.size() * sizeof(int), cudaMemcpyHostToDevice, streams[0] );
-    cudaMemcpyAsync( eventOffsetsDev, eventOffsets.data(), eventOffsets.size() * sizeof(int), cudaMemcpyHostToDevice, streams[0] );
+    CU_CHECK_ERR(cudaMemcpyAsync( scaleDev, scale.data(), scale.size() * sizeof(float), cudaMemcpyHostToDevice, streams[0] ));
+    CU_CHECK_ERR(cudaMemcpyAsync( shiftDev, shift.data(), shift.size() * sizeof(float), cudaMemcpyHostToDevice,  streams[0]));
+    CU_CHECK_ERR(cudaMemcpyAsync( varDev, var.data(), var.size() * sizeof(float), cudaMemcpyHostToDevice, streams[0]));
+    CU_CHECK_ERR(cudaMemcpyAsync( logVarDev, log_var.data(), log_var.size() * sizeof(float), cudaMemcpyHostToDevice, streams[0] ));
+    CU_CHECK_ERR(cudaMemcpyAsync( eventsPerBaseDev, eventsPerBase.data(), eventsPerBase.size() * sizeof(float), cudaMemcpyHostToDevice, streams[0]));
+    CU_CHECK_ERR(cudaMemcpyAsync( eventMeansDev, eventMeans, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice, streams[0] ));
+    CU_CHECK_ERR(cudaMemcpyAsync( preFlankingDev, preFlankingHost, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice, streams[0] ));
+    CU_CHECK_ERR(cudaMemcpyAsync( postFlankingDev, postFlankingHost, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice, streams[0] ));
+    CU_CHECK_ERR(cudaMemcpyAsync( numRowsDev, n_rows.data(), n_rows.size() * sizeof(int), cudaMemcpyHostToDevice, streams[0] ));
+    CU_CHECK_ERR(cudaMemcpyAsync( eventStartsDev, e_starts.data(), e_starts.size() * sizeof(int), cudaMemcpyHostToDevice, streams[0] ));
+    CU_CHECK_ERR(cudaMemcpyAsync( eventStridesDev, event_strides.data(), event_strides.size() * sizeof(int), cudaMemcpyHostToDevice, streams[0] ));
+    CU_CHECK_ERR(cudaMemcpyAsync( eventOffsetsDev, eventOffsets.data(), eventOffsets.size() * sizeof(int), cudaMemcpyHostToDevice, streams[0] ));
 
+    // Populate pore model buffers
+    // Assume that every event sequence has the same pore model
     if (poreModelInitialized == false) {
           int poreModelEntriesPerState = 3;
 	  for(int st=0; st<num_states; st++){
@@ -542,8 +979,8 @@ std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSeq
 	    poreModelHost[st * poreModelEntriesPerState + 2] = params.level_log_stdv;
 	  }
         // copy over the pore model
-	  cudaMemcpyAsync(poreModelDev, poreModelHost,
-			  poreModelEntriesPerState * 4096 * sizeof(float), cudaMemcpyHostToDevice, streams[0]); // TODO don't hardcode num kmers
+        CU_CHECK_ERR(cudaMemcpyAsync(poreModelDev, poreModelHost,
+			  poreModelEntriesPerState * 4096 * sizeof(float), cudaMemcpyHostToDevice, streams[0])); // TODO don't hardcode num kmers
 	  poreModelInitialized = true;
     }
 
@@ -559,7 +996,8 @@ std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSeq
 
         size_t sequenceLength = sequence.length();
         for(size_t ki = 0; ki < sequenceLength; ++ki) {
-            kmerRanks[ki + kmerOffset] = sequence.get_kmer_rank(ki, k, false);
+            int rank = sequence.get_kmer_rank(ki, k, false);
+            kmerRanks[ki + kmerOffset] = rank;
         }
         kmerRanksDevPointers[i] = kmerRanksDev + kmerOffset;
         kmerOffset += sequenceLength;
@@ -586,8 +1024,8 @@ std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSeq
 
         int num_blocks = n_states / PSR9_NUM_STATES;
 
-        dim3 dimBlock(num_blocks - 2); // One thread per state, not including Start and Terminal state.
-        dim3 dimGrid(num_reads); // let's look at only the first read
+        dim3 dimBlock(num_blocks - 2);
+        dim3 dimGrid(num_reads);
 
         getScores <<< dimGrid, dimBlock, 0, streams[i]>>> (eventMeansDev,
                 eventsPerBaseDev,
@@ -604,6 +1042,10 @@ std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSeq
                 preFlankingDev,
                 postFlankingDev,
                 returnValuesDev);
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess)
+            printf("Errors during kernel execution: %s\n", cudaGetErrorString(err));
+
     }
     for (int i = 0; i<8;i++) {
         cudaMemcpyAsync(returnValuesHostResultsPointers[i], returnValuesDevResultsPointers[i],
@@ -631,11 +1073,11 @@ std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSeq
 
 
 std::vector<Variant> GpuAligner::variantScoresThresholded(std::vector<Variant> input_variants,
-                                                        Haplotype base_haplotype,
-                                                        std::vector<HMMInputData> event_sequences,
-                                                        uint32_t alignment_flags,
-                                                        int screen_score_threshold,
-                                                        std::vector<std::string> methylation_types) {
+                                                          Haplotype base_haplotype,
+                                                          std::vector<HMMInputData> event_sequences,
+                                                          uint32_t alignment_flags,
+                                                          int screen_score_threshold,
+                                                          std::vector<std::string> methylation_types) {
     int numVariants = input_variants.size();
 
     std::vector<Variant> out_variants = input_variants;
@@ -675,12 +1117,6 @@ std::vector<Variant> GpuAligner::variantScoresThresholded(std::vector<Variant> i
 
         std::vector<std::vector<std::vector<double>>> scoresMod = scoreKernelMod(scoreSets, alignment_flags);
 
-//        for (int i=0; i<scores[0].size();i++){
-//            printf("Index: %i score (normal): %f\n", i, scores[0][i]);
-//            printf("Index: %i score (modified): %f\n", i, scoresMod[0][0][i]);
-//        }
-
-
         uint32_t numScores = scores[0].size();
         for (int variantIndex = 0; variantIndex < numVariants; variantIndex++) { // index 0 is the base scores
             double totalScore = 0.0;
diff --git a/src/cuda_kernels/GpuAligner.h b/src/cuda_kernels/GpuAligner.h
index 9a118350..e586bfdc 100644
--- a/src/cuda_kernels/GpuAligner.h
+++ b/src/cuda_kernels/GpuAligner.h
@@ -45,8 +45,8 @@
 
 //Data to be scored
 typedef struct {
-    std::vector<HMMInputSequence> stateSequences;
-    std::vector<HMMInputData> rawData;
+    std::vector<HMMInputSequence> &stateSequences;
+    std::vector<HMMInputData> &rawData;
 } ScoreSet;
 
 class GpuAligner
@@ -85,6 +85,30 @@ class GpuAligner
     float* poreModelLevelMeanDev;
     float* poreModelDev;
     float* poreModelHost;
+    int * sequenceOffsetsDev;
+
+    // NEW - for MOD kernel
+    int * readLengthsHost;
+    int * eventStartsHost;
+    int * eventStridesHost;
+    float * eventsPerBaseHost;
+    float * scaleHost;
+    float * shiftHost;
+    float * varHost;
+    float * logVarHost;
+    int * sequenceLengthsHost;
+    int * eventOffsetsHost;
+    int * sequenceOffsetsHost;
+    int * readIdxHost;
+    int * seqIdxHost;
+
+    int * readLengthsDev;
+    int * sequenceLengthsDev;
+    int * readIdxDev;
+    int * seqIdxDev;
+
+    float * returnValuesHost;
+    float * scoresDev;
 
     int * kmerRanks;
     int * kmerRanksDev;
diff --git a/src/hmm/nanopolish_profile_hmm_r7.inl b/src/hmm/nanopolish_profile_hmm_r7.inl
index bf0edd28..3fe4b309 100644
--- a/src/hmm/nanopolish_profile_hmm_r7.inl
+++ b/src/hmm/nanopolish_profile_hmm_r7.inl
@@ -306,9 +306,13 @@ inline float profile_hmm_fill_generic_r7(const HMMInputSequence& _sequence,
     assert( data.pore_model->states.size() == sequence.get_num_kmer_ranks(k) );
 
     std::vector<uint32_t> kmer_ranks(num_kmers);
-    for(size_t ki = 0; ki < num_kmers; ++ki)
-        kmer_ranks[ki] = sequence.get_kmer_rank(ki, k, data.rc);
-
+    for(size_t ki = 0; ki < num_kmers; ++ki) {
+        int rank = sequence.get_kmer_rank(ki, k, data.rc);
+        if(rank>4096){
+            printf("Rank: %i", rank);
+        }
+        kmer_ranks[ki] = rank;
+    }
     size_t num_events = output.get_num_rows() - 1;
 
     std::vector<float> pre_flank = make_pre_flanking_r7(data, parameters, e_start, num_events);
diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp
index 2cd5982e..1b01ff2c 100644
--- a/src/nanopolish_call_variants.cpp
+++ b/src/nanopolish_call_variants.cpp
@@ -292,16 +292,16 @@ void singleLocusBaseEditCandidate(int i,
                                   GpuAligner &aligner,
                                   std::mutex &outVariantsMutex
 ){
-
+try {
     int calling_start = i - opt::screen_flanking_sequence;
     int calling_end = i + 1 + opt::screen_flanking_sequence;
 
-    if(!alignments.are_coordinates_valid(contig, calling_start, calling_end)) {
+    if (!alignments.are_coordinates_valid(contig, calling_start, calling_end)) {
         return;
     }
 
     std::vector<Variant> tmp_variants;
-    for(size_t j = 0; j < 4; ++j) {
+    for (size_t j = 0; j < 4; ++j) {
         // Substitutions
         Variant v;
         v.ref_name = contig;
@@ -309,14 +309,14 @@ void singleLocusBaseEditCandidate(int i,
         v.ref_seq = alignments.get_reference_substring(contig, i, i);
         v.alt_seq = "ACGT"[j];
 
-        if(v.ref_seq != v.alt_seq) {
+        if (v.ref_seq != v.alt_seq) {
             tmp_variants.push_back(v);
         }
 
         // Insertions
         v.alt_seq = v.ref_seq + "ACGT"[j];
         // ignore insertions of the type "A" -> "AA" as these are redundant
-        if(v.alt_seq[1] != v.ref_seq[0]) {
+        if (v.alt_seq[1] != v.ref_seq[0]) {
             tmp_variants.push_back(v);
         }
     }
@@ -329,7 +329,7 @@ void singleLocusBaseEditCandidate(int i,
     del.alt_seq = del.ref_seq[0];
 
     // ignore deletions of the type "AA" -> "A" as these are redundant
-    if(del.alt_seq[0] != del.ref_seq[1]) {
+    if (del.alt_seq[0] != del.ref_seq[1]) {
         tmp_variants.push_back(del);
     }
 
@@ -344,16 +344,18 @@ void singleLocusBaseEditCandidate(int i,
                              calling_start,
                              alignments.get_reference_substring(contig, calling_start, calling_end));
 
-    if (opt::gpu){
-      std::vector<Variant> scoredVariants = aligner.variantScoresThresholded(tmp_variants, test_haplotype, event_sequences,
-                                                                               alignment_flags, opt::screen_score_threshold,
+    if (opt::gpu) {
+        std::vector<Variant> scoredVariants = aligner.variantScoresThresholded(tmp_variants, test_haplotype,
+                                                                               event_sequences,
+                                                                               alignment_flags,
+                                                                               opt::screen_score_threshold,
                                                                                opt::methylation_types);
-      for (auto variant: scoredVariants){
-          if (variant.quality > 0) {
-            std::lock_guard<std::mutex> lock(outVariantsMutex);
-            out_variants.push_back(variant);
-          }
-      }
+        for (auto variant: scoredVariants) {
+            if (variant.quality > 0) {
+                std::lock_guard<std::mutex> lock(outVariantsMutex);
+                out_variants.push_back(variant);
+            }
+        }
     } else {
         for (const Variant &v : tmp_variants) {
             auto t0 = std::chrono::high_resolution_clock::now();
@@ -369,6 +371,9 @@ void singleLocusBaseEditCandidate(int i,
             }
         }
     }
+}catch (std::exception &e){
+    printf("Exception in thread! %s\n", e.what());
+}
 }
 
 // Given the input region, calculate all single base edits to the current assembly
@@ -376,74 +381,79 @@ std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& ali
                                                           int region_start,
                                                           int region_end,
                                                           uint32_t alignment_flags){
-    std::vector<Variant> out_variants;
-    std::string contig = alignments.get_region_contig();
-    std::mutex outVariantsMutex;
-
-    // Add all positively-scoring single-base changes into the candidate set
-    if (opt::gpu){
-        size_t num_workers = 1;
-        std::vector<GpuAligner> gpuAligners(num_workers);
-
-        //std::vector<std::thread> workerThreads(num_workers);
-        std::vector<std::future<void>> handles(num_workers);
-        int nextLocus = region_start;
-
-        //Initialise workers
-        for (int workerIdx=0; workerIdx<num_workers; workerIdx++) {
-          auto aligner = std::ref(gpuAligners[workerIdx]);
-          if (nextLocus < region_end){
-              handles[workerIdx] = std::async(std::launch::async,
-                         singleLocusBaseEditCandidate,
-                         nextLocus,
-                         std::ref(alignments),
-                         alignment_flags,
-                         std::ref(out_variants),
-                         std::ref(contig),
-                         aligner,
-                         std::ref(outVariantsMutex));
-          nextLocus++;
-          }
-        }
+    try {
+        std::vector<Variant> out_variants;
+        std::string contig = alignments.get_region_contig();
+        std::mutex outVariantsMutex;
+
+        // Add all positively-scoring single-base changes into the candidate set
+        if (opt::gpu) {
+            size_t num_workers = 1;
+            std::vector<GpuAligner> gpuAligners(num_workers);
+
+            //std::vector<std::thread> workerThreads(num_workers);
+            std::vector<std::future<void>> handles(num_workers);
+            int nextLocus = region_start;
+
+            //Initialise workers
+            for (int workerIdx = 0; workerIdx < num_workers; workerIdx++) {
+                auto aligner = std::ref(gpuAligners[workerIdx]);
+                if (nextLocus < region_end) {
+                    handles[workerIdx] = std::async(std::launch::async,
+                                                    singleLocusBaseEditCandidate,
+                                                    nextLocus,
+                                                    std::ref(alignments),
+                                                    alignment_flags,
+                                                    std::ref(out_variants),
+                                                    std::ref(contig),
+                                                    aligner,
+                                                    std::ref(outVariantsMutex));
+                    nextLocus++;
+                }
+            }
 
-        //Round robin the workers until done
-        while(nextLocus < region_end){
-          for (int i = 0; i<num_workers; i++){
-              auto status = handles[i].wait_for(std::chrono::microseconds(100));
-              if (status == std::future_status::ready && (nextLocus < region_end)) {
-                  auto aligner = std::ref(gpuAligners[i]);
-                  handles[i].get();
-                  handles[i] = std::async(std::launch::async,
-                             singleLocusBaseEditCandidate,
-                             nextLocus,
-                             std::ref(alignments),
-                             alignment_flags,
-                             std::ref(out_variants),
-                             std::ref(contig),
-                             aligner,
-                             std::ref(outVariantsMutex));
-                  nextLocus++;
+            //Round robin the workers until done
+            while (nextLocus < region_end) {
+                for (int i = 0; i < num_workers; i++) {
+                    auto status = handles[i].wait_for(std::chrono::microseconds(100));
+                    if (status == std::future_status::ready && (nextLocus < region_end)) {
+                        auto aligner = std::ref(gpuAligners[i]);
+                        handles[i].get();
+                        handles[i] = std::async(std::launch::async,
+                                                singleLocusBaseEditCandidate,
+                                                nextLocus,
+                                                std::ref(alignments),
+                                                alignment_flags,
+                                                std::ref(out_variants),
+                                                std::ref(contig),
+                                                aligner,
+                                                std::ref(outVariantsMutex));
+                        nextLocus++;
+                    }
                 }
             }
-        }
 
-        //Synchronize the remaining ones
-        for (int workerIdx=0; workerIdx<num_workers; workerIdx++) {
-            handles[workerIdx].wait();
+            //Synchronize the remaining ones
+            for (int workerIdx = 0; workerIdx < num_workers; workerIdx++) {
+                handles[workerIdx].wait();
+            }
+        } else {
+            GpuAligner aligner; //TODO: temporary - refactor to get rid of this
+            for (size_t i = region_start; i < region_end; ++i) {
+                singleLocusBaseEditCandidate(i,
+                                             std::ref(alignments),
+                                             alignment_flags,
+                                             std::ref(out_variants),
+                                             std::ref(contig),
+                                             std::ref(aligner),
+                                             std::ref(outVariantsMutex));
+            }
         }
-    } else{
-      GpuAligner aligner; //TODO: temporary - refactor to get rid of this
-      for(size_t i = region_start; i < region_end; ++i){
-	    singleLocusBaseEditCandidate(i,
-				     std::ref(alignments),
-				     alignment_flags,
-				     std::ref(out_variants),
-				     std::ref(contig),
-                     std::ref(aligner),
-				     std::ref(outVariantsMutex));
-      }
+        return out_variants;
+    }
+    catch(std::exception &e){
+        printf("Excpetion in calling thread: %s\n", e.what());
     }
-    return out_variants;
 }
 
 // Given the input set of variants, calculate the variants that have a positive score

From 8acc6d2d024aae0aa5c7a8a80458464fa912b4f4 Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Fri, 20 Jul 2018 13:26:10 +0100
Subject: [PATCH 43/80] Both Kernels giving similar but not identical results

---
 Makefile                       |   2 +-
 src/cuda_kernels/GpuAligner.cu | 143 ++++++++++++++++++---------------
 2 files changed, 77 insertions(+), 68 deletions(-)

diff --git a/Makefile b/Makefile
index bf0324da..a213f8c8 100644
--- a/Makefile
+++ b/Makefile
@@ -15,7 +15,7 @@ CFLAGS ?= -std=c99 -O0
 CXX ?= g++
 CC ?= gcc
 NVCC = nvcc
-NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0include -O0 -use_fast_math --default-stream per-thread -restrict -g -G
+NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0include -O0 -use_fast_math --default-stream per-thread -restrict -g
 CURTFLAGS ?= -L/usr/local/cuda-9.0/lib64 -lcudart
 
 # Change the value of HDF5, EIGEN, or HTS below to any value to disable compilation of bundled code
diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index 7150b3dc..030881c2 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -41,8 +41,6 @@ __device__ float lp_match_r9(int rank,
 
 }
 
-
-
 __global__ void getScoresMod (float * poreModelDev,
                               int * readLengthsDev,
                               int * eventStartsDev,
@@ -63,6 +61,11 @@ __global__ void getScoresMod (float * poreModelDev,
                               int * readIdxDev,
                               float * returnValuesDev){
 
+    bool debug = false;
+    if ((threadIdx.x == 0) && (blockIdx.x == 0)){
+        debug = false;
+    }
+
     // get buffer indices
     int scoreIdx = threadIdx.x;
     int readIdx = readIdxDev[scoreIdx];
@@ -82,6 +85,9 @@ __global__ void getScoresMod (float * poreModelDev,
 
     // get sequence statistics
     int numKmers = sequenceLengthsDev[seqIdx];
+    int seqOffset = sequenceOffsetsDev[seqIdx];
+
+    printf("This is thread %i, seqIdx is %i, readIdx is %i, numKmers is %i, seqOffset is %i\n", threadIdx.x, seqIdx, readIdx, numKmers, seqOffset);
 
     int lastRowIdx = numEvents -1;
     int lastKmerIdx = numKmers - 1;
@@ -92,6 +98,13 @@ __global__ void getScoresMod (float * poreModelDev,
     int numBlocks = numKmers + 2;
     int numStates = numBlocks * PSR9_NUM_STATES; // 3 blocks per kmer and then 3 each for start and end state.
 
+    if(debug){
+        printf("Kernel 1 >>> Num Kmers is %i\n", numKmers);
+        printf("Kernel 1 >>> n_states %i\n", numStates);
+        printf("Kernel 1 >>> num events in read is  %i\n", numEvents);
+        printf("Kernel 1 >>> event offset is  %i\n", e_offset);
+    }
+
     // Initialise the prev probabilities vector
     for (int i = 0; i < numStates - PSR9_NUM_STATES; i++) {
         prevProbabilities[i] = -INFINITY;
@@ -105,9 +118,6 @@ __global__ void getScoresMod (float * poreModelDev,
         rc = true;
     }
 
-    //int kmerIdx = threadIdx.x;
-    uint32_t rank;
-
     float p_stay = 1 - (1 / read_events_per_base);
     float p_skip = 0.0025;
     float p_bad = 0.001;
@@ -151,21 +161,19 @@ __global__ void getScoresMod (float * poreModelDev,
 
         float lp_emission_b = BAD_EVENT_PENALTY; //TODO: Can this be taken out of the inner loop?
 
+        //Initialise temp registers
+        float prevMatch = prevProbabilities[PSR9_MATCH];;
+        float prevSkip = prevProbabilities[PSR9_KMER_SKIP];
+        float prevBad = prevProbabilities[PSR9_BAD_EVENT];
+
         for (int blkIdx = 1; blkIdx<numBlocks - 1; blkIdx++) {
-            int curBlockIdx = blkIdx; // Accounts for fact that we are not working with start block.
+            int curBlockIdx = blkIdx;
             int prevBlockIdx = curBlockIdx - 1;
             int prevBlockOffset = PSR9_NUM_STATES * prevBlockIdx;
             int curBlockOffset = PSR9_NUM_STATES * curBlockIdx;
 
             int kmerIdx = blkIdx - 1; // because there is a start block with no associated kmer
-            uint32_t rank;
-
-            if (rc == true) {
-                rank = kmerRanksDev[kmerIdx +
-                                    numKmers]; // TODO understand why this is segfaulting sometimes, why does kmerIdx sometimes exceed 4096
-            } else {
-                rank = kmerRanksDev[kmerIdx];
-            }
+            uint32_t rank = kmerRanksDev[seqOffset + kmerIdx + (numKmers * rc)]; // TODO understand why this is segfaulting sometimes, why does kmerIdx sometimes exceed 4096
 
             float pore_mean = poreModelDev[rank * 3];
             float pore_stdv = poreModelDev[rank * 3 + 1];
@@ -182,11 +190,15 @@ __global__ void getScoresMod (float * poreModelDev,
                                               logVar);
 
             // Get all the scores for a match
-            float HMT_FROM_SAME_M = lp_mm_self + prevProbabilities[curBlockOffset + PSR9_MATCH];
-            float HMT_FROM_PREV_M = lp_mm_next + prevProbabilities[prevBlockOffset + PSR9_MATCH];
-            float HMT_FROM_SAME_B = lp_bm_self + prevProbabilities[curBlockOffset + PSR9_BAD_EVENT];
-            float HMT_FROM_PREV_B = lp_bm_next + prevProbabilities[prevBlockOffset + PSR9_BAD_EVENT];
-            float HMT_FROM_PREV_K = lp_km + prevProbabilities[prevBlockOffset + PSR9_KMER_SKIP];
+            float curMatch = prevProbabilities[curBlockOffset + PSR9_MATCH];
+            float curBad = prevProbabilities[curBlockOffset + PSR9_BAD_EVENT];
+            float curSkip = prevProbabilities[curBlockOffset + PSR9_KMER_SKIP];
+
+            float HMT_FROM_SAME_M = lp_mm_self + curMatch;
+            float HMT_FROM_PREV_M = lp_mm_next + prevMatch;
+            float HMT_FROM_SAME_B = lp_bm_self + curBad;
+            float HMT_FROM_PREV_B = lp_bm_next + prevBad;
+            float HMT_FROM_PREV_K = lp_km + prevSkip;
 
             // m_s is the probability of going from the start state
             // to this kmer. The start state is (currently) only
@@ -210,9 +222,9 @@ __global__ void getScoresMod (float * poreModelDev,
 
             // Calculate the bad event scores
             // state PSR9_BAD_EVENT
-            HMT_FROM_SAME_M = lp_mb + prevProbabilities[curBlockOffset + PSR9_MATCH];
+            HMT_FROM_SAME_M = lp_mb + curMatch;
             HMT_FROM_PREV_M = -INFINITY;
-            HMT_FROM_SAME_B = lp_bb + prevProbabilities[curBlockOffset + PSR9_BAD_EVENT];
+            HMT_FROM_SAME_B = lp_bb + prevBad;
             HMT_FROM_PREV_B = -INFINITY;
             HMT_FROM_PREV_K = -INFINITY;
             HMT_FROM_SOFT = -INFINITY;
@@ -226,45 +238,34 @@ __global__ void getScoresMod (float * poreModelDev,
             // Write row out. prevProbabilities now becomes "current probabilities" for evaluating skips.
             prevProbabilities[curBlockOffset + PSR9_MATCH] = newMatchScore;
             prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] = newBadEventScore;
-        }
-        // Filled out Matches and Bad Events for current row. Can now work on skips and end before moving onto next row.
-        for (int blkIdx = 1; blkIdx<numBlocks - 1; blkIdx++){
-            int kmerIdx = blkIdx - 1; // because there is a start block with no associated kmer
-            int curBlockIdx = blkIdx; // Accounts for fact that we are not working with start block.
-            int prevBlockIdx = curBlockIdx - 1;
-            int prevBlockOffset = PSR9_NUM_STATES * prevBlockIdx;
-            int curBlockOffset = PSR9_NUM_STATES * curBlockIdx;
-            float HMT_FROM_SAME_M = -INFINITY;
-            float HMT_FROM_PREV_M = lp_mk + prevProbabilities[prevBlockOffset + PSR9_MATCH];
-            float HMT_FROM_SAME_B = -INFINITY;
-            float HMT_FROM_PREV_B = lp_bk + prevProbabilities[prevBlockOffset + PSR9_BAD_EVENT];
-            float HMT_FROM_SOFT = -INFINITY;
 
-            float sum = HMT_FROM_PREV_M;
+            //Update tmp vars
+            prevMatch = curMatch;
+            prevSkip = curSkip;
+            prevBad = prevBad;
+
+            //Now do the non-skip-skip transition. This relies on the updated vector values.
+            // state PSR9_KMER_SKIP
+            HMT_FROM_PREV_M = lp_mk + prevProbabilities[prevBlockOffset + PSR9_MATCH];
+            HMT_FROM_PREV_B = lp_bk + prevProbabilities[prevBlockOffset + PSR9_BAD_EVENT];
+            HMT_FROM_PREV_K = lp_kk + prevProbabilities[prevBlockOffset + PSR9_KMER_SKIP];
+
+            sum = HMT_FROM_PREV_M;
             sum = logsumexpf(sum, HMT_FROM_PREV_B);
-            sum = logsumexpf(sum, HMT_FROM_PREV_K);
+            sum = logsumexpf(sum, HMT_FROM_PREV_K); //TODO - this is in the 'normal' kernel instead of HMT_FROM_PREV_M - is it wrong?
+            sum = logsumexpf(sum, HMT_FROM_PREV_M); //TODO - assume this should probably be in there, but not in current
 
             float newSkipScore = sum;
 
             prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] = newSkipScore;
 
-            //Skip-Skip
-            float prevSkipScore = prevProbabilities[(blkIdx - 1) * PSR9_NUM_STATES + PSR9_KMER_SKIP];
-            auto skipIdx = blkIdx * PSR9_NUM_STATES + PSR9_KMER_SKIP;
-            float curSkipScore = prevProbabilities[skipIdx + PSR9_KMER_SKIP];
-            float HMT_FROM_PREV_K = lp_kk + prevSkipScore;
-            newSkipScore = logsumexpf(curSkipScore, HMT_FROM_PREV_K);
-            prevProbabilities[skipIdx] = newSkipScore;
-            prevSkipScore = newSkipScore;
-
-            float end;
-            //Only executed once per loop
+            //post-clip transition
             if(kmerIdx == lastKmerIdx && ( (HAF_ALLOW_POST_CLIP) || row == lastRowIdx)) {
                 float lp1 = lp_ms + prevProbabilities[curBlockOffset + PSR9_MATCH] + postFlank;
                 float lp2 = lp_ms + prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] + postFlank;
                 float lp3 = lp_ms + prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] + postFlank;
 
-                end = returnValue;
+                float end = returnValue;
                 end = logsumexpf(end, lp1);
                 end = logsumexpf(end, lp2);
                 end = logsumexpf(end, lp3);
@@ -272,7 +273,6 @@ __global__ void getScoresMod (float * poreModelDev,
             }
         }
     }
-
     returnValuesDev[scoreIdx] = returnValue;
 }
 
@@ -292,10 +292,17 @@ __global__ void getScores(float * const eventData,
                           float * const postFlankingDev,
                           float * returnValues) {
 
+    bool debug = false;
+    if (threadIdx.x == 0 && blockIdx.x == 0){
+        debug = false;
+    }
+
     // Initialise the prev probability row, which is the row of the DP table
     int n_kmers = blockDim.x;
     int n_states = n_kmers * PSR9_NUM_STATES + 2 * PSR9_NUM_STATES; // 3 blocks per kmer and then 3 each for start and end state.
 
+
+
     __shared__ float returnValue;
     returnValue = -INFINITY;
 
@@ -317,19 +324,21 @@ __global__ void getScores(float * const eventData,
     int e_stride = eventStrides[readIdx];
     int e_offset = eventOffsets[readIdx]; // Within the event means etc, the offset needed for this block to get a specific event
 
+    if(debug){
+        printf("Kernel 0 >>> Num Kmers is %i\n", n_kmers);
+        printf("Kernel 0 >>> n_states %i\n", n_states);
+        printf("Kernel 0 >>> num events in read is  %i\n", numRows);
+        printf("Kernel 0 >>> event offset is is  %i\n", e_offset);
+    }
+
     bool rc = false;
     if (e_stride == -1){
         rc = true;
     }
 
     int kmerIdx = threadIdx.x;
-    uint32_t rank;
 
-    if (rc == true) {
-        rank = kmerRanks[kmerIdx + n_kmers];
-    }else{
-        rank = kmerRanks[kmerIdx];
-    }
+    uint32_t rank = kmerRanks[kmerIdx + (n_kmers * rc)];
 
     float pore_mean = poreModelDev[rank * 3];
     float pore_stdv = poreModelDev[rank * 3 + 1];
@@ -645,7 +654,6 @@ std::vector<std::vector<std::vector<double>>> GpuAligner::scoreKernelMod(std::ve
     int  numSequences = 0; // The number of sequences across all scoreSets
     int kmerOffset = 0;
     int numReads = 0; // The number of reads across all scoreSets
-    int numBases = 0;
     int numScoreSets = scoreSets.size();
 
     int rawReadOffset = 0;
@@ -731,23 +739,24 @@ std::vector<std::vector<std::vector<double>>> GpuAligner::scoreKernelMod(std::ve
             sequenceOffsetsHost[globalSequenceIdx] = kmerOffset;
 
             int sequenceLength = sequence.length();
-            numBases += sequenceLength;
 
-            for(size_t ki = 0; ki < sequenceLength; ++ki) {
+            int numKmers = sequenceLength - k + 1;
+
+            for(size_t ki = 0; ki < numKmers; ++ki) {
                 int rank = sequence.get_kmer_rank(ki, k, false);
                 kmerRanks[ki + kmerOffset] = rank;
             }
             //kmerRanksDevPointers[i] = kmerRanksDev + kmerOffset;
-            kmerOffset += sequenceLength;
+            kmerOffset += numKmers;
 
-            for(size_t ki = 0; ki < sequenceLength; ++ki) {
+            for(size_t ki = 0; ki < numKmers; ++ki) {
                 int rank = sequence.get_kmer_rank(ki, k, true);
                 kmerRanks[ki + kmerOffset] = rank;
             }
 
-            kmerOffset += sequenceLength;
+            kmerOffset += numKmers;
 
-            sequenceLengthsHost[globalSequenceIdx] = sequenceLength;
+            sequenceLengthsHost[globalSequenceIdx] = numKmers;
 
             // Loop over the raw reads, producing a cartesian product of the two
 
@@ -816,7 +825,7 @@ std::vector<std::vector<std::vector<double>>> GpuAligner::scoreKernelMod(std::ve
 
     // Launch Kernels
 
-    dim3 dimBlock(1); //TODO change back to globalScoreIDx this is only for debugging
+    dim3 dimBlock(globalScoreIdx); // TODO: divide work into smaller blocks
     dim3 dimGrid(1);
 
     //printf("Launching get scores mod kernel\n");
@@ -987,14 +996,14 @@ std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSeq
     //Let's populate a host buffer with all the sequences.
     size_t  numKmers = 0;
     for (auto sequence: sequences) {
-        numKmers += sequence.length();
+        numKmers += (sequence.length() - k + 1);
     }
 
     size_t kmerOffset = 0;
     for (int i = 0; i<sequences.size(); i++) {
         auto sequence = sequences[i];
 
-        size_t sequenceLength = sequence.length();
+        size_t sequenceLength = sequence.length() - k + 1;
         for(size_t ki = 0; ki < sequenceLength; ++ki) {
             int rank = sequence.get_kmer_rank(ki, k, false);
             kmerRanks[ki + kmerOffset] = rank;
@@ -1012,7 +1021,7 @@ std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSeq
                     cudaMemcpyHostToDevice, streams[0]);
 
     uint8_t  MAX_NUM_KMERS = 30;
-    for (size_t i =0; i < sequences.size();i++){
+    for (size_t i =0; i < sequences.size() ;i++){
 
         int * kmerRanksDevPtr = kmerRanksDevPointers[i];
 
@@ -1106,7 +1115,7 @@ std::vector<Variant> GpuAligner::variantScoresThresholded(std::vector<Variant> i
 
     if (!event_sequences.empty()) {
         std::vector<std::vector<double>> scores = scoreKernel(sequences, event_sequences, alignment_flags);
-
+        //std::vector<std::vector<double>> scores;
         // Now try it with the new method
         ScoreSet s = {
                 sequences,

From 409fb3a0663ae87dea4cb4fb0a1f6657fc135206 Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Fri, 20 Jul 2018 21:17:02 +0100
Subject: [PATCH 44/80] Split work into smaller threadBlocks

---
 Makefile                         |   8 +-
 src/cuda_kernels/GpuAligner.cu   | 450 ++++++++++++++++---------------
 src/nanopolish_call_variants.cpp |   2 +-
 3 files changed, 239 insertions(+), 221 deletions(-)

diff --git a/Makefile b/Makefile
index a213f8c8..5f22a05b 100644
--- a/Makefile
+++ b/Makefile
@@ -9,13 +9,13 @@ SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/ali
 
 #Basic flags every build needs
 LIBS=-lz
-CXXFLAGS ?= -O0
-CXXFLAGS += -std=c++11 -fopenmp -fsigned-char -g
-CFLAGS ?= -std=c99 -O0
+CXXFLAGS ?= -O3
+CXXFLAGS += -std=c++11 -fopenmp -fsigned-char #-g
+CFLAGS ?= -std=c99 -O3
 CXX ?= g++
 CC ?= gcc
 NVCC = nvcc
-NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0include -O0 -use_fast_math --default-stream per-thread -restrict -g
+NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0include -O3 -use_fast_math --default-stream per-thread -restrict #-g
 CURTFLAGS ?= -L/usr/local/cuda-9.0/lib64 -lcudart
 
 # Change the value of HDF5, EIGEN, or HTS below to any value to disable compilation of bundled code
diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index 030881c2..af84e60c 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -4,7 +4,7 @@
 #include <vector>
 #include "nanopolish_profile_hmm_r9.h"
 
-#define MAX_STATES 512
+#define MAX_STATES 256
 
 #define EXPAND_TO_STRING(X) #X
 #define TO_STRING(X) EXPAND_TO_STRING(X)
@@ -59,6 +59,7 @@ __global__ void getScoresMod (float * poreModelDev,
                               int * kmerRanksDev,
                               int * seqIdxDev,
                               int * readIdxDev,
+                              int numScores,
                               float * returnValuesDev){
 
     bool debug = false;
@@ -67,213 +68,218 @@ __global__ void getScoresMod (float * poreModelDev,
     }
 
     // get buffer indices
-    int scoreIdx = threadIdx.x;
-    int readIdx = readIdxDev[scoreIdx];
-    int seqIdx = seqIdxDev[scoreIdx];
-
-    // get read statistics
-    int numEvents = readLengthsDev[readIdx];
-    int readOffset = eventOffsetsDev[readIdx];
-    float read_events_per_base = eventsPerBaseDev[readIdx];
-    int e_start = eventStartsDev[readIdx]; // Event start for read
-    int e_stride = eventStridesDev[readIdx];
-    int e_offset = eventOffsetsDev[readIdx]; // Within the event means etc, the offset needed for this block to get a specific event
-    float scale = scaleDev[readIdx];
-    float shift = shiftDev[readIdx];
-    float var = varDev[readIdx];
-    float logVar = logVarDev[readIdx];
-
-    // get sequence statistics
-    int numKmers = sequenceLengthsDev[seqIdx];
-    int seqOffset = sequenceOffsetsDev[seqIdx];
-
-    printf("This is thread %i, seqIdx is %i, readIdx is %i, numKmers is %i, seqOffset is %i\n", threadIdx.x, seqIdx, readIdx, numKmers, seqOffset);
-
-    int lastRowIdx = numEvents -1;
-    int lastKmerIdx = numKmers - 1;
-
-    float returnValue = -INFINITY; //Used to sum over the last column.
-    float prevProbabilities[MAX_STATES];
-
-    int numBlocks = numKmers + 2;
-    int numStates = numBlocks * PSR9_NUM_STATES; // 3 blocks per kmer and then 3 each for start and end state.
-
-    if(debug){
-        printf("Kernel 1 >>> Num Kmers is %i\n", numKmers);
-        printf("Kernel 1 >>> n_states %i\n", numStates);
-        printf("Kernel 1 >>> num events in read is  %i\n", numEvents);
-        printf("Kernel 1 >>> event offset is  %i\n", e_offset);
-    }
-
-    // Initialise the prev probabilities vector
-    for (int i = 0; i < numStates - PSR9_NUM_STATES; i++) {
-        prevProbabilities[i] = -INFINITY;
-    }
-    for (int i = numStates - PSR9_NUM_STATES; i < numStates; i++) {
-        prevProbabilities[i] = 0.0f;
-    }
-
-    bool rc = false;
-    if (e_stride == -1){
-        rc = true;
-    }
-
-    float p_stay = 1 - (1 / read_events_per_base);
-    float p_skip = 0.0025;
-    float p_bad = 0.001;
-    float p_bad_self = p_bad;
-    float p_skip_self = 0.3;
-    float p_mk = p_skip; // probability of not observing an event at all
-    float p_mb = p_bad; // probabilty of observing a bad event
-    float p_mm_self = p_stay; // probability of observing additional events from this k-mer
-    float p_mm_next = 1.0f - p_mm_self - p_mk - p_mb; // normal movement from state to state
-    // transitions from event split state in previous block
-    float p_bb = p_bad_self;
-    float p_bk, p_bm_next, p_bm_self;
-    p_bk = p_bm_next = p_bm_self = (1.0f - p_bb) / 3;
-    // transitions from kmer skip state in previous block
-    float p_kk = p_skip_self;
-    float p_km = 1.0f - p_kk;
-    // We assign some transition probabilities. I believe this is correct and they don't vary by location in the sequence
-    float lp_mk = logf(p_mk);
-    float lp_mb = logf(p_mb);
-    float lp_mm_self = logf(p_mm_self);
-    float lp_mm_next = logf(p_mm_next);
-    float lp_bb = logf(p_bb);
-    float lp_bk = logf(p_bk);
-    float lp_bm_next = logf(p_bm_next);
-    float lp_bm_self = logf(p_bm_self);
-    float lp_kk = logf(p_kk);
-    float lp_km = logf(p_km);
-    float lp_sm, lp_ms;
-    lp_sm = lp_ms = 0.0f;
+    int scoreIdx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (scoreIdx < numScores) {
+
+        int readIdx = readIdxDev[scoreIdx];
+        int seqIdx = seqIdxDev[scoreIdx];
+
+        // get read statistics
+        int numEvents = readLengthsDev[readIdx];
+        int readOffset = eventOffsetsDev[readIdx];
+        float read_events_per_base = eventsPerBaseDev[readIdx];
+        int e_start = eventStartsDev[readIdx]; // Event start for read
+        int e_stride = eventStridesDev[readIdx];
+        int e_offset = eventOffsetsDev[readIdx]; // Within the event means etc, the offset needed for this block to get a specific event
+        float scale = scaleDev[readIdx];
+        float shift = shiftDev[readIdx];
+        float var = varDev[readIdx];
+        float logVar = logVarDev[readIdx];
+
+        // get sequence statistics
+        int numKmers = sequenceLengthsDev[seqIdx];
+        int seqOffset = sequenceOffsetsDev[seqIdx];
+
+        int lastRowIdx = numEvents - 1;
+        int lastKmerIdx = numKmers - 1;
+
+        float returnValue = -INFINITY; //Used to sum over the last column.
+        float prevProbabilities[MAX_STATES];
+
+        int numBlocks = numKmers + 2;
+        int numStates = numBlocks * PSR9_NUM_STATES; // 3 blocks per kmer and then 3 each for start and end state.
+
+        if (debug) {
+            printf("Kernel 1 >>> Num Kmers is %i\n", numKmers);
+            printf("Kernel 1 >>> n_states %i\n", numStates);
+            printf("Kernel 1 >>> num events in read is  %i\n", numEvents);
+            printf("Kernel 1 >>> event offset is  %i\n", e_offset);
+        }
 
-    // the penalty is controlled by the transition probability
-    float BAD_EVENT_PENALTY = 0.0f;
+        // Initialise the prev probabilities vector
+        for (int i = 0; i < numStates - PSR9_NUM_STATES; i++) {
+            prevProbabilities[i] = -INFINITY;
+        }
+        for (int i = numStates - PSR9_NUM_STATES; i < numStates; i++) {
+            prevProbabilities[i] = 0.0f;
+        }
 
-    //Fill out the dynamic programming table
-    for(int row=1; row<numEvents + 1;row++) {//TODO: check that numRows is correct value.
-        //row-specific values
-        int event_idx = e_start + (row - 1) * e_stride;
-        float eventMean = eventMeansDev[e_offset + row - 1];
-        float preFlank = preFlankingDev[e_offset + row - 1];
-        float postFlank = postFlankingDev[e_offset + row - 1];
+        bool rc = false;
+        if (e_stride == -1) {
+            rc = true;
+        }
 
-        float lp_emission_b = BAD_EVENT_PENALTY; //TODO: Can this be taken out of the inner loop?
-
-        //Initialise temp registers
-        float prevMatch = prevProbabilities[PSR9_MATCH];;
-        float prevSkip = prevProbabilities[PSR9_KMER_SKIP];
-        float prevBad = prevProbabilities[PSR9_BAD_EVENT];
-
-        for (int blkIdx = 1; blkIdx<numBlocks - 1; blkIdx++) {
-            int curBlockIdx = blkIdx;
-            int prevBlockIdx = curBlockIdx - 1;
-            int prevBlockOffset = PSR9_NUM_STATES * prevBlockIdx;
-            int curBlockOffset = PSR9_NUM_STATES * curBlockIdx;
-
-            int kmerIdx = blkIdx - 1; // because there is a start block with no associated kmer
-            uint32_t rank = kmerRanksDev[seqOffset + kmerIdx + (numKmers * rc)]; // TODO understand why this is segfaulting sometimes, why does kmerIdx sometimes exceed 4096
-
-            float pore_mean = poreModelDev[rank * 3];
-            float pore_stdv = poreModelDev[rank * 3 + 1];
-            float pore_log_level_stdv = poreModelDev[rank * 3 + 2];
-
-            float lp_emission_m = lp_match_r9(rank,
-                                              eventMean,
-                                              pore_mean,
-                                              pore_stdv,
-                                              pore_log_level_stdv,
-                                              scale,
-                                              shift,
-                                              var,
-                                              logVar);
-
-            // Get all the scores for a match
-            float curMatch = prevProbabilities[curBlockOffset + PSR9_MATCH];
-            float curBad = prevProbabilities[curBlockOffset + PSR9_BAD_EVENT];
-            float curSkip = prevProbabilities[curBlockOffset + PSR9_KMER_SKIP];
-
-            float HMT_FROM_SAME_M = lp_mm_self + curMatch;
-            float HMT_FROM_PREV_M = lp_mm_next + prevMatch;
-            float HMT_FROM_SAME_B = lp_bm_self + curBad;
-            float HMT_FROM_PREV_B = lp_bm_next + prevBad;
-            float HMT_FROM_PREV_K = lp_km + prevSkip;
-
-            // m_s is the probability of going from the start state
-            // to this kmer. The start state is (currently) only
-            // allowed to go to the first kmer. If ALLOW_PRE_CLIP
-            // is defined, we allow all events before this one to be skipped,
-            // with a penalty;
-            float HMT_FROM_SOFT = (kmerIdx == 0 &&
-                                   (event_idx == e_start ||
-                                    (HAF_ALLOW_PRE_CLIP))) ? lp_sm + preFlank : -INFINITY;
-
-            // calculate the score
-            float sum = HMT_FROM_SAME_M;
-            sum = logsumexpf(sum, HMT_FROM_SOFT);
-            sum = logsumexpf(sum, HMT_FROM_PREV_M);
-            sum = logsumexpf(sum, HMT_FROM_SAME_B);
-            sum = logsumexpf(sum, HMT_FROM_PREV_B);
-            sum = logsumexpf(sum, HMT_FROM_PREV_K);
-            sum += lp_emission_m;
-
-            float newMatchScore = sum;
-
-            // Calculate the bad event scores
-            // state PSR9_BAD_EVENT
-            HMT_FROM_SAME_M = lp_mb + curMatch;
-            HMT_FROM_PREV_M = -INFINITY;
-            HMT_FROM_SAME_B = lp_bb + prevBad;
-            HMT_FROM_PREV_B = -INFINITY;
-            HMT_FROM_PREV_K = -INFINITY;
-            HMT_FROM_SOFT = -INFINITY;
-
-            sum = HMT_FROM_SAME_M;
-            sum = logsumexpf(sum, HMT_FROM_SAME_B);
-            sum += lp_emission_b;
-
-            float newBadEventScore = sum;
-
-            // Write row out. prevProbabilities now becomes "current probabilities" for evaluating skips.
-            prevProbabilities[curBlockOffset + PSR9_MATCH] = newMatchScore;
-            prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] = newBadEventScore;
-
-            //Update tmp vars
-            prevMatch = curMatch;
-            prevSkip = curSkip;
-            prevBad = prevBad;
-
-            //Now do the non-skip-skip transition. This relies on the updated vector values.
-            // state PSR9_KMER_SKIP
-            HMT_FROM_PREV_M = lp_mk + prevProbabilities[prevBlockOffset + PSR9_MATCH];
-            HMT_FROM_PREV_B = lp_bk + prevProbabilities[prevBlockOffset + PSR9_BAD_EVENT];
-            HMT_FROM_PREV_K = lp_kk + prevProbabilities[prevBlockOffset + PSR9_KMER_SKIP];
-
-            sum = HMT_FROM_PREV_M;
-            sum = logsumexpf(sum, HMT_FROM_PREV_B);
-            sum = logsumexpf(sum, HMT_FROM_PREV_K); //TODO - this is in the 'normal' kernel instead of HMT_FROM_PREV_M - is it wrong?
-            sum = logsumexpf(sum, HMT_FROM_PREV_M); //TODO - assume this should probably be in there, but not in current
-
-            float newSkipScore = sum;
-
-            prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] = newSkipScore;
-
-            //post-clip transition
-            if(kmerIdx == lastKmerIdx && ( (HAF_ALLOW_POST_CLIP) || row == lastRowIdx)) {
-                float lp1 = lp_ms + prevProbabilities[curBlockOffset + PSR9_MATCH] + postFlank;
-                float lp2 = lp_ms + prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] + postFlank;
-                float lp3 = lp_ms + prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] + postFlank;
-
-                float end = returnValue;
-                end = logsumexpf(end, lp1);
-                end = logsumexpf(end, lp2);
-                end = logsumexpf(end, lp3);
-                returnValue = end;
+        float p_stay = 1 - (1 / read_events_per_base);
+        float p_skip = 0.0025;
+        float p_bad = 0.001;
+        float p_bad_self = p_bad;
+        float p_skip_self = 0.3;
+        float p_mk = p_skip; // probability of not observing an event at all
+        float p_mb = p_bad; // probabilty of observing a bad event
+        float p_mm_self = p_stay; // probability of observing additional events from this k-mer
+        float p_mm_next = 1.0f - p_mm_self - p_mk - p_mb; // normal movement from state to state
+        // transitions from event split state in previous block
+        float p_bb = p_bad_self;
+        float p_bk, p_bm_next, p_bm_self;
+        p_bk = p_bm_next = p_bm_self = (1.0f - p_bb) / 3;
+        // transitions from kmer skip state in previous block
+        float p_kk = p_skip_self;
+        float p_km = 1.0f - p_kk;
+        // We assign some transition probabilities. I believe this is correct and they don't vary by location in the sequence
+        float lp_mk = logf(p_mk);
+        float lp_mb = logf(p_mb);
+        float lp_mm_self = logf(p_mm_self);
+        float lp_mm_next = logf(p_mm_next);
+        float lp_bb = logf(p_bb);
+        float lp_bk = logf(p_bk);
+        float lp_bm_next = logf(p_bm_next);
+        float lp_bm_self = logf(p_bm_self);
+        float lp_kk = logf(p_kk);
+        float lp_km = logf(p_km);
+        float lp_sm, lp_ms;
+        lp_sm = lp_ms = 0.0f;
+
+        // the penalty is controlled by the transition probability
+        float BAD_EVENT_PENALTY = 0.0f;
+
+        //Fill out the dynamic programming table
+        for (int row = 1; row < numEvents + 1; row++) {//TODO: check that numRows is correct value.
+            //row-specific values
+            int event_idx = e_start + (row - 1) * e_stride;
+            float eventMean = eventMeansDev[e_offset + row - 1];
+            float preFlank = preFlankingDev[e_offset + row - 1];
+            float postFlank = postFlankingDev[e_offset + row - 1];
+
+            float lp_emission_b = BAD_EVENT_PENALTY; //TODO: Can this be taken out of the inner loop?
+
+            //Initialise temp registers
+            float prevMatch = prevProbabilities[PSR9_MATCH];;
+            float prevSkip = prevProbabilities[PSR9_KMER_SKIP];
+            float prevBad = prevProbabilities[PSR9_BAD_EVENT];
+
+            for (int blkIdx = 1; blkIdx < numBlocks - 1; blkIdx++) {
+                int curBlockIdx = blkIdx;
+                int prevBlockIdx = curBlockIdx - 1;
+                int prevBlockOffset = PSR9_NUM_STATES * prevBlockIdx;
+                int curBlockOffset = PSR9_NUM_STATES * curBlockIdx;
+
+                int kmerIdx = blkIdx - 1; // because there is a start block with no associated kmer
+                uint32_t rank = kmerRanksDev[seqOffset + kmerIdx + (numKmers *
+                                                                    rc)]; // TODO understand why this is segfaulting sometimes, why does kmerIdx sometimes exceed 4096
+
+                float pore_mean = poreModelDev[rank * 3];
+                float pore_stdv = poreModelDev[rank * 3 + 1];
+                float pore_log_level_stdv = poreModelDev[rank * 3 + 2];
+
+                float lp_emission_m = lp_match_r9(rank,
+                                                  eventMean,
+                                                  pore_mean,
+                                                  pore_stdv,
+                                                  pore_log_level_stdv,
+                                                  scale,
+                                                  shift,
+                                                  var,
+                                                  logVar);
+
+                // Get all the scores for a match
+                float curMatch = prevProbabilities[curBlockOffset + PSR9_MATCH];
+                float curBad = prevProbabilities[curBlockOffset + PSR9_BAD_EVENT];
+                float curSkip = prevProbabilities[curBlockOffset + PSR9_KMER_SKIP];
+
+                float HMT_FROM_SAME_M = lp_mm_self + curMatch;
+                float HMT_FROM_PREV_M = lp_mm_next + prevMatch;
+                float HMT_FROM_SAME_B = lp_bm_self + curBad;
+                float HMT_FROM_PREV_B = lp_bm_next + prevBad;
+                float HMT_FROM_PREV_K = lp_km + prevSkip;
+
+                // m_s is the probability of going from the start state
+                // to this kmer. The start state is (currently) only
+                // allowed to go to the first kmer. If ALLOW_PRE_CLIP
+                // is defined, we allow all events before this one to be skipped,
+                // with a penalty;
+                float HMT_FROM_SOFT = (kmerIdx == 0 &&
+                                       (event_idx == e_start ||
+                                        (HAF_ALLOW_PRE_CLIP))) ? lp_sm + preFlank : -INFINITY;
+
+                // calculate the score
+                float sum = HMT_FROM_SAME_M;
+                sum = logsumexpf(sum, HMT_FROM_SOFT);
+                sum = logsumexpf(sum, HMT_FROM_PREV_M);
+                sum = logsumexpf(sum, HMT_FROM_SAME_B);
+                sum = logsumexpf(sum, HMT_FROM_PREV_B);
+                sum = logsumexpf(sum, HMT_FROM_PREV_K);
+                sum += lp_emission_m;
+
+                float newMatchScore = sum;
+
+                // Calculate the bad event scores
+                // state PSR9_BAD_EVENT
+                HMT_FROM_SAME_M = lp_mb + curMatch;
+                HMT_FROM_PREV_M = -INFINITY;
+                HMT_FROM_SAME_B = lp_bb + prevBad;
+                HMT_FROM_PREV_B = -INFINITY;
+                HMT_FROM_PREV_K = -INFINITY;
+                HMT_FROM_SOFT = -INFINITY;
+
+                sum = HMT_FROM_SAME_M;
+                sum = logsumexpf(sum, HMT_FROM_SAME_B);
+                sum += lp_emission_b;
+
+                float newBadEventScore = sum;
+
+                // Write row out. prevProbabilities now becomes "current probabilities" for evaluating skips.
+                prevProbabilities[curBlockOffset + PSR9_MATCH] = newMatchScore;
+                prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] = newBadEventScore;
+
+                //Update tmp vars
+                prevMatch = curMatch;
+                prevSkip = curSkip;
+                prevBad = prevBad;
+
+                //Now do the non-skip-skip transition. This relies on the updated vector values.
+                // state PSR9_KMER_SKIP
+                HMT_FROM_PREV_M = lp_mk + prevProbabilities[prevBlockOffset + PSR9_MATCH];
+                HMT_FROM_PREV_B = lp_bk + prevProbabilities[prevBlockOffset + PSR9_BAD_EVENT];
+                HMT_FROM_PREV_K = lp_kk + prevProbabilities[prevBlockOffset + PSR9_KMER_SKIP];
+
+                sum = HMT_FROM_PREV_M;
+                sum = logsumexpf(sum, HMT_FROM_PREV_B);
+                sum = logsumexpf(sum,
+                                 HMT_FROM_PREV_K); //TODO - this is in the 'normal' kernel instead of HMT_FROM_PREV_M - is it wrong?
+                sum = logsumexpf(sum,
+                                 HMT_FROM_PREV_M); //TODO - assume this should probably be in there, but not in current
+
+                float newSkipScore = sum;
+
+                prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] = newSkipScore;
+
+                //post-clip transition
+                if (kmerIdx == lastKmerIdx && ((HAF_ALLOW_POST_CLIP) || row == lastRowIdx)) {
+                    float lp1 = lp_ms + prevProbabilities[curBlockOffset + PSR9_MATCH] + postFlank;
+                    float lp2 = lp_ms + prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] + postFlank;
+                    float lp3 = lp_ms + prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] + postFlank;
+
+                    float end = returnValue;
+                    end = logsumexpf(end, lp1);
+                    end = logsumexpf(end, lp2);
+                    end = logsumexpf(end, lp3);
+                    returnValue = end;
+                }
             }
         }
+        returnValuesDev[scoreIdx] = returnValue;
     }
-    returnValuesDev[scoreIdx] = returnValue;
 }
 
 __global__ void getScores(float * const eventData,
@@ -520,7 +526,7 @@ GpuAligner::GpuAligner()
     int max_num_reads = 1000;
     int readsSizeBuffer = max_num_reads * sizeof(int);
     int max_n_rows = 100;
-    int maxBuffer = 50000 * sizeof(float);  //TODO: allocate more smartly
+    int maxBuffer = 100000 * sizeof(float);  //TODO: allocate more smartly
     int max_num_sequences = 8;
     int max_sequence_length = 50;
 
@@ -586,8 +592,8 @@ GpuAligner::GpuAligner()
     CU_CHECK_ERR(cudaHostAlloc(&readIdxHost, maxBuffer, cudaHostAllocDefault));
 
     int numKmers = max_sequence_length * max_num_sequences;
-    CU_CHECK_ERR(cudaHostAlloc(&kmerRanks, numKmers  * 2 * sizeof(int), cudaHostAllocDefault));
-    CU_CHECK_ERR(cudaMalloc((void**)&kmerRanksDev, numKmers  * 2 * sizeof(int)));
+    CU_CHECK_ERR(cudaHostAlloc(&kmerRanks, maxBuffer , cudaHostAllocDefault));
+    CU_CHECK_ERR(cudaMalloc((void**)&kmerRanksDev, maxBuffer ));
 
     // Allocate host memory for model
     returnValuesHostResultsPointers.resize(max_num_sequences);
@@ -769,7 +775,6 @@ std::vector<std::vector<std::vector<double>>> GpuAligner::scoreKernelMod(std::ve
 
             globalSequenceIdx++;
             }
-
     }
 
     // All data is now in host buffers - perform memcpys
@@ -825,11 +830,13 @@ std::vector<std::vector<std::vector<double>>> GpuAligner::scoreKernelMod(std::ve
 
     // Launch Kernels
 
-    dim3 dimBlock(globalScoreIdx); // TODO: divide work into smaller blocks
-    dim3 dimGrid(1);
+    int blockSize = 32;
+    int numBlocks =  (globalScoreIdx + blockSize - 1 ) / blockSize;
+    dim3 dimBlock(blockSize);
+    dim3 dimGrid(numBlocks);
 
     //printf("Launching get scores mod kernel\n");
-    getScoresMod <<< dimGrid, dimBlock, MAX_STATES * sizeof(int), streams[0]>>> (poreModelDev,
+    getScoresMod <<< dimGrid, dimBlock, 0, streams[0]>>> (poreModelDev,
                                                           readLengthsDev,
                                                           eventStartsDev,
                                                           eventStridesDev,
@@ -847,6 +854,7 @@ std::vector<std::vector<std::vector<double>>> GpuAligner::scoreKernelMod(std::ve
                                                           kmerRanksDev,
                                                           seqIdxDev,
                                                           readIdxDev,
+                                                          globalScoreIdx,
                                                           scoresDev);
     cudaError_t err = cudaGetLastError();
 
@@ -866,12 +874,15 @@ std::vector<std::vector<std::vector<double>>> GpuAligner::scoreKernelMod(std::ve
         int numSequences = scoreSet.stateSequences.size();
         int numReads = scoreSet.rawData.size();
         for (int seqIdx=0; seqIdx<numSequences; seqIdx++){
-            std::vector<double> seqScores;
+
+            std::vector<double> seqScores(numReads);
+
             for (int readIdx=0; readIdx<numReads; readIdx++){
                 float score = returnValuesHost[k];
-                seqScores.push_back(score);
+                seqScores[readIdx] = score;
                 k++;
             }
+
             result[scoreSetIdx].push_back(seqScores);
         }
     }
@@ -1114,17 +1125,24 @@ std::vector<Variant> GpuAligner::variantScoresThresholded(std::vector<Variant> i
     std::vector<Variant> v = input_variants;
 
     if (!event_sequences.empty()) {
-        std::vector<std::vector<double>> scores = scoreKernel(sequences, event_sequences, alignment_flags);
-        //std::vector<std::vector<double>> scores;
+        //std::vector<std::vector<double>> scores = scoreKernel(sequences, event_sequences, alignment_flags);
+
         // Now try it with the new method
         ScoreSet s = {
                 sequences,
                 event_sequences
         };
 
-        std::vector<ScoreSet>  scoreSets(1,s);
+        std::vector<ScoreSet> scoreSets;
+        scoreSets.push_back(s);
+        //scoreSets.push_back(s);
+        //scoreSets.push_back(s);
+        //scoreSets.push_back(s);
+        //scoreSets.push_back(s);
+        //scoreSets.push_back(s);
 
-        std::vector<std::vector<std::vector<double>>> scoresMod = scoreKernelMod(scoreSets, alignment_flags);
+        auto scoresMod = scoreKernelMod(scoreSets, alignment_flags);
+        std::vector<std::vector<double>> scores = scoresMod[0];
 
         uint32_t numScores = scores[0].size();
         for (int variantIndex = 0; variantIndex < numVariants; variantIndex++) { // index 0 is the base scores
@@ -1138,7 +1156,7 @@ std::vector<Variant> GpuAligner::variantScoresThresholded(std::vector<Variant> i
             v[variantIndex].quality = totalScore;
             v[variantIndex].info = "";
         }
-    }
 
+    }
     return v;
 }
diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp
index 1b01ff2c..509c98e8 100644
--- a/src/nanopolish_call_variants.cpp
+++ b/src/nanopolish_call_variants.cpp
@@ -388,7 +388,7 @@ std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& ali
 
         // Add all positively-scoring single-base changes into the candidate set
         if (opt::gpu) {
-            size_t num_workers = 1;
+            size_t num_workers = 8;
             std::vector<GpuAligner> gpuAligners(num_workers);
 
             //std::vector<std::thread> workerThreads(num_workers);

From 8136fa6888ab297d188e99980f358e817af8a663 Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Sat, 21 Jul 2018 01:17:22 +0100
Subject: [PATCH 45/80] New Kernel working in multi-base mode. Code needs big
 refactor and testing

---
 src/cuda_kernels/GpuAligner.cu   |  91 +++++++++-------
 src/cuda_kernels/GpuAligner.h    |  10 +-
 src/nanopolish_call_variants.cpp | 179 ++++++++++++++++++-------------
 3 files changed, 163 insertions(+), 117 deletions(-)

diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index af84e60c..985c6a95 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -653,7 +653,7 @@ GpuAligner::~GpuAligner() {
 
 }
 
-std::vector<std::vector<std::vector<double>>> GpuAligner::scoreKernelMod(std::vector<ScoreSet> scoreSets,
+std::vector<std::vector<std::vector<double>>> GpuAligner::scoreKernelMod(std::vector<ScoreSet> &scoreSets,
                                                                          uint32_t alignment_flags){
 
     int numEventsTotal = 0; // The number of events across all scoreSets
@@ -669,7 +669,7 @@ std::vector<std::vector<std::vector<double>>> GpuAligner::scoreKernelMod(std::ve
 
     //Loop over every scoreset, filling out buffers and counters
     for (int scoreSetIdx=0; scoreSetIdx < numScoreSets; scoreSetIdx++){
-        auto &scoreSet = scoreSets[scoreSetIdx];
+        auto scoreSet = scoreSets[scoreSetIdx];
         int firstReadIdxinScoreSet = globalReadIdx;
         //Read data
         for (int eventSequenceIdx=0; eventSequenceIdx < scoreSet.rawData.size();eventSequenceIdx++){
@@ -1092,12 +1092,22 @@ std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSeq
 }
 
 
-std::vector<Variant> GpuAligner::variantScoresThresholded(std::vector<Variant> input_variants,
-                                                          Haplotype base_haplotype,
-                                                          std::vector<HMMInputData> event_sequences,
+std::vector<Variant> GpuAligner::variantScoresThresholded(std::vector<std::vector<Variant>> input_variants_vector,
+                                                          std::vector<Haplotype> base_haplotypes,
+                                                          std::vector<std::vector<HMMInputData>> event_sequences_vector,
                                                           uint32_t alignment_flags,
                                                           int screen_score_threshold,
                                                           std::vector<std::string> methylation_types) {
+  int numScoreSets = base_haplotypes.size();
+  std::vector<ScoreSet> scoreSets;
+  scoreSets.resize(numScoreSets);
+
+  for(int scoreSetIdx=0; scoreSetIdx<numScoreSets;scoreSetIdx++){
+
+    auto input_variants = input_variants_vector[scoreSetIdx];
+    auto base_haplotype = base_haplotypes[scoreSetIdx];
+    auto event_sequences = event_sequences_vector[scoreSetIdx];
+
     int numVariants = input_variants.size();
 
     std::vector<Variant> out_variants = input_variants;
@@ -1122,41 +1132,42 @@ std::vector<Variant> GpuAligner::variantScoresThresholded(std::vector<Variant> i
         sequences.push_back(variant_sequence);
     }
 
-    std::vector<Variant> v = input_variants;
-
-    if (!event_sequences.empty()) {
-        //std::vector<std::vector<double>> scores = scoreKernel(sequences, event_sequences, alignment_flags);
-
-        // Now try it with the new method
-        ScoreSet s = {
-                sequences,
-                event_sequences
-        };
-
-        std::vector<ScoreSet> scoreSets;
-        scoreSets.push_back(s);
-        //scoreSets.push_back(s);
-        //scoreSets.push_back(s);
-        //scoreSets.push_back(s);
-        //scoreSets.push_back(s);
-        //scoreSets.push_back(s);
-
-        auto scoresMod = scoreKernelMod(scoreSets, alignment_flags);
-        std::vector<std::vector<double>> scores = scoresMod[0];
-
-        uint32_t numScores = scores[0].size();
-        for (int variantIndex = 0; variantIndex < numVariants; variantIndex++) { // index 0 is the base scores
-            double totalScore = 0.0;
-            for (int k = 0; k < numScores; k++) {
-                if (fabs(totalScore) < screen_score_threshold) {
-                    double baseScore = scores[0][k];
-                    totalScore += (scores[variantIndex + 1][k] - baseScore);
-                }
-            }
-            v[variantIndex].quality = totalScore;
-            v[variantIndex].info = "";
-        }
+    ScoreSet s = {
+      sequences,
+      event_sequences
+    };
 
+    scoreSets[scoreSetIdx] = s;
+
+  }
+
+  std::vector<Variant> v;
+  if (!event_sequences_vector.empty()) {
+    //std::vector<std::vector<double>> scores = scoreKernel(sequences, event_sequences, alignment_flags);
+
+    auto scoresMod = scoreKernelMod(scoreSets, alignment_flags);
+
+    // results are now ready, need to unpack them
+    for (int scoreSetIdx=0; scoreSetIdx<numScoreSets; scoreSetIdx++){
+      std::vector<std::vector<double>> scores = scoresMod[scoreSetIdx]; // scores for this candidate, including all variants and base(zeroth)
+      int numVariants = scores.size() - 1; // subtract one for the base
+      int numScores = scores[0].size();
+
+      for (int variantIndex = 0; variantIndex < numVariants; variantIndex++) { // index 0 is the base scores
+	double totalScore = 0.0;
+	for (int k = 0; k < numScores; k++) {
+	  if (fabs(totalScore) < screen_score_threshold) {
+	    double baseScore = scores[0][k];
+	    totalScore += (scores[variantIndex + 1][k] - baseScore);
+	  }
+	}
+	// get the old variant:
+	auto unScoredVariant = input_variants_vector[scoreSetIdx][variantIndex];
+	unScoredVariant.quality = totalScore;
+	unScoredVariant.info = "";
+	v.push_back(unScoredVariant);
+      }
     }
-    return v;
+  }
+  return v;
 }
diff --git a/src/cuda_kernels/GpuAligner.h b/src/cuda_kernels/GpuAligner.h
index e586bfdc..d72ce9c2 100644
--- a/src/cuda_kernels/GpuAligner.h
+++ b/src/cuda_kernels/GpuAligner.h
@@ -45,8 +45,8 @@
 
 //Data to be scored
 typedef struct {
-    std::vector<HMMInputSequence> &stateSequences;
-    std::vector<HMMInputData> &rawData;
+    std::vector<HMMInputSequence> stateSequences;
+    std::vector<HMMInputData> rawData;
 } ScoreSet;
 
 class GpuAligner
@@ -56,13 +56,15 @@ class GpuAligner
     ~GpuAligner();
 
     std::vector<Variant>
-    variantScoresThresholded(std::vector<Variant> tmp_variants, Haplotype haplotype, std::vector<HMMInputData> event_sequences,
+      variantScoresThresholded(std::vector<std::vector<Variant>>,
+			       std::vector<Haplotype>,
+			       std::vector<std::vector<HMMInputData>>,
               uint32_t alignment_flags, int screen_score_threshold, std::vector<std::string> methylation_types);
 
     std::vector<std::vector<double>> scoreKernel(std::vector<HMMInputSequence> sequences,
     std::vector<HMMInputData> event_sequences,
             uint32_t alignment_flags);
-    std::vector<std::vector<std::vector<double>>> scoreKernelMod(std::vector<ScoreSet> scoreSets,
+    std::vector<std::vector<std::vector<double>>> scoreKernelMod(std::vector<ScoreSet> &scoreSets,
                                                                              uint32_t alignment_flags);
 private:
     float* scaleDev;
diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp
index 509c98e8..1a7b2e96 100644
--- a/src/nanopolish_call_variants.cpp
+++ b/src/nanopolish_call_variants.cpp
@@ -284,24 +284,27 @@ void annotate_with_all_support(std::vector<Variant>& variants,
 }
 
 
-void singleLocusBaseEditCandidate(int i,
-                                  const AlignmentDB& alignments,
-                                  uint32_t alignment_flags,
-                                  std::vector<Variant> &out_variants,
-                                  std::string contig,
-                                  GpuAligner &aligner,
-                                  std::mutex &outVariantsMutex
-){
-try {
-    int calling_start = i - opt::screen_flanking_sequence;
-    int calling_end = i + 1 + opt::screen_flanking_sequence;
-
-    if (!alignments.are_coordinates_valid(contig, calling_start, calling_end)) {
+void locusRangeBaseEditCandidate(int start, int end,
+				 const AlignmentDB& alignments,
+				 uint32_t alignment_flags,
+				 std::vector<Variant> &out_variants,
+				 std::string contig,
+				 GpuAligner &aligner,
+				 std::mutex &outVariantsMutex){
+  try {
+    std::vector<std::vector<Variant>> tmp_variants_vector;
+    std::vector<Haplotype> haplotypes;
+    std::vector<std::vector<HMMInputData>> event_sequences_vector;
+    for(int i = start; i<=end; i++){
+      int calling_start = i - opt::screen_flanking_sequence;
+      int calling_end = i + 1 + opt::screen_flanking_sequence;
+
+      if (!alignments.are_coordinates_valid(contig, calling_start, calling_end)) {
         return;
-    }
+      }
 
-    std::vector<Variant> tmp_variants;
-    for (size_t j = 0; j < 4; ++j) {
+      std::vector<Variant> tmp_variants;
+      for (size_t j = 0; j < 4; ++j) {
         // Substitutions
         Variant v;
         v.ref_name = contig;
@@ -310,43 +313,51 @@ try {
         v.alt_seq = "ACGT"[j];
 
         if (v.ref_seq != v.alt_seq) {
-            tmp_variants.push_back(v);
+	  tmp_variants.push_back(v);
         }
 
         // Insertions
         v.alt_seq = v.ref_seq + "ACGT"[j];
         // ignore insertions of the type "A" -> "AA" as these are redundant
         if (v.alt_seq[1] != v.ref_seq[0]) {
-            tmp_variants.push_back(v);
+	  tmp_variants.push_back(v);
         }
-    }
+      }
 
-    // deletion
-    Variant del;
-    del.ref_name = contig;
-    del.ref_position = i - 1;
-    del.ref_seq = alignments.get_reference_substring(contig, i - 1, i);
-    del.alt_seq = del.ref_seq[0];
+      // deletion
+      Variant del;
+      del.ref_name = contig;
+      del.ref_position = i - 1;
+      del.ref_seq = alignments.get_reference_substring(contig, i - 1, i);
+      del.alt_seq = del.ref_seq[0];
 
-    // ignore deletions of the type "AA" -> "A" as these are redundant
-    if (del.alt_seq[0] != del.ref_seq[1]) {
+      // ignore deletions of the type "AA" -> "A" as these are redundant
+      if (del.alt_seq[0] != del.ref_seq[1]) {
         tmp_variants.push_back(del);
+      }
+
+      // Screen variants by score
+      // We do this internally here as it is much faster to get the event sequences
+      // for the entire window for all variants at this position once, rather than
+      // for each variant individually
+      std::vector<HMMInputData> event_sequences =
+	alignments.get_event_subsequences(contig, calling_start, calling_end);
+
+      Haplotype test_haplotype(contig,
+			       calling_start,
+			       alignments.get_reference_substring(contig,
+								  calling_start,
+								  calling_end));
+
+      haplotypes.push_back(test_haplotype);
+      event_sequences_vector.push_back(event_sequences);
+      tmp_variants_vector.push_back(tmp_variants);
     }
-
-    // Screen variants by score
-    // We do this internally here as it is much faster to get the event sequences
-    // for the entire window for all variants at this position once, rather than
-    // for each variant individually
-    std::vector<HMMInputData> event_sequences =
-            alignments.get_event_subsequences(contig, calling_start, calling_end);
-
-    Haplotype test_haplotype(contig,
-                             calling_start,
-                             alignments.get_reference_substring(contig, calling_start, calling_end));
-
+    
     if (opt::gpu) {
-        std::vector<Variant> scoredVariants = aligner.variantScoresThresholded(tmp_variants, test_haplotype,
-                                                                               event_sequences,
+        std::vector<Variant> scoredVariants = aligner.variantScoresThresholded(tmp_variants_vector,
+									       haplotypes,
+                                                                               event_sequences_vector,
                                                                                alignment_flags,
                                                                                opt::screen_score_threshold,
                                                                                opt::methylation_types);
@@ -357,19 +368,19 @@ try {
             }
         }
     } else {
-        for (const Variant &v : tmp_variants) {
-            auto t0 = std::chrono::high_resolution_clock::now();
-            Variant scored_variant = score_variant_thresholded(v,
-                                                               test_haplotype,
-                                                               event_sequences,
-                                                               alignment_flags,
-                                                               opt::screen_score_threshold,
-                                                               opt::methylation_types);
-            scored_variant.info = "";
-            if (scored_variant.quality > 0) {
-                out_variants.push_back(scored_variant);
-            }
-        }
+        //for (const Variant &v : tmp_variants) {
+        //    auto t0 = std::chrono::high_resolution_clock::now();
+        //    Variant scored_variant = score_variant_thresholded(v,
+        //                                                       test_haplotype,
+        //                                                       event_sequences,
+        //                                                       alignment_flags,
+        //                                                       opt::screen_score_threshold,
+        //                                                       opt::methylation_types);
+        //    scored_variant.info = "";
+        //    if (scored_variant.quality > 0) {
+        //        out_variants.push_back(scored_variant);
+        //    }
+        //}
     }
 }catch (std::exception &e){
     printf("Exception in thread! %s\n", e.what());
@@ -393,42 +404,64 @@ std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& ali
 
             //std::vector<std::thread> workerThreads(num_workers);
             std::vector<std::future<void>> handles(num_workers);
-            int nextLocus = region_start;
 
+	    int lociPerWorker = 12;
+            int nextLocusBegin = region_start;
+	    int nextLocusEnd = region_start;
+
+	    //printf("Initialising workers\n");
             //Initialise workers
             for (int workerIdx = 0; workerIdx < num_workers; workerIdx++) {
                 auto aligner = std::ref(gpuAligners[workerIdx]);
-                if (nextLocus < region_end) {
+                if (nextLocusEnd < region_end) { //TODO: Check this is correct. May be leaving some off at the end. May want to put icrements at start and redo this whole block.
                     handles[workerIdx] = std::async(std::launch::async,
-                                                    singleLocusBaseEditCandidate,
-                                                    nextLocus,
+                                                    locusRangeBaseEditCandidate,
+                                                    nextLocusBegin,
+						    nextLocusEnd,
                                                     std::ref(alignments),
                                                     alignment_flags,
                                                     std::ref(out_variants),
                                                     std::ref(contig),
                                                     aligner,
                                                     std::ref(outVariantsMutex));
-                    nextLocus++;
+		    if ((nextLocusEnd + lociPerWorker) < region_end){
+		      nextLocusBegin = nextLocusEnd + 1;
+		      nextLocusEnd = nextLocusBegin + lociPerWorker - 1;
+		    }else{
+		      nextLocusBegin = nextLocusEnd + 1;
+		      nextLocusEnd = region_end;
+		    }
                 }
             }
+	    //printf("Workers initialised\n");
 
             //Round robin the workers until done
-            while (nextLocus < region_end) {
+            while (nextLocusEnd < region_end) {
                 for (int i = 0; i < num_workers; i++) {
                     auto status = handles[i].wait_for(std::chrono::microseconds(100));
-                    if (status == std::future_status::ready && (nextLocus < region_end)) {
+		    //printf("Got status\n");
+                    if (status == std::future_status::ready && (nextLocusEnd < region_end)) {
+		      //printf("Entering the event loop, locus start is %i and end is %i\n", nextLocusBegin, nextLocusEnd);
                         auto aligner = std::ref(gpuAligners[i]);
+			//printf("Sending work to a worker\n");
                         handles[i].get();
                         handles[i] = std::async(std::launch::async,
-                                                singleLocusBaseEditCandidate,
-                                                nextLocus,
+						locusRangeBaseEditCandidate,
+						nextLocusBegin,
+						nextLocusEnd,
                                                 std::ref(alignments),
                                                 alignment_flags,
                                                 std::ref(out_variants),
                                                 std::ref(contig),
                                                 aligner,
                                                 std::ref(outVariantsMutex));
-                        nextLocus++;
+			if ((nextLocusEnd + lociPerWorker) < region_end){
+			  nextLocusBegin = nextLocusEnd + 1;
+			  nextLocusEnd = nextLocusBegin + lociPerWorker - 1;
+			}else{
+			  nextLocusBegin = nextLocusEnd + 1;
+			  nextLocusEnd = region_end;
+			}
                     }
                 }
             }
@@ -439,15 +472,15 @@ std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& ali
             }
         } else {
             GpuAligner aligner; //TODO: temporary - refactor to get rid of this
-            for (size_t i = region_start; i < region_end; ++i) {
-                singleLocusBaseEditCandidate(i,
-                                             std::ref(alignments),
-                                             alignment_flags,
-                                             std::ref(out_variants),
-                                             std::ref(contig),
-                                             std::ref(aligner),
-                                             std::ref(outVariantsMutex));
-            }
+            //for (size_t i = region_start; i < region_end; ++i) {
+            //    singleLocusBaseEditCandidate(i,
+            //                                 std::ref(alignments),
+            //                                 alignment_flags,
+            //                                 std::ref(out_variants),
+            //                                 std::ref(contig),
+            //                                 std::ref(aligner),
+            //                                 std::ref(outVariantsMutex));
+            //
         }
         return out_variants;
     }

From 9d4323907d6a8634147485fa9e46bff197537ac4 Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Mon, 23 Jul 2018 13:17:25 +0100
Subject: [PATCH 46/80] Increased buffer sizes

---
 src/cuda_kernels/GpuAligner.cu | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index 985c6a95..8767a40f 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -8,7 +8,7 @@
 
 #define EXPAND_TO_STRING(X) #X
 #define TO_STRING(X) EXPAND_TO_STRING(X)
-#define CU_CHECK_ERR(X) if (X != cudaSuccess){printf("CUDA error: %s at line %s\n", cudaGetErrorString(X), TO_STRING(__LINE__));throw std::runtime_error("CUDA ERRROR");}
+#define CU_CHECK_ERR(X) if (X != cudaSuccess){printf("CUDA error: <<%s>> at line %s\n", cudaGetErrorString(X), TO_STRING(__LINE__));throw std::runtime_error("CUDA ERROR");}
 
 __device__ float logsumexpf(float x, float y){
     if(x == -INFINITY && y == -INFINITY){
@@ -523,10 +523,10 @@ __global__ void getScores(float * const eventData,
 GpuAligner::GpuAligner()
 {
     int numModelElements = 4096;
-    int max_num_reads = 1000;
+    int max_num_reads = 5000;
     int readsSizeBuffer = max_num_reads * sizeof(int);
     int max_n_rows = 100;
-    int maxBuffer = 100000 * sizeof(float);  //TODO: allocate more smartly
+    int maxBuffer = 500000 * sizeof(float);  //TODO: allocate more smartly
     int max_num_sequences = 8;
     int max_sequence_length = 50;
 
@@ -550,13 +550,11 @@ GpuAligner::GpuAligner()
     CU_CHECK_ERR(cudaMalloc( (void**)&readLengthsDev, readsSizeBuffer));
     CU_CHECK_ERR(cudaHostAlloc(&readLengthsHost, readsSizeBuffer, cudaHostAllocDefault));
 
-
     // Allocate Device memory for pore model
     CU_CHECK_ERR(cudaMalloc((void**)&poreModelDev, numModelElements * 3 * sizeof(float)));
     CU_CHECK_ERR(cudaHostAlloc(&poreModelHost, numModelElements * sizeof(float) * 3, cudaHostAllocDefault));
 
-
-    CU_CHECK_ERR(cudaMalloc((void**)&numRowsDev, max_n_rows * sizeof(int)));
+    CU_CHECK_ERR(cudaMalloc((void**)&numRowsDev, readsSizeBuffer * sizeof(int)));
 
     CU_CHECK_ERR(cudaMalloc((void**)&eventStartsDev, readsSizeBuffer));
     CU_CHECK_ERR(cudaHostAlloc(&eventStartsHost, readsSizeBuffer, cudaHostAllocDefault));
@@ -782,9 +780,6 @@ std::vector<std::vector<std::vector<double>>> GpuAligner::scoreKernelMod(std::ve
     CU_CHECK_ERR(cudaMemcpyAsync(eventStartsDev, eventStartsHost,
                     numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
 
-    CU_CHECK_ERR(cudaMemcpyAsync(eventStridesDev, eventStridesHost,
-                    numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
-
     CU_CHECK_ERR(cudaMemcpyAsync(eventsPerBaseDev, eventsPerBaseHost,
                     numReads * sizeof(float), cudaMemcpyHostToDevice, streams[0]));
 
@@ -794,6 +789,9 @@ std::vector<std::vector<std::vector<double>>> GpuAligner::scoreKernelMod(std::ve
     CU_CHECK_ERR(cudaMemcpyAsync(shiftDev, shiftHost,
                     numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
 
+    CU_CHECK_ERR(cudaMemcpyAsync(eventStridesDev, eventStridesHost,
+                                 numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
+
     CU_CHECK_ERR(cudaMemcpyAsync(varDev, varHost,
                     numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
 

From d7f2e3184489e67856118034e4aa4499cb58d1e2 Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Mon, 23 Jul 2018 16:12:35 +0100
Subject: [PATCH 47/80] Fixed issue with bases at end not being corrected

---
 src/nanopolish_call_variants.cpp | 61 ++++++++++++++++----------------
 1 file changed, 31 insertions(+), 30 deletions(-)

diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp
index 1a7b2e96..95304c71 100644
--- a/src/nanopolish_call_variants.cpp
+++ b/src/nanopolish_call_variants.cpp
@@ -405,68 +405,69 @@ std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& ali
             //std::vector<std::thread> workerThreads(num_workers);
             std::vector<std::future<void>> handles(num_workers);
 
-	    int lociPerWorker = 12;
+            int lociPerWorker = 12;
             int nextLocusBegin = region_start;
-	    int nextLocusEnd = region_start;
+            int nextLocusEnd = nextLocusBegin + lociPerWorker;
+            bool finished = false;
 
-	    //printf("Initialising workers\n");
-            //Initialise workers
             for (int workerIdx = 0; workerIdx < num_workers; workerIdx++) {
                 auto aligner = std::ref(gpuAligners[workerIdx]);
-                if (nextLocusEnd < region_end) { //TODO: Check this is correct. May be leaving some off at the end. May want to put icrements at start and redo this whole block.
+                if (!finished) {
+                    if (nextLocusEnd == region_end) {
+                        finished = true;
+                    }
                     handles[workerIdx] = std::async(std::launch::async,
                                                     locusRangeBaseEditCandidate,
                                                     nextLocusBegin,
-						    nextLocusEnd,
+                                                    nextLocusEnd,
                                                     std::ref(alignments),
                                                     alignment_flags,
                                                     std::ref(out_variants),
                                                     std::ref(contig),
                                                     aligner,
                                                     std::ref(outVariantsMutex));
-		    if ((nextLocusEnd + lociPerWorker) < region_end){
-		      nextLocusBegin = nextLocusEnd + 1;
-		      nextLocusEnd = nextLocusBegin + lociPerWorker - 1;
-		    }else{
-		      nextLocusBegin = nextLocusEnd + 1;
-		      nextLocusEnd = region_end;
-		    }
+                    if ((nextLocusEnd + lociPerWorker) < region_end){
+                      nextLocusBegin = nextLocusEnd + 1;
+                      nextLocusEnd = nextLocusBegin + lociPerWorker - 1;
+                    }else{
+                      nextLocusBegin = nextLocusEnd + 1;
+                      nextLocusEnd = region_end;
+                    }
                 }
             }
-	    //printf("Workers initialised\n");
 
             //Round robin the workers until done
-            while (nextLocusEnd < region_end) {
+            while (!finished) {
                 for (int i = 0; i < num_workers; i++) {
                     auto status = handles[i].wait_for(std::chrono::microseconds(100));
-		    //printf("Got status\n");
-                    if (status == std::future_status::ready && (nextLocusEnd < region_end)) {
-		      //printf("Entering the event loop, locus start is %i and end is %i\n", nextLocusBegin, nextLocusEnd);
+                    if (status == std::future_status::ready && (!finished)) {
+                        if (nextLocusEnd == region_end){
+                            finished = true;
+                        }
                         auto aligner = std::ref(gpuAligners[i]);
-			//printf("Sending work to a worker\n");
                         handles[i].get();
                         handles[i] = std::async(std::launch::async,
-						locusRangeBaseEditCandidate,
-						nextLocusBegin,
-						nextLocusEnd,
+                                                locusRangeBaseEditCandidate,
+                                                nextLocusBegin,
+                                                nextLocusEnd,
                                                 std::ref(alignments),
                                                 alignment_flags,
                                                 std::ref(out_variants),
                                                 std::ref(contig),
                                                 aligner,
                                                 std::ref(outVariantsMutex));
-			if ((nextLocusEnd + lociPerWorker) < region_end){
-			  nextLocusBegin = nextLocusEnd + 1;
-			  nextLocusEnd = nextLocusBegin + lociPerWorker - 1;
-			}else{
-			  nextLocusBegin = nextLocusEnd + 1;
-			  nextLocusEnd = region_end;
-			}
+                        if ((nextLocusEnd + lociPerWorker) < region_end){
+                          nextLocusBegin = nextLocusEnd + 1;
+                          nextLocusEnd = nextLocusBegin + lociPerWorker - 1;
+                        }else{
+                          nextLocusBegin = nextLocusEnd + 1;
+                          nextLocusEnd = region_end;
+                        }
                     }
                 }
             }
 
-            //Synchronize the remaining ones
+            //Block until all workers are complete
             for (int workerIdx = 0; workerIdx < num_workers; workerIdx++) {
                 handles[workerIdx].wait();
             }

From 18effc5d29f9799d860914cc00eba61acee034e0 Mon Sep 17 00:00:00 2001
From: Mike Vella <mike.vella@nanoporetech.com>
Date: Mon, 23 Jul 2018 16:42:32 +0100
Subject: [PATCH 48/80] 16 workers - better on V100 for now

---
 src/nanopolish_call_variants.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp
index 95304c71..8f6d2caa 100644
--- a/src/nanopolish_call_variants.cpp
+++ b/src/nanopolish_call_variants.cpp
@@ -399,7 +399,7 @@ std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& ali
 
         // Add all positively-scoring single-base changes into the candidate set
         if (opt::gpu) {
-            size_t num_workers = 8;
+            size_t num_workers = 16;
             std::vector<GpuAligner> gpuAligners(num_workers);
 
             //std::vector<std::thread> workerThreads(num_workers);

From 5b09cbc5ee4e824ce84227a80e86e0a1d8b5a04b Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Tue, 24 Jul 2018 16:13:46 +0100
Subject: [PATCH 49/80] Refactor of nanopolish_call_variants.cpp

---
 src/nanopolish_call_variants.cpp | 419 +++++++++++++++++--------------
 1 file changed, 232 insertions(+), 187 deletions(-)

diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp
index 8f6d2caa..fbf033c6 100644
--- a/src/nanopolish_call_variants.cpp
+++ b/src/nanopolish_call_variants.cpp
@@ -283,211 +283,248 @@ void annotate_with_all_support(std::vector<Variant>& variants,
     }
 }
 
+void prepareForBaseEditCandidates(int start,
+                                  int end,
+                                  const AlignmentDB& alignments,
+                                  std::string contig,
+                                  std::vector<std::vector<Variant>> &tmp_variants_vector,
+                                  std::vector<Haplotype> &haplotypes,
+                                  std::vector<std::vector<HMMInputData>> &event_sequences_vector
+){
+    for(int i = start; i<=end; i++){
+        int calling_start = i - opt::screen_flanking_sequence;
+        int calling_end = i + 1 + opt::screen_flanking_sequence;
+
+        if (!alignments.are_coordinates_valid(contig, calling_start, calling_end)) {
+            return;
+        }
+
+        std::vector<Variant> tmp_variants;
+        for (size_t j = 0; j < 4; ++j) {
+            // Substitutions
+            Variant v;
+            v.ref_name = contig;
+            v.ref_position = i;
+            v.ref_seq = alignments.get_reference_substring(contig, i, i);
+            v.alt_seq = "ACGT"[j];
+
+            if (v.ref_seq != v.alt_seq) {
+                tmp_variants.push_back(v);
+            }
+
+            // Insertions
+            v.alt_seq = v.ref_seq + "ACGT"[j];
+            // ignore insertions of the type "A" -> "AA" as these are redundant
+            if (v.alt_seq[1] != v.ref_seq[0]) {
+                tmp_variants.push_back(v);
+            }
+        }
+
+        // deletion
+        Variant del;
+        del.ref_name = contig;
+        del.ref_position = i - 1;
+        del.ref_seq = alignments.get_reference_substring(contig, i - 1, i);
+        del.alt_seq = del.ref_seq[0];
+
+        // ignore deletions of the type "AA" -> "A" as these are redundant
+        if (del.alt_seq[0] != del.ref_seq[1]) {
+            tmp_variants.push_back(del);
+        }
+
+        // Screen variants by score
+        // We do this internally here as it is much faster to get the event sequences
+        // for the entire window for all variants at this position once, rather than
+        // for each variant individually
+        std::vector<HMMInputData> event_sequences = alignments.get_event_subsequences(contig, calling_start, calling_end);
+
+        Haplotype test_haplotype(contig,
+                                 calling_start,
+                                 alignments.get_reference_substring(contig,
+                                                                    calling_start,
+                                                                    calling_end));
+
+        haplotypes.push_back(test_haplotype);
+        event_sequences_vector.push_back(event_sequences);
+        tmp_variants_vector.push_back(tmp_variants);
+    }
+}
 
-void locusRangeBaseEditCandidate(int start, int end,
-				 const AlignmentDB& alignments,
-				 uint32_t alignment_flags,
-				 std::vector<Variant> &out_variants,
-				 std::string contig,
-				 GpuAligner &aligner,
-				 std::mutex &outVariantsMutex){
-  try {
+
+void locusRangeBaseEditCandidateGPU(int start,
+                                    int end,
+                                    const AlignmentDB& alignments,
+                                    uint32_t alignment_flags,
+                                    std::vector<Variant> &out_variants,
+                                    std::string contig,
+                                    GpuAligner &aligner,
+                                    std::mutex &outVariantsMutex) {
     std::vector<std::vector<Variant>> tmp_variants_vector;
     std::vector<Haplotype> haplotypes;
     std::vector<std::vector<HMMInputData>> event_sequences_vector;
-    for(int i = start; i<=end; i++){
-      int calling_start = i - opt::screen_flanking_sequence;
-      int calling_end = i + 1 + opt::screen_flanking_sequence;
-
-      if (!alignments.are_coordinates_valid(contig, calling_start, calling_end)) {
-        return;
-      }
-
-      std::vector<Variant> tmp_variants;
-      for (size_t j = 0; j < 4; ++j) {
-        // Substitutions
-        Variant v;
-        v.ref_name = contig;
-        v.ref_position = i;
-        v.ref_seq = alignments.get_reference_substring(contig, i, i);
-        v.alt_seq = "ACGT"[j];
-
-        if (v.ref_seq != v.alt_seq) {
-	  tmp_variants.push_back(v);
-        }
 
-        // Insertions
-        v.alt_seq = v.ref_seq + "ACGT"[j];
-        // ignore insertions of the type "A" -> "AA" as these are redundant
-        if (v.alt_seq[1] != v.ref_seq[0]) {
-	  tmp_variants.push_back(v);
+    prepareForBaseEditCandidates(start,
+                                 end,
+                                 alignments,
+                                 contig,
+                                 tmp_variants_vector,
+                                 haplotypes,
+                                 event_sequences_vector);
+
+    std::vector<Variant> scoredVariants = aligner.variantScoresThresholded(tmp_variants_vector,
+                                                                           haplotypes,
+                                                                           event_sequences_vector,
+                                                                           alignment_flags,
+                                                                           opt::screen_score_threshold,
+                                                                           opt::methylation_types);
+    for (auto variant: scoredVariants) {
+        if (variant.quality > 0) {
+            std::lock_guard<std::mutex> lock(outVariantsMutex);
+            out_variants.push_back(variant);
         }
-      }
-
-      // deletion
-      Variant del;
-      del.ref_name = contig;
-      del.ref_position = i - 1;
-      del.ref_seq = alignments.get_reference_substring(contig, i - 1, i);
-      del.alt_seq = del.ref_seq[0];
-
-      // ignore deletions of the type "AA" -> "A" as these are redundant
-      if (del.alt_seq[0] != del.ref_seq[1]) {
-        tmp_variants.push_back(del);
-      }
-
-      // Screen variants by score
-      // We do this internally here as it is much faster to get the event sequences
-      // for the entire window for all variants at this position once, rather than
-      // for each variant individually
-      std::vector<HMMInputData> event_sequences =
-	alignments.get_event_subsequences(contig, calling_start, calling_end);
-
-      Haplotype test_haplotype(contig,
-			       calling_start,
-			       alignments.get_reference_substring(contig,
-								  calling_start,
-								  calling_end));
-
-      haplotypes.push_back(test_haplotype);
-      event_sequences_vector.push_back(event_sequences);
-      tmp_variants_vector.push_back(tmp_variants);
     }
-    
-    if (opt::gpu) {
-        std::vector<Variant> scoredVariants = aligner.variantScoresThresholded(tmp_variants_vector,
-									       haplotypes,
-                                                                               event_sequences_vector,
-                                                                               alignment_flags,
-                                                                               opt::screen_score_threshold,
-                                                                               opt::methylation_types);
-        for (auto variant: scoredVariants) {
-            if (variant.quality > 0) {
-                std::lock_guard<std::mutex> lock(outVariantsMutex);
-                out_variants.push_back(variant);
+
+}
+
+void locusRangeBaseEditCandidate(int start,
+                                 int end,
+				                 const AlignmentDB& alignments,
+                                 uint32_t alignment_flags,
+                                 std::vector<Variant> &out_variants,
+                                 std::string contig) {
+    std::vector<std::vector<Variant>> tmp_variants_vector;
+    std::vector<Haplotype> haplotypes;
+    std::vector<std::vector<HMMInputData>> event_sequences_vector;
+
+    prepareForBaseEditCandidates(start,
+                                 end,
+                                 alignments,
+                                 contig,
+                                 tmp_variants_vector,
+                                 haplotypes,
+                                 event_sequences_vector);
+
+    int numHaplotypes = haplotypes.size();
+    for (int haplotypeIDX = 0; haplotypeIDX < numHaplotypes; haplotypeIDX++) {
+        auto variants = tmp_variants_vector[haplotypeIDX];
+        auto test_haplotype = haplotypes[haplotypeIDX];
+        auto event_sequences = event_sequences_vector[haplotypeIDX];
+        for (const Variant &v : variants) {
+            auto t0 = std::chrono::high_resolution_clock::now();
+            Variant scored_variant = score_variant_thresholded(v,
+                                                               test_haplotype,
+                                                               event_sequences,
+                                                               alignment_flags,
+                                                               opt::screen_score_threshold,
+                                                               opt::methylation_types);
+            scored_variant.info = "";
+            if (scored_variant.quality > 0) {
+                out_variants.push_back(scored_variant);
             }
         }
-    } else {
-        //for (const Variant &v : tmp_variants) {
-        //    auto t0 = std::chrono::high_resolution_clock::now();
-        //    Variant scored_variant = score_variant_thresholded(v,
-        //                                                       test_haplotype,
-        //                                                       event_sequences,
-        //                                                       alignment_flags,
-        //                                                       opt::screen_score_threshold,
-        //                                                       opt::methylation_types);
-        //    scored_variant.info = "";
-        //    if (scored_variant.quality > 0) {
-        //        out_variants.push_back(scored_variant);
-        //    }
-        //}
     }
-}catch (std::exception &e){
-    printf("Exception in thread! %s\n", e.what());
-}
 }
 
-// Given the input region, calculate all single base edits to the current assembly
-std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& alignments,
-                                                          int region_start,
-                                                          int region_end,
-                                                          uint32_t alignment_flags){
-    try {
-        std::vector<Variant> out_variants;
-        std::string contig = alignments.get_region_contig();
-        std::mutex outVariantsMutex;
-
-        // Add all positively-scoring single-base changes into the candidate set
-        if (opt::gpu) {
-            size_t num_workers = 16;
-            std::vector<GpuAligner> gpuAligners(num_workers);
-
-            //std::vector<std::thread> workerThreads(num_workers);
-            std::vector<std::future<void>> handles(num_workers);
-
-            int lociPerWorker = 12;
-            int nextLocusBegin = region_start;
-            int nextLocusEnd = nextLocusBegin + lociPerWorker;
-            bool finished = false;
-
-            for (int workerIdx = 0; workerIdx < num_workers; workerIdx++) {
-                auto aligner = std::ref(gpuAligners[workerIdx]);
-                if (!finished) {
-                    if (nextLocusEnd == region_end) {
-                        finished = true;
-                    }
-                    handles[workerIdx] = std::async(std::launch::async,
-                                                    locusRangeBaseEditCandidate,
-                                                    nextLocusBegin,
-                                                    nextLocusEnd,
-                                                    std::ref(alignments),
-                                                    alignment_flags,
-                                                    std::ref(out_variants),
-                                                    std::ref(contig),
-                                                    aligner,
-                                                    std::ref(outVariantsMutex));
-                    if ((nextLocusEnd + lociPerWorker) < region_end){
-                      nextLocusBegin = nextLocusEnd + 1;
-                      nextLocusEnd = nextLocusBegin + lociPerWorker - 1;
-                    }else{
-                      nextLocusBegin = nextLocusEnd + 1;
-                      nextLocusEnd = region_end;
-                    }
-                }
-            }
+std::vector<Variant> generate_candidate_single_base_edits_gpu(const AlignmentDB& alignments,
+                                                              int region_start,
+                                                              int region_end,
+                                                              uint32_t alignment_flags){
 
-            //Round robin the workers until done
-            while (!finished) {
-                for (int i = 0; i < num_workers; i++) {
-                    auto status = handles[i].wait_for(std::chrono::microseconds(100));
-                    if (status == std::future_status::ready && (!finished)) {
-                        if (nextLocusEnd == region_end){
-                            finished = true;
-                        }
-                        auto aligner = std::ref(gpuAligners[i]);
-                        handles[i].get();
-                        handles[i] = std::async(std::launch::async,
-                                                locusRangeBaseEditCandidate,
-                                                nextLocusBegin,
-                                                nextLocusEnd,
-                                                std::ref(alignments),
-                                                alignment_flags,
-                                                std::ref(out_variants),
-                                                std::ref(contig),
-                                                aligner,
-                                                std::ref(outVariantsMutex));
-                        if ((nextLocusEnd + lociPerWorker) < region_end){
-                          nextLocusBegin = nextLocusEnd + 1;
-                          nextLocusEnd = nextLocusBegin + lociPerWorker - 1;
-                        }else{
-                          nextLocusBegin = nextLocusEnd + 1;
-                          nextLocusEnd = region_end;
-                        }
-                    }
-                }
+    std::mutex outVariantsMutex;
+    std::vector<Variant> out_variants;
+    std::string contig = alignments.get_region_contig();
+
+    // Add all positively-scoring single-base changes into the candidate set
+    size_t num_workers = 16;
+    std::vector<GpuAligner> gpuAligners(num_workers);
+
+    //std::vector<std::thread> workerThreads(num_workers);
+    std::vector<std::future<void>> handles(num_workers);
+
+    int lociPerWorker = 12;
+    int nextLocusBegin = region_start;
+    int nextLocusEnd = nextLocusBegin + lociPerWorker;
+    bool finished = false;
+
+    for (int workerIdx = 0; workerIdx < num_workers; workerIdx++) {
+        auto aligner = std::ref(gpuAligners[workerIdx]);
+        if (!finished) {
+            if (nextLocusEnd == region_end) {
+                finished = true;
+            }
+            handles[workerIdx] = std::async(std::launch::async,
+                                            locusRangeBaseEditCandidateGPU,
+                                            nextLocusBegin,
+                                            nextLocusEnd,
+                                            std::ref(alignments),
+                                            alignment_flags,
+                                            std::ref(out_variants),
+                                            std::ref(contig),
+                                            aligner,
+                                            std::ref(outVariantsMutex));
+            if ((nextLocusEnd + lociPerWorker) < region_end){
+                nextLocusBegin = nextLocusEnd + 1;
+                nextLocusEnd = nextLocusBegin + lociPerWorker - 1;
+            }else{
+                nextLocusBegin = nextLocusEnd + 1;
+                nextLocusEnd = region_end;
             }
+        }
+    }
 
-            //Block until all workers are complete
-            for (int workerIdx = 0; workerIdx < num_workers; workerIdx++) {
-                handles[workerIdx].wait();
+    //Round robin the workers until done
+    while (!finished) {
+        for (int i = 0; i < num_workers; i++) {
+            auto status = handles[i].wait_for(std::chrono::microseconds(100));
+            if (status == std::future_status::ready && (!finished)) {
+                if (nextLocusEnd == region_end){
+                    finished = true;
+                }
+                auto aligner = std::ref(gpuAligners[i]);
+                handles[i].get();
+                handles[i] = std::async(std::launch::async,
+                                        locusRangeBaseEditCandidateGPU,
+                                        nextLocusBegin,
+                                        nextLocusEnd,
+                                        std::ref(alignments),
+                                        alignment_flags,
+                                        std::ref(out_variants),
+                                        std::ref(contig),
+                                        aligner,
+                                        std::ref(outVariantsMutex));
+                if ((nextLocusEnd + lociPerWorker) < region_end){
+                    nextLocusBegin = nextLocusEnd + 1;
+                    nextLocusEnd = nextLocusBegin + lociPerWorker - 1;
+                }else{
+                    nextLocusBegin = nextLocusEnd + 1;
+                    nextLocusEnd = region_end;
+                }
             }
-        } else {
-            GpuAligner aligner; //TODO: temporary - refactor to get rid of this
-            //for (size_t i = region_start; i < region_end; ++i) {
-            //    singleLocusBaseEditCandidate(i,
-            //                                 std::ref(alignments),
-            //                                 alignment_flags,
-            //                                 std::ref(out_variants),
-            //                                 std::ref(contig),
-            //                                 std::ref(aligner),
-            //                                 std::ref(outVariantsMutex));
-            //
         }
-        return out_variants;
     }
-    catch(std::exception &e){
-        printf("Excpetion in calling thread: %s\n", e.what());
+
+    //Block until all workers are complete
+    for (int workerIdx = 0; workerIdx < num_workers; workerIdx++) {
+        handles[workerIdx].wait();
     }
+    return  out_variants;
+}
+
+// Given the input region, calculate all single base edits to the current assembly
+std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& alignments,
+                                                          int region_start,
+                                                          int region_end,
+                                                          uint32_t alignment_flags){
+    std::vector<Variant> out_variants;
+    std::string contig = alignments.get_region_contig();
+    locusRangeBaseEditCandidate(region_start,
+                                region_end,
+                                alignments,
+                                alignment_flags,
+                                out_variants,
+                                std::ref(contig));
+
+    return out_variants;
 }
 
 // Given the input set of variants, calculate the variants that have a positive score
@@ -1048,7 +1085,15 @@ Haplotype call_variants_for_region(const std::string& contig, int region_start,
     if(opt::consensus_mode) {
 
         // generate single-base edits that have a positive haplotype score
-        std::vector<Variant> single_base_edits = generate_candidate_single_base_edits(alignments, region_start, region_end, alignment_flags);
+
+        std::vector<Variant> single_base_edits;
+        if(opt::gpu) {
+            single_base_edits = generate_candidate_single_base_edits_gpu(alignments, region_start, region_end,
+                                                                         alignment_flags);
+        } else {
+            single_base_edits = generate_candidate_single_base_edits(alignments, region_start, region_end,
+                                                                     alignment_flags);
+        }
         // insert these into the candidate set
         candidate_variants.insert(candidate_variants.end(), single_base_edits.begin(), single_base_edits.end());
 

From 68fb38eb38fadc196ba57f19db1879d37e7228b1 Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Tue, 24 Jul 2018 16:58:11 +0100
Subject: [PATCH 50/80] Fewer and bigger streams

---
 src/cuda_kernels/GpuAligner.cu   | 22 ++++++++++++----------
 src/cuda_kernels/GpuAligner.h    |  4 ++++
 src/nanopolish_call_variants.cpp | 13 ++++++-------
 3 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index 8767a40f..5f2f287c 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -522,13 +522,15 @@ __global__ void getScores(float * const eventData,
 
 GpuAligner::GpuAligner()
 {
-    int numModelElements = 4096;
-    int max_num_reads = 5000;
-    int readsSizeBuffer = max_num_reads * sizeof(int);
-    int max_n_rows = 100;
-    int maxBuffer = 500000 * sizeof(float);  //TODO: allocate more smartly
+    size_t numModelElements = 4096;
+    size_t max_reads_per_worker = LOCI_PER_WORKER * MAX_COVERAGE;
+    int readsSizeBuffer = max_reads_per_worker * sizeof(int);
+    int maxBuffer = max_reads_per_worker * MAX_SEQUENCE_LENGTH * sizeof(int);  //4MB buffer
+
+    //OLD
     int max_num_sequences = 8;
     int max_sequence_length = 50;
+    int max_n_rows = 100;
 
     poreModelInitialized = false;
 
@@ -544,12 +546,12 @@ GpuAligner::GpuAligner()
     CU_CHECK_ERR(cudaMalloc((void**)&logVarDev, readsSizeBuffer));
     CU_CHECK_ERR(cudaHostAlloc(&logVarHost, readsSizeBuffer, cudaHostAllocDefault));
 
-    CU_CHECK_ERR(cudaMalloc( (void**)&eventsPerBaseDev, maxBuffer));
-    CU_CHECK_ERR(cudaHostAlloc(&eventsPerBaseHost, maxBuffer, cudaHostAllocDefault));
-
     CU_CHECK_ERR(cudaMalloc( (void**)&readLengthsDev, readsSizeBuffer));
     CU_CHECK_ERR(cudaHostAlloc(&readLengthsHost, readsSizeBuffer, cudaHostAllocDefault));
 
+    CU_CHECK_ERR(cudaMalloc( (void**)&eventsPerBaseDev, maxBuffer));
+    CU_CHECK_ERR(cudaHostAlloc(&eventsPerBaseHost, maxBuffer, cudaHostAllocDefault));
+
     // Allocate Device memory for pore model
     CU_CHECK_ERR(cudaMalloc((void**)&poreModelDev, numModelElements * 3 * sizeof(float)));
     CU_CHECK_ERR(cudaHostAlloc(&poreModelHost, numModelElements * sizeof(float) * 3, cudaHostAllocDefault));
@@ -605,8 +607,8 @@ GpuAligner::GpuAligner()
         float * returnValuesDev;
         float * returnedValues;
 
-        CU_CHECK_ERR(cudaMalloc((void**)&returnValuesDev, sizeof(float) * max_num_reads)); //one score per read
-        CU_CHECK_ERR(cudaHostAlloc(&returnedValues, max_num_reads * sizeof(float) , cudaHostAllocDefault));
+        CU_CHECK_ERR(cudaMalloc((void**)&returnValuesDev, sizeof(float) * 50)); //one score per read
+        CU_CHECK_ERR(cudaHostAlloc(&returnedValues, 59 * sizeof(float) , cudaHostAllocDefault));
         CU_CHECK_ERR(cudaMalloc((void**)&kmerRanksDev, max_n_rows * sizeof(int)));
 
         kmerRanksDevPointers[i] = kmerRanksDev;
diff --git a/src/cuda_kernels/GpuAligner.h b/src/cuda_kernels/GpuAligner.h
index d72ce9c2..4169ecd3 100644
--- a/src/cuda_kernels/GpuAligner.h
+++ b/src/cuda_kernels/GpuAligner.h
@@ -43,6 +43,10 @@
 #ifndef GPU_ALIGNER_H
 #define GPU_ALIGNER_H
 
+#define LOCI_PER_WORKER 64
+#define MAX_COVERAGE 500
+#define MAX_SEQUENCE_LENGTH 100
+
 //Data to be scored
 typedef struct {
     std::vector<HMMInputSequence> stateSequences;
diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp
index fbf033c6..4bffbfb4 100644
--- a/src/nanopolish_call_variants.cpp
+++ b/src/nanopolish_call_variants.cpp
@@ -435,15 +435,14 @@ std::vector<Variant> generate_candidate_single_base_edits_gpu(const AlignmentDB&
     std::string contig = alignments.get_region_contig();
 
     // Add all positively-scoring single-base changes into the candidate set
-    size_t num_workers = 16;
+    size_t num_workers = 4;
     std::vector<GpuAligner> gpuAligners(num_workers);
 
     //std::vector<std::thread> workerThreads(num_workers);
     std::vector<std::future<void>> handles(num_workers);
 
-    int lociPerWorker = 12;
     int nextLocusBegin = region_start;
-    int nextLocusEnd = nextLocusBegin + lociPerWorker;
+    int nextLocusEnd = nextLocusBegin + LOCI_PER_WORKER;
     bool finished = false;
 
     for (int workerIdx = 0; workerIdx < num_workers; workerIdx++) {
@@ -462,9 +461,9 @@ std::vector<Variant> generate_candidate_single_base_edits_gpu(const AlignmentDB&
                                             std::ref(contig),
                                             aligner,
                                             std::ref(outVariantsMutex));
-            if ((nextLocusEnd + lociPerWorker) < region_end){
+            if ((nextLocusEnd + LOCI_PER_WORKER) < region_end){
                 nextLocusBegin = nextLocusEnd + 1;
-                nextLocusEnd = nextLocusBegin + lociPerWorker - 1;
+                nextLocusEnd = nextLocusBegin + LOCI_PER_WORKER - 1;
             }else{
                 nextLocusBegin = nextLocusEnd + 1;
                 nextLocusEnd = region_end;
@@ -492,9 +491,9 @@ std::vector<Variant> generate_candidate_single_base_edits_gpu(const AlignmentDB&
                                         std::ref(contig),
                                         aligner,
                                         std::ref(outVariantsMutex));
-                if ((nextLocusEnd + lociPerWorker) < region_end){
+                if ((nextLocusEnd + LOCI_PER_WORKER) < region_end){
                     nextLocusBegin = nextLocusEnd + 1;
-                    nextLocusEnd = nextLocusBegin + lociPerWorker - 1;
+                    nextLocusEnd = nextLocusBegin + LOCI_PER_WORKER - 1;
                 }else{
                     nextLocusBegin = nextLocusEnd + 1;
                     nextLocusEnd = region_end;

From bb69f2e259e618b2fc5c8a102d63bcffcf97e86d Mon Sep 17 00:00:00 2001
From: Mike Vella <mike.vella@nanoporetech.com>
Date: Wed, 25 Jul 2018 10:19:10 +0100
Subject: [PATCH 51/80] fixing a memory leak

---
 src/cuda_kernels/GpuAligner.cu   | 502 ++-----------------------------
 src/nanopolish_call_variants.cpp |   6 +-
 2 files changed, 29 insertions(+), 479 deletions(-)

diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index 5f2f287c..078ef203 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -282,243 +282,6 @@ __global__ void getScoresMod (float * poreModelDev,
     }
 }
 
-__global__ void getScores(float * const eventData,
-                          float * const readEventsPerBase,
-                          int * const numRowsPerRead,
-                          int * const eventStarts,
-                          int * const eventStrides,
-                          int * const kmerRanks,
-                          int * const eventOffsets, // Offset to use for getting an event IDX for a specific read (read obtained by block IDX)
-                          float * const poreModelDev,
-                          float * const scaleDev,
-                          float * const shiftDev,
-                          float * const varDev,
-                          float * const logVarDev,
-                          float * const preFlankingDev,
-                          float * const postFlankingDev,
-                          float * returnValues) {
-
-    bool debug = false;
-    if (threadIdx.x == 0 && blockIdx.x == 0){
-        debug = false;
-    }
-
-    // Initialise the prev probability row, which is the row of the DP table
-    int n_kmers = blockDim.x;
-    int n_states = n_kmers * PSR9_NUM_STATES + 2 * PSR9_NUM_STATES; // 3 blocks per kmer and then 3 each for start and end state.
-
-
-
-    __shared__ float returnValue;
-    returnValue = -INFINITY;
-
-    __shared__ float prevProbabilities[MAX_STATES];
-
-    // Initialise the previous probabilities - this may not be quite correct as the intialization is different to the C++ version but I don't think it matter
-    for (int i = 0; i < n_states - PSR9_NUM_STATES; i++) {
-        prevProbabilities[i] = -INFINITY;
-    }
-    for (int i = n_states - PSR9_NUM_STATES; i < n_states; i++) {
-        prevProbabilities[i] = 0.0f; // Is this correct?
-    }
-
-    //Step 1: calculate transitions. For now we are going to use external params.
-    int readIdx = blockIdx.x;
-    float read_events_per_base = readEventsPerBase[readIdx];
-    int numRows = numRowsPerRead[readIdx]; // Number of rows in this DP table.
-    int e_start = eventStarts[readIdx]; // Event start for read
-    int e_stride = eventStrides[readIdx];
-    int e_offset = eventOffsets[readIdx]; // Within the event means etc, the offset needed for this block to get a specific event
-
-    if(debug){
-        printf("Kernel 0 >>> Num Kmers is %i\n", n_kmers);
-        printf("Kernel 0 >>> n_states %i\n", n_states);
-        printf("Kernel 0 >>> num events in read is  %i\n", numRows);
-        printf("Kernel 0 >>> event offset is is  %i\n", e_offset);
-    }
-
-    bool rc = false;
-    if (e_stride == -1){
-        rc = true;
-    }
-
-    int kmerIdx = threadIdx.x;
-
-    uint32_t rank = kmerRanks[kmerIdx + (n_kmers * rc)];
-
-    float pore_mean = poreModelDev[rank * 3];
-    float pore_stdv = poreModelDev[rank * 3 + 1];
-    float pore_log_level_stdv = poreModelDev[rank * 3 + 2];
-
-
-    float p_stay = 1 - (1 / read_events_per_base);
-    float p_skip = 0.0025;
-    float p_bad = 0.001;
-    float p_bad_self = p_bad;
-    float p_skip_self = 0.3;
-
-    float p_mk = p_skip; // probability of not observing an event at all
-    float p_mb = p_bad; // probabilty of observing a bad event
-    float p_mm_self = p_stay; // probability of observing additional events from this k-mer
-    float p_mm_next = 1.0f - p_mm_self - p_mk - p_mb; // normal movement from state to state
-
-    // transitions from event split state in previous block
-    float p_bb = p_bad_self;
-    float p_bk, p_bm_next, p_bm_self;
-    p_bk = p_bm_next = p_bm_self = (1.0f - p_bb) / 3;
-
-    // transitions from kmer skip state in previous block
-    float p_kk = p_skip_self;
-    float p_km = 1.0f - p_kk;
-
-    // We assign some transition probabilities. I believe this is correct and they don't vary by location in the sequence
-    float lp_mk = logf(p_mk);
-    float lp_mb = logf(p_mb);
-    float lp_mm_self = logf(p_mm_self);
-    float lp_mm_next = logf(p_mm_next);
-    float lp_bb = logf(p_bb);
-    float lp_bk = logf(p_bk);
-    float lp_bm_next = logf(p_bm_next);
-    float lp_bm_self = logf(p_bm_self);
-    float lp_kk = logf(p_kk);
-    float lp_km = logf(p_km);
-
-    float lp_sm, lp_ms;
-    lp_sm = lp_ms = 0.0f;
-
-    // Start filling out the "DP table"
-    // Each thread is going to work on an individual P-HMM Block
-    int curBlockIdx = kmerIdx + 1; // Accounts for fact that we are not working with start block.
-    int prevBlockIdx = curBlockIdx -1;
-    int prevBlockOffset = PSR9_NUM_STATES * prevBlockIdx;
-    int curBlockOffset = PSR9_NUM_STATES * curBlockIdx;
-
-    // the penalty is controlled by the transition probability
-    float BAD_EVENT_PENALTY = 0.0f;
-
-    float scale = scaleDev[readIdx];
-    float shift = shiftDev[readIdx];
-    float var = varDev[readIdx];
-    float logVar = logVarDev[readIdx];
-
-    for(int row=1; row<numRows + 1;row++){
-        // Emission probabilities
-        int event_idx = e_start + (row - 1) * e_stride;
-        float event_mean = eventData[e_offset + row - 1];
-        float preFlank = preFlankingDev[e_offset + row - 1];
-        float postFlank = postFlankingDev[e_offset + row - 1];
-
-        float lp_emission_m = lp_match_r9(rank,
-                                          event_mean,
-                                          pore_mean,
-                                          pore_stdv,
-                                          pore_log_level_stdv,
-                                          scale,
-                                          shift,
-                                          var,
-                                          logVar);
-
-
-        float lp_emission_b = BAD_EVENT_PENALTY;
-
-        // Get all the scores for a match
-        float HMT_FROM_SAME_M = lp_mm_self + prevProbabilities[curBlockOffset + PSR9_MATCH];
-        float HMT_FROM_PREV_M = lp_mm_next + prevProbabilities[prevBlockOffset + PSR9_MATCH];
-        float HMT_FROM_SAME_B = lp_bm_self + prevProbabilities[curBlockOffset + PSR9_BAD_EVENT];
-        float HMT_FROM_PREV_B = lp_bm_next + prevProbabilities[prevBlockOffset + PSR9_BAD_EVENT];
-        float HMT_FROM_PREV_K = lp_km + prevProbabilities[prevBlockOffset + PSR9_KMER_SKIP];
-
-        // m_s is the probability of going from the start state
-        // to this kmer. The start state is (currently) only
-        // allowed to go to the first kmer. If ALLOW_PRE_CLIP
-        // is defined, we allow all events before this one to be skipped,
-        // with a penalty;
-        float HMT_FROM_SOFT = (kmerIdx == 0 &&
-                               (event_idx == e_start ||
-                                (HAF_ALLOW_PRE_CLIP)))  ? lp_sm  + preFlank : -INFINITY;
-
-        // calculate the score
-        float sum = HMT_FROM_SAME_M;
-        sum = logsumexpf(sum, HMT_FROM_SOFT);
-        sum = logsumexpf(sum, HMT_FROM_PREV_M);
-        sum = logsumexpf(sum, HMT_FROM_SAME_B);
-        sum = logsumexpf(sum, HMT_FROM_PREV_B);
-        sum = logsumexpf(sum, HMT_FROM_PREV_K);
-        sum += lp_emission_m;
-
-        float newMatchScore = sum;
-
-        // Calculate the bad event scores
-        // state PSR9_BAD_EVENT
-        HMT_FROM_SAME_M = lp_mb + prevProbabilities[curBlockOffset + PSR9_MATCH];
-        HMT_FROM_PREV_M = -INFINITY;
-        HMT_FROM_SAME_B = lp_bb + prevProbabilities[curBlockOffset + PSR9_BAD_EVENT];
-        HMT_FROM_PREV_B = -INFINITY;
-        HMT_FROM_PREV_K = -INFINITY;
-        HMT_FROM_SOFT = -INFINITY;
-
-        sum = HMT_FROM_SAME_M;
-        sum = logsumexpf(sum, HMT_FROM_SAME_B);
-        sum += lp_emission_b;
-
-        float newBadEventScore = sum;
-
-        // Write row out. prevProbabilities now becomes "current probabilities" for evaluating skips.
-        prevProbabilities[curBlockOffset + PSR9_MATCH] = newMatchScore;
-        prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] = newBadEventScore;
-        __syncthreads();
-
-        // state PSR9_KMER_SKIP
-        HMT_FROM_SAME_M = -INFINITY;
-        HMT_FROM_PREV_M = lp_mk + prevProbabilities[prevBlockOffset + PSR9_MATCH];
-        HMT_FROM_SAME_B = -INFINITY;
-        HMT_FROM_PREV_B = lp_bk + prevProbabilities[prevBlockOffset + PSR9_BAD_EVENT];
-        HMT_FROM_SOFT = -INFINITY;
-
-        sum = HMT_FROM_PREV_M;
-        sum = logsumexpf(sum, HMT_FROM_PREV_B);
-        sum = logsumexpf(sum, HMT_FROM_PREV_K);
-
-        float newSkipScore = sum;
-
-        prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] = newSkipScore;
-        __syncthreads();
-
-        //Now need to do the skip-skip transition, which is serial so for now letting one thread execute it.
-        if (threadIdx.x == 0){
-            int firstBlockIdx = 2;
-            float prevSkipScore = prevProbabilities[(firstBlockIdx - 1) *
-						    PSR9_NUM_STATES + PSR9_KMER_SKIP];
-            for (int blkidx = firstBlockIdx; blkidx <= blockDim.x; blkidx++){
-                auto skipIdx = blkidx * PSR9_NUM_STATES + PSR9_KMER_SKIP;
-                float curSkipScore = prevProbabilities[skipIdx + PSR9_KMER_SKIP];
-                HMT_FROM_PREV_K = lp_kk + prevSkipScore;
-                newSkipScore = logsumexpf(curSkipScore, HMT_FROM_PREV_K);
-                prevProbabilities[skipIdx] = newSkipScore;
-                prevSkipScore = newSkipScore;
-            }
-        }
-
-        int lastKmerIdx = n_kmers -1;
-        int lastRowIdx = numRows -1;
-        float end;
-        // Now do the post-clip transition
-        if(kmerIdx == lastKmerIdx && ( (HAF_ALLOW_POST_CLIP) || row == lastRowIdx)) {
-            float lp1 = lp_ms + prevProbabilities[curBlockOffset + PSR9_MATCH] + postFlank;
-            float lp2 = lp_ms + prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] + postFlank;
-            float lp3 = lp_ms + prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] + postFlank;
-
-            end = returnValue;
-            end = logsumexpf(end, lp1);
-            end = logsumexpf(end, lp2);
-            end = logsumexpf(end, lp3);
-            returnValue = end;
-        }
-    }
-    returnValues[blockIdx.x] = returnValue;
-    __syncthreads();
-}
-
 
 GpuAligner::GpuAligner()
 {
@@ -528,7 +291,7 @@ GpuAligner::GpuAligner()
     int maxBuffer = max_reads_per_worker * MAX_SEQUENCE_LENGTH * sizeof(int);  //4MB buffer
 
     //OLD
-    int max_num_sequences = 8;
+    int max_num_sequences = 1; //TODO can get rid of this
     int max_sequence_length = 50;
     int max_n_rows = 100;
 
@@ -546,18 +309,16 @@ GpuAligner::GpuAligner()
     CU_CHECK_ERR(cudaMalloc((void**)&logVarDev, readsSizeBuffer));
     CU_CHECK_ERR(cudaHostAlloc(&logVarHost, readsSizeBuffer, cudaHostAllocDefault));
 
-    CU_CHECK_ERR(cudaMalloc( (void**)&readLengthsDev, readsSizeBuffer));
+    CU_CHECK_ERR(cudaMalloc((void**)&readLengthsDev, readsSizeBuffer));
     CU_CHECK_ERR(cudaHostAlloc(&readLengthsHost, readsSizeBuffer, cudaHostAllocDefault));
 
-    CU_CHECK_ERR(cudaMalloc( (void**)&eventsPerBaseDev, maxBuffer));
+    CU_CHECK_ERR(cudaMalloc((void**)&eventsPerBaseDev, maxBuffer));
     CU_CHECK_ERR(cudaHostAlloc(&eventsPerBaseHost, maxBuffer, cudaHostAllocDefault));
 
     // Allocate Device memory for pore model
     CU_CHECK_ERR(cudaMalloc((void**)&poreModelDev, numModelElements * 3 * sizeof(float)));
     CU_CHECK_ERR(cudaHostAlloc(&poreModelHost, numModelElements * sizeof(float) * 3, cudaHostAllocDefault));
 
-    CU_CHECK_ERR(cudaMalloc((void**)&numRowsDev, readsSizeBuffer * sizeof(int)));
-
     CU_CHECK_ERR(cudaMalloc((void**)&eventStartsDev, readsSizeBuffer));
     CU_CHECK_ERR(cudaHostAlloc(&eventStartsHost, readsSizeBuffer, cudaHostAllocDefault));
 
@@ -600,22 +361,7 @@ GpuAligner::GpuAligner()
     kmerRanksDevPointers.resize(max_num_sequences);
     returnValuesDevResultsPointers.resize(max_num_sequences);
 
-
-    //. This is the "old" way
     for (int i =0; i<max_num_sequences;i++){
-        int * kmerRanksDev;
-        float * returnValuesDev;
-        float * returnedValues;
-
-        CU_CHECK_ERR(cudaMalloc((void**)&returnValuesDev, sizeof(float) * 50)); //one score per read
-        CU_CHECK_ERR(cudaHostAlloc(&returnedValues, 59 * sizeof(float) , cudaHostAllocDefault));
-        CU_CHECK_ERR(cudaMalloc((void**)&kmerRanksDev, max_n_rows * sizeof(int)));
-
-        kmerRanksDevPointers[i] = kmerRanksDev;
-        returnValuesDevResultsPointers[i] = returnValuesDev;
-        returnValuesHostResultsPointers[i] = returnedValues;
-
-        //create a stream per sequence
         cudaStreamCreate(&streams[i]);
     }
 }
@@ -626,9 +372,9 @@ GpuAligner::~GpuAligner() {
     CU_CHECK_ERR(cudaFree(shiftDev));
     CU_CHECK_ERR(cudaFree(varDev));
     CU_CHECK_ERR(cudaFree(logVarDev));
-    CU_CHECK_ERR(cudaFree(eventMeansDev));
     CU_CHECK_ERR(cudaFree(eventsPerBaseDev));
-    CU_CHECK_ERR(cudaFree(numRowsDev));
+    CU_CHECK_ERR(cudaFree(readLengthsDev));
+    CU_CHECK_ERR(cudaFree(eventMeansDev));
     CU_CHECK_ERR(cudaFree(eventStartsDev));
     CU_CHECK_ERR(cudaFree(eventStridesDev));
     CU_CHECK_ERR(cudaFree(eventOffsetsDev));
@@ -636,21 +382,28 @@ GpuAligner::~GpuAligner() {
     CU_CHECK_ERR(cudaFree(postFlankingDev));
     CU_CHECK_ERR(cudaFree(kmerRanksDev));
     CU_CHECK_ERR(cudaFree(poreModelDev));
+    CU_CHECK_ERR(cudaFree(sequenceOffsetsDev));
+    CU_CHECK_ERR(cudaFree(sequenceLengthsDev));
+    CU_CHECK_ERR(cudaFree(scoresDev));
+    CU_CHECK_ERR(cudaFree(seqIdxDev));
+    CU_CHECK_ERR(cudaFree(readIdxDev));
 
     CU_CHECK_ERR(cudaFreeHost(eventMeans));
+    CU_CHECK_ERR(cudaFreeHost(poreModelHost));
     CU_CHECK_ERR(cudaFreeHost(preFlankingHost));
     CU_CHECK_ERR(cudaFreeHost(postFlankingHost));
     CU_CHECK_ERR(cudaFreeHost(kmerRanks));
-    CU_CHECK_ERR(cudaFreeHost(poreModelHost));
-
-    int max_num_sequences = 8; // should be a private variable
-    // Free device and host memory
+    CU_CHECK_ERR(cudaFreeHost(sequenceOffsetsHost));
+    CU_CHECK_ERR(cudaFreeHost(returnValuesHost));
+    CU_CHECK_ERR(cudaFreeHost(readLengthsHost));
+    CU_CHECK_ERR(cudaFreeHost(sequenceLengthsHost));
+    CU_CHECK_ERR(cudaFreeHost(seqIdxHost));
+    CU_CHECK_ERR(cudaFreeHost(readIdxHost));
+
+    int max_num_sequences = 1; //TODO can get rid of this
     for (int i =0; i<max_num_sequences; i++) {
       CU_CHECK_ERR(cudaStreamDestroy(streams[i]));
-      CU_CHECK_ERR(cudaFree(returnValuesDevResultsPointers[i]));
-      CU_CHECK_ERR(cudaFreeHost(returnValuesHostResultsPointers[i]));
     }
-
 }
 
 std::vector<std::vector<std::vector<double>>> GpuAligner::scoreKernelMod(std::vector<ScoreSet> &scoreSets,
@@ -808,25 +561,25 @@ std::vector<std::vector<std::vector<double>>> GpuAligner::scoreKernelMod(std::ve
                     numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
 
     // Reads + Flanks
-    CU_CHECK_ERR(cudaMemcpyAsync( eventMeansDev, eventMeans, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice, streams[0] ));
+    CU_CHECK_ERR(cudaMemcpyAsync(eventMeansDev, eventMeans, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice, streams[0] ));
 
-    CU_CHECK_ERR(cudaMemcpyAsync( preFlankingDev, preFlankingHost, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice, streams[0] ));
+    CU_CHECK_ERR(cudaMemcpyAsync(preFlankingDev, preFlankingHost, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice, streams[0] ));
 
-    CU_CHECK_ERR(cudaMemcpyAsync( postFlankingDev, postFlankingHost, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice, streams[0] ));
+    CU_CHECK_ERR(cudaMemcpyAsync(postFlankingDev, postFlankingHost, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice, streams[0] ));
 
     // Sequence statistics
 
-    CU_CHECK_ERR(cudaMemcpyAsync( sequenceLengthsDev, sequenceLengthsHost, numSequences * sizeof(int), cudaMemcpyHostToDevice, streams[0] ));
+    CU_CHECK_ERR(cudaMemcpyAsync(sequenceLengthsDev, sequenceLengthsHost, numSequences * sizeof(int), cudaMemcpyHostToDevice, streams[0] ));
 
     // Sequence offsets
-    CU_CHECK_ERR(cudaMemcpyAsync( sequenceOffsetsDev, sequenceOffsetsHost, numSequences * sizeof(int), cudaMemcpyHostToDevice, streams[0] ));
+    CU_CHECK_ERR(cudaMemcpyAsync(sequenceOffsetsDev, sequenceOffsetsHost, numSequences * sizeof(int), cudaMemcpyHostToDevice, streams[0] ));
 
     // Sequences
-    CU_CHECK_ERR(cudaMemcpyAsync( kmerRanksDev, kmerRanks, kmerOffset * sizeof(int), cudaMemcpyHostToDevice, streams[0] ));
+    CU_CHECK_ERR(cudaMemcpyAsync(kmerRanksDev, kmerRanks, kmerOffset * sizeof(int), cudaMemcpyHostToDevice, streams[0] ));
 
     // Job details
-    CU_CHECK_ERR(cudaMemcpyAsync( seqIdxDev, seqIdxHost, globalScoreIdx * sizeof(int), cudaMemcpyHostToDevice, streams[0] ));
-    CU_CHECK_ERR(cudaMemcpyAsync( readIdxDev, readIdxHost, globalScoreIdx * sizeof(int), cudaMemcpyHostToDevice, streams[0] ));
+    CU_CHECK_ERR(cudaMemcpyAsync(seqIdxDev, seqIdxHost, globalScoreIdx * sizeof(int), cudaMemcpyHostToDevice, streams[0] ));
+    CU_CHECK_ERR(cudaMemcpyAsync(readIdxDev, readIdxHost, globalScoreIdx * sizeof(int), cudaMemcpyHostToDevice, streams[0] ));
 
     // Launch Kernels
 
@@ -890,207 +643,6 @@ std::vector<std::vector<std::vector<double>>> GpuAligner::scoreKernelMod(std::ve
     return result;
 }
 
-std::vector<std::vector<double>> GpuAligner::scoreKernel(std::vector<HMMInputSequence> sequences,
-                                                std::vector<HMMInputData> event_sequences,
-                                                uint32_t alignment_flags){
-    // pre-running asserts
-    assert(!sequences.empty());
-    assert(!event_sequences.empty());
-    assert(std::string(sequences[0].get_alphabet()->get_name()) == "nucleotide");
-    for (auto e: event_sequences) {
-        assert(std::string(e.pore_model->pmalphabet->get_name()) == "nucleotide");
-        assert(e.read->pore_type == PT_R9);
-        assert( (e.rc && e.event_stride == -1) || (!e.rc && e.event_stride == 1));
-    }
-
-    int num_reads = event_sequences.size();
-
-    const uint32_t k = event_sequences[0].pore_model->k; //k is the length of a kmer
-
-    std::vector<uint32_t> n_rows; //number of rows in the DP table (n_events) for each read
-    std::vector<uint32_t> e_starts; //event starts in the read for each read
-    std::vector<int> event_strides; //event strides for each read
-    std::vector<std::vector<float>> pre_flanks;
-    std::vector<std::vector<float>> post_flanks;
-    std::vector<float> eventsPerBase;
-
-    //Populate per-read vectors
-    int numEventsTotal = 0;
-    for(auto e: event_sequences){
-        uint32_t e_start = e.event_start_idx;
-        e_starts.push_back(e_start);
-
-        uint32_t e_stride = e.event_stride;
-        event_strides.push_back(e_stride);
-
-        uint32_t e_end = e.event_stop_idx;
-        uint32_t n_events = 0;
-        if(e_end > e_start)
-            n_events = e_end - e_start + 1;
-        else
-            n_events = e_start - e_end + 1;
-
-        n_rows.push_back(n_events);
-        numEventsTotal += n_events;
-
-        std::vector<float> pre_flank = make_pre_flanking(e, e_start, n_events);
-        std::vector<float> post_flank = make_post_flanking(e, e_start, n_events);
-
-        pre_flanks.push_back(pre_flank);
-        post_flanks.push_back(post_flank);
-
-        float readEventsPerBase = e.read->events_per_base[e.strand];
-        eventsPerBase.push_back(readEventsPerBase);
-    }
-
-    //Populate buffers for flanks and scaled means data
-    std::vector<int> eventOffsets;
-    size_t offset = 0;
-    for(int j=0; j<num_reads; j++){
-        auto e = event_sequences[j];
-        eventOffsets.push_back(offset);
-        size_t num_events = n_rows[j];
-        for (int i=0;i<num_events;i++) {
-            auto event_idx =  e_starts[j] + i * event_strides[j];
-            auto scaled = e.read->get_drift_scaled_level(event_idx, e.strand); // send the data in drift scaled
-            eventMeans[offset + i] = scaled;
-            preFlankingHost[offset + i] = pre_flanks[j][i]; //also copy over the pre-flanking data, since it has a 1-1 correspondence with events
-            postFlankingHost[offset + i] = post_flanks[j][i]; //also copy over the pre-flanking data, since it has a 1-1 correspondence with events
-        }
-        offset += num_events;
-    }
-
-    int num_states = event_sequences[0].pore_model->states.size();
-    //Populating read-statistics buffers
-    std::vector<float> scale(num_reads);
-    std::vector<float> shift(num_reads);
-    std::vector<float> var(num_reads);
-    std::vector<float> log_var(num_reads);
-    for (int i=0;i<num_reads;i++){
-        auto read = event_sequences[i];
-        scale[i] = event_sequences[i].read->scalings[read.strand].scale;
-        shift[i] = event_sequences[i].read->scalings[read.strand].shift;
-        var[i] = event_sequences[i].read->scalings[read.strand].var;
-        log_var[i] = event_sequences[i].read->scalings[read.strand].log_var;
-    }
-
-    // Copy to the device all buffers shared across kmer sequences.
-    CU_CHECK_ERR(cudaMemcpyAsync( scaleDev, scale.data(), scale.size() * sizeof(float), cudaMemcpyHostToDevice, streams[0] ));
-    CU_CHECK_ERR(cudaMemcpyAsync( shiftDev, shift.data(), shift.size() * sizeof(float), cudaMemcpyHostToDevice,  streams[0]));
-    CU_CHECK_ERR(cudaMemcpyAsync( varDev, var.data(), var.size() * sizeof(float), cudaMemcpyHostToDevice, streams[0]));
-    CU_CHECK_ERR(cudaMemcpyAsync( logVarDev, log_var.data(), log_var.size() * sizeof(float), cudaMemcpyHostToDevice, streams[0] ));
-    CU_CHECK_ERR(cudaMemcpyAsync( eventsPerBaseDev, eventsPerBase.data(), eventsPerBase.size() * sizeof(float), cudaMemcpyHostToDevice, streams[0]));
-    CU_CHECK_ERR(cudaMemcpyAsync( eventMeansDev, eventMeans, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice, streams[0] ));
-    CU_CHECK_ERR(cudaMemcpyAsync( preFlankingDev, preFlankingHost, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice, streams[0] ));
-    CU_CHECK_ERR(cudaMemcpyAsync( postFlankingDev, postFlankingHost, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice, streams[0] ));
-    CU_CHECK_ERR(cudaMemcpyAsync( numRowsDev, n_rows.data(), n_rows.size() * sizeof(int), cudaMemcpyHostToDevice, streams[0] ));
-    CU_CHECK_ERR(cudaMemcpyAsync( eventStartsDev, e_starts.data(), e_starts.size() * sizeof(int), cudaMemcpyHostToDevice, streams[0] ));
-    CU_CHECK_ERR(cudaMemcpyAsync( eventStridesDev, event_strides.data(), event_strides.size() * sizeof(int), cudaMemcpyHostToDevice, streams[0] ));
-    CU_CHECK_ERR(cudaMemcpyAsync( eventOffsetsDev, eventOffsets.data(), eventOffsets.size() * sizeof(int), cudaMemcpyHostToDevice, streams[0] ));
-
-    // Populate pore model buffers
-    // Assume that every event sequence has the same pore model
-    if (poreModelInitialized == false) {
-          int poreModelEntriesPerState = 3;
-	  for(int st=0; st<num_states; st++){
-	    auto params = event_sequences[0].pore_model->states[st];
-	    poreModelHost[st * poreModelEntriesPerState] = params.level_mean;
-	    poreModelHost[st * poreModelEntriesPerState + 1] = params.level_stdv;
-	    poreModelHost[st * poreModelEntriesPerState + 2] = params.level_log_stdv;
-	  }
-        // copy over the pore model
-        CU_CHECK_ERR(cudaMemcpyAsync(poreModelDev, poreModelHost,
-			  poreModelEntriesPerState * 4096 * sizeof(float), cudaMemcpyHostToDevice, streams[0])); // TODO don't hardcode num kmers
-	  poreModelInitialized = true;
-    }
-
-    //Let's populate a host buffer with all the sequences.
-    size_t  numKmers = 0;
-    for (auto sequence: sequences) {
-        numKmers += (sequence.length() - k + 1);
-    }
-
-    size_t kmerOffset = 0;
-    for (int i = 0; i<sequences.size(); i++) {
-        auto sequence = sequences[i];
-
-        size_t sequenceLength = sequence.length() - k + 1;
-        for(size_t ki = 0; ki < sequenceLength; ++ki) {
-            int rank = sequence.get_kmer_rank(ki, k, false);
-            kmerRanks[ki + kmerOffset] = rank;
-        }
-        kmerRanksDevPointers[i] = kmerRanksDev + kmerOffset;
-        kmerOffset += sequenceLength;
-
-        for(size_t ki = 0; ki < sequenceLength; ++ki) {
-            kmerRanks[ki + kmerOffset] = sequence.get_kmer_rank(ki, k, true);
-        }
-        kmerOffset += sequenceLength;
-    }
-
-    cudaMemcpyAsync(kmerRanksDev, kmerRanks, numKmers * sizeof(int) * 2,
-                    cudaMemcpyHostToDevice, streams[0]);
-
-    uint8_t  MAX_NUM_KMERS = 30;
-    for (size_t i =0; i < sequences.size() ;i++){
-
-        int * kmerRanksDevPtr = kmerRanksDevPointers[i];
-
-        float * returnValuesDev = returnValuesDevResultsPointers[i];
-
-        auto sequence = sequences[i];
-        uint32_t n_kmers = sequence.length() - k + 1; //number of kmers in the sequence
-        uint32_t n_states = PSR9_NUM_STATES * (n_kmers + 2); // + 2 for explicit terminal states
-
-        int num_blocks = n_states / PSR9_NUM_STATES;
-
-        dim3 dimBlock(num_blocks - 2);
-        dim3 dimGrid(num_reads);
-
-        getScores <<< dimGrid, dimBlock, 0, streams[i]>>> (eventMeansDev,
-                eventsPerBaseDev,
-                numRowsDev,
-                eventStartsDev,
-                eventStridesDev,
-                kmerRanksDevPtr,
-                eventOffsetsDev,
-                poreModelDev,							   
-                scaleDev,
-                shiftDev,
-                varDev,
-                logVarDev,
-                preFlankingDev,
-                postFlankingDev,
-                returnValuesDev);
-        cudaError_t err = cudaGetLastError();
-        if (err != cudaSuccess)
-            printf("Errors during kernel execution: %s\n", cudaGetErrorString(err));
-
-    }
-    for (int i = 0; i<8;i++) {
-        cudaMemcpyAsync(returnValuesHostResultsPointers[i], returnValuesDevResultsPointers[i],
-                        num_reads * sizeof(float), cudaMemcpyDeviceToHost, streams[i]);
-    }
-    std::vector<std::vector<double>> results(sequences.size());
-    for (size_t i =0; i<sequences.size();i++) {
-        for(int readIdx=0; readIdx<num_reads;readIdx++) {
-            results[i].resize(num_reads);
-        }
-    }
-
-    for (size_t i = 0; i<sequences.size();i++){
-      cudaStreamSynchronize(streams[i]);
-    }
-
-    for (size_t i =0; i<sequences.size();i++) {
-        for(int readIdx=0; readIdx<num_reads;readIdx++) {
-            results[i][readIdx] = (double) returnValuesHostResultsPointers[i][readIdx];
-        }
-    }
-
-    return results;
-}
-
 
 std::vector<Variant> GpuAligner::variantScoresThresholded(std::vector<std::vector<Variant>> input_variants_vector,
                                                           std::vector<Haplotype> base_haplotypes,
diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp
index 4bffbfb4..47cc2272 100644
--- a/src/nanopolish_call_variants.cpp
+++ b/src/nanopolish_call_variants.cpp
@@ -38,7 +38,6 @@
 #include "profiler.h"
 #include "progress.h"
 #include "stdaln.h"
-#include <chrono>
 #include <cuda_kernels/GpuAligner.h>
 #include <thread>
 #include <chrono>
@@ -410,7 +409,6 @@ void locusRangeBaseEditCandidate(int start,
         auto test_haplotype = haplotypes[haplotypeIDX];
         auto event_sequences = event_sequences_vector[haplotypeIDX];
         for (const Variant &v : variants) {
-            auto t0 = std::chrono::high_resolution_clock::now();
             Variant scored_variant = score_variant_thresholded(v,
                                                                test_haplotype,
                                                                event_sequences,
@@ -435,7 +433,7 @@ std::vector<Variant> generate_candidate_single_base_edits_gpu(const AlignmentDB&
     std::string contig = alignments.get_region_contig();
 
     // Add all positively-scoring single-base changes into the candidate set
-    size_t num_workers = 4;
+    size_t num_workers = 8;
     std::vector<GpuAligner> gpuAligners(num_workers);
 
     //std::vector<std::thread> workerThreads(num_workers);
@@ -474,7 +472,7 @@ std::vector<Variant> generate_candidate_single_base_edits_gpu(const AlignmentDB&
     //Round robin the workers until done
     while (!finished) {
         for (int i = 0; i < num_workers; i++) {
-            auto status = handles[i].wait_for(std::chrono::microseconds(100));
+            auto status = handles[i].wait_for(std::chrono::microseconds(0));
             if (status == std::future_status::ready && (!finished)) {
                 if (nextLocusEnd == region_end){
                     finished = true;

From 2b14a68e21d29de2d53fbab77ede27761ea63f29 Mon Sep 17 00:00:00 2001
From: Mike Vella <mike.vella@nanoporetech.com>
Date: Wed, 25 Jul 2018 16:15:33 +0100
Subject: [PATCH 52/80] 40x coverage

---
 src/cuda_kernels/GpuAligner.cu   |  2 ++
 src/cuda_kernels/GpuAligner.h    |  5 +----
 src/nanopolish_call_variants.cpp | 13 +++++++++----
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index 078ef203..aedc4592 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -660,6 +660,8 @@ std::vector<Variant> GpuAligner::variantScoresThresholded(std::vector<std::vecto
     auto base_haplotype = base_haplotypes[scoreSetIdx];
     auto event_sequences = event_sequences_vector[scoreSetIdx];
 
+    event_sequences.resize(40);
+
     int numVariants = input_variants.size();
 
     std::vector<Variant> out_variants = input_variants;
diff --git a/src/cuda_kernels/GpuAligner.h b/src/cuda_kernels/GpuAligner.h
index 4169ecd3..d7b8a826 100644
--- a/src/cuda_kernels/GpuAligner.h
+++ b/src/cuda_kernels/GpuAligner.h
@@ -1,6 +1,3 @@
-//
-// Created by mike on 05/06/18.
-//
 #include <vector>
 #include "nanopolish_variant.h"
 #include <stdio.h>
@@ -43,7 +40,7 @@
 #ifndef GPU_ALIGNER_H
 #define GPU_ALIGNER_H
 
-#define LOCI_PER_WORKER 64
+#define LOCI_PER_WORKER 32
 #define MAX_COVERAGE 500
 #define MAX_SEQUENCE_LENGTH 100
 
diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp
index 47cc2272..5c67d82d 100644
--- a/src/nanopolish_call_variants.cpp
+++ b/src/nanopolish_call_variants.cpp
@@ -132,7 +132,8 @@ namespace opt
     static int min_flanking_sequence = 30;
     static int max_haplotypes = 1000;
     static int max_rounds = 50;
-    static int screen_score_threshold = 1000;
+    static int screen_score_threshold = 100;
+    static int max_coverage_gpu = 40;
     static int screen_flanking_sequence = 10;
     static int debug_alignments = 0;
     static std::vector<std::string> methylation_types;
@@ -433,7 +434,7 @@ std::vector<Variant> generate_candidate_single_base_edits_gpu(const AlignmentDB&
     std::string contig = alignments.get_region_contig();
 
     // Add all positively-scoring single-base changes into the candidate set
-    size_t num_workers = 8;
+    size_t num_workers = opt::num_threads;
     std::vector<GpuAligner> gpuAligners(num_workers);
 
     //std::vector<std::thread> workerThreads(num_workers);
@@ -1085,10 +1086,14 @@ Haplotype call_variants_for_region(const std::string& contig, int region_start,
 
         std::vector<Variant> single_base_edits;
         if(opt::gpu) {
-            single_base_edits = generate_candidate_single_base_edits_gpu(alignments, region_start, region_end,
+            single_base_edits = generate_candidate_single_base_edits_gpu(alignments,
+									 region_start,
+									 region_end,
                                                                          alignment_flags);
         } else {
-            single_base_edits = generate_candidate_single_base_edits(alignments, region_start, region_end,
+            single_base_edits = generate_candidate_single_base_edits(alignments,
+								     region_start,
+								     region_end,
                                                                      alignment_flags);
         }
         // insert these into the candidate set

From f4d53cc3e7dea1ee7faf2d17343de0c360a1c456 Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Thu, 26 Jul 2018 10:10:29 +0100
Subject: [PATCH 53/80] added max coverage

---
 src/cuda_kernels/GpuAligner.cu | 10 ++++++----
 src/cuda_kernels/GpuAligner.h  |  3 ++-
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index aedc4592..c283bc93 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -286,13 +286,13 @@ __global__ void getScoresMod (float * poreModelDev,
 GpuAligner::GpuAligner()
 {
     size_t numModelElements = 4096;
-    size_t max_reads_per_worker = LOCI_PER_WORKER * MAX_COVERAGE;
+    size_t max_reads_per_worker = LOCI_PER_WORKER * MAX_COVERAGE * MAX_NUM_VARIANTS_PER_LOCUS;
     int readsSizeBuffer = max_reads_per_worker * sizeof(int);
-    int maxBuffer = max_reads_per_worker * MAX_SEQUENCE_LENGTH * sizeof(int);  //4MB buffer
+    int maxBuffer = max_reads_per_worker * MAX_SEQUENCE_LENGTH * sizeof(int);
 
     //OLD
     int max_num_sequences = 1; //TODO can get rid of this
-    int max_sequence_length = 50;
+    int max_sequence_length = 100;
     int max_n_rows = 100;
 
     poreModelInitialized = false;
@@ -660,7 +660,9 @@ std::vector<Variant> GpuAligner::variantScoresThresholded(std::vector<std::vecto
     auto base_haplotype = base_haplotypes[scoreSetIdx];
     auto event_sequences = event_sequences_vector[scoreSetIdx];
 
-    event_sequences.resize(40);
+    if (event_sequences.size() > MAX_COVERAGE) {
+        event_sequences.resize(MAX_COVERAGE);
+    }
 
     int numVariants = input_variants.size();
 
diff --git a/src/cuda_kernels/GpuAligner.h b/src/cuda_kernels/GpuAligner.h
index d7b8a826..6e94fc62 100644
--- a/src/cuda_kernels/GpuAligner.h
+++ b/src/cuda_kernels/GpuAligner.h
@@ -41,8 +41,9 @@
 #define GPU_ALIGNER_H
 
 #define LOCI_PER_WORKER 32
-#define MAX_COVERAGE 500
+#define MAX_COVERAGE 400
 #define MAX_SEQUENCE_LENGTH 100
+#define MAX_NUM_VARIANTS_PER_LOCUS 10
 
 //Data to be scored
 typedef struct {

From cf5be6aff9c1e1b80c7864342e359829a978bc90 Mon Sep 17 00:00:00 2001
From: Mike Vella <mike.vella@nanoporetech.com>
Date: Thu, 26 Jul 2018 15:30:10 +0100
Subject: [PATCH 54/80] Finding good max coverage to use

---
 src/cuda_kernels/GpuAligner.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cuda_kernels/GpuAligner.h b/src/cuda_kernels/GpuAligner.h
index 6e94fc62..94989125 100644
--- a/src/cuda_kernels/GpuAligner.h
+++ b/src/cuda_kernels/GpuAligner.h
@@ -41,8 +41,8 @@
 #define GPU_ALIGNER_H
 
 #define LOCI_PER_WORKER 32
-#define MAX_COVERAGE 400
-#define MAX_SEQUENCE_LENGTH 100
+#define MAX_COVERAGE 300
+#define MAX_SEQUENCE_LENGTH 50
 #define MAX_NUM_VARIANTS_PER_LOCUS 10
 
 //Data to be scored

From 6e22a85080690bbef4eca1066994763887281982 Mon Sep 17 00:00:00 2001
From: Mike Vella <mike.vella@nanoporetech.com>
Date: Thu, 26 Jul 2018 16:52:55 +0100
Subject: [PATCH 55/80] Performance tuning for V100

---
 src/cuda_kernels/GpuAligner.h    | 1 +
 src/nanopolish_call_variants.cpp | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/cuda_kernels/GpuAligner.h b/src/cuda_kernels/GpuAligner.h
index 94989125..731f2ed9 100644
--- a/src/cuda_kernels/GpuAligner.h
+++ b/src/cuda_kernels/GpuAligner.h
@@ -44,6 +44,7 @@
 #define MAX_COVERAGE 300
 #define MAX_SEQUENCE_LENGTH 50
 #define MAX_NUM_VARIANTS_PER_LOCUS 10
+#define MAX_NUM_WORKERS 16
 
 //Data to be scored
 typedef struct {
diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp
index 5c67d82d..3dff1b49 100644
--- a/src/nanopolish_call_variants.cpp
+++ b/src/nanopolish_call_variants.cpp
@@ -434,7 +434,7 @@ std::vector<Variant> generate_candidate_single_base_edits_gpu(const AlignmentDB&
     std::string contig = alignments.get_region_contig();
 
     // Add all positively-scoring single-base changes into the candidate set
-    size_t num_workers = opt::num_threads;
+    size_t num_workers = (opt::num_threads < MAX_NUM_WORKERS) ? opt::num_threads : MAX_NUM_WORKERS;
     std::vector<GpuAligner> gpuAligners(num_workers);
 
     //std::vector<std::thread> workerThreads(num_workers);

From e2a35252b92e9e1e8fb8e58b7146e2874130e1a0 Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Fri, 27 Jul 2018 11:28:32 +0100
Subject: [PATCH 56/80] set sleep to 100us

---
 src/nanopolish_call_variants.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp
index 3dff1b49..5d614867 100644
--- a/src/nanopolish_call_variants.cpp
+++ b/src/nanopolish_call_variants.cpp
@@ -473,7 +473,7 @@ std::vector<Variant> generate_candidate_single_base_edits_gpu(const AlignmentDB&
     //Round robin the workers until done
     while (!finished) {
         for (int i = 0; i < num_workers; i++) {
-            auto status = handles[i].wait_for(std::chrono::microseconds(0));
+            auto status = handles[i].wait_for(std::chrono::microseconds(100));
             if (status == std::future_status::ready && (!finished)) {
                 if (nextLocusEnd == region_end){
                     finished = true;

From 1adf4b8ad25c79ea9433af7bd1bbe2fc5337503d Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Thu, 2 Aug 2018 15:33:05 +0100
Subject: [PATCH 57/80] Merged upstream master

---
 README.md                         | 11 ++++--
 src/common/nanopolish_common.h    |  2 +-
 src/common/nanopolish_variant.cpp | 11 +++++-
 src/common/nanopolish_variant.h   | 17 ++++++--
 src/main/nanopolish.cpp           |  4 +-
 src/nanopolish_call_variants.cpp  | 65 +++++++++++++++++++------------
 6 files changed, 74 insertions(+), 36 deletions(-)

diff --git a/README.md b/README.md
index 9609577b..74c9c23b 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,10 @@ nanopolish variants --consensus polished_gpu.fa -w "tig00000001:200000-230000" -
 
 Software package for signal-level analysis of Oxford Nanopore sequencing data. Nanopolish can calculate an improved consensus sequence for a draft genome assembly, detect base modifications, call SNPs and indels with respect to a reference genome and more (see Nanopolish modules, below).
 
+## Release notes
+
+* 0.10.1: `nanopolish variants --consensus` now only outputs a VCF file instead of a fasta sequence. The VCF file describes the changes that need to be made to turn the draft sequence into the polished assembly. A new program, `nanopolish vcf2fasta`, is provided to generate the polished genome (this replaces `nanopolish_merge.py`, see usage instructions below). This change is to avoid issues when merging segments that end on repeat boundaries (reported by Michael Wykes and Chris Wright).
+
 ## Dependencies
 
 A compiler that supports C++11 is needed to build nanopolish. Development of the code is performed using [gcc-4.8](https://gcc.gnu.org/gcc-4.8/).
@@ -43,7 +47,7 @@ When major features have been added or bugs fixed, we will tag and release a new
 ```
 git clone --recursive https://github.com/jts/nanopolish.git
 cd nanopolish
-git checkout v0.7.1
+git checkout v0.9.2
 make
 ```
 
@@ -52,7 +56,6 @@ make
 The main subprograms of nanopolish are:
 
 ```
-nanopolish extract: extract reads in FASTA or FASTQ format from a directory of FAST5 files
 nanopolish call-methylation: predict genomic bases that may be methylated
 nanopolish variants: detect SNPs and indels with respect to a reference genome
 nanopolish variants --consensus: calculate an improved consensus sequence for a draft genome assembly
@@ -89,7 +92,7 @@ Now, we use nanopolish to compute the consensus sequence (the genome is polished
 
 ```
 python nanopolish_makerange.py draft.fa | parallel --results nanopolish.results -P 8 \
-    nanopolish variants --consensus polished.{1}.fa -w {1} -r reads.fa -b reads.sorted.bam -g draft.fa -t 4 --min-candidate-frequency 0.1
+    nanopolish variants --consensus -o polished.{1}.vcf -w {1} -r reads.fa -b reads.sorted.bam -g draft.fa -t 4 --min-candidate-frequency 0.1
 ```
 
 This command will run the consensus algorithm on eight 50kbp segments of the genome at a time, using 4 threads each. Change the ```-P``` and ```--threads``` options as appropriate for the machines you have available.
@@ -97,7 +100,7 @@ This command will run the consensus algorithm on eight 50kbp segments of the gen
 After all polishing jobs are complete, you can merge the individual 50kb segments together back into the final assembly:
 
 ```
-python nanopolish_merge.py polished.*.fa > polished_genome.fa
+nanopolish vcf2fasta -g draft.fa polished.*.vcf > polished_genome.fa
 ```
 
 ## Calling Methylation
diff --git a/src/common/nanopolish_common.h b/src/common/nanopolish_common.h
index 887a5e67..d287ac09 100644
--- a/src/common/nanopolish_common.h
+++ b/src/common/nanopolish_common.h
@@ -18,7 +18,7 @@
 #include "logsum.h"
 
 #define PACKAGE_NAME "nanopolish"
-#define PACKAGE_VERSION "0.9.2"
+#define PACKAGE_VERSION "0.10.1"
 #define PACKAGE_BUGREPORT "https://github.com/jts/nanopolish/issues"
 
 //
diff --git a/src/common/nanopolish_variant.cpp b/src/common/nanopolish_variant.cpp
index b73a6b2b..902756f3 100644
--- a/src/common/nanopolish_variant.cpp
+++ b/src/common/nanopolish_variant.cpp
@@ -18,6 +18,13 @@
 
 //#define DEBUG_HAPLOTYPE_SELECTION 1
 
+std::string Variant::make_vcf_header_key_value(const std::string& key, const std::string& value)
+{
+    std::stringstream ss;
+    ss << "##" << key << "=" << value;
+    return ss.str();
+}
+
 std::string Variant::make_vcf_tag_string(const std::string& tag,
                                          const std::string& id,
                                          int count,
@@ -31,11 +38,11 @@ std::string Variant::make_vcf_tag_string(const std::string& tag,
 }
 
 void Variant::write_vcf_header(FILE* fp,
-                               const std::vector<std::string>& tag_lines)
+                               const std::vector<std::string>& header_lines)
 {
 
     fprintf(fp, "##fileformat=VCFv4.2\n");
-    for(const std::string& line : tag_lines) {
+    for(const std::string& line : header_lines) {
         fprintf(fp, "%s\n", line.c_str());
     }
     fprintf(fp, "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	sample\n");
diff --git a/src/common/nanopolish_variant.h b/src/common/nanopolish_variant.h
index f3f350ba..c9ef3b73 100644
--- a/src/common/nanopolish_variant.h
+++ b/src/common/nanopolish_variant.h
@@ -21,7 +21,9 @@ class AlignmentDB;
 struct Variant
 {
     static void write_vcf_header(FILE* fp, 
-                                 const std::vector<std::string>& tag_lines = std::vector<std::string>());
+                                 const std::vector<std::string>& header_lines = std::vector<std::string>());
+
+    static std::string make_vcf_header_key_value(const std::string& key, const std::string& value);
 
     static std::string make_vcf_tag_string(const std::string& tag,
                                            const std::string& id,
@@ -43,8 +45,8 @@ struct Variant
     void write_vcf(FILE* fp) const
     {
         assert(fp != NULL);
-        const char* gt_def = genotype.empty() ? NULL : "GT";
-        const char* gt_str = genotype.empty() ? NULL : genotype.c_str();
+        const char* gt_def = "GT";
+        const char* gt_str = genotype.empty() ? "." : genotype.c_str();
 
         fprintf(fp, "%s\t%zu\t%s\t", ref_name.c_str(), ref_position + 1, ".");
         fprintf(fp, "%s\t%s\t%.1lf\t", ref_seq.c_str(), alt_seq.c_str(), quality);
@@ -116,6 +118,15 @@ class VariantKeyComp
         }
 };
 
+class VariantKeyEqualityComp
+{
+    public: 
+        inline bool operator()(const Variant& a, const Variant& b)
+        {
+            return a.key() == b.key();
+        }
+};
+
 // Read a collection of variants from a VCF file
 std::vector<Variant> read_variants_from_file(const std::string& filename);
 std::vector<Variant> read_variants_for_region(const std::string& filename,
diff --git a/src/main/nanopolish.cpp b/src/main/nanopolish.cpp
index d25df269..459a3e2e 100644
--- a/src/main/nanopolish.cpp
+++ b/src/main/nanopolish.cpp
@@ -18,6 +18,7 @@
 #include "nanopolish_call_methylation.h"
 #include "nanopolish_scorereads.h"
 #include "nanopolish_phase_reads.h"
+#include "nanopolish_vcf2fasta.h"
 #include "nanopolish_train_poremodel_from_basecalls.h"
 
 int print_usage(int argc, char **argv);
@@ -34,7 +35,8 @@ static std::map< std::string, std::function<int(int, char**)> > programs = {
     {"variants",    call_variants_main},
     {"methyltrain", methyltrain_main},
     {"scorereads",  scorereads_main} ,
-    {"phase-reads",  phase_reads_main} ,
+    {"phase-reads", phase_reads_main} ,
+    {"vcf2fasta",   vcf2fasta_main} ,
     {"call-methylation",  call_methylation_main}
 };
 
diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp
index 5d614867..1d9cf081 100644
--- a/src/nanopolish_call_variants.cpp
+++ b/src/nanopolish_call_variants.cpp
@@ -79,7 +79,7 @@ static const char *CONSENSUS_USAGE_MESSAGE =
 "      --version                        display version\n"
 "      --help                           display this help and exit\n"
 "      --snps                           only call SNPs\n"
-"      --consensus=FILE                 run in consensus calling mode and write polished sequence to FILE\n"
+"      --consensus                      run in consensus calling mode\n"
 "      --fix-homopolymers               run the experimental homopolymer caller\n"
 "      --faster                         minimize compute time while slightly reducing consensus accuracy\n"
 "  -w, --window=STR                     find variants in window STR (format: <chromsome_name>:<start>-<end>)\n"
@@ -188,7 +188,7 @@ static const struct option longopts[] = {
     { "p-skip-self",               required_argument, NULL, OPT_P_SKIP_SELF },
     { "p-bad",                     required_argument, NULL, OPT_P_BAD },
     { "p-bad-self",                required_argument, NULL, OPT_P_BAD_SELF },
-    { "consensus",                 required_argument, NULL, OPT_CONSENSUS },
+    { "consensus",                 no_argument,       NULL, OPT_CONSENSUS },
     { "gpu",                       required_argument, NULL, OPT_GPU },
     { "faster",                    no_argument,       NULL, OPT_FASTER },
     { "fix-homopolymers",          no_argument,       NULL, OPT_FIX_HOMOPOLYMERS },
@@ -889,6 +889,11 @@ Haplotype fix_homopolymers(const Haplotype& input_haplotype,
 
         int size_diff = call - hp_length;
         std::string contig = fixed_haplotype.get_reference_name();
+        Variant v;
+        v.ref_name = contig;
+        v.add_info("TotalReads", event_sequences.size());
+        v.add_info("AlleleCount", 1);
+
         if(size_diff > 0) {
             // add a 1bp insertion in this region
             // the variant might conflict with other variants in the region
@@ -896,12 +901,12 @@ Haplotype fix_homopolymers(const Haplotype& input_haplotype,
             // NB: it is intended that if the call is a 2bp (or greater) insertion
             // we only insert 1bp (for now)
             for(size_t k = hap_hp_start; k <= hap_hp_end; ++k) {
-                Variant v;
-                v.ref_name = contig;
                 v.ref_position = input_haplotype.get_reference_position_for_haplotype_base(k);
+
                 if(v.ref_position == std::string::npos) {
                     continue;
                 }
+
                 v.ref_seq = fixed_haplotype.substr_by_reference(v.ref_position, v.ref_position).get_sequence();
                 if(v.ref_seq.size() == 1 && v.ref_seq[0] == hp_base) {
                     v.alt_seq = v.ref_seq + hp_base;
@@ -916,10 +921,9 @@ Haplotype fix_homopolymers(const Haplotype& input_haplotype,
         } else if(size_diff < 0) {
             // add a 1bp deletion at this position
             for(size_t k = hap_hp_start; k <= hap_hp_end; ++k) {
-                Variant v;
-                v.ref_name = contig;
                 v.ref_position = input_haplotype.get_reference_position_for_haplotype_base(k);
                 v.quality = score;
+
                 if(v.ref_position == std::string::npos) {
                     continue;
                 }
@@ -1035,14 +1039,12 @@ Haplotype call_haplotype_from_candidates(const AlignmentDB& alignments,
             // Apply them to the final haplotype
             for(size_t vi = 0; vi < called_variants.size(); vi++) {
                 derived_haplotype.apply_variant(called_variants[vi]);
-                called_variants[vi].write_vcf(vcf_out);
             }
         }
     }
     return derived_haplotype;
 }
 
-
 Haplotype call_variants_for_region(const std::string& contig, int region_start, int region_end, FILE* out_fp)
 {
     const int BUFFER = opt::min_flanking_sequence + 10;
@@ -1167,13 +1169,6 @@ Haplotype call_variants_for_region(const std::string& contig, int region_start,
             called_haplotype = fix_homopolymers(called_haplotype, alignments);
         }
 
-        // write consensus result
-        FILE* consensus_fp = fopen(opt::consensus_output.c_str(), "w");
-        fprintf(consensus_fp, ">%s:%d-%d\n%s\n", contig.c_str(),
-                                  alignments.get_region_start(),
-                                  alignments.get_region_end(),
-                                  called_haplotype.get_sequence().c_str());
-        fclose(consensus_fp);
     } else {
         //
         // Calling strategy in reference-based variant calling mode
@@ -1210,7 +1205,7 @@ void parse_call_variants_options(int argc, char** argv)
             case '?': die = true; break;
             case 't': arg >> opt::num_threads; break;
             case 'v': opt::verbose++; break;
-            case OPT_CONSENSUS: arg >> opt::consensus_output; opt::consensus_mode = 1; break;
+            case OPT_CONSENSUS: opt::consensus_mode = 1; break;
             case OPT_GPU: opt::gpu = 1; break;
             case OPT_FIX_HOMOPOLYMERS: opt::fix_homopolymers = 1; break;
             case OPT_EFFORT: arg >> opt::screen_score_threshold; break;
@@ -1342,43 +1337,63 @@ int call_variants_main(int argc, char** argv)
     }
 
     // Build the VCF header
-    std::vector<std::string> tag_fields;
+    std::vector<std::string> header_fields;
+
+    std::stringstream polish_window;
+    polish_window << contig << ":" << start_base << "-" << end_base;
+    header_fields.push_back(Variant::make_vcf_header_key_value("nanopolish_window", polish_window.str()));
 
     //
-    tag_fields.push_back(
+    header_fields.push_back(
         Variant::make_vcf_tag_string("INFO", "TotalReads", 1, "Integer",
                                       "The number of event-space reads used to call the variant"));
 
-    tag_fields.push_back(
+    header_fields.push_back(
         Variant::make_vcf_tag_string("INFO", "SupportFraction", 1, "Float",
                                       "The fraction of event-space reads that support the variant"));
 
-    tag_fields.push_back(
+    header_fields.push_back(
         Variant::make_vcf_tag_string("INFO", "BaseCalledReadsWithVariant", 1, "Integer",
                                       "The number of base-space reads that support the variant"));
 
-    tag_fields.push_back(
+    header_fields.push_back(
         Variant::make_vcf_tag_string("INFO", "BaseCalledFraction", 1, "Float",
                                       "The fraction of base-space reads that support the variant"));
 
-    tag_fields.push_back(
+    header_fields.push_back(
             Variant::make_vcf_tag_string("INFO", "AlleleCount", 1, "Integer",
                 "The inferred number of copies of the allele"));
 
     if(opt::calculate_all_support) {
-        tag_fields.push_back(
+        header_fields.push_back(
                 Variant::make_vcf_tag_string("INFO", "SupportFractionByBase", 4, "Integer",
                     "The fraction of reads supporting A,C,G,T at this position"));
 
     }
-    tag_fields.push_back(
+    header_fields.push_back(
             Variant::make_vcf_tag_string("FORMAT", "GT", 1, "String",
                 "Genotype"));
 
-    Variant::write_vcf_header(out_fp, tag_fields);
+    Variant::write_vcf_header(out_fp, header_fields);
 
     Haplotype haplotype = call_variants_for_region(contig, start_base, end_base, out_fp);
 
+    // write the consensus result as a fasta file if requested
+    if(!opt::consensus_output.empty()) {
+        FILE* consensus_fp = fopen(opt::consensus_output.c_str(), "w");
+        fprintf(consensus_fp, ">%s:%d-%d\n%s\n", contig.c_str(),
+                                  start_base,
+                                  end_base,
+                                  haplotype.get_sequence().c_str());
+        fclose(consensus_fp);
+    }
+
+    // write the variants
+    for(const auto& v : haplotype.get_variants()) {
+        v.write_vcf(out_fp);
+    }
+
+    //
     if(out_fp != stdout) {
         fclose(out_fp);
     }

From 2856bbb15a790c1d5b862810cd18da869ca4bbb8 Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Wed, 15 Aug 2018 15:07:48 +0100
Subject: [PATCH 58/80] Adding files for VCF handling which for some reason are
 absent

---
 src/nanopolish_vcf2fasta.cpp | 274 +++++++++++++++++++++++++++++++++++
 src/nanopolish_vcf2fasta.h   |  14 ++
 2 files changed, 288 insertions(+)
 create mode 100644 src/nanopolish_vcf2fasta.cpp
 create mode 100644 src/nanopolish_vcf2fasta.h

diff --git a/src/nanopolish_vcf2fasta.cpp b/src/nanopolish_vcf2fasta.cpp
new file mode 100644
index 00000000..93187985
--- /dev/null
+++ b/src/nanopolish_vcf2fasta.cpp
@@ -0,0 +1,274 @@
+//---------------------------------------------------------
+// Copyright 2018 Ontario Institute for Cancer Research
+// Written by Jared Simpson (jared.simpson@oicr.on.ca)
+//---------------------------------------------------------
+//
+// nanopolish_vcf2fasta - write a new genome sequence
+// by introducing variants from a set of vcf files
+//
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <string>
+#include <vector>
+#include <map>
+#include <inttypes.h>
+#include <assert.h>
+#include <math.h>
+#include <sys/time.h>
+#include <algorithm>
+#include <sstream>
+#include <set>
+#include <omp.h>
+#include <getopt.h>
+#include <fast5.hpp>
+#include "htslib/faidx.h"
+#include "nanopolish_common.h"
+#include "nanopolish_variant.h"
+#include "nanopolish_eventalign.h"
+#include "nanopolish_haplotype.h"
+
+//
+// Getopt
+//
+#define SUBPROGRAM "vcf2fasta"
+
+static const char *VCF2FASTA_VERSION_MESSAGE =
+SUBPROGRAM " Version " PACKAGE_VERSION "\n"
+"Written by Jared Simpson.\n"
+"\n"
+"Copyright 2018 Ontario Institute for Cancer Research\n";
+
+static const char *VCF2FASTA_USAGE_MESSAGE =
+"Usage: " PACKAGE_NAME " " SUBPROGRAM " -g draft.fa segment1.vcf segment2.vcf ...\n"
+"Write a new genome sequence by introducing variants from the input files\n"
+"\n"
+"  -v, --verbose                        display verbose output\n"
+"      --version                        display version\n"
+"      --help                           display this help and exit\n"
+"  -g, --genome=FILE                    the input genome is in FILE\n"
+"\nReport bugs to " PACKAGE_BUGREPORT "\n\n";
+
+namespace opt
+{
+    static unsigned int verbose;
+    static std::vector<std::string> input_vcf_files;
+    static std::string genome_file;
+}
+
+static const char* shortopts = "g:v";
+
+enum { OPT_HELP = 1, OPT_VERSION };
+
+static const struct option longopts[] = {
+    { "verbose",     no_argument,       NULL, 'v' },
+    { "help",        no_argument,       NULL, OPT_HELP },
+    { "version",     no_argument,       NULL, OPT_VERSION },
+    { "genome",      required_argument, NULL, 'g' },
+    { NULL, 0, NULL, 0 }
+};
+
+void parse_vcf2fasta_options(int argc, char** argv)
+{
+    bool die = false;
+    for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) {
+        std::istringstream arg(optarg != NULL ? optarg : "");
+        switch (c) {
+            case '?': die = true; break;
+            case 'v': opt::verbose++; break;
+            case 'g': arg >> opt::genome_file; break;
+            case OPT_HELP:
+                std::cout << VCF2FASTA_USAGE_MESSAGE;
+                exit(EXIT_SUCCESS);
+            case OPT_VERSION:
+                std::cout << VCF2FASTA_VERSION_MESSAGE;
+                exit(EXIT_SUCCESS);
+        }
+    }
+
+    if(opt::genome_file.empty()) {
+        std::cerr << SUBPROGRAM ": -g/--genome file is required\n";
+        die = true;
+    }
+
+    if (argc - optind < 1) {
+        std::cerr << SUBPROGRAM ": not enough arguments\n";
+        die = true;
+    }
+
+    if (die)
+    {
+        std::cout << "\n" << VCF2FASTA_USAGE_MESSAGE;
+        exit(EXIT_FAILURE);
+    }
+
+    for(; optind < argc; ++optind) {
+        opt::input_vcf_files.push_back(argv[optind]);
+    }
+}
+
+int vcf2fasta_main(int argc, char** argv)
+{
+    parse_vcf2fasta_options(argc, argv);
+
+    // Read genome file
+    faidx_t *fai = fai_load(opt::genome_file.c_str());
+
+    // Read VCF files and gather variants for each contig and the polishing window coordinates
+    std::map<std::string, std::vector<Variant>> variants_by_contig;
+    std::map<std::string, std::vector<std::pair<int, int>>> windows_by_contig;
+
+    for(const auto& filename : opt::input_vcf_files) {
+
+        std::string window_str;
+        std::vector<Variant> out;
+        std::ifstream infile(filename);
+        std::string line;
+        while(getline(infile, line)) {
+
+            // parse header
+            if(line[0] == '#') {
+
+                // check for window coordinates
+                if(line.find("nanopolish_window") != std::string::npos) {
+                    std::vector<std::string> fields = split(line, '=');
+                    assert(fields.size() == 2);
+                    window_str = fields[1];
+                }
+            } else {
+                Variant v(line);
+                variants_by_contig[v.ref_name].push_back(v);
+            }
+        }
+
+        if(window_str.empty()) {
+            fprintf(stderr, "error: could not detect polishing window from input file %s\n", filename.c_str());
+            exit(EXIT_FAILURE);
+        }
+
+        std::string window_contig;
+        int window_start, window_end;
+        parse_region_string(window_str, window_contig, window_start, window_end);
+        windows_by_contig[window_contig].push_back(std::make_pair(window_start, window_end));
+    }
+
+    size_t n_contigs = faidx_nseq(fai);
+
+    for(size_t contig_idx = 0; contig_idx < n_contigs; ++contig_idx) {
+        std::string contig = faidx_iseq(fai, contig_idx);
+        int contig_length = faidx_seq_len(fai, contig.c_str());
+
+        // Confirm that all windows on this contig have been polished
+        bool window_check_ok = true;
+        auto& windows = windows_by_contig[contig];
+
+        std::sort(windows.begin(), windows.end());
+        if(windows[0].first != 0) {
+            fprintf(stderr, "error: first %d bases are not covered by a polished window for contig %s.\n", windows[0].first, contig.c_str());
+            window_check_ok = false;
+        }
+
+        for(size_t window_idx = 1; window_idx < windows.size(); ++window_idx) {
+            int prev_start = windows[window_idx - 1].first;
+            int prev_end = windows[window_idx - 1].second;
+            int curr_start = windows[window_idx].first;
+            int curr_end = windows[window_idx].second;
+            if(curr_start > prev_end) {
+                fprintf(stderr, "error: adjacent polishing windows do not overlap (%d-%d and %d-%d)\n", prev_start, prev_end, curr_start, curr_end);
+                window_check_ok = false;
+            }
+        }
+
+        int end_gap = contig_length - windows.back().second;
+        if(end_gap > 500) {
+            fprintf(stderr, "error: last %d bases are not covered by a polished window for contig %s.\n", end_gap, contig.c_str());
+            window_check_ok = false;
+        }
+
+        if(!window_check_ok) {
+            fprintf(stderr, "error: one or more polishing windows are missing. Please check that all nanopolish variants --consensus jobs ran to completion\n");
+            exit(EXIT_FAILURE);
+        }
+
+        int length;
+        char* seq = fai_fetch(fai, contig.c_str(), &length);
+        if(length < 0) {
+            fprintf(stderr, "error: could not fetch contig %s\n", contig.c_str());
+            exit(EXIT_FAILURE);
+        }
+
+        auto& variants = variants_by_contig[contig];
+        std::sort(variants.begin(), variants.end(), sortByPosition);
+
+        // remove duplicate variants
+        VariantKeyEqualityComp vkec;
+        auto last = std::unique(variants.begin(), variants.end(), vkec);
+        variants.erase(last, variants.end());
+
+        assert(variants.size() < (1 << 30));
+        uint32_t deleted_tag = 1 << 30;
+        uint32_t variant_tag = 1 << 31;
+
+        // make a vector holding either a literal character or an index to the variant that needs to be applied
+        std::vector<uint32_t> consensus_record(length);
+        for(size_t i = 0; i < length; ++i) {
+            consensus_record[i] = seq[i];
+        }
+
+        size_t num_skipped = 0;
+        size_t num_subs = 0;
+        size_t num_insertions = 0;
+        size_t num_deletions = 0;
+
+        // update the consensus record according to the variants for this contig
+        size_t applied_variants = 0;
+        for(size_t variant_idx = 0; variant_idx < variants.size(); ++variant_idx) {
+            const Variant& v = variants[variant_idx];
+
+            // check if the variant record matches the reference sequence
+            bool matches_ref = true;
+            for(size_t i = 0; i < v.ref_seq.length(); ++i) {
+                matches_ref = matches_ref && v.ref_seq[i] == consensus_record[v.ref_position + i];
+            }
+
+            if(!matches_ref) {
+                num_skipped += 1;
+                continue;
+            }
+
+            // mark the first base of the reference sequence as a variant and set the index
+            consensus_record[v.ref_position] = variant_tag | variant_idx;
+
+            // mark the subsequent bases of the reference as deleted
+            for(size_t i = 1; i < v.ref_seq.length(); ++i) {
+                consensus_record[v.ref_position + i] = deleted_tag;
+            }
+
+            num_subs += v.ref_seq.length() == v.alt_seq.length();
+            num_insertions += v.ref_seq.length() < v.alt_seq.length();
+            num_deletions += v.ref_seq.length() > v.alt_seq.length();
+        }
+
+        // write out the consensus record
+        std::string out;
+        out.reserve(length);
+        for(size_t i = 0; i < length; ++i) {
+            uint32_t r = consensus_record[i];
+            if(r & variant_tag) {
+                out.append(variants[r & ~variant_tag].alt_seq);
+            } else if(r & ~deleted_tag) {
+                out.append(1, r);
+            } else {
+                assert(r & deleted_tag);
+            }
+        }
+
+        fprintf(stderr, "[vcf2fasta] rewrote contig %s with %zu subs, %zu ins, %zu dels (%zu skipped)\n", contig.c_str(), num_subs, num_insertions, num_deletions, num_skipped);
+        fprintf(stdout, ">%s\n%s\n", contig.c_str(), out.c_str());
+
+        free(seq);
+        seq = NULL;
+    }
+
+    return 0;
+}
diff --git a/src/nanopolish_vcf2fasta.h b/src/nanopolish_vcf2fasta.h
new file mode 100644
index 00000000..729cebe8
--- /dev/null
+++ b/src/nanopolish_vcf2fasta.h
@@ -0,0 +1,14 @@
+//---------------------------------------------------------
+// Copyright 2018 Ontario Institute for Cancer Research
+// Written by Jared Simpson (jared.simpson@oicr.on.ca)
+//---------------------------------------------------------
+//
+// nanopolish_vcf2fasta - write a new genome sequence
+// by introducing variants from a set of vcf files
+//
+#ifndef NANOPOLISH_VCF2FASTA_H
+#define NANOPOLISH_VCF2FASTA_H
+
+int vcf2fasta_main(int argc, char** argv);
+
+#endif

From ca5f7a2967333fc65be976229bd093b23ecd99fe Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Thu, 20 Sep 2018 10:50:28 +0100
Subject: [PATCH 59/80] tidying makefile

---
 Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 5f22a05b..30897c9b 100644
--- a/Makefile
+++ b/Makefile
@@ -10,12 +10,12 @@ SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/ali
 #Basic flags every build needs
 LIBS=-lz
 CXXFLAGS ?= -O3
-CXXFLAGS += -std=c++11 -fopenmp -fsigned-char #-g
+CXXFLAGS += -std=c++11 -fopenmp -fsigned-char
 CFLAGS ?= -std=c99 -O3
 CXX ?= g++
 CC ?= gcc
 NVCC = nvcc
-NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0include -O3 -use_fast_math --default-stream per-thread -restrict #-g
+NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0include -O3 -use_fast_math --default-stream per-thread -restrict
 CURTFLAGS ?= -L/usr/local/cuda-9.0/lib64 -lcudart
 
 # Change the value of HDF5, EIGEN, or HTS below to any value to disable compilation of bundled code

From 5ecf0668635aa16fb7cafd1b39969b9178ac11d0 Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Thu, 20 Sep 2018 11:06:58 +0100
Subject: [PATCH 60/80] tidying

---
 src/cuda_kernels/GpuAligner.cu   | 11 +++--------
 src/nanopolish_call_variants.cpp |  3 ++-
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index c283bc93..31d50890 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -487,8 +487,6 @@ std::vector<std::vector<std::vector<double>>> GpuAligner::scoreKernelMod(std::ve
                             poreModelEntriesPerState * 4096 * sizeof(float), cudaMemcpyHostToDevice, streams[0])); // TODO don't hardcode num kmers
             poreModelInitialized = true;
         }
-        // Sequences
-        // Sequences
         auto & sequences = scoreSet.stateSequences;
         numSequences += sequences.size();
 
@@ -505,7 +503,7 @@ std::vector<std::vector<std::vector<double>>> GpuAligner::scoreKernelMod(std::ve
                 int rank = sequence.get_kmer_rank(ki, k, false);
                 kmerRanks[ki + kmerOffset] = rank;
             }
-            //kmerRanksDevPointers[i] = kmerRanksDev + kmerOffset;
+
             kmerOffset += numKmers;
 
             for(size_t ki = 0; ki < numKmers; ++ki) {
@@ -517,8 +515,7 @@ std::vector<std::vector<std::vector<double>>> GpuAligner::scoreKernelMod(std::ve
 
             sequenceLengthsHost[globalSequenceIdx] = numKmers;
 
-            // Loop over the raw reads, producing a cartesian product of the two
-
+            // Loop over the raw reads, producing a cartesian product of reads and sequences
             auto numReadsInScoreSet = scoreSet.rawData.size();
             for (int r=0; r<numReadsInScoreSet; r++){
                 seqIdxHost[globalScoreIdx] = globalSequenceIdx;
@@ -614,8 +611,7 @@ std::vector<std::vector<std::vector<double>>> GpuAligner::scoreKernelMod(std::ve
     if (err != cudaSuccess)
         printf("Errors during kernel execution: %s\n", cudaGetErrorString(err));
 
-    cudaMemcpyAsync(returnValuesHost, scoresDev,
-                    globalScoreIdx * sizeof(float), cudaMemcpyDeviceToHost, streams[0]);
+    cudaMemcpyAsync(returnValuesHost, scoresDev, globalScoreIdx * sizeof(float), cudaMemcpyDeviceToHost, streams[0]);
     cudaStreamSynchronize(streams[0]);
 
     //Unpack results
@@ -699,7 +695,6 @@ std::vector<Variant> GpuAligner::variantScoresThresholded(std::vector<std::vecto
 
   std::vector<Variant> v;
   if (!event_sequences_vector.empty()) {
-    //std::vector<std::vector<double>> scores = scoreKernel(sequences, event_sequences, alignment_flags);
 
     auto scoresMod = scoreKernelMod(scoreSets, alignment_flags);
 
diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp
index 1d9cf081..806cde2c 100644
--- a/src/nanopolish_call_variants.cpp
+++ b/src/nanopolish_call_variants.cpp
@@ -444,6 +444,7 @@ std::vector<Variant> generate_candidate_single_base_edits_gpu(const AlignmentDB&
     int nextLocusEnd = nextLocusBegin + LOCI_PER_WORKER;
     bool finished = false;
 
+    //Initialise the workers
     for (int workerIdx = 0; workerIdx < num_workers; workerIdx++) {
         auto aligner = std::ref(gpuAligners[workerIdx]);
         if (!finished) {
@@ -470,7 +471,7 @@ std::vector<Variant> generate_candidate_single_base_edits_gpu(const AlignmentDB&
         }
     }
 
-    //Round robin the workers until done
+    //Round robin - assigning work to the workers until out of candidates
     while (!finished) {
         for (int i = 0; i < num_workers; i++) {
             auto status = handles[i].wait_for(std::chrono::microseconds(100));

From 075cee3b83cb76568b81632840f391b0c31920d6 Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Thu, 20 Sep 2018 14:53:36 +0100
Subject: [PATCH 61/80] GPU acceleration of nanopolish consensus

---
 README.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 74c9c23b..ec1bd330 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,5 @@
 # Nanopolish
 
-## GPU acceleration branch - experimental/Work in progress
-
-This branch adds CUDA-enabled GPU acceleration to the nanopolish consensus improvement algorithm. To try this feature run with the `--gpu` flag e.g:
-```
-nanopolish variants --consensus polished_gpu.fa -w "tig00000001:200000-230000" -r reads.fasta -b reads.sorted.bam -g draft.fa --threads=8 --gpu=1
-```
-
 [![Build Status](https://travis-ci.org/jts/nanopolish.svg?branch=master)](https://travis-ci.org/jts/nanopolish)
 
 Software package for signal-level analysis of Oxford Nanopore sequencing data. Nanopolish can calculate an improved consensus sequence for a draft genome assembly, detect base modifications, call SNPs and indels with respect to a reference genome and more (see Nanopolish modules, below).
@@ -119,6 +112,13 @@ Then you can run nanopolish from the image:
 docker run -v /path/to/local/data/data/:/data/ -it :image_id  ./nanopolish eventalign -r /data/reads.fa -b /data/alignments.sorted.bam -g /data/ref.fa
 ```
 
+## GPU acceleration
+
+The nanopolish consensus improvement algorithm can be performed faster using CUDA-enabled GPU acceleration. This is an experimental feature, to try this feature run with the `--gpu` flag e.g:
+```
+nanopolish variants --consensus polished_gpu.fa -w "tig00000001:200000-230000" -r reads.fasta -b reads.sorted.bam -g draft.fa --threads=8 --gpu=1
+```
+
 ## Credits and Thanks
 
 The fast table-driven logsum implementation was provided by Sean Eddy as public domain code. This code was originally part of [hmmer3](http://hmmer.janelia.org/). Nanopolish also includes code from Oxford Nanopore's [scrappie](https://github.com/nanoporetech/scrappie) basecaller. This code is licensed under the MPL.

From 979475093f9f17fed62ee51f4b6a9d5729b547d9 Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Thu, 20 Sep 2018 14:55:34 +0100
Subject: [PATCH 62/80] removed spurious comment

---
 src/common/nanopolish_variant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/common/nanopolish_variant.cpp b/src/common/nanopolish_variant.cpp
index 902756f3..d9fa2498 100644
--- a/src/common/nanopolish_variant.cpp
+++ b/src/common/nanopolish_variant.cpp
@@ -671,7 +671,7 @@ std::vector<Variant> multi_call(VariantGroup& variant_group,
 //
 Variant score_variant_thresholded(const Variant& input_variant,
                                   Haplotype base_haplotype, 
-                                  const std::vector<HMMInputData>& input, // raw reads (I think)
+                                  const std::vector<HMMInputData>& input,
                                   const uint32_t alignment_flags,
                                   const uint32_t score_threshold,
                                   const std::vector<std::string>& methylation_types)

From 3fc628eb7fa858981967f0a30cd07080edba2d8e Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Thu, 20 Sep 2018 15:04:39 +0100
Subject: [PATCH 63/80] setting indentation to 4 to match rest of nanopolish

---
 src/cuda_kernels/GpuAligner.cu | 826 ++++++++++++++++-----------------
 1 file changed, 413 insertions(+), 413 deletions(-)

diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index 31d50890..7e8ece87 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -12,7 +12,7 @@
 
 __device__ float logsumexpf(float x, float y){
     if(x == -INFINITY && y == -INFINITY){
-        return -INFINITY;
+	return -INFINITY;
     }
     float result = fmax(x, y) + log1pf(expf(-fabsf(y - x)));
     return result;
@@ -64,7 +64,7 @@ __global__ void getScoresMod (float * poreModelDev,
 
     bool debug = false;
     if ((threadIdx.x == 0) && (blockIdx.x == 0)){
-        debug = false;
+	debug = false;
     }
 
     // get buffer indices
@@ -72,213 +72,213 @@ __global__ void getScoresMod (float * poreModelDev,
 
     if (scoreIdx < numScores) {
 
-        int readIdx = readIdxDev[scoreIdx];
-        int seqIdx = seqIdxDev[scoreIdx];
-
-        // get read statistics
-        int numEvents = readLengthsDev[readIdx];
-        int readOffset = eventOffsetsDev[readIdx];
-        float read_events_per_base = eventsPerBaseDev[readIdx];
-        int e_start = eventStartsDev[readIdx]; // Event start for read
-        int e_stride = eventStridesDev[readIdx];
-        int e_offset = eventOffsetsDev[readIdx]; // Within the event means etc, the offset needed for this block to get a specific event
-        float scale = scaleDev[readIdx];
-        float shift = shiftDev[readIdx];
-        float var = varDev[readIdx];
-        float logVar = logVarDev[readIdx];
-
-        // get sequence statistics
-        int numKmers = sequenceLengthsDev[seqIdx];
-        int seqOffset = sequenceOffsetsDev[seqIdx];
-
-        int lastRowIdx = numEvents - 1;
-        int lastKmerIdx = numKmers - 1;
-
-        float returnValue = -INFINITY; //Used to sum over the last column.
-        float prevProbabilities[MAX_STATES];
-
-        int numBlocks = numKmers + 2;
-        int numStates = numBlocks * PSR9_NUM_STATES; // 3 blocks per kmer and then 3 each for start and end state.
-
-        if (debug) {
-            printf("Kernel 1 >>> Num Kmers is %i\n", numKmers);
-            printf("Kernel 1 >>> n_states %i\n", numStates);
-            printf("Kernel 1 >>> num events in read is  %i\n", numEvents);
-            printf("Kernel 1 >>> event offset is  %i\n", e_offset);
-        }
-
-        // Initialise the prev probabilities vector
-        for (int i = 0; i < numStates - PSR9_NUM_STATES; i++) {
-            prevProbabilities[i] = -INFINITY;
-        }
-        for (int i = numStates - PSR9_NUM_STATES; i < numStates; i++) {
-            prevProbabilities[i] = 0.0f;
-        }
-
-        bool rc = false;
-        if (e_stride == -1) {
-            rc = true;
-        }
-
-        float p_stay = 1 - (1 / read_events_per_base);
-        float p_skip = 0.0025;
-        float p_bad = 0.001;
-        float p_bad_self = p_bad;
-        float p_skip_self = 0.3;
-        float p_mk = p_skip; // probability of not observing an event at all
-        float p_mb = p_bad; // probabilty of observing a bad event
-        float p_mm_self = p_stay; // probability of observing additional events from this k-mer
-        float p_mm_next = 1.0f - p_mm_self - p_mk - p_mb; // normal movement from state to state
-        // transitions from event split state in previous block
-        float p_bb = p_bad_self;
-        float p_bk, p_bm_next, p_bm_self;
-        p_bk = p_bm_next = p_bm_self = (1.0f - p_bb) / 3;
-        // transitions from kmer skip state in previous block
-        float p_kk = p_skip_self;
-        float p_km = 1.0f - p_kk;
-        // We assign some transition probabilities. I believe this is correct and they don't vary by location in the sequence
-        float lp_mk = logf(p_mk);
-        float lp_mb = logf(p_mb);
-        float lp_mm_self = logf(p_mm_self);
-        float lp_mm_next = logf(p_mm_next);
-        float lp_bb = logf(p_bb);
-        float lp_bk = logf(p_bk);
-        float lp_bm_next = logf(p_bm_next);
-        float lp_bm_self = logf(p_bm_self);
-        float lp_kk = logf(p_kk);
-        float lp_km = logf(p_km);
-        float lp_sm, lp_ms;
-        lp_sm = lp_ms = 0.0f;
-
-        // the penalty is controlled by the transition probability
-        float BAD_EVENT_PENALTY = 0.0f;
-
-        //Fill out the dynamic programming table
-        for (int row = 1; row < numEvents + 1; row++) {//TODO: check that numRows is correct value.
-            //row-specific values
-            int event_idx = e_start + (row - 1) * e_stride;
-            float eventMean = eventMeansDev[e_offset + row - 1];
-            float preFlank = preFlankingDev[e_offset + row - 1];
-            float postFlank = postFlankingDev[e_offset + row - 1];
-
-            float lp_emission_b = BAD_EVENT_PENALTY; //TODO: Can this be taken out of the inner loop?
-
-            //Initialise temp registers
-            float prevMatch = prevProbabilities[PSR9_MATCH];;
-            float prevSkip = prevProbabilities[PSR9_KMER_SKIP];
-            float prevBad = prevProbabilities[PSR9_BAD_EVENT];
-
-            for (int blkIdx = 1; blkIdx < numBlocks - 1; blkIdx++) {
-                int curBlockIdx = blkIdx;
-                int prevBlockIdx = curBlockIdx - 1;
-                int prevBlockOffset = PSR9_NUM_STATES * prevBlockIdx;
-                int curBlockOffset = PSR9_NUM_STATES * curBlockIdx;
-
-                int kmerIdx = blkIdx - 1; // because there is a start block with no associated kmer
-                uint32_t rank = kmerRanksDev[seqOffset + kmerIdx + (numKmers *
-                                                                    rc)]; // TODO understand why this is segfaulting sometimes, why does kmerIdx sometimes exceed 4096
-
-                float pore_mean = poreModelDev[rank * 3];
-                float pore_stdv = poreModelDev[rank * 3 + 1];
-                float pore_log_level_stdv = poreModelDev[rank * 3 + 2];
-
-                float lp_emission_m = lp_match_r9(rank,
-                                                  eventMean,
-                                                  pore_mean,
-                                                  pore_stdv,
-                                                  pore_log_level_stdv,
-                                                  scale,
-                                                  shift,
-                                                  var,
-                                                  logVar);
-
-                // Get all the scores for a match
-                float curMatch = prevProbabilities[curBlockOffset + PSR9_MATCH];
-                float curBad = prevProbabilities[curBlockOffset + PSR9_BAD_EVENT];
-                float curSkip = prevProbabilities[curBlockOffset + PSR9_KMER_SKIP];
-
-                float HMT_FROM_SAME_M = lp_mm_self + curMatch;
-                float HMT_FROM_PREV_M = lp_mm_next + prevMatch;
-                float HMT_FROM_SAME_B = lp_bm_self + curBad;
-                float HMT_FROM_PREV_B = lp_bm_next + prevBad;
-                float HMT_FROM_PREV_K = lp_km + prevSkip;
-
-                // m_s is the probability of going from the start state
-                // to this kmer. The start state is (currently) only
-                // allowed to go to the first kmer. If ALLOW_PRE_CLIP
-                // is defined, we allow all events before this one to be skipped,
-                // with a penalty;
-                float HMT_FROM_SOFT = (kmerIdx == 0 &&
-                                       (event_idx == e_start ||
-                                        (HAF_ALLOW_PRE_CLIP))) ? lp_sm + preFlank : -INFINITY;
-
-                // calculate the score
-                float sum = HMT_FROM_SAME_M;
-                sum = logsumexpf(sum, HMT_FROM_SOFT);
-                sum = logsumexpf(sum, HMT_FROM_PREV_M);
-                sum = logsumexpf(sum, HMT_FROM_SAME_B);
-                sum = logsumexpf(sum, HMT_FROM_PREV_B);
-                sum = logsumexpf(sum, HMT_FROM_PREV_K);
-                sum += lp_emission_m;
-
-                float newMatchScore = sum;
-
-                // Calculate the bad event scores
-                // state PSR9_BAD_EVENT
-                HMT_FROM_SAME_M = lp_mb + curMatch;
-                HMT_FROM_PREV_M = -INFINITY;
-                HMT_FROM_SAME_B = lp_bb + prevBad;
-                HMT_FROM_PREV_B = -INFINITY;
-                HMT_FROM_PREV_K = -INFINITY;
-                HMT_FROM_SOFT = -INFINITY;
-
-                sum = HMT_FROM_SAME_M;
-                sum = logsumexpf(sum, HMT_FROM_SAME_B);
-                sum += lp_emission_b;
-
-                float newBadEventScore = sum;
-
-                // Write row out. prevProbabilities now becomes "current probabilities" for evaluating skips.
-                prevProbabilities[curBlockOffset + PSR9_MATCH] = newMatchScore;
-                prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] = newBadEventScore;
-
-                //Update tmp vars
-                prevMatch = curMatch;
-                prevSkip = curSkip;
-                prevBad = prevBad;
-
-                //Now do the non-skip-skip transition. This relies on the updated vector values.
-                // state PSR9_KMER_SKIP
-                HMT_FROM_PREV_M = lp_mk + prevProbabilities[prevBlockOffset + PSR9_MATCH];
-                HMT_FROM_PREV_B = lp_bk + prevProbabilities[prevBlockOffset + PSR9_BAD_EVENT];
-                HMT_FROM_PREV_K = lp_kk + prevProbabilities[prevBlockOffset + PSR9_KMER_SKIP];
-
-                sum = HMT_FROM_PREV_M;
-                sum = logsumexpf(sum, HMT_FROM_PREV_B);
-                sum = logsumexpf(sum,
-                                 HMT_FROM_PREV_K); //TODO - this is in the 'normal' kernel instead of HMT_FROM_PREV_M - is it wrong?
-                sum = logsumexpf(sum,
-                                 HMT_FROM_PREV_M); //TODO - assume this should probably be in there, but not in current
-
-                float newSkipScore = sum;
-
-                prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] = newSkipScore;
-
-                //post-clip transition
-                if (kmerIdx == lastKmerIdx && ((HAF_ALLOW_POST_CLIP) || row == lastRowIdx)) {
-                    float lp1 = lp_ms + prevProbabilities[curBlockOffset + PSR9_MATCH] + postFlank;
-                    float lp2 = lp_ms + prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] + postFlank;
-                    float lp3 = lp_ms + prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] + postFlank;
-
-                    float end = returnValue;
-                    end = logsumexpf(end, lp1);
-                    end = logsumexpf(end, lp2);
-                    end = logsumexpf(end, lp3);
-                    returnValue = end;
-                }
-            }
-        }
-        returnValuesDev[scoreIdx] = returnValue;
+	int readIdx = readIdxDev[scoreIdx];
+	int seqIdx = seqIdxDev[scoreIdx];
+
+	// get read statistics
+	int numEvents = readLengthsDev[readIdx];
+	int readOffset = eventOffsetsDev[readIdx];
+	float read_events_per_base = eventsPerBaseDev[readIdx];
+	int e_start = eventStartsDev[readIdx]; // Event start for read
+	int e_stride = eventStridesDev[readIdx];
+	int e_offset = eventOffsetsDev[readIdx]; // Within the event means etc, the offset needed for this block to get a specific event
+	float scale = scaleDev[readIdx];
+	float shift = shiftDev[readIdx];
+	float var = varDev[readIdx];
+	float logVar = logVarDev[readIdx];
+
+	// get sequence statistics
+	int numKmers = sequenceLengthsDev[seqIdx];
+	int seqOffset = sequenceOffsetsDev[seqIdx];
+
+	int lastRowIdx = numEvents - 1;
+	int lastKmerIdx = numKmers - 1;
+
+	float returnValue = -INFINITY; //Used to sum over the last column.
+	float prevProbabilities[MAX_STATES];
+
+	int numBlocks = numKmers + 2;
+	int numStates = numBlocks * PSR9_NUM_STATES; // 3 blocks per kmer and then 3 each for start and end state.
+
+	if (debug) {
+	    printf("Kernel 1 >>> Num Kmers is %i\n", numKmers);
+	    printf("Kernel 1 >>> n_states %i\n", numStates);
+	    printf("Kernel 1 >>> num events in read is  %i\n", numEvents);
+	    printf("Kernel 1 >>> event offset is  %i\n", e_offset);
+	}
+
+	// Initialise the prev probabilities vector
+	for (int i = 0; i < numStates - PSR9_NUM_STATES; i++) {
+	    prevProbabilities[i] = -INFINITY;
+	}
+	for (int i = numStates - PSR9_NUM_STATES; i < numStates; i++) {
+	    prevProbabilities[i] = 0.0f;
+	}
+
+	bool rc = false;
+	if (e_stride == -1) {
+	    rc = true;
+	}
+
+	float p_stay = 1 - (1 / read_events_per_base);
+	float p_skip = 0.0025;
+	float p_bad = 0.001;
+	float p_bad_self = p_bad;
+	float p_skip_self = 0.3;
+	float p_mk = p_skip; // probability of not observing an event at all
+	float p_mb = p_bad; // probabilty of observing a bad event
+	float p_mm_self = p_stay; // probability of observing additional events from this k-mer
+	float p_mm_next = 1.0f - p_mm_self - p_mk - p_mb; // normal movement from state to state
+	// transitions from event split state in previous block
+	float p_bb = p_bad_self;
+	float p_bk, p_bm_next, p_bm_self;
+	p_bk = p_bm_next = p_bm_self = (1.0f - p_bb) / 3;
+	// transitions from kmer skip state in previous block
+	float p_kk = p_skip_self;
+	float p_km = 1.0f - p_kk;
+	// We assign some transition probabilities. I believe this is correct and they don't vary by location in the sequence
+	float lp_mk = logf(p_mk);
+	float lp_mb = logf(p_mb);
+	float lp_mm_self = logf(p_mm_self);
+	float lp_mm_next = logf(p_mm_next);
+	float lp_bb = logf(p_bb);
+	float lp_bk = logf(p_bk);
+	float lp_bm_next = logf(p_bm_next);
+	float lp_bm_self = logf(p_bm_self);
+	float lp_kk = logf(p_kk);
+	float lp_km = logf(p_km);
+	float lp_sm, lp_ms;
+	lp_sm = lp_ms = 0.0f;
+
+	// the penalty is controlled by the transition probability
+	float BAD_EVENT_PENALTY = 0.0f;
+
+	//Fill out the dynamic programming table
+	for (int row = 1; row < numEvents + 1; row++) {//TODO: check that numRows is correct value.
+	    //row-specific values
+	    int event_idx = e_start + (row - 1) * e_stride;
+	    float eventMean = eventMeansDev[e_offset + row - 1];
+	    float preFlank = preFlankingDev[e_offset + row - 1];
+	    float postFlank = postFlankingDev[e_offset + row - 1];
+
+	    float lp_emission_b = BAD_EVENT_PENALTY; //TODO: Can this be taken out of the inner loop?
+
+	    //Initialise temp registers
+	    float prevMatch = prevProbabilities[PSR9_MATCH];;
+	    float prevSkip = prevProbabilities[PSR9_KMER_SKIP];
+	    float prevBad = prevProbabilities[PSR9_BAD_EVENT];
+
+	    for (int blkIdx = 1; blkIdx < numBlocks - 1; blkIdx++) {
+		int curBlockIdx = blkIdx;
+		int prevBlockIdx = curBlockIdx - 1;
+		int prevBlockOffset = PSR9_NUM_STATES * prevBlockIdx;
+		int curBlockOffset = PSR9_NUM_STATES * curBlockIdx;
+
+		int kmerIdx = blkIdx - 1; // because there is a start block with no associated kmer
+		uint32_t rank = kmerRanksDev[seqOffset + kmerIdx + (numKmers *
+								    rc)]; // TODO understand why this is segfaulting sometimes, why does kmerIdx sometimes exceed 4096
+
+		float pore_mean = poreModelDev[rank * 3];
+		float pore_stdv = poreModelDev[rank * 3 + 1];
+		float pore_log_level_stdv = poreModelDev[rank * 3 + 2];
+
+		float lp_emission_m = lp_match_r9(rank,
+						  eventMean,
+						  pore_mean,
+						  pore_stdv,
+						  pore_log_level_stdv,
+						  scale,
+						  shift,
+						  var,
+						  logVar);
+
+		// Get all the scores for a match
+		float curMatch = prevProbabilities[curBlockOffset + PSR9_MATCH];
+		float curBad = prevProbabilities[curBlockOffset + PSR9_BAD_EVENT];
+		float curSkip = prevProbabilities[curBlockOffset + PSR9_KMER_SKIP];
+
+		float HMT_FROM_SAME_M = lp_mm_self + curMatch;
+		float HMT_FROM_PREV_M = lp_mm_next + prevMatch;
+		float HMT_FROM_SAME_B = lp_bm_self + curBad;
+		float HMT_FROM_PREV_B = lp_bm_next + prevBad;
+		float HMT_FROM_PREV_K = lp_km + prevSkip;
+
+		// m_s is the probability of going from the start state
+		// to this kmer. The start state is (currently) only
+		// allowed to go to the first kmer. If ALLOW_PRE_CLIP
+		// is defined, we allow all events before this one to be skipped,
+		// with a penalty;
+		float HMT_FROM_SOFT = (kmerIdx == 0 &&
+				       (event_idx == e_start ||
+					(HAF_ALLOW_PRE_CLIP))) ? lp_sm + preFlank : -INFINITY;
+
+		// calculate the score
+		float sum = HMT_FROM_SAME_M;
+		sum = logsumexpf(sum, HMT_FROM_SOFT);
+		sum = logsumexpf(sum, HMT_FROM_PREV_M);
+		sum = logsumexpf(sum, HMT_FROM_SAME_B);
+		sum = logsumexpf(sum, HMT_FROM_PREV_B);
+		sum = logsumexpf(sum, HMT_FROM_PREV_K);
+		sum += lp_emission_m;
+
+		float newMatchScore = sum;
+
+		// Calculate the bad event scores
+		// state PSR9_BAD_EVENT
+		HMT_FROM_SAME_M = lp_mb + curMatch;
+		HMT_FROM_PREV_M = -INFINITY;
+		HMT_FROM_SAME_B = lp_bb + prevBad;
+		HMT_FROM_PREV_B = -INFINITY;
+		HMT_FROM_PREV_K = -INFINITY;
+		HMT_FROM_SOFT = -INFINITY;
+
+		sum = HMT_FROM_SAME_M;
+		sum = logsumexpf(sum, HMT_FROM_SAME_B);
+		sum += lp_emission_b;
+
+		float newBadEventScore = sum;
+
+		// Write row out. prevProbabilities now becomes "current probabilities" for evaluating skips.
+		prevProbabilities[curBlockOffset + PSR9_MATCH] = newMatchScore;
+		prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] = newBadEventScore;
+
+		//Update tmp vars
+		prevMatch = curMatch;
+		prevSkip = curSkip;
+		prevBad = prevBad;
+
+		//Now do the non-skip-skip transition. This relies on the updated vector values.
+		// state PSR9_KMER_SKIP
+		HMT_FROM_PREV_M = lp_mk + prevProbabilities[prevBlockOffset + PSR9_MATCH];
+		HMT_FROM_PREV_B = lp_bk + prevProbabilities[prevBlockOffset + PSR9_BAD_EVENT];
+		HMT_FROM_PREV_K = lp_kk + prevProbabilities[prevBlockOffset + PSR9_KMER_SKIP];
+
+		sum = HMT_FROM_PREV_M;
+		sum = logsumexpf(sum, HMT_FROM_PREV_B);
+		sum = logsumexpf(sum,
+				 HMT_FROM_PREV_K); //TODO - this is in the 'normal' kernel instead of HMT_FROM_PREV_M - is it wrong?
+		sum = logsumexpf(sum,
+				 HMT_FROM_PREV_M); //TODO - assume this should probably be in there, but not in current
+
+		float newSkipScore = sum;
+
+		prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] = newSkipScore;
+
+		//post-clip transition
+		if (kmerIdx == lastKmerIdx && ((HAF_ALLOW_POST_CLIP) || row == lastRowIdx)) {
+		    float lp1 = lp_ms + prevProbabilities[curBlockOffset + PSR9_MATCH] + postFlank;
+		    float lp2 = lp_ms + prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] + postFlank;
+		    float lp3 = lp_ms + prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] + postFlank;
+
+		    float end = returnValue;
+		    end = logsumexpf(end, lp1);
+		    end = logsumexpf(end, lp2);
+		    end = logsumexpf(end, lp3);
+		    returnValue = end;
+		}
+	    }
+	}
+	returnValuesDev[scoreIdx] = returnValue;
     }
 }
 
@@ -362,7 +362,7 @@ GpuAligner::GpuAligner()
     returnValuesDevResultsPointers.resize(max_num_sequences);
 
     for (int i =0; i<max_num_sequences;i++){
-        cudaStreamCreate(&streams[i]);
+	cudaStreamCreate(&streams[i]);
     }
 }
 
@@ -402,7 +402,7 @@ GpuAligner::~GpuAligner() {
 
     int max_num_sequences = 1; //TODO can get rid of this
     for (int i =0; i<max_num_sequences; i++) {
-      CU_CHECK_ERR(cudaStreamDestroy(streams[i]));
+	CU_CHECK_ERR(cudaStreamDestroy(streams[i]));
     }
 }
 
@@ -422,140 +422,140 @@ std::vector<std::vector<std::vector<double>>> GpuAligner::scoreKernelMod(std::ve
 
     //Loop over every scoreset, filling out buffers and counters
     for (int scoreSetIdx=0; scoreSetIdx < numScoreSets; scoreSetIdx++){
-        auto scoreSet = scoreSets[scoreSetIdx];
-        int firstReadIdxinScoreSet = globalReadIdx;
-        //Read data
-        for (int eventSequenceIdx=0; eventSequenceIdx < scoreSet.rawData.size();eventSequenceIdx++){
-            auto e = scoreSet.rawData[eventSequenceIdx];
-            numReads++;
-
-            //Read statistics - populate host buffers
-            scaleHost[globalReadIdx] = e.read->scalings[e.strand].scale;
-            shiftHost[globalReadIdx] = e.read->scalings[e.strand].shift;
-            varHost[globalReadIdx] = e.read->scalings[e.strand].var;
-            logVarHost[globalReadIdx] = e.read->scalings[e.strand].log_var;
-
-            int e_start = e.event_start_idx;
-            eventStartsHost[globalReadIdx] = e_start;
-
-            int e_stride = e.event_stride;
-            eventStridesHost[globalReadIdx] = e_stride;
-
-            uint32_t e_end = e.event_stop_idx;
-            uint32_t n_events;
-            if(e_end > e_start)
-                n_events = e_end - e_start + 1;
-            else
-                n_events = e_start - e_end + 1;
-            readLengthsHost[globalReadIdx] = n_events;
-            numEventsTotal += n_events;
-
-            eventOffsetsHost[globalReadIdx] = rawReadOffset;
-
-            float readEventsPerBase = e.read->events_per_base[e.strand];
-            eventsPerBaseHost[globalReadIdx] = readEventsPerBase;
-
-            std::vector<float> pre_flank = make_pre_flanking(e, e_start, n_events);
-            std::vector<float> post_flank = make_post_flanking(e, e_start, n_events);
-
-            for (int i=0;i<n_events;i++) {
-                auto event_idx =  e_start + i * e_stride;
-                auto scaled = e.read->get_drift_scaled_level(event_idx, e.strand); // send the data in drift scaled
-                eventMeans[rawReadOffset + i] = scaled;
-
-                //populate the pre/post-flanking data, since it has a 1-1 correspondence with events
-                preFlankingHost[rawReadOffset + i] = pre_flank[i];
-                postFlankingHost[rawReadOffset + i] = post_flank[i];
-                }
-
-            rawReadOffset += n_events;
-            globalReadIdx++;
-        }
-        //Pore Model
-        const uint32_t k = scoreSets[0].rawData[0].pore_model->k; //k is the length of a kmer in the pore model
-        if (poreModelInitialized == false) {
-            int num_states = scoreSets[0].rawData[0].pore_model->states.size();
-            int poreModelEntriesPerState = 3;
-            for(int st=0; st<num_states; st++){
-                auto params = scoreSets[0].rawData[0].pore_model->states[st];
-                poreModelHost[st * poreModelEntriesPerState] = params.level_mean;
-                poreModelHost[st * poreModelEntriesPerState + 1] = params.level_stdv;
-                poreModelHost[st * poreModelEntriesPerState + 2] = params.level_log_stdv;
-            }
-            // copy over the pore model
-            CU_CHECK_ERR(cudaMemcpyAsync(poreModelDev, poreModelHost,
-                            poreModelEntriesPerState * 4096 * sizeof(float), cudaMemcpyHostToDevice, streams[0])); // TODO don't hardcode num kmers
-            poreModelInitialized = true;
-        }
-        auto & sequences = scoreSet.stateSequences;
-        numSequences += sequences.size();
-
-        for (int i = 0; i<sequences.size(); i++) {
-            auto sequence = sequences[i];
-
-            sequenceOffsetsHost[globalSequenceIdx] = kmerOffset;
-
-            int sequenceLength = sequence.length();
-
-            int numKmers = sequenceLength - k + 1;
-
-            for(size_t ki = 0; ki < numKmers; ++ki) {
-                int rank = sequence.get_kmer_rank(ki, k, false);
-                kmerRanks[ki + kmerOffset] = rank;
-            }
-
-            kmerOffset += numKmers;
-
-            for(size_t ki = 0; ki < numKmers; ++ki) {
-                int rank = sequence.get_kmer_rank(ki, k, true);
-                kmerRanks[ki + kmerOffset] = rank;
-            }
-
-            kmerOffset += numKmers;
-
-            sequenceLengthsHost[globalSequenceIdx] = numKmers;
-
-            // Loop over the raw reads, producing a cartesian product of reads and sequences
-            auto numReadsInScoreSet = scoreSet.rawData.size();
-            for (int r=0; r<numReadsInScoreSet; r++){
-                seqIdxHost[globalScoreIdx] = globalSequenceIdx;
-                readIdxHost[globalScoreIdx] = firstReadIdxinScoreSet + r;
-                globalScoreIdx++;
-            }
-
-            globalSequenceIdx++;
-            }
+	auto scoreSet = scoreSets[scoreSetIdx];
+	int firstReadIdxinScoreSet = globalReadIdx;
+	//Read data
+	for (int eventSequenceIdx=0; eventSequenceIdx < scoreSet.rawData.size();eventSequenceIdx++){
+	    auto e = scoreSet.rawData[eventSequenceIdx];
+	    numReads++;
+
+	    //Read statistics - populate host buffers
+	    scaleHost[globalReadIdx] = e.read->scalings[e.strand].scale;
+	    shiftHost[globalReadIdx] = e.read->scalings[e.strand].shift;
+	    varHost[globalReadIdx] = e.read->scalings[e.strand].var;
+	    logVarHost[globalReadIdx] = e.read->scalings[e.strand].log_var;
+
+	    int e_start = e.event_start_idx;
+	    eventStartsHost[globalReadIdx] = e_start;
+
+	    int e_stride = e.event_stride;
+	    eventStridesHost[globalReadIdx] = e_stride;
+
+	    uint32_t e_end = e.event_stop_idx;
+	    uint32_t n_events;
+	    if(e_end > e_start)
+		n_events = e_end - e_start + 1;
+	    else
+		n_events = e_start - e_end + 1;
+	    readLengthsHost[globalReadIdx] = n_events;
+	    numEventsTotal += n_events;
+
+	    eventOffsetsHost[globalReadIdx] = rawReadOffset;
+
+	    float readEventsPerBase = e.read->events_per_base[e.strand];
+	    eventsPerBaseHost[globalReadIdx] = readEventsPerBase;
+
+	    std::vector<float> pre_flank = make_pre_flanking(e, e_start, n_events);
+	    std::vector<float> post_flank = make_post_flanking(e, e_start, n_events);
+
+	    for (int i=0;i<n_events;i++) {
+		auto event_idx =  e_start + i * e_stride;
+		auto scaled = e.read->get_drift_scaled_level(event_idx, e.strand); // send the data in drift scaled
+		eventMeans[rawReadOffset + i] = scaled;
+
+		//populate the pre/post-flanking data, since it has a 1-1 correspondence with events
+		preFlankingHost[rawReadOffset + i] = pre_flank[i];
+		postFlankingHost[rawReadOffset + i] = post_flank[i];
+	    }
+
+	    rawReadOffset += n_events;
+	    globalReadIdx++;
+	}
+	//Pore Model
+	const uint32_t k = scoreSets[0].rawData[0].pore_model->k; //k is the length of a kmer in the pore model
+	if (poreModelInitialized == false) {
+	    int num_states = scoreSets[0].rawData[0].pore_model->states.size();
+	    int poreModelEntriesPerState = 3;
+	    for(int st=0; st<num_states; st++){
+		auto params = scoreSets[0].rawData[0].pore_model->states[st];
+		poreModelHost[st * poreModelEntriesPerState] = params.level_mean;
+		poreModelHost[st * poreModelEntriesPerState + 1] = params.level_stdv;
+		poreModelHost[st * poreModelEntriesPerState + 2] = params.level_log_stdv;
+	    }
+	    // copy over the pore model
+	    CU_CHECK_ERR(cudaMemcpyAsync(poreModelDev, poreModelHost,
+					 poreModelEntriesPerState * 4096 * sizeof(float), cudaMemcpyHostToDevice, streams[0])); // TODO don't hardcode num kmers
+	    poreModelInitialized = true;
+	}
+	auto & sequences = scoreSet.stateSequences;
+	numSequences += sequences.size();
+
+	for (int i = 0; i<sequences.size(); i++) {
+	    auto sequence = sequences[i];
+
+	    sequenceOffsetsHost[globalSequenceIdx] = kmerOffset;
+
+	    int sequenceLength = sequence.length();
+
+	    int numKmers = sequenceLength - k + 1;
+
+	    for(size_t ki = 0; ki < numKmers; ++ki) {
+		int rank = sequence.get_kmer_rank(ki, k, false);
+		kmerRanks[ki + kmerOffset] = rank;
+	    }
+
+	    kmerOffset += numKmers;
+
+	    for(size_t ki = 0; ki < numKmers; ++ki) {
+		int rank = sequence.get_kmer_rank(ki, k, true);
+		kmerRanks[ki + kmerOffset] = rank;
+	    }
+
+	    kmerOffset += numKmers;
+
+	    sequenceLengthsHost[globalSequenceIdx] = numKmers;
+
+	    // Loop over the raw reads, producing a cartesian product of reads and sequences
+	    auto numReadsInScoreSet = scoreSet.rawData.size();
+	    for (int r=0; r<numReadsInScoreSet; r++){
+		seqIdxHost[globalScoreIdx] = globalSequenceIdx;
+		readIdxHost[globalScoreIdx] = firstReadIdxinScoreSet + r;
+		globalScoreIdx++;
+	    }
+
+	    globalSequenceIdx++;
+	}
     }
 
     // All data is now in host buffers - perform memcpys
     //Read statistics
     CU_CHECK_ERR(cudaMemcpyAsync(eventStartsDev, eventStartsHost,
-                    numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
+				 numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
 
     CU_CHECK_ERR(cudaMemcpyAsync(eventsPerBaseDev, eventsPerBaseHost,
-                    numReads * sizeof(float), cudaMemcpyHostToDevice, streams[0]));
+				 numReads * sizeof(float), cudaMemcpyHostToDevice, streams[0]));
 
     CU_CHECK_ERR(cudaMemcpyAsync(scaleDev, scaleHost,
-                    numReads * sizeof(float), cudaMemcpyHostToDevice, streams[0]));
+				 numReads * sizeof(float), cudaMemcpyHostToDevice, streams[0]));
 
     CU_CHECK_ERR(cudaMemcpyAsync(shiftDev, shiftHost,
-                    numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
+				 numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
 
     CU_CHECK_ERR(cudaMemcpyAsync(eventStridesDev, eventStridesHost,
-                                 numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
+				 numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
 
     CU_CHECK_ERR(cudaMemcpyAsync(varDev, varHost,
-                    numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
+				 numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
 
     CU_CHECK_ERR(cudaMemcpyAsync(logVarDev, logVarHost,
-                    numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
+				 numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
 
     CU_CHECK_ERR(cudaMemcpyAsync(readLengthsDev, readLengthsHost,
-                                 numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
+				 numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
 
     // Read offsets
     CU_CHECK_ERR(cudaMemcpyAsync(eventOffsetsDev, eventOffsetsHost,
-                    numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
+				 numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
 
     // Reads + Flanks
     CU_CHECK_ERR(cudaMemcpyAsync(eventMeansDev, eventMeans, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice, streams[0] ));
@@ -587,29 +587,29 @@ std::vector<std::vector<std::vector<double>>> GpuAligner::scoreKernelMod(std::ve
 
     //printf("Launching get scores mod kernel\n");
     getScoresMod <<< dimGrid, dimBlock, 0, streams[0]>>> (poreModelDev,
-                                                          readLengthsDev,
-                                                          eventStartsDev,
-                                                          eventStridesDev,
-                                                          eventsPerBaseDev,
-                                                          scaleDev,
-                                                          shiftDev,
-                                                          varDev,
-                                                          logVarDev,
-                                                          eventOffsetsDev,
-                                                          eventMeansDev,
-                                                          preFlankingDev,
-                                                          postFlankingDev,
-                                                          sequenceLengthsDev,
-                                                          sequenceOffsetsDev,
-                                                          kmerRanksDev,
-                                                          seqIdxDev,
-                                                          readIdxDev,
-                                                          globalScoreIdx,
-                                                          scoresDev);
+							  readLengthsDev,
+							  eventStartsDev,
+							  eventStridesDev,
+							  eventsPerBaseDev,
+							  scaleDev,
+							  shiftDev,
+							  varDev,
+							  logVarDev,
+							  eventOffsetsDev,
+							  eventMeansDev,
+							  preFlankingDev,
+							  postFlankingDev,
+							  sequenceLengthsDev,
+							  sequenceOffsetsDev,
+							  kmerRanksDev,
+							  seqIdxDev,
+							  readIdxDev,
+							  globalScoreIdx,
+							  scoresDev);
     cudaError_t err = cudaGetLastError();
 
     if (err != cudaSuccess)
-        printf("Errors during kernel execution: %s\n", cudaGetErrorString(err));
+	printf("Errors during kernel execution: %s\n", cudaGetErrorString(err));
 
     cudaMemcpyAsync(returnValuesHost, scoresDev, globalScoreIdx * sizeof(float), cudaMemcpyDeviceToHost, streams[0]);
     cudaStreamSynchronize(streams[0]);
@@ -619,21 +619,21 @@ std::vector<std::vector<std::vector<double>>> GpuAligner::scoreKernelMod(std::ve
     std::vector<std::vector<std::vector<double>>> result(scoreSets.size());
 
     for(int scoreSetIdx=0; scoreSetIdx<numScoreSets; scoreSetIdx++){
-        auto scoreSet = scoreSets[scoreSetIdx];
-        int numSequences = scoreSet.stateSequences.size();
-        int numReads = scoreSet.rawData.size();
-        for (int seqIdx=0; seqIdx<numSequences; seqIdx++){
+	auto scoreSet = scoreSets[scoreSetIdx];
+	int numSequences = scoreSet.stateSequences.size();
+	int numReads = scoreSet.rawData.size();
+	for (int seqIdx=0; seqIdx<numSequences; seqIdx++){
 
-            std::vector<double> seqScores(numReads);
+	    std::vector<double> seqScores(numReads);
 
-            for (int readIdx=0; readIdx<numReads; readIdx++){
-                float score = returnValuesHost[k];
-                seqScores[readIdx] = score;
-                k++;
-            }
+	    for (int readIdx=0; readIdx<numReads; readIdx++){
+		float score = returnValuesHost[k];
+		seqScores[readIdx] = score;
+		k++;
+	    }
 
-            result[scoreSetIdx].push_back(seqScores);
-        }
+	    result[scoreSetIdx].push_back(seqScores);
+	}
     }
 
     return result;
@@ -646,79 +646,79 @@ std::vector<Variant> GpuAligner::variantScoresThresholded(std::vector<std::vecto
                                                           uint32_t alignment_flags,
                                                           int screen_score_threshold,
                                                           std::vector<std::string> methylation_types) {
-  int numScoreSets = base_haplotypes.size();
-  std::vector<ScoreSet> scoreSets;
-  scoreSets.resize(numScoreSets);
-
-  for(int scoreSetIdx=0; scoreSetIdx<numScoreSets;scoreSetIdx++){
-
-    auto input_variants = input_variants_vector[scoreSetIdx];
-    auto base_haplotype = base_haplotypes[scoreSetIdx];
-    auto event_sequences = event_sequences_vector[scoreSetIdx];
+    int numScoreSets = base_haplotypes.size();
+    std::vector<ScoreSet> scoreSets;
+    scoreSets.resize(numScoreSets);
 
-    if (event_sequences.size() > MAX_COVERAGE) {
-        event_sequences.resize(MAX_COVERAGE);
-    }
-
-    int numVariants = input_variants.size();
+    for(int scoreSetIdx=0; scoreSetIdx<numScoreSets;scoreSetIdx++){
 
-    std::vector<Variant> out_variants = input_variants;
-    std::vector<Haplotype> variant_haplotypes(numVariants, base_haplotype);
+	auto input_variants = input_variants_vector[scoreSetIdx];
+	auto base_haplotype = base_haplotypes[scoreSetIdx];
+	auto event_sequences = event_sequences_vector[scoreSetIdx];
 
-    //loop over the vector, applying the variants to the haplotypes
-    for (int i = 0; i<input_variants.size();i++){
-        variant_haplotypes[i].apply_variant(input_variants[i]);
-    }
+	if (event_sequences.size() > MAX_COVERAGE) {
+	    event_sequences.resize(MAX_COVERAGE);
+	}
 
-    // Make methylated versions of each input sequence. Once for the base haplotype and once each for each variant
+	int numVariants = input_variants.size();
 
-    std::vector<HMMInputSequence> sequences;
+	std::vector<Variant> out_variants = input_variants;
+	std::vector<Haplotype> variant_haplotypes(numVariants, base_haplotype);
 
-    HMMInputSequence base_sequence = generate_methylated_alternatives(base_haplotype.get_sequence(),
-                                                                                    methylation_types)[0]; //TODO: fix for non-zero
+	//loop over the vector, applying the variants to the haplotypes
+	for (int i = 0; i<input_variants.size();i++){
+	    variant_haplotypes[i].apply_variant(input_variants[i]);
+	}
 
-    sequences.push_back(base_sequence);
+	// Make methylated versions of each input sequence. Once for the base haplotype and once each for each variant
 
-    for (auto v: variant_haplotypes){
-        auto variant_sequence = generate_methylated_alternatives(v.get_sequence(), methylation_types)[0];  //TODO: fix for non-zero
-        sequences.push_back(variant_sequence);
-    }
+	std::vector<HMMInputSequence> sequences;
 
-    ScoreSet s = {
-      sequences,
-      event_sequences
-    };
+	HMMInputSequence base_sequence = generate_methylated_alternatives(base_haplotype.get_sequence(),
+									  methylation_types)[0]; //TODO: fix for non-zero
 
-    scoreSets[scoreSetIdx] = s;
+	sequences.push_back(base_sequence);
 
-  }
+	for (auto v: variant_haplotypes){
+	    auto variant_sequence = generate_methylated_alternatives(v.get_sequence(), methylation_types)[0];  //TODO: fix for non-zero
+	    sequences.push_back(variant_sequence);
+	}
 
-  std::vector<Variant> v;
-  if (!event_sequences_vector.empty()) {
+	ScoreSet s = {
+	    sequences,
+	    event_sequences
+	};
 
-    auto scoresMod = scoreKernelMod(scoreSets, alignment_flags);
+	scoreSets[scoreSetIdx] = s;
 
-    // results are now ready, need to unpack them
-    for (int scoreSetIdx=0; scoreSetIdx<numScoreSets; scoreSetIdx++){
-      std::vector<std::vector<double>> scores = scoresMod[scoreSetIdx]; // scores for this candidate, including all variants and base(zeroth)
-      int numVariants = scores.size() - 1; // subtract one for the base
-      int numScores = scores[0].size();
+    }
 
-      for (int variantIndex = 0; variantIndex < numVariants; variantIndex++) { // index 0 is the base scores
-	double totalScore = 0.0;
-	for (int k = 0; k < numScores; k++) {
-	  if (fabs(totalScore) < screen_score_threshold) {
-	    double baseScore = scores[0][k];
-	    totalScore += (scores[variantIndex + 1][k] - baseScore);
-	  }
+    std::vector<Variant> v;
+    if (!event_sequences_vector.empty()) {
+
+	auto scoresMod = scoreKernelMod(scoreSets, alignment_flags);
+
+	// results are now ready, need to unpack them
+	for (int scoreSetIdx=0; scoreSetIdx<numScoreSets; scoreSetIdx++){
+	    std::vector<std::vector<double>> scores = scoresMod[scoreSetIdx]; // scores for this candidate, including all variants and base(zeroth)
+	    int numVariants = scores.size() - 1; // subtract one for the base
+	    int numScores = scores[0].size();
+
+	    for (int variantIndex = 0; variantIndex < numVariants; variantIndex++) { // index 0 is the base scores
+		double totalScore = 0.0;
+		for (int k = 0; k < numScores; k++) {
+		    if (fabs(totalScore) < screen_score_threshold) {
+			double baseScore = scores[0][k];
+			totalScore += (scores[variantIndex + 1][k] - baseScore);
+		    }
+		}
+		// get the old variant:
+		auto unScoredVariant = input_variants_vector[scoreSetIdx][variantIndex];
+		unScoredVariant.quality = totalScore;
+		unScoredVariant.info = "";
+		v.push_back(unScoredVariant);
+	    }
 	}
-	// get the old variant:
-	auto unScoredVariant = input_variants_vector[scoreSetIdx][variantIndex];
-	unScoredVariant.quality = totalScore;
-	unScoredVariant.info = "";
-	v.push_back(unScoredVariant);
-      }
     }
-  }
-  return v;
+    return v;
 }

From b2fb309b16dbe5147eb623e6b77b0e3554bd9d71 Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Thu, 20 Sep 2018 15:06:43 +0100
Subject: [PATCH 64/80] removed some outdated comments

---
 src/cuda_kernels/GpuAligner.cu | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
index 7e8ece87..36b6378a 100644
--- a/src/cuda_kernels/GpuAligner.cu
+++ b/src/cuda_kernels/GpuAligner.cu
@@ -154,14 +154,14 @@ __global__ void getScoresMod (float * poreModelDev,
 	float BAD_EVENT_PENALTY = 0.0f;
 
 	//Fill out the dynamic programming table
-	for (int row = 1; row < numEvents + 1; row++) {//TODO: check that numRows is correct value.
+	for (int row = 1; row < numEvents + 1; row++) {
 	    //row-specific values
 	    int event_idx = e_start + (row - 1) * e_stride;
 	    float eventMean = eventMeansDev[e_offset + row - 1];
 	    float preFlank = preFlankingDev[e_offset + row - 1];
 	    float postFlank = postFlankingDev[e_offset + row - 1];
 
-	    float lp_emission_b = BAD_EVENT_PENALTY; //TODO: Can this be taken out of the inner loop?
+	    float lp_emission_b = BAD_EVENT_PENALTY;
 
 	    //Initialise temp registers
 	    float prevMatch = prevProbabilities[PSR9_MATCH];;
@@ -176,7 +176,7 @@ __global__ void getScoresMod (float * poreModelDev,
 
 		int kmerIdx = blkIdx - 1; // because there is a start block with no associated kmer
 		uint32_t rank = kmerRanksDev[seqOffset + kmerIdx + (numKmers *
-								    rc)]; // TODO understand why this is segfaulting sometimes, why does kmerIdx sometimes exceed 4096
+								    rc)];
 
 		float pore_mean = poreModelDev[rank * 3];
 		float pore_stdv = poreModelDev[rank * 3 + 1];
@@ -255,10 +255,8 @@ __global__ void getScoresMod (float * poreModelDev,
 
 		sum = HMT_FROM_PREV_M;
 		sum = logsumexpf(sum, HMT_FROM_PREV_B);
-		sum = logsumexpf(sum,
-				 HMT_FROM_PREV_K); //TODO - this is in the 'normal' kernel instead of HMT_FROM_PREV_M - is it wrong?
-		sum = logsumexpf(sum,
-				 HMT_FROM_PREV_M); //TODO - assume this should probably be in there, but not in current
+		sum = logsumexpf(sum, HMT_FROM_PREV_K);
+		sum = logsumexpf(sum, HMT_FROM_PREV_M);
 
 		float newSkipScore = sum;
 
@@ -291,7 +289,7 @@ GpuAligner::GpuAligner()
     int maxBuffer = max_reads_per_worker * MAX_SEQUENCE_LENGTH * sizeof(int);
 
     //OLD
-    int max_num_sequences = 1; //TODO can get rid of this
+    int max_num_sequences = 1;
     int max_sequence_length = 100;
     int max_n_rows = 100;
 
@@ -400,7 +398,7 @@ GpuAligner::~GpuAligner() {
     CU_CHECK_ERR(cudaFreeHost(seqIdxHost));
     CU_CHECK_ERR(cudaFreeHost(readIdxHost));
 
-    int max_num_sequences = 1; //TODO can get rid of this
+    int max_num_sequences = 1;
     for (int i =0; i<max_num_sequences; i++) {
 	CU_CHECK_ERR(cudaStreamDestroy(streams[i]));
     }
@@ -484,7 +482,7 @@ std::vector<std::vector<std::vector<double>>> GpuAligner::scoreKernelMod(std::ve
 	    }
 	    // copy over the pore model
 	    CU_CHECK_ERR(cudaMemcpyAsync(poreModelDev, poreModelHost,
-					 poreModelEntriesPerState * 4096 * sizeof(float), cudaMemcpyHostToDevice, streams[0])); // TODO don't hardcode num kmers
+					 poreModelEntriesPerState * 4096 * sizeof(float), cudaMemcpyHostToDevice, streams[0]));
 	    poreModelInitialized = true;
 	}
 	auto & sequences = scoreSet.stateSequences;
@@ -675,12 +673,12 @@ std::vector<Variant> GpuAligner::variantScoresThresholded(std::vector<std::vecto
 	std::vector<HMMInputSequence> sequences;
 
 	HMMInputSequence base_sequence = generate_methylated_alternatives(base_haplotype.get_sequence(),
-									  methylation_types)[0]; //TODO: fix for non-zero
+									  methylation_types)[0];
 
 	sequences.push_back(base_sequence);
 
 	for (auto v: variant_haplotypes){
-	    auto variant_sequence = generate_methylated_alternatives(v.get_sequence(), methylation_types)[0];  //TODO: fix for non-zero
+	    auto variant_sequence = generate_methylated_alternatives(v.get_sequence(), methylation_types)[0];
 	    sequences.push_back(variant_sequence);
 	}
 

From 186ac5dcfbd38e902597105d18a6ebbb7400aa5f Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Thu, 20 Sep 2018 15:09:46 +0100
Subject: [PATCH 65/80] removed old debug code

---
 src/hmm/nanopolish_emissions.h | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/src/hmm/nanopolish_emissions.h b/src/hmm/nanopolish_emissions.h
index 5f99a410..3dca4746 100644
--- a/src/hmm/nanopolish_emissions.h
+++ b/src/hmm/nanopolish_emissions.h
@@ -63,19 +63,8 @@ inline float log_probability_match_r9(const SquiggleRead& read,
 {
     // event level mean, scaled with the drift value
     float level = read.get_drift_scaled_level(event_idx, strand);
-    //if (debug == true){
-    //    printf("Level being used to calculate emission: %f\n", level);
-    //}
+
     GaussianParameters gp = read.get_scaled_gaussian_from_pore_model_state(pore_model, strand, kmer_rank);
-//    if (debug == true) {
-//        printf(">Event IDX is: %i\n", event_idx);
-//        printf(">CPU Strand is: %i\n", strand);
-//        printf(">CPU kmer_rank is: %i\n", kmer_rank);
-//        printf(">CPU level is: %f\n", level);
-//        printf(">CPU gaussian mean: %f\n", gp.mean);
-//        printf(">CPU gaussian stdv: %f\n", gp.stdv);
-//        printf(">CPU gaussian log_level_stdv: %f\n", gp.log_stdv);
-//    }
     float lp = log_normal_pdf(level, gp);
     return lp;
 }

From e823003f9b8a7dcacdf1e7b979e6a8397067b5de Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Thu, 20 Sep 2018 15:19:19 +0100
Subject: [PATCH 66/80] removed deprecated code

---
 src/hmm/nanopolish_profile_hmm.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/hmm/nanopolish_profile_hmm.cpp b/src/hmm/nanopolish_profile_hmm.cpp
index 0d9f5167..6d5d0f37 100644
--- a/src/hmm/nanopolish_profile_hmm.cpp
+++ b/src/hmm/nanopolish_profile_hmm.cpp
@@ -31,7 +31,6 @@ float profile_hmm_score(const HMMInputSequence& sequence, const HMMInputData& da
 
 float profile_hmm_score_set(const std::vector<HMMInputSequence>& sequences, const HMMInputData& data, const uint32_t flags)
 {
-    //printf("In profile_hmm_score set function...\n");
     assert(!sequences.empty());
     assert(std::string(sequences[0].get_alphabet()->get_name()) == "nucleotide");
     assert(std::string(data.pore_model->pmalphabet->get_name()) == "nucleotide");

From 551cd230a037ec888abf569ba3ecfe22f51e5779 Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Thu, 20 Sep 2018 15:28:19 +0100
Subject: [PATCH 67/80] removed old debug code

---
 src/hmm/nanopolish_profile_hmm_r7.inl | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/hmm/nanopolish_profile_hmm_r7.inl b/src/hmm/nanopolish_profile_hmm_r7.inl
index 3fe4b309..7f9083e9 100644
--- a/src/hmm/nanopolish_profile_hmm_r7.inl
+++ b/src/hmm/nanopolish_profile_hmm_r7.inl
@@ -308,9 +308,6 @@ inline float profile_hmm_fill_generic_r7(const HMMInputSequence& _sequence,
     std::vector<uint32_t> kmer_ranks(num_kmers);
     for(size_t ki = 0; ki < num_kmers; ++ki) {
         int rank = sequence.get_kmer_rank(ki, k, data.rc);
-        if(rank>4096){
-            printf("Rank: %i", rank);
-        }
         kmer_ranks[ki] = rank;
     }
     size_t num_events = output.get_num_rows() - 1;

From 5d67b61e464a10c04ce006f0863204bd35a9b553 Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Thu, 20 Sep 2018 15:30:09 +0100
Subject: [PATCH 68/80] revert typo

---
 src/hmm/nanopolish_profile_hmm_r9.inl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/hmm/nanopolish_profile_hmm_r9.inl b/src/hmm/nanopolish_profile_hmm_r9.inl
index 0d90b5c3..c09b4321 100644
--- a/src/hmm/nanopolish_profile_hmm_r9.inl
+++ b/src/hmm/nanopolish_profile_hmm_r9.inl
@@ -216,7 +216,6 @@ inline std::vector<float> make_pre_flanking(const HMMInputData& data,
         pre_flank[i] = log(TRANS_CLIP_SELF) + 
                        log_probability_background(*data.read, event_idx, data.strand) + // emit from background
                        pre_flank[i - 1]; // this accounts for the transition from the start & to the silent pre
-
     }
 
     return pre_flank;

From 27f4d5c574bb50031f18208729557734a9e9c6ee Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Thu, 20 Sep 2018 15:32:50 +0100
Subject: [PATCH 69/80] Made indentation consistent

---
 src/main/nanopolish.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/main/nanopolish.cpp b/src/main/nanopolish.cpp
index 459a3e2e..d6bcf8d4 100644
--- a/src/main/nanopolish.cpp
+++ b/src/main/nanopolish.cpp
@@ -54,10 +54,10 @@ int print_usage(int, char **)
 int print_version(int, char **)
 {
     static const char *VERSION_MESSAGE =
-    "nanopolish version " PACKAGE_VERSION "\n"
-    "Written by Jared Simpson.\n"
-    "\n"
-    "Copyright 2015-2017 Ontario Institute for Cancer Research\n";
+	"nanopolish version " PACKAGE_VERSION "\n"
+	"Written by Jared Simpson.\n"
+	"\n"
+	"Copyright 2015-2017 Ontario Institute for Cancer Research\n";
     std::cout << VERSION_MESSAGE << std::endl;
     return 0;
 }
@@ -78,7 +78,7 @@ int main(int argc, char** argv)
         if (iter != programs.end()) {
             ret = iter->second(argc - 1, argv + 1);
         }
-       else
+	else
             ret = print_usage( argc - 1, argv + 1);
     }
 
@@ -92,7 +92,7 @@ int main(int argc, char** argv)
     extern int g_bad_fast5_file;
     if(g_total_reads > 0) {
         fprintf(stderr, "[post-run summaryz] total reads: %d, unparseable: %d, qc fail: %d, could not calibrate: %d, no alignment: %d, bad fast5: %d\n",
-            g_total_reads, g_unparseable_reads, g_qc_fail_reads, g_failed_calibration_reads, g_failed_alignment_reads, g_bad_fast5_file);
+		g_total_reads, g_unparseable_reads, g_qc_fail_reads, g_failed_calibration_reads, g_failed_alignment_reads, g_bad_fast5_file);
     }
     return ret;
 }

From 585302a9d7e3220c1079876bc6d750f237d01610 Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Thu, 20 Sep 2018 15:39:41 +0100
Subject: [PATCH 70/80] fixed indentation

---
 src/nanopolish_call_variants.cpp | 221 +++++++++++++++----------------
 1 file changed, 110 insertions(+), 111 deletions(-)

diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp
index 806cde2c..a5de13d9 100644
--- a/src/nanopolish_call_variants.cpp
+++ b/src/nanopolish_call_variants.cpp
@@ -66,43 +66,43 @@ float g_p_skip, g_p_skip_self, g_p_bad, g_p_bad_self;
 #define SUBPROGRAM "variants"
 
 static const char *CONSENSUS_VERSION_MESSAGE =
-SUBPROGRAM " Version " PACKAGE_VERSION "\n"
-"Written by Jared Simpson.\n"
-"\n"
-"Copyright 2015 Ontario Institute for Cancer Research\n";
+    SUBPROGRAM " Version " PACKAGE_VERSION "\n"
+    "Written by Jared Simpson.\n"
+    "\n"
+    "Copyright 2015 Ontario Institute for Cancer Research\n";
 
 static const char *CONSENSUS_USAGE_MESSAGE =
-"Usage: " PACKAGE_NAME " " SUBPROGRAM " [OPTIONS] --reads reads.fa --bam alignments.bam --genome genome.fa\n"
-"Find SNPs using a signal-level HMM\n"
-"\n"
-"  -v, --verbose                        display verbose output\n"
-"      --version                        display version\n"
-"      --help                           display this help and exit\n"
-"      --snps                           only call SNPs\n"
-"      --consensus                      run in consensus calling mode\n"
-"      --fix-homopolymers               run the experimental homopolymer caller\n"
-"      --faster                         minimize compute time while slightly reducing consensus accuracy\n"
-"  -w, --window=STR                     find variants in window STR (format: <chromsome_name>:<start>-<end>)\n"
-"  -r, --reads=FILE                     the ONT reads are in fasta FILE\n"
-"  -b, --bam=FILE                       the reads aligned to the reference genome are in bam FILE\n"
-"  -e, --event-bam=FILE                 the events aligned to the reference genome are in bam FILE\n"
-"  -g, --genome=FILE                    the reference genome is in FILE\n"
-"  -p, --ploidy=NUM                     the ploidy level of the sequenced genome\n"
-"  -q  --methylation-aware=STR          turn on methylation aware polishing and test motifs given in STR (example: -q dcm,dam)\n"
-"      --genotype=FILE                  call genotypes for the variants in the vcf FILE\n"
-"  -o, --outfile=FILE                   write result to FILE [default: stdout]\n"
-"  -t, --threads=NUM                    use NUM threads (default: 1)\n"
-"  -m, --min-candidate-frequency=F      extract candidate variants from the aligned reads when the variant frequency is at least F (default 0.2)\n"
-"  -d, --min-candidate-depth=D          extract candidate variants from the aligned reads when the depth is at least D (default: 20)\n"
-"  -x, --max-haplotypes=N               consider at most N haplotype combinations (default: 1000)\n"
-"      --min-flanking-sequence=N        distance from alignment end to calculate variants (default: 30)\n"
-"      --max-rounds=N                   perform N rounds of consensus sequence improvement (default: 50)\n"
-"  -c, --candidates=VCF                 read variant candidates from VCF, rather than discovering them from aligned reads\n"
-"  -a, --alternative-basecalls-bam=FILE if an alternative basecaller was used that does not output event annotations\n"
-"                                       then use basecalled sequences from FILE. The signal-level events will still be taken from the -b bam.\n"
-"      --calculate-all-support          when making a call, also calculate the support of the 3 other possible bases\n"
-"      --models-fofn=FILE               read alternative k-mer models from FILE\n"
-"\nReport bugs to " PACKAGE_BUGREPORT "\n\n";
+    "Usage: " PACKAGE_NAME " " SUBPROGRAM " [OPTIONS] --reads reads.fa --bam alignments.bam --genome genome.fa\n"
+    "Find SNPs using a signal-level HMM\n"
+    "\n"
+    "  -v, --verbose                        display verbose output\n"
+    "      --version                        display version\n"
+    "      --help                           display this help and exit\n"
+    "      --snps                           only call SNPs\n"
+    "      --consensus                      run in consensus calling mode\n"
+    "      --fix-homopolymers               run the experimental homopolymer caller\n"
+    "      --faster                         minimize compute time while slightly reducing consensus accuracy\n"
+    "  -w, --window=STR                     find variants in window STR (format: <chromsome_name>:<start>-<end>)\n"
+    "  -r, --reads=FILE                     the ONT reads are in fasta FILE\n"
+    "  -b, --bam=FILE                       the reads aligned to the reference genome are in bam FILE\n"
+    "  -e, --event-bam=FILE                 the events aligned to the reference genome are in bam FILE\n"
+    "  -g, --genome=FILE                    the reference genome is in FILE\n"
+    "  -p, --ploidy=NUM                     the ploidy level of the sequenced genome\n"
+    "  -q  --methylation-aware=STR          turn on methylation aware polishing and test motifs given in STR (example: -q dcm,dam)\n"
+    "      --genotype=FILE                  call genotypes for the variants in the vcf FILE\n"
+    "  -o, --outfile=FILE                   write result to FILE [default: stdout]\n"
+    "  -t, --threads=NUM                    use NUM threads (default: 1)\n"
+    "  -m, --min-candidate-frequency=F      extract candidate variants from the aligned reads when the variant frequency is at least F (default 0.2)\n"
+    "  -d, --min-candidate-depth=D          extract candidate variants from the aligned reads when the depth is at least D (default: 20)\n"
+    "  -x, --max-haplotypes=N               consider at most N haplotype combinations (default: 1000)\n"
+    "      --min-flanking-sequence=N        distance from alignment end to calculate variants (default: 30)\n"
+    "      --max-rounds=N                   perform N rounds of consensus sequence improvement (default: 50)\n"
+    "  -c, --candidates=VCF                 read variant candidates from VCF, rather than discovering them from aligned reads\n"
+    "  -a, --alternative-basecalls-bam=FILE if an alternative basecaller was used that does not output event annotations\n"
+    "                                       then use basecalled sequences from FILE. The signal-level events will still be taken from the -b bam.\n"
+    "      --calculate-all-support          when making a call, also calculate the support of the 3 other possible bases\n"
+    "      --models-fofn=FILE               read alternative k-mer models from FILE\n"
+    "\nReport bugs to " PACKAGE_BUGREPORT "\n\n";
 
 namespace opt
 {
@@ -289,8 +289,7 @@ void prepareForBaseEditCandidates(int start,
                                   std::string contig,
                                   std::vector<std::vector<Variant>> &tmp_variants_vector,
                                   std::vector<Haplotype> &haplotypes,
-                                  std::vector<std::vector<HMMInputData>> &event_sequences_vector
-){
+                                  std::vector<std::vector<HMMInputData>> &event_sequences_vector){
     for(int i = start; i<=end; i++){
         int calling_start = i - opt::screen_flanking_sequence;
         int calling_end = i + 1 + opt::screen_flanking_sequence;
@@ -388,7 +387,7 @@ void locusRangeBaseEditCandidateGPU(int start,
 
 void locusRangeBaseEditCandidate(int start,
                                  int end,
-				                 const AlignmentDB& alignments,
+				 const AlignmentDB& alignments,
                                  uint32_t alignment_flags,
                                  std::vector<Variant> &out_variants,
                                  std::string contig) {
@@ -843,7 +842,7 @@ Haplotype fix_homopolymers(const Haplotype& input_haplotype,
                     duration_likelihoods[var_sequence_length] += log_gamma;
                 }
                 if(opt::verbose > 3) {
-                   fprintf(stderr, "SUM_VAR\t%zu\t%zu\t%d\t%d\t%lu\t%.5lf\t%.2lf\n", ref_hp_start, hp_length, var_sequence_length, call_window, variant_offset_end - variant_offset_start, sum_duration, log_gamma);
+		    fprintf(stderr, "SUM_VAR\t%zu\t%zu\t%d\t%d\t%lu\t%.5lf\t%.2lf\n", ref_hp_start, hp_length, var_sequence_length, call_window, variant_offset_end - variant_offset_start, sum_duration, log_gamma);
                 }
             }
         }
@@ -960,7 +959,7 @@ Haplotype call_haplotype_from_candidates(const AlignmentDB& alignments,
         size_t end_variant_idx = curr_variant_idx + 1;
         while(end_variant_idx < candidate_variants.size()) {
             int distance = candidate_variants[end_variant_idx].ref_position -
-                           candidate_variants[end_variant_idx - 1].ref_position;
+		candidate_variants[end_variant_idx - 1].ref_position;
             if(distance > opt::min_distance_between_variants)
                 break;
             end_variant_idx++;
@@ -969,8 +968,8 @@ Haplotype call_haplotype_from_candidates(const AlignmentDB& alignments,
         size_t num_variants = end_variant_idx - curr_variant_idx;
         int calling_start = candidate_variants[curr_variant_idx].ref_position - opt::min_flanking_sequence;
         int calling_end = candidate_variants[end_variant_idx - 1].ref_position +
-                          candidate_variants[end_variant_idx - 1].ref_seq.length() +
-                          opt::min_flanking_sequence;
+	    candidate_variants[end_variant_idx - 1].ref_seq.length() +
+	    opt::min_flanking_sequence;
 
         int calling_size = calling_end - calling_start;
 
@@ -1014,7 +1013,7 @@ Haplotype call_haplotype_from_candidates(const AlignmentDB& alignments,
             }
         } else {
             fprintf(stderr, "Warning: %zu variants in span, region not called [%d %d]\n", num_variants, calling_start, calling_end);
-		}
+	}
 
         // advance to start of next region
         curr_variant_idx = end_variant_idx;
@@ -1070,11 +1069,11 @@ Haplotype call_variants_for_region(const std::string& contig, int region_start,
         fprintf(stderr, "input region: %s\n", alignments.get_reference_substring(contig, region_start - BUFFER, region_end + BUFFER).c_str());
     }
 
-/*
-    Haplotype called_haplotype(alignments.get_region_contig(),
-                               alignments.get_region_start(),
-                               alignments.get_reference());
-*/
+    /*
+      Haplotype called_haplotype(alignments.get_region_contig(),
+      alignments.get_region_start(),
+      alignments.get_reference());
+    */
     // Step 1. Discover putative variants across the whole region
     std::vector<Variant> candidate_variants;
     if(opt::candidates_file.empty()) {
@@ -1155,10 +1154,10 @@ Haplotype call_variants_for_region(const std::string& contig, int region_start,
             last_round_variant_keys = this_round_variant_keys;
             if(variant_set_changed) {
                 candidate_variants = expand_variants(alignments,
-                        called_variants,
-                        region_start,
-                        region_end,
-                        alignment_flags);
+						     called_variants,
+						     region_start,
+						     region_end,
+						     alignment_flags);
 
             } else {
                 break;
@@ -1190,44 +1189,44 @@ void parse_call_variants_options(int argc, char** argv)
     for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) {
         std::istringstream arg(optarg != NULL ? optarg : "");
         switch (c) {
-            case 'r': arg >> opt::reads_file; break;
-            case 'g': arg >> opt::genome_file; break;
-            case 'b': arg >> opt::bam_file; break;
-            case 'e': arg >> opt::event_bam_file; break;
-            case 'w': arg >> opt::window; break;
-            case 'o': arg >> opt::output_file; break;
-            case 'm': arg >> opt::min_candidate_frequency; break;
-            case 'd': arg >> opt::min_candidate_depth; break;
-            case 'x': arg >> opt::max_haplotypes; break;
-            case 'c': arg >> opt::candidates_file; break;
-            case 'p': arg >> opt::ploidy; break;
-            case 'q': arg >> methylation_motifs_str; break;
-            case 'a': arg >> opt::alternative_basecalls_bam; break;
-            case '?': die = true; break;
-            case 't': arg >> opt::num_threads; break;
-            case 'v': opt::verbose++; break;
-            case OPT_CONSENSUS: opt::consensus_mode = 1; break;
-            case OPT_GPU: opt::gpu = 1; break;
-            case OPT_FIX_HOMOPOLYMERS: opt::fix_homopolymers = 1; break;
-            case OPT_EFFORT: arg >> opt::screen_score_threshold; break;
-            case OPT_FASTER: opt::screen_score_threshold = 25; break;
-            case OPT_MAX_ROUNDS: arg >> opt::max_rounds; break;
-            case OPT_GENOTYPE: opt::genotype_only = 1; arg >> opt::candidates_file; break;
-            case OPT_MODELS_FOFN: arg >> opt::models_fofn; break;
-            case OPT_CALC_ALL_SUPPORT: opt::calculate_all_support = 1; break;
-            case OPT_SNPS_ONLY: opt::snps_only = 1; break;
-            case OPT_PROGRESS: opt::show_progress = 1; break;
-            case OPT_P_SKIP: arg >> g_p_skip; break;
-            case OPT_P_SKIP_SELF: arg >> g_p_skip_self; break;
-            case OPT_P_BAD: arg >> g_p_bad; break;
-            case OPT_P_BAD_SELF: arg >> g_p_bad_self; break;
-            case OPT_MIN_FLANKING_SEQUENCE: arg >> opt::min_flanking_sequence; break;
-            case OPT_HELP:
-                std::cout << CONSENSUS_USAGE_MESSAGE;
-                exit(EXIT_SUCCESS);
-            case OPT_VERSION:
-                std::cout << CONSENSUS_VERSION_MESSAGE;
-                exit(EXIT_SUCCESS);
+	case 'r': arg >> opt::reads_file; break;
+	case 'g': arg >> opt::genome_file; break;
+	case 'b': arg >> opt::bam_file; break;
+	case 'e': arg >> opt::event_bam_file; break;
+	case 'w': arg >> opt::window; break;
+	case 'o': arg >> opt::output_file; break;
+	case 'm': arg >> opt::min_candidate_frequency; break;
+	case 'd': arg >> opt::min_candidate_depth; break;
+	case 'x': arg >> opt::max_haplotypes; break;
+	case 'c': arg >> opt::candidates_file; break;
+	case 'p': arg >> opt::ploidy; break;
+	case 'q': arg >> methylation_motifs_str; break;
+	case 'a': arg >> opt::alternative_basecalls_bam; break;
+	case '?': die = true; break;
+	case 't': arg >> opt::num_threads; break;
+	case 'v': opt::verbose++; break;
+	case OPT_CONSENSUS: opt::consensus_mode = 1; break;
+	case OPT_GPU: opt::gpu = 1; break;
+	case OPT_FIX_HOMOPOLYMERS: opt::fix_homopolymers = 1; break;
+	case OPT_EFFORT: arg >> opt::screen_score_threshold; break;
+	case OPT_FASTER: opt::screen_score_threshold = 25; break;
+	case OPT_MAX_ROUNDS: arg >> opt::max_rounds; break;
+	case OPT_GENOTYPE: opt::genotype_only = 1; arg >> opt::candidates_file; break;
+	case OPT_MODELS_FOFN: arg >> opt::models_fofn; break;
+	case OPT_CALC_ALL_SUPPORT: opt::calculate_all_support = 1; break;
+	case OPT_SNPS_ONLY: opt::snps_only = 1; break;
+	case OPT_PROGRESS: opt::show_progress = 1; break;
+	case OPT_P_SKIP: arg >> g_p_skip; break;
+	case OPT_P_SKIP_SELF: arg >> g_p_skip_self; break;
+	case OPT_P_BAD: arg >> g_p_bad; break;
+	case OPT_P_BAD_SELF: arg >> g_p_bad_self; break;
+	case OPT_MIN_FLANKING_SEQUENCE: arg >> opt::min_flanking_sequence; break;
+	case OPT_HELP:
+	    std::cout << CONSENSUS_USAGE_MESSAGE;
+	    exit(EXIT_SUCCESS);
+	case OPT_VERSION:
+	    std::cout << CONSENSUS_VERSION_MESSAGE;
+	    exit(EXIT_SUCCESS);
         }
     }
 
@@ -1281,10 +1280,10 @@ void parse_call_variants_options(int argc, char** argv)
     }
 
     if (die)
-    {
-        std::cout << "\n" << CONSENSUS_USAGE_MESSAGE;
-        exit(EXIT_FAILURE);
-    }
+	{
+	    std::cout << "\n" << CONSENSUS_USAGE_MESSAGE;
+	    exit(EXIT_FAILURE);
+	}
 }
 
 void print_invalid_window_error(int start_base, int end_base)
@@ -1346,34 +1345,34 @@ int call_variants_main(int argc, char** argv)
 
     //
     header_fields.push_back(
-        Variant::make_vcf_tag_string("INFO", "TotalReads", 1, "Integer",
-                                      "The number of event-space reads used to call the variant"));
+			    Variant::make_vcf_tag_string("INFO", "TotalReads", 1, "Integer",
+							 "The number of event-space reads used to call the variant"));
 
     header_fields.push_back(
-        Variant::make_vcf_tag_string("INFO", "SupportFraction", 1, "Float",
-                                      "The fraction of event-space reads that support the variant"));
+			    Variant::make_vcf_tag_string("INFO", "SupportFraction", 1, "Float",
+							 "The fraction of event-space reads that support the variant"));
 
     header_fields.push_back(
-        Variant::make_vcf_tag_string("INFO", "BaseCalledReadsWithVariant", 1, "Integer",
-                                      "The number of base-space reads that support the variant"));
+			    Variant::make_vcf_tag_string("INFO", "BaseCalledReadsWithVariant", 1, "Integer",
+							 "The number of base-space reads that support the variant"));
 
     header_fields.push_back(
-        Variant::make_vcf_tag_string("INFO", "BaseCalledFraction", 1, "Float",
-                                      "The fraction of base-space reads that support the variant"));
+			    Variant::make_vcf_tag_string("INFO", "BaseCalledFraction", 1, "Float",
+							 "The fraction of base-space reads that support the variant"));
 
     header_fields.push_back(
-            Variant::make_vcf_tag_string("INFO", "AlleleCount", 1, "Integer",
-                "The inferred number of copies of the allele"));
+			    Variant::make_vcf_tag_string("INFO", "AlleleCount", 1, "Integer",
+							 "The inferred number of copies of the allele"));
 
     if(opt::calculate_all_support) {
         header_fields.push_back(
-                Variant::make_vcf_tag_string("INFO", "SupportFractionByBase", 4, "Integer",
-                    "The fraction of reads supporting A,C,G,T at this position"));
+				Variant::make_vcf_tag_string("INFO", "SupportFractionByBase", 4, "Integer",
+							     "The fraction of reads supporting A,C,G,T at this position"));
 
     }
     header_fields.push_back(
-            Variant::make_vcf_tag_string("FORMAT", "GT", 1, "String",
-                "Genotype"));
+			    Variant::make_vcf_tag_string("FORMAT", "GT", 1, "String",
+							 "Genotype"));
 
     Variant::write_vcf_header(out_fp, header_fields);
 
@@ -1383,9 +1382,9 @@ int call_variants_main(int argc, char** argv)
     if(!opt::consensus_output.empty()) {
         FILE* consensus_fp = fopen(opt::consensus_output.c_str(), "w");
         fprintf(consensus_fp, ">%s:%d-%d\n%s\n", contig.c_str(),
-                                  start_base,
-                                  end_base,
-                                  haplotype.get_sequence().c_str());
+		start_base,
+		end_base,
+		haplotype.get_sequence().c_str());
         fclose(consensus_fp);
     }
 

From f3bf3e1f27e986e4ee3c884ab182e81ca7e52d28 Mon Sep 17 00:00:00 2001
From: Hasindu Gamaarachchi <hasindu2008@gmail.com>
Date: Wed, 25 Sep 2019 17:38:18 +1000
Subject: [PATCH 71/80] changes to the makefile to get it compiled

---
 Makefile        | 9 ++++-----
 test/.gitignore | 2 ++
 2 files changed, 6 insertions(+), 5 deletions(-)
 create mode 100644 test/.gitignore

diff --git a/Makefile b/Makefile
index dad29e07..0ccb65b5 100644
--- a/Makefile
+++ b/Makefile
@@ -15,8 +15,8 @@ CFLAGS ?= -std=c99 -O3
 CXX ?= g++
 CC ?= gcc
 NVCC = nvcc
-NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda-9.0include -O3 -use_fast_math --default-stream per-thread -restrict
-CURTFLAGS ?= -L/usr/local/cuda-9.0/lib64 -lcudart
+NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda/include -O3 -use_fast_math --default-stream per-thread -restrict
+CURTFLAGS ?= -L/usr/local/cuda/lib64 -lcudart
 
 # Change the value of HDF5, EIGEN, or HTS below to any value to disable compilation of bundled code
 HDF5 ?= install
@@ -69,7 +69,7 @@ EIGEN_INCLUDE = -I./eigen/
 # Include the src subdirectories
 NP_INCLUDE = $(addprefix -I./, $(SUBDIRS))
 
-CUDA_INCLUDE=-I/usr/local/cuda-9.0/include
+CUDA_INCLUDE=-I/usr/local/cuda/include
 
 # Add include flags
 CPPFLAGS += $(H5_INCLUDE) $(HTS_INCLUDE) $(FAST5_INCLUDE) $(NP_INCLUDE) $(EIGEN_INCLUDE) $(CUDA_INCLUDE)
@@ -158,5 +158,4 @@ test: $(TEST_PROGRAM)
 
 .PHONY: clean
 clean:
-	rm -f $(PROGRAM) $(TEST_PROGRAM) $(CPP_OBJ) $(CU_OBJ) $(C_OBJ) src/main/nanopolish.o src/test/nanopolish_test.o
-		src/main/nanopolish.o src/test/nanopolish_test.o
\ No newline at end of file
+	rm -f $(PROGRAM) $(TEST_PROGRAM) $(CPP_OBJ) $(CU_OBJ) $(C_OBJ) src/main/nanopolish.o src/test/nanopolish_test.o src/main/nanopolish.o src/test/nanopolish_test.o
diff --git a/test/.gitignore b/test/.gitignore
new file mode 100644
index 00000000..479a396b
--- /dev/null
+++ b/test/.gitignore
@@ -0,0 +1,2 @@
+ecoli_2kb_region
+

From e484b291ece556922e2d367a440fe5f01b933880 Mon Sep 17 00:00:00 2001
From: Hasindu Gamaarachchi <hasindu2008@gmail.com>
Date: Fri, 27 Sep 2019 19:20:57 +1000
Subject: [PATCH 72/80] cleaned up the make file and added cuda support as an
 option with minimal changes to the original source

---
 .travis.yml                      |  2 +-
 Makefile                         | 64 ++++++++++++++++++++------------
 src/nanopolish_call_variants.cpp | 20 +++++++---
 3 files changed, 56 insertions(+), 30 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index ca383521..7b15b855 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -43,4 +43,4 @@ script:
     # to display the log without downloading the raw log on Travis log page.
     # Travis finishs with error when exceeding the limit of 4 MB of log length.
     - export H5_CFLAGS="-w"
-    - make nanopolish && make test
+    - make && make test
diff --git a/Makefile b/Makefile
index 0ccb65b5..f45fda70 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 #
 
 # Sub directories containing source code, except for the main programs
-SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/alignment src/pore_model src/cuda_kernels
+SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/alignment src/pore_model
 
 #
 # Set libraries, paths, flags and options
@@ -11,12 +11,9 @@ SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/ali
 LIBS = -lz
 CXXFLAGS ?= -g -O3
 CXXFLAGS += -std=c++11 -fopenmp -fsigned-char
-CFLAGS ?= -std=c99 -O3
+CFLAGS ?= -O3 -std=c99
 CXX ?= g++
 CC ?= gcc
-NVCC = nvcc
-NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda/include -O3 -use_fast_math --default-stream per-thread -restrict
-CURTFLAGS ?= -L/usr/local/cuda/lib64 -lcudart
 
 # Change the value of HDF5, EIGEN, or HTS below to any value to disable compilation of bundled code
 HDF5 ?= install
@@ -69,17 +66,15 @@ EIGEN_INCLUDE = -I./eigen/
 # Include the src subdirectories
 NP_INCLUDE = $(addprefix -I./, $(SUBDIRS))
 
-CUDA_INCLUDE=-I/usr/local/cuda/include
-
 # Add include flags
-CPPFLAGS += $(H5_INCLUDE) $(HTS_INCLUDE) $(FAST5_INCLUDE) $(NP_INCLUDE) $(EIGEN_INCLUDE) $(CUDA_INCLUDE)
+CPPFLAGS += $(H5_INCLUDE) $(HTS_INCLUDE) $(FAST5_INCLUDE) $(NP_INCLUDE) $(EIGEN_INCLUDE)
 
 # Main programs to build
 PROGRAM = nanopolish
 TEST_PROGRAM = nanopolish_test
 
 .PHONY: all
-all: $(PROGRAM) $(TEST_PROGRAM)
+all: depend $(PROGRAM)
 
 #
 # Build libhts
@@ -113,27 +108,50 @@ eigen/INSTALL:
 
 # Find the source files by searching subdirectories
 CPP_SRC := $(foreach dir, $(SUBDIRS), $(wildcard $(dir)/*.cpp))
-CU_SRC := $(foreach dir, $(SUBDIRS), $(wildcard $(dir)/*.cu))
 C_SRC := $(foreach dir, $(SUBDIRS), $(wildcard $(dir)/*.c))
 EXE_SRC = src/main/nanopolish.cpp src/test/nanopolish_test.cpp
 
 # Automatically generated object names
-CPP_OBJ=$(CPP_SRC:.cpp=.o)
-C_OBJ=$(C_SRC:.c=.o)
-CU_OBJ=$(CU_SRC:.cu=.o)
+CPP_OBJ = $(CPP_SRC:.cpp=.o)
+C_OBJ = $(C_SRC:.c=.o)
+
+ifdef cuda
+
+	NVCC = nvcc
+	NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda/include -O3 -use_fast_math --default-stream per-thread -restrict
+	CURTFLAGS ?= -L/usr/local/cuda/lib64 -lcudart
+
+	CUDA_INCLUDE?=-I/usr/local/cuda/include
+	CPPFLAGS+=$(CUDA_INCLUDE)
+	CPPFLAGS+=-DHAVE_CUDA=1
+
+	# Sub directories containing CUDA source code
+	SUBDIRS+=src/cuda_kernels
+	# Find the source files by searching subdirectories
+	CU_SRC := $(foreach dir, $(SUBDIRS), $(wildcard $(dir)/*.cu))
+	# Automatically generated object names
+	CU_OBJ=$(CU_SRC:.cu=.o)
+	CPP_OBJ+=$(CU_OBJ)
+	LDFLAGS+=$(CURTFLAGS)
 
 .SUFFIXES: .cu
 
+# Compile objects
+.cu.o:
+	$(NVCC) -o $@ -c $(NVCCFLAGS) $(CPPFLAGS) $<
+
+endif
+
+
+
 # Generate dependencies
 .PHONY: depend
 depend: .depend
 
-.depend: $(CPP_SRC) $(C_SRC) $(CU_SRC) $(EXE_SRC) $(H5_LIB) $(EIGEN_CHECK)
+.depend: $(CPP_SRC) $(C_SRC) $(EXE_SRC) $(H5_LIB) $(EIGEN_CHECK)
 	rm -f ./.depend
 	$(CXX) $(CXXFLAGS) $(CPPFLAGS) -MM $(CPP_SRC) $(C_SRC) > ./.depend;
 
-include .depend
-
 # Compile objects
 .cpp.o:
 	$(CXX) -o $@ -c $(CXXFLAGS) $(CPPFLAGS) -fPIC $<
@@ -141,16 +159,13 @@ include .depend
 .c.o:
 	$(CC) -o $@ -c $(CFLAGS) $(CPPFLAGS) $(H5_INCLUDE) -fPIC $<
 
-.cu.o:
-	$(NVCC) -o $@ -c $(NVCCFLAGS) $(CPPFLAGS) $<
-
 # Link main executable
-$(PROGRAM): src/main/nanopolish.o $(CU_OBJ) $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(EIGEN_CHECK)
-	$(CXX) -o $@ $(CXXFLAGS) $(CPPFLAGS) -fPIC $< $(CPP_OBJ) $(CU_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(LIBS) $(LDFLAGS) $(CURTFLAGS)
+$(PROGRAM): src/main/nanopolish.o $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(EIGEN_CHECK)
+	$(CXX) -o $@ $(CXXFLAGS) $(CPPFLAGS) -fPIC $< $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(LIBS) $(LDFLAGS)
 
 # Link test executable
-$(TEST_PROGRAM): src/test/nanopolish_test.o $(CPP_OBJ) $(CU_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB)
-	$(CXX) -o $@ $(CXXFLAGS) $(CPPFLAGS) -fPIC $< $(CPP_OBJ) $(CU_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(LIBS) $(LDFLAGS) $(CURTFLAGS)
+$(TEST_PROGRAM): src/test/nanopolish_test.o $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB)
+	$(CXX) -o $@ $(CXXFLAGS) $(CPPFLAGS) -fPIC $< $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(LIBS) $(LDFLAGS)
 
 .PHONY: test
 test: $(TEST_PROGRAM)
@@ -158,4 +173,5 @@ test: $(TEST_PROGRAM)
 
 .PHONY: clean
 clean:
-	rm -f $(PROGRAM) $(TEST_PROGRAM) $(CPP_OBJ) $(CU_OBJ) $(C_OBJ) src/main/nanopolish.o src/test/nanopolish_test.o src/main/nanopolish.o src/test/nanopolish_test.o
+	rm -f $(PROGRAM) $(TEST_PROGRAM) $(CPP_OBJ) $(C_OBJ) \
+		src/main/nanopolish.o src/test/nanopolish_test.o
diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp
index 28f86574..90f991b3 100644
--- a/src/nanopolish_call_variants.cpp
+++ b/src/nanopolish_call_variants.cpp
@@ -38,10 +38,12 @@
 #include "profiler.h"
 #include "progress.h"
 #include "stdaln.h"
-#include <cuda_kernels/GpuAligner.h>
-#include <thread>
-#include <chrono>
-#include <future>
+#ifdef HAVE_CUDA
+    #include <cuda_kernels/GpuAligner.h>
+    #include <thread>
+    #include <chrono>
+    #include <future>
+#endif
 
 // Macros
 #define max3(x,y,z) std::max(std::max(x,y), z)
@@ -349,7 +351,7 @@ void prepareForBaseEditCandidates(int start,
     }
 }
 
-
+#ifdef HAVE_CUDA
 void locusRangeBaseEditCandidateGPU(int start,
                                     int end,
                                     const AlignmentDB& alignments,
@@ -384,6 +386,7 @@ void locusRangeBaseEditCandidateGPU(int start,
     }
 
 }
+#endif
 
 void locusRangeBaseEditCandidate(int start,
                                  int end,
@@ -423,6 +426,7 @@ void locusRangeBaseEditCandidate(int start,
     }
 }
 
+#ifdef HAVE_CUDA
 std::vector<Variant> generate_candidate_single_base_edits_gpu(const AlignmentDB& alignments,
                                                               int region_start,
                                                               int region_end,
@@ -507,6 +511,7 @@ std::vector<Variant> generate_candidate_single_base_edits_gpu(const AlignmentDB&
     }
     return  out_variants;
 }
+#endif
 
 // Given the input region, calculate all single base edits to the current assembly
 std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& alignments,
@@ -1087,10 +1092,15 @@ Haplotype call_variants_for_region(const std::string& contig, int region_start,
 
         std::vector<Variant> single_base_edits;
         if(opt::gpu) {
+            #ifdef HAVE_CUDA
             single_base_edits = generate_candidate_single_base_edits_gpu(alignments,
 									 region_start,
 									 region_end,
                                                                          alignment_flags);
+            #else
+                fprintf(stderr,"Not compiled for CUDA\n");
+                exit(1);
+            #endif
         } else {
             single_base_edits = generate_candidate_single_base_edits(alignments,
 								     region_start,

From f19f9b8d9371870f9a3553e365ab90fa76683fe0 Mon Sep 17 00:00:00 2001
From: Hasindu Gamaarachchi <hasindu2008@gmail.com>
Date: Fri, 27 Sep 2019 19:53:24 +1000
Subject: [PATCH 73/80] cleaned up to be consistent with the original code

---
 src/hmm/nanopolish_emissions.h        |  4 +---
 src/hmm/nanopolish_profile_hmm_r7.inl |  7 +++----
 src/hmm/nanopolish_profile_hmm_r9.cpp |  2 +-
 src/hmm/nanopolish_profile_hmm_r9.inl | 28 +++++++++++----------------
 src/main/nanopolish.cpp               | 19 +++++++++---------
 5 files changed, 25 insertions(+), 35 deletions(-)

diff --git a/src/hmm/nanopolish_emissions.h b/src/hmm/nanopolish_emissions.h
index 3dca4746..f9e85142 100644
--- a/src/hmm/nanopolish_emissions.h
+++ b/src/hmm/nanopolish_emissions.h
@@ -58,12 +58,10 @@ inline float log_probability_match_r9(const SquiggleRead& read,
                                       const PoreModel& pore_model,
                                       uint32_t kmer_rank,
                                       uint32_t event_idx,
-                                      uint8_t strand,
-                                      bool debug = false)
+                                      uint8_t strand)
 {
     // event level mean, scaled with the drift value
     float level = read.get_drift_scaled_level(event_idx, strand);
-
     GaussianParameters gp = read.get_scaled_gaussian_from_pore_model_state(pore_model, strand, kmer_rank);
     float lp = log_normal_pdf(level, gp);
     return lp;
diff --git a/src/hmm/nanopolish_profile_hmm_r7.inl b/src/hmm/nanopolish_profile_hmm_r7.inl
index 7f9083e9..bf0edd28 100644
--- a/src/hmm/nanopolish_profile_hmm_r7.inl
+++ b/src/hmm/nanopolish_profile_hmm_r7.inl
@@ -306,10 +306,9 @@ inline float profile_hmm_fill_generic_r7(const HMMInputSequence& _sequence,
     assert( data.pore_model->states.size() == sequence.get_num_kmer_ranks(k) );
 
     std::vector<uint32_t> kmer_ranks(num_kmers);
-    for(size_t ki = 0; ki < num_kmers; ++ki) {
-        int rank = sequence.get_kmer_rank(ki, k, data.rc);
-        kmer_ranks[ki] = rank;
-    }
+    for(size_t ki = 0; ki < num_kmers; ++ki)
+        kmer_ranks[ki] = sequence.get_kmer_rank(ki, k, data.rc);
+
     size_t num_events = output.get_num_rows() - 1;
 
     std::vector<float> pre_flank = make_pre_flanking_r7(data, parameters, e_start, num_events);
diff --git a/src/hmm/nanopolish_profile_hmm_r9.cpp b/src/hmm/nanopolish_profile_hmm_r9.cpp
index 1f365ebe..773394a7 100644
--- a/src/hmm/nanopolish_profile_hmm_r9.cpp
+++ b/src/hmm/nanopolish_profile_hmm_r9.cpp
@@ -46,7 +46,7 @@ float profile_hmm_score_r9(const HMMInputSequence& sequence, const HMMInputData&
     FloatMatrix fm;
     allocate_matrix(fm, n_rows, n_states);
 
-    profile_hmm_forward_initialize_r9(fm); // what does this do?
+    profile_hmm_forward_initialize_r9(fm);
 
     ProfileHMMForwardOutputR9 output(&fm);
 
diff --git a/src/hmm/nanopolish_profile_hmm_r9.inl b/src/hmm/nanopolish_profile_hmm_r9.inl
index c09b4321..71d52aba 100644
--- a/src/hmm/nanopolish_profile_hmm_r9.inl
+++ b/src/hmm/nanopolish_profile_hmm_r9.inl
@@ -216,6 +216,7 @@ inline std::vector<float> make_pre_flanking(const HMMInputData& data,
         pre_flank[i] = log(TRANS_CLIP_SELF) + 
                        log_probability_background(*data.read, event_idx, data.strand) + // emit from background
                        pre_flank[i - 1]; // this accounts for the transition from the start & to the silent pre
+    
     }
 
     return pre_flank;
@@ -260,7 +261,7 @@ inline std::vector<float> make_post_flanking(const HMMInputData& data,
 template<class ProfileHMMOutput>
 inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
                                          const HMMInputData& _data,
-                                         const uint32_t,  //e_start apparently not used by this function
+                                         const uint32_t,
                                          uint32_t flags,
                                          ProfileHMMOutput& output)
 {
@@ -281,10 +282,10 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
 #endif
 
     uint32_t e_start = data.event_start_idx;
-
+    
     // Calculate number of blocks
     // A block of the HMM is a set of states for one kmer
-    uint32_t num_blocks = output.get_num_columns() / PSR9_NUM_STATES; // num_columns is the number of HMM STATES
+    uint32_t num_blocks = output.get_num_columns() / PSR9_NUM_STATES;
     uint32_t last_event_row_idx = output.get_num_rows() - 1;
 
     // Precompute the transition probabilites for each kmer block
@@ -300,10 +301,8 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
     assert( data.pore_model->states.size() == sequence.get_num_kmer_ranks(k) );
 
     std::vector<uint32_t> kmer_ranks(num_kmers);
-    for(size_t ki = 0; ki < num_kmers; ++ki) {
-        int kr = sequence.get_kmer_rank(ki, k, data.rc); // can * -1 here to see if 3rd is correct
-        kmer_ranks[ki] = kr;
-    }
+    for(size_t ki = 0; ki < num_kmers; ++ki)
+        kmer_ranks[ki] = sequence.get_kmer_rank(ki, k, data.rc);
 
     size_t num_events = output.get_num_rows() - 1;
 
@@ -338,8 +337,7 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
             // Emission probabilities
             uint32_t event_idx = e_start + (row - 1) * data.event_stride;
             uint32_t rank = kmer_ranks[kmer_idx];
-            float lp_emission_m = log_probability_match_r9(*data.read, *data.pore_model, rank, event_idx, data.strand, true);
-
+            float lp_emission_m = log_probability_match_r9(*data.read, *data.pore_model, rank, event_idx, data.strand);
             float lp_emission_b = BAD_EVENT_PENALTY;
             
             HMMUpdateScores scores;
@@ -351,8 +349,6 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
             scores.x[HMT_FROM_PREV_B] = bt.lp_bm_next + output.get(row - 1, prev_block_offset + PSR9_BAD_EVENT);
             scores.x[HMT_FROM_PREV_K] = bt.lp_km + output.get(row - 1, prev_block_offset + PSR9_KMER_SKIP);
 
-            scores.x[HMT_FROM_PREV_B] = bt.lp_bm_next + output.get(row - 1, prev_block_offset + PSR9_BAD_EVENT);
-
             // m_s is the probability of going from the start state
             // to this kmer. The start state is (currently) only 
             // allowed to go to the first kmer. If ALLOW_PRE_CLIP
@@ -361,10 +357,10 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
             scores.x[HMT_FROM_SOFT] = (kmer_idx == 0 &&
                                         (event_idx == e_start ||
                                              (flags & HAF_ALLOW_PRE_CLIP))) ? lp_sm + pre_flank[row - 1] : -INFINITY;
-
+            
             output.update_cell(row, curr_block_offset + PSR9_MATCH, scores, lp_emission_m);
 
-             // state PSR9_BAD_EVENT
+            // state PSR9_BAD_EVENT
             scores.x[HMT_FROM_SAME_M] = bt.lp_mb + output.get(row - 1, curr_block_offset + PSR9_MATCH);
             scores.x[HMT_FROM_PREV_M] = -INFINITY; // not allowed
             scores.x[HMT_FROM_SAME_B] = bt.lp_bb + output.get(row - 1, curr_block_offset + PSR9_BAD_EVENT);
@@ -385,7 +381,6 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
             // If POST_CLIP is enabled we allow the last kmer to transition directly
             // to the end after any event. Otherwise we only allow it from the 
             // last kmer/event match.
-
             if(kmer_idx == last_kmer_idx && ( (flags & HAF_ALLOW_POST_CLIP) || row == last_event_row_idx)) {
                 float lp1 = lp_ms + output.get(row, curr_block_offset + PSR9_MATCH) + post_flank[row - 1];
                 float lp2 = lp_ms + output.get(row, curr_block_offset + PSR9_BAD_EVENT) + post_flank[row - 1];
@@ -396,7 +391,6 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
                 output.update_end(lp3, row, curr_block_offset + PSR9_KMER_SKIP);
             }
 
-
 #ifdef DEBUG_LOCAL_ALIGNMENT
             printf("[%d %d] start: %.2lf  pre: %.2lf fm: %.2lf\n", event_idx, kmer_idx, m_s + lp_emission_m, pre_flank[row - 1], output.get(row, curr_block_offset + PSR9_MATCH));
             printf("[%d %d]   end: %.2lf post: %.2lf\n", event_idx, kmer_idx, lp_end, post_flank[row - 1]);
@@ -430,7 +424,7 @@ inline float profile_hmm_fill_generic_r9(const HMMInputSequence& _sequence,
 #endif
         }
     }
-
-        return output.get_end();
+    
+    return output.get_end();
 }
 
diff --git a/src/main/nanopolish.cpp b/src/main/nanopolish.cpp
index 417790ca..3962d79b 100644
--- a/src/main/nanopolish.cpp
+++ b/src/main/nanopolish.cpp
@@ -56,10 +56,10 @@ int print_usage(int, char **)
 int print_version(int, char **)
 {
     static const char *VERSION_MESSAGE =
-	"nanopolish version " PACKAGE_VERSION "\n"
-	"Written by Jared Simpson.\n"
-	"\n"
-	"Copyright 2015-2017 Ontario Institute for Cancer Research\n";
+    "nanopolish version " PACKAGE_VERSION "\n"
+    "Written by Jared Simpson.\n"
+    "\n"
+    "Copyright 2015-2017 Ontario Institute for Cancer Research\n";
     std::cout << VERSION_MESSAGE << std::endl;
     return 0;
 }
@@ -77,10 +77,9 @@ int main(int argc, char** argv)
     } else {
         std::string command(argv[1]);
         auto iter = programs.find(command);
-        if (iter != programs.end()) {
-            ret = iter->second(argc - 1, argv + 1);
-        }
-	else
+        if (iter != programs.end()) 
+            ret = iter->second( argc - 1, argv + 1);
+        else
             ret = print_usage( argc - 1, argv + 1);
     }
 
@@ -93,8 +92,8 @@ int main(int argc, char** argv)
     extern int g_failed_alignment_reads;
     extern int g_bad_fast5_file;
     if(g_total_reads > 0) {
-        fprintf(stderr, "[post-run summaryz] total reads: %d, unparseable: %d, qc fail: %d, could not calibrate: %d, no alignment: %d, bad fast5: %d\n",
-		g_total_reads, g_unparseable_reads, g_qc_fail_reads, g_failed_calibration_reads, g_failed_alignment_reads, g_bad_fast5_file);
+        fprintf(stderr, "[post-run summary] total reads: %d, unparseable: %d, qc fail: %d, could not calibrate: %d, no alignment: %d, bad fast5: %d\n", 
+            g_total_reads, g_unparseable_reads, g_qc_fail_reads, g_failed_calibration_reads, g_failed_alignment_reads, g_bad_fast5_file);
     }
     return ret;
 }

From 56585975849a46d25bc54adc93c9b8058342f74d Mon Sep 17 00:00:00 2001
From: Hasindu Gamaarachchi <hasindu2008@gmail.com>
Date: Sat, 28 Sep 2019 15:23:40 +1000
Subject: [PATCH 74/80] restructured to minimise changes to the original source
 code

---
 Makefile                               |  27 +-
 cuda.mk                                |  27 ++
 src/cuda_kernels/gpu_call_variants.inl | 191 ++++++++++
 src/nanopolish_call_variants.cpp       | 475 ++++++++-----------------
 4 files changed, 370 insertions(+), 350 deletions(-)
 create mode 100644 cuda.mk
 create mode 100644 src/cuda_kernels/gpu_call_variants.inl

diff --git a/Makefile b/Makefile
index f45fda70..5d3bbbe0 100644
--- a/Makefile
+++ b/Makefile
@@ -116,34 +116,9 @@ CPP_OBJ = $(CPP_SRC:.cpp=.o)
 C_OBJ = $(C_SRC:.c=.o)
 
 ifdef cuda
-
-	NVCC = nvcc
-	NVCCFLAGS ?= -std=c++11 -I. -I/usr/local/cuda/include -O3 -use_fast_math --default-stream per-thread -restrict
-	CURTFLAGS ?= -L/usr/local/cuda/lib64 -lcudart
-
-	CUDA_INCLUDE?=-I/usr/local/cuda/include
-	CPPFLAGS+=$(CUDA_INCLUDE)
-	CPPFLAGS+=-DHAVE_CUDA=1
-
-	# Sub directories containing CUDA source code
-	SUBDIRS+=src/cuda_kernels
-	# Find the source files by searching subdirectories
-	CU_SRC := $(foreach dir, $(SUBDIRS), $(wildcard $(dir)/*.cu))
-	# Automatically generated object names
-	CU_OBJ=$(CU_SRC:.cu=.o)
-	CPP_OBJ+=$(CU_OBJ)
-	LDFLAGS+=$(CURTFLAGS)
-
-.SUFFIXES: .cu
-
-# Compile objects
-.cu.o:
-	$(NVCC) -o $@ -c $(NVCCFLAGS) $(CPPFLAGS) $<
-
+include cuda.mk
 endif
 
-
-
 # Generate dependencies
 .PHONY: depend
 depend: .depend
diff --git a/cuda.mk b/cuda.mk
new file mode 100644
index 00000000..57c97bf4
--- /dev/null
+++ b/cuda.mk
@@ -0,0 +1,27 @@
+#Make file options for CUDA support
+
+NVCC ?= nvcc
+CUDA_ROOT = /usr/local/cuda
+CUDA_LIB ?= $(CUDA_ROOT)/lib64
+CUDA_INCLUDE ?= $(CUDA_ROOT)/include
+CURTFLAGS = -L$(CUDA_LIB) -lcudart
+NVCCFLAGS ?= -std=c++11 -I. -I$(CUDA_INCLUDE) -O3 -use_fast_math --default-stream per-thread -restrict
+
+CPPFLAGS += -I$(CUDA_INCLUDE)
+CPPFLAGS += -DHAVE_CUDA=1
+
+# Sub directories containing CUDA source code
+SUBDIRS += src/cuda_kernels
+# Find the source files by searching subdirectories
+CU_SRC := $(foreach dir, $(SUBDIRS), $(wildcard $(dir)/*.cu))
+# Automatically generated object names
+CU_OBJ = $(CU_SRC:.cu=.o)
+CPP_OBJ += $(CU_OBJ)
+LDFLAGS += $(CURTFLAGS)
+
+.SUFFIXES: .cu
+
+# Compile objects
+.cu.o:
+	$(NVCC) -o $@ -c $(NVCCFLAGS) $(CPPFLAGS) $<
+
diff --git a/src/cuda_kernels/gpu_call_variants.inl b/src/cuda_kernels/gpu_call_variants.inl
new file mode 100644
index 00000000..c5036dcf
--- /dev/null
+++ b/src/cuda_kernels/gpu_call_variants.inl
@@ -0,0 +1,191 @@
+#include <cuda_kernels/GpuAligner.h>
+#include <thread>
+#include <chrono>
+#include <future>
+
+void prepareForBaseEditCandidates(int start,
+                                  int end,
+                                  const AlignmentDB& alignments,
+                                  std::string contig,
+                                  std::vector<std::vector<Variant>> &tmp_variants_vector,
+                                  std::vector<Haplotype> &haplotypes,
+                                  std::vector<std::vector<HMMInputData>> &event_sequences_vector){
+    for(int i = start; i<=end; i++){
+        int calling_start = i - opt::screen_flanking_sequence;
+        int calling_end = i + 1 + opt::screen_flanking_sequence;
+
+        if (!alignments.are_coordinates_valid(contig, calling_start, calling_end)) {
+            return;
+        }
+
+        std::vector<Variant> tmp_variants;
+        for (size_t j = 0; j < 4; ++j) {
+            // Substitutions
+            Variant v;
+            v.ref_name = contig;
+            v.ref_position = i;
+            v.ref_seq = alignments.get_reference_substring(contig, i, i);
+            v.alt_seq = "ACGT"[j];
+
+            if (v.ref_seq != v.alt_seq) {
+                tmp_variants.push_back(v);
+            }
+
+            // Insertions
+            v.alt_seq = v.ref_seq + "ACGT"[j];
+            // ignore insertions of the type "A" -> "AA" as these are redundant
+            if (v.alt_seq[1] != v.ref_seq[0]) {
+                tmp_variants.push_back(v);
+            }
+        }
+
+        // deletion
+        Variant del;
+        del.ref_name = contig;
+        del.ref_position = i - 1;
+        del.ref_seq = alignments.get_reference_substring(contig, i - 1, i);
+        del.alt_seq = del.ref_seq[0];
+
+        // ignore deletions of the type "AA" -> "A" as these are redundant
+        if (del.alt_seq[0] != del.ref_seq[1]) {
+            tmp_variants.push_back(del);
+        }
+
+        // Screen variants by score
+        // We do this internally here as it is much faster to get the event sequences
+        // for the entire window for all variants at this position once, rather than
+        // for each variant individually
+        std::vector<HMMInputData> event_sequences = alignments.get_event_subsequences(contig, calling_start, calling_end);
+
+        Haplotype test_haplotype(contig,
+                                 calling_start,
+                                 alignments.get_reference_substring(contig,
+                                                                    calling_start,
+                                                                    calling_end));
+
+        haplotypes.push_back(test_haplotype);
+        event_sequences_vector.push_back(event_sequences);
+        tmp_variants_vector.push_back(tmp_variants);
+    }
+}
+
+
+void locusRangeBaseEditCandidateGPU(int start,
+                                    int end,
+                                    const AlignmentDB& alignments,
+                                    uint32_t alignment_flags,
+                                    std::vector<Variant> &out_variants,
+                                    std::string contig,
+                                    GpuAligner &aligner,
+                                    std::mutex &outVariantsMutex) {
+    std::vector<std::vector<Variant>> tmp_variants_vector;
+    std::vector<Haplotype> haplotypes;
+    std::vector<std::vector<HMMInputData>> event_sequences_vector;
+
+    prepareForBaseEditCandidates(start,
+                                 end,
+                                 alignments,
+                                 contig,
+                                 tmp_variants_vector,
+                                 haplotypes,
+                                 event_sequences_vector);
+
+    std::vector<Variant> scoredVariants = aligner.variantScoresThresholded(tmp_variants_vector,
+                                                                           haplotypes,
+                                                                           event_sequences_vector,
+                                                                           alignment_flags,
+                                                                           opt::screen_score_threshold,
+                                                                           opt::methylation_types);
+    for (auto variant: scoredVariants) {
+        if (variant.quality > 0) {
+            std::lock_guard<std::mutex> lock(outVariantsMutex);
+            out_variants.push_back(variant);
+        }
+    }
+
+}
+
+std::vector<Variant> generate_candidate_single_base_edits_gpu(const AlignmentDB& alignments,
+                                                              int region_start,
+                                                              int region_end,
+                                                              uint32_t alignment_flags){
+
+    std::mutex outVariantsMutex;
+    std::vector<Variant> out_variants;
+    std::string contig = alignments.get_region_contig();
+
+    // Add all positively-scoring single-base changes into the candidate set
+    size_t num_workers = (opt::num_threads < MAX_NUM_WORKERS) ? opt::num_threads : MAX_NUM_WORKERS;
+    std::vector<GpuAligner> gpuAligners(num_workers);
+
+    //std::vector<std::thread> workerThreads(num_workers);
+    std::vector<std::future<void>> handles(num_workers);
+
+    int nextLocusBegin = region_start;
+    int nextLocusEnd = nextLocusBegin + LOCI_PER_WORKER;
+    bool finished = false;
+
+    //Initialise the workers
+    for (int workerIdx = 0; workerIdx < num_workers; workerIdx++) {
+        auto aligner = std::ref(gpuAligners[workerIdx]);
+        if (!finished) {
+            if (nextLocusEnd == region_end) {
+                finished = true;
+            }
+            handles[workerIdx] = std::async(std::launch::async,
+                                            locusRangeBaseEditCandidateGPU,
+                                            nextLocusBegin,
+                                            nextLocusEnd,
+                                            std::ref(alignments),
+                                            alignment_flags,
+                                            std::ref(out_variants),
+                                            std::ref(contig),
+                                            aligner,
+                                            std::ref(outVariantsMutex));
+            if ((nextLocusEnd + LOCI_PER_WORKER) < region_end){
+                nextLocusBegin = nextLocusEnd + 1;
+                nextLocusEnd = nextLocusBegin + LOCI_PER_WORKER - 1;
+            }else{
+                nextLocusBegin = nextLocusEnd + 1;
+                nextLocusEnd = region_end;
+            }
+        }
+    }
+
+    //Round robin - assigning work to the workers until out of candidates
+    while (!finished) {
+        for (int i = 0; i < num_workers; i++) {
+            auto status = handles[i].wait_for(std::chrono::microseconds(100));
+            if (status == std::future_status::ready && (!finished)) {
+                if (nextLocusEnd == region_end){
+                    finished = true;
+                }
+                auto aligner = std::ref(gpuAligners[i]);
+                handles[i].get();
+                handles[i] = std::async(std::launch::async,
+                                        locusRangeBaseEditCandidateGPU,
+                                        nextLocusBegin,
+                                        nextLocusEnd,
+                                        std::ref(alignments),
+                                        alignment_flags,
+                                        std::ref(out_variants),
+                                        std::ref(contig),
+                                        aligner,
+                                        std::ref(outVariantsMutex));
+                if ((nextLocusEnd + LOCI_PER_WORKER) < region_end){
+                    nextLocusBegin = nextLocusEnd + 1;
+                    nextLocusEnd = nextLocusBegin + LOCI_PER_WORKER - 1;
+                }else{
+                    nextLocusBegin = nextLocusEnd + 1;
+                    nextLocusEnd = region_end;
+                }
+            }
+        }
+    }
+
+    //Block until all workers are complete
+    for (int workerIdx = 0; workerIdx < num_workers; workerIdx++) {
+        handles[workerIdx].wait();
+    }
+    return  out_variants;
+}
diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp
index 90f991b3..3aaf371a 100644
--- a/src/nanopolish_call_variants.cpp
+++ b/src/nanopolish_call_variants.cpp
@@ -38,12 +38,6 @@
 #include "profiler.h"
 #include "progress.h"
 #include "stdaln.h"
-#ifdef HAVE_CUDA
-    #include <cuda_kernels/GpuAligner.h>
-    #include <thread>
-    #include <chrono>
-    #include <future>
-#endif
 
 // Macros
 #define max3(x,y,z) std::max(std::max(x,y), z)
@@ -68,43 +62,43 @@ float g_p_skip, g_p_skip_self, g_p_bad, g_p_bad_self;
 #define SUBPROGRAM "variants"
 
 static const char *CONSENSUS_VERSION_MESSAGE =
-    SUBPROGRAM " Version " PACKAGE_VERSION "\n"
-    "Written by Jared Simpson.\n"
-    "\n"
-    "Copyright 2015 Ontario Institute for Cancer Research\n";
+SUBPROGRAM " Version " PACKAGE_VERSION "\n"
+"Written by Jared Simpson.\n"
+"\n"
+"Copyright 2015 Ontario Institute for Cancer Research\n";
 
 static const char *CONSENSUS_USAGE_MESSAGE =
-    "Usage: " PACKAGE_NAME " " SUBPROGRAM " [OPTIONS] --reads reads.fa --bam alignments.bam --genome genome.fa\n"
-    "Find SNPs using a signal-level HMM\n"
-    "\n"
-    "  -v, --verbose                        display verbose output\n"
-    "      --version                        display version\n"
-    "      --help                           display this help and exit\n"
-    "      --snps                           only call SNPs\n"
-    "      --consensus                      run in consensus calling mode\n"
-    "      --fix-homopolymers               run the experimental homopolymer caller\n"
-    "      --faster                         minimize compute time while slightly reducing consensus accuracy\n"
-    "  -w, --window=STR                     find variants in window STR (format: <chromsome_name>:<start>-<end>)\n"
-    "  -r, --reads=FILE                     the ONT reads are in fasta FILE\n"
-    "  -b, --bam=FILE                       the reads aligned to the reference genome are in bam FILE\n"
-    "  -e, --event-bam=FILE                 the events aligned to the reference genome are in bam FILE\n"
-    "  -g, --genome=FILE                    the reference genome is in FILE\n"
-    "  -p, --ploidy=NUM                     the ploidy level of the sequenced genome\n"
-    "  -q  --methylation-aware=STR          turn on methylation aware polishing and test motifs given in STR (example: -q dcm,dam)\n"
-    "      --genotype=FILE                  call genotypes for the variants in the vcf FILE\n"
-    "  -o, --outfile=FILE                   write result to FILE [default: stdout]\n"
-    "  -t, --threads=NUM                    use NUM threads (default: 1)\n"
-    "  -m, --min-candidate-frequency=F      extract candidate variants from the aligned reads when the variant frequency is at least F (default 0.2)\n"
-    "  -d, --min-candidate-depth=D          extract candidate variants from the aligned reads when the depth is at least D (default: 20)\n"
-    "  -x, --max-haplotypes=N               consider at most N haplotype combinations (default: 1000)\n"
-    "      --min-flanking-sequence=N        distance from alignment end to calculate variants (default: 30)\n"
-    "      --max-rounds=N                   perform N rounds of consensus sequence improvement (default: 50)\n"
-    "  -c, --candidates=VCF                 read variant candidates from VCF, rather than discovering them from aligned reads\n"
-    "  -a, --alternative-basecalls-bam=FILE if an alternative basecaller was used that does not output event annotations\n"
-    "                                       then use basecalled sequences from FILE. The signal-level events will still be taken from the -b bam.\n"
-    "      --calculate-all-support          when making a call, also calculate the support of the 3 other possible bases\n"
-    "      --models-fofn=FILE               read alternative k-mer models from FILE\n"
-    "\nReport bugs to " PACKAGE_BUGREPORT "\n\n";
+"Usage: " PACKAGE_NAME " " SUBPROGRAM " [OPTIONS] --reads reads.fa --bam alignments.bam --genome genome.fa\n"
+"Find SNPs using a signal-level HMM\n"
+"\n"
+"  -v, --verbose                        display verbose output\n"
+"      --version                        display version\n"
+"      --help                           display this help and exit\n"
+"      --snps                           only call SNPs\n"
+"      --consensus                      run in consensus calling mode\n"
+"      --fix-homopolymers               run the experimental homopolymer caller\n"
+"      --faster                         minimize compute time while slightly reducing consensus accuracy\n"
+"  -w, --window=STR                     find variants in window STR (format: <chromsome_name>:<start>-<end>)\n"
+"  -r, --reads=FILE                     the ONT reads are in fasta FILE\n"
+"  -b, --bam=FILE                       the reads aligned to the reference genome are in bam FILE\n"
+"  -e, --event-bam=FILE                 the events aligned to the reference genome are in bam FILE\n"
+"  -g, --genome=FILE                    the reference genome is in FILE\n"
+"  -p, --ploidy=NUM                     the ploidy level of the sequenced genome\n"
+"  -q  --methylation-aware=STR          turn on methylation aware polishing and test motifs given in STR (example: -q dcm,dam)\n"
+"      --genotype=FILE                  call genotypes for the variants in the vcf FILE\n"
+"  -o, --outfile=FILE                   write result to FILE [default: stdout]\n"
+"  -t, --threads=NUM                    use NUM threads (default: 1)\n"
+"  -m, --min-candidate-frequency=F      extract candidate variants from the aligned reads when the variant frequency is at least F (default 0.2)\n"
+"  -d, --min-candidate-depth=D          extract candidate variants from the aligned reads when the depth is at least D (default: 20)\n"
+"  -x, --max-haplotypes=N               consider at most N haplotype combinations (default: 1000)\n"
+"      --min-flanking-sequence=N        distance from alignment end to calculate variants (default: 30)\n"
+"      --max-rounds=N                   perform N rounds of consensus sequence improvement (default: 50)\n"
+"  -c, --candidates=VCF                 read variant candidates from VCF, rather than discovering them from aligned reads\n"
+"  -a, --alternative-basecalls-bam=FILE if an alternative basecaller was used that does not output event annotations\n"
+"                                       then use basecalled sequences from FILE. The signal-level events will still be taken from the -b bam.\n"
+"      --calculate-all-support          when making a call, also calculate the support of the 3 other possible bases\n"
+"      --models-fofn=FILE               read alternative k-mer models from FILE\n"
+"\nReport bugs to " PACKAGE_BUGREPORT "\n\n";
 
 namespace opt
 {
@@ -140,7 +134,6 @@ namespace opt
     static int debug_alignments = 0;
     static std::vector<std::string> methylation_types;
     static int gpu = 0;
-
 }
 
 static const char* shortopts = "r:b:g:t:w:o:e:m:c:d:a:x:q:p:v";
@@ -285,23 +278,28 @@ void annotate_with_all_support(std::vector<Variant>& variants,
     }
 }
 
-void prepareForBaseEditCandidates(int start,
-                                  int end,
-                                  const AlignmentDB& alignments,
-                                  std::string contig,
-                                  std::vector<std::vector<Variant>> &tmp_variants_vector,
-                                  std::vector<Haplotype> &haplotypes,
-                                  std::vector<std::vector<HMMInputData>> &event_sequences_vector){
-    for(int i = start; i<=end; i++){
+// Given the input region, calculate all single base edits to the current assembly
+std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& alignments,
+                                                          int region_start,
+                                                          int region_end,
+                                                          uint32_t alignment_flags)
+{
+    std::vector<Variant> out_variants;
+
+    std::string contig = alignments.get_region_contig();
+
+    // Add all positively-scoring single-base changes into the candidate set
+    for(size_t i = region_start; i < region_end; ++i) {
+
         int calling_start = i - opt::screen_flanking_sequence;
         int calling_end = i + 1 + opt::screen_flanking_sequence;
 
-        if (!alignments.are_coordinates_valid(contig, calling_start, calling_end)) {
-            return;
+        if(!alignments.are_coordinates_valid(contig, calling_start, calling_end)) {
+            continue;
         }
 
         std::vector<Variant> tmp_variants;
-        for (size_t j = 0; j < 4; ++j) {
+        for(size_t j = 0; j < 4; ++j) {
             // Substitutions
             Variant v;
             v.ref_name = contig;
@@ -309,14 +307,14 @@ void prepareForBaseEditCandidates(int start,
             v.ref_seq = alignments.get_reference_substring(contig, i, i);
             v.alt_seq = "ACGT"[j];
 
-            if (v.ref_seq != v.alt_seq) {
+            if(v.ref_seq != v.alt_seq) {
                 tmp_variants.push_back(v);
             }
 
             // Insertions
             v.alt_seq = v.ref_seq + "ACGT"[j];
             // ignore insertions of the type "A" -> "AA" as these are redundant
-            if (v.alt_seq[1] != v.ref_seq[0]) {
+            if(v.alt_seq[1] != v.ref_seq[0]) {
                 tmp_variants.push_back(v);
             }
         }
@@ -329,7 +327,7 @@ void prepareForBaseEditCandidates(int start,
         del.alt_seq = del.ref_seq[0];
 
         // ignore deletions of the type "AA" -> "A" as these are redundant
-        if (del.alt_seq[0] != del.ref_seq[1]) {
+        if(del.alt_seq[0] != del.ref_seq[1]) {
             tmp_variants.push_back(del);
         }
 
@@ -337,199 +335,29 @@ void prepareForBaseEditCandidates(int start,
         // We do this internally here as it is much faster to get the event sequences
         // for the entire window for all variants at this position once, rather than
         // for each variant individually
-        std::vector<HMMInputData> event_sequences = alignments.get_event_subsequences(contig, calling_start, calling_end);
+        std::vector<HMMInputData> event_sequences =
+            alignments.get_event_subsequences(contig, calling_start, calling_end);
 
         Haplotype test_haplotype(contig,
                                  calling_start,
-                                 alignments.get_reference_substring(contig,
-                                                                    calling_start,
-                                                                    calling_end));
-
-        haplotypes.push_back(test_haplotype);
-        event_sequences_vector.push_back(event_sequences);
-        tmp_variants_vector.push_back(tmp_variants);
-    }
-}
-
-#ifdef HAVE_CUDA
-void locusRangeBaseEditCandidateGPU(int start,
-                                    int end,
-                                    const AlignmentDB& alignments,
-                                    uint32_t alignment_flags,
-                                    std::vector<Variant> &out_variants,
-                                    std::string contig,
-                                    GpuAligner &aligner,
-                                    std::mutex &outVariantsMutex) {
-    std::vector<std::vector<Variant>> tmp_variants_vector;
-    std::vector<Haplotype> haplotypes;
-    std::vector<std::vector<HMMInputData>> event_sequences_vector;
-
-    prepareForBaseEditCandidates(start,
-                                 end,
-                                 alignments,
-                                 contig,
-                                 tmp_variants_vector,
-                                 haplotypes,
-                                 event_sequences_vector);
-
-    std::vector<Variant> scoredVariants = aligner.variantScoresThresholded(tmp_variants_vector,
-                                                                           haplotypes,
-                                                                           event_sequences_vector,
-                                                                           alignment_flags,
-                                                                           opt::screen_score_threshold,
-                                                                           opt::methylation_types);
-    for (auto variant: scoredVariants) {
-        if (variant.quality > 0) {
-            std::lock_guard<std::mutex> lock(outVariantsMutex);
-            out_variants.push_back(variant);
-        }
-    }
-
-}
-#endif
+                                 alignments.get_reference_substring(contig, calling_start, calling_end));
 
-void locusRangeBaseEditCandidate(int start,
-                                 int end,
-				 const AlignmentDB& alignments,
-                                 uint32_t alignment_flags,
-                                 std::vector<Variant> &out_variants,
-                                 std::string contig) {
-    std::vector<std::vector<Variant>> tmp_variants_vector;
-    std::vector<Haplotype> haplotypes;
-    std::vector<std::vector<HMMInputData>> event_sequences_vector;
-
-    prepareForBaseEditCandidates(start,
-                                 end,
-                                 alignments,
-                                 contig,
-                                 tmp_variants_vector,
-                                 haplotypes,
-                                 event_sequences_vector);
-
-    int numHaplotypes = haplotypes.size();
-    for (int haplotypeIDX = 0; haplotypeIDX < numHaplotypes; haplotypeIDX++) {
-        auto variants = tmp_variants_vector[haplotypeIDX];
-        auto test_haplotype = haplotypes[haplotypeIDX];
-        auto event_sequences = event_sequences_vector[haplotypeIDX];
-        for (const Variant &v : variants) {
-            Variant scored_variant = score_variant_thresholded(v,
-                                                               test_haplotype,
-                                                               event_sequences,
-                                                               alignment_flags,
-                                                               opt::screen_score_threshold,
-                                                               opt::methylation_types);
+        for(const Variant& v : tmp_variants) {
+            Variant scored_variant = score_variant_thresholded(v, test_haplotype, event_sequences, alignment_flags, opt::screen_score_threshold, opt::methylation_types);
             scored_variant.info = "";
-            if (scored_variant.quality > 0) {
+            if(scored_variant.quality > 0) {
                 out_variants.push_back(scored_variant);
             }
         }
+
     }
+    return out_variants;
 }
 
 #ifdef HAVE_CUDA
-std::vector<Variant> generate_candidate_single_base_edits_gpu(const AlignmentDB& alignments,
-                                                              int region_start,
-                                                              int region_end,
-                                                              uint32_t alignment_flags){
-
-    std::mutex outVariantsMutex;
-    std::vector<Variant> out_variants;
-    std::string contig = alignments.get_region_contig();
-
-    // Add all positively-scoring single-base changes into the candidate set
-    size_t num_workers = (opt::num_threads < MAX_NUM_WORKERS) ? opt::num_threads : MAX_NUM_WORKERS;
-    std::vector<GpuAligner> gpuAligners(num_workers);
-
-    //std::vector<std::thread> workerThreads(num_workers);
-    std::vector<std::future<void>> handles(num_workers);
-
-    int nextLocusBegin = region_start;
-    int nextLocusEnd = nextLocusBegin + LOCI_PER_WORKER;
-    bool finished = false;
-
-    //Initialise the workers
-    for (int workerIdx = 0; workerIdx < num_workers; workerIdx++) {
-        auto aligner = std::ref(gpuAligners[workerIdx]);
-        if (!finished) {
-            if (nextLocusEnd == region_end) {
-                finished = true;
-            }
-            handles[workerIdx] = std::async(std::launch::async,
-                                            locusRangeBaseEditCandidateGPU,
-                                            nextLocusBegin,
-                                            nextLocusEnd,
-                                            std::ref(alignments),
-                                            alignment_flags,
-                                            std::ref(out_variants),
-                                            std::ref(contig),
-                                            aligner,
-                                            std::ref(outVariantsMutex));
-            if ((nextLocusEnd + LOCI_PER_WORKER) < region_end){
-                nextLocusBegin = nextLocusEnd + 1;
-                nextLocusEnd = nextLocusBegin + LOCI_PER_WORKER - 1;
-            }else{
-                nextLocusBegin = nextLocusEnd + 1;
-                nextLocusEnd = region_end;
-            }
-        }
-    }
-
-    //Round robin - assigning work to the workers until out of candidates
-    while (!finished) {
-        for (int i = 0; i < num_workers; i++) {
-            auto status = handles[i].wait_for(std::chrono::microseconds(100));
-            if (status == std::future_status::ready && (!finished)) {
-                if (nextLocusEnd == region_end){
-                    finished = true;
-                }
-                auto aligner = std::ref(gpuAligners[i]);
-                handles[i].get();
-                handles[i] = std::async(std::launch::async,
-                                        locusRangeBaseEditCandidateGPU,
-                                        nextLocusBegin,
-                                        nextLocusEnd,
-                                        std::ref(alignments),
-                                        alignment_flags,
-                                        std::ref(out_variants),
-                                        std::ref(contig),
-                                        aligner,
-                                        std::ref(outVariantsMutex));
-                if ((nextLocusEnd + LOCI_PER_WORKER) < region_end){
-                    nextLocusBegin = nextLocusEnd + 1;
-                    nextLocusEnd = nextLocusBegin + LOCI_PER_WORKER - 1;
-                }else{
-                    nextLocusBegin = nextLocusEnd + 1;
-                    nextLocusEnd = region_end;
-                }
-            }
-        }
-    }
-
-    //Block until all workers are complete
-    for (int workerIdx = 0; workerIdx < num_workers; workerIdx++) {
-        handles[workerIdx].wait();
-    }
-    return  out_variants;
-}
+    #include <cuda_kernels/gpu_call_variants.inl>
 #endif
 
-// Given the input region, calculate all single base edits to the current assembly
-std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& alignments,
-                                                          int region_start,
-                                                          int region_end,
-                                                          uint32_t alignment_flags){
-    std::vector<Variant> out_variants;
-    std::string contig = alignments.get_region_contig();
-    locusRangeBaseEditCandidate(region_start,
-                                region_end,
-                                alignments,
-                                alignment_flags,
-                                out_variants,
-                                std::ref(contig));
-
-    return out_variants;
-}
-
 // Given the input set of variants, calculate the variants that have a positive score
 std::vector<Variant> screen_variants_by_score(const AlignmentDB& alignments,
                                               const std::vector<Variant>& candidate_variants,
@@ -847,7 +675,7 @@ Haplotype fix_homopolymers(const Haplotype& input_haplotype,
                     duration_likelihoods[var_sequence_length] += log_gamma;
                 }
                 if(opt::verbose > 3) {
-		    fprintf(stderr, "SUM_VAR\t%zu\t%zu\t%d\t%d\t%lu\t%.5lf\t%.2lf\n", ref_hp_start, hp_length, var_sequence_length, call_window, variant_offset_end - variant_offset_start, sum_duration, log_gamma);
+                   fprintf(stderr, "SUM_VAR\t%zu\t%zu\t%d\t%d\t%lu\t%.5lf\t%.2lf\n", ref_hp_start, hp_length, var_sequence_length, call_window, variant_offset_end - variant_offset_start, sum_duration, log_gamma);
                 }
             }
         }
@@ -963,7 +791,7 @@ Haplotype call_haplotype_from_candidates(const AlignmentDB& alignments,
         size_t end_variant_idx = curr_variant_idx + 1;
         while(end_variant_idx < candidate_variants.size()) {
             int distance = candidate_variants[end_variant_idx].ref_position -
-		candidate_variants[end_variant_idx - 1].ref_position;
+                           candidate_variants[end_variant_idx - 1].ref_position;
             if(distance > opt::min_distance_between_variants)
                 break;
             end_variant_idx++;
@@ -972,8 +800,8 @@ Haplotype call_haplotype_from_candidates(const AlignmentDB& alignments,
         size_t num_variants = end_variant_idx - curr_variant_idx;
         int calling_start = candidate_variants[curr_variant_idx].ref_position - opt::min_flanking_sequence;
         int calling_end = candidate_variants[end_variant_idx - 1].ref_position +
-	    candidate_variants[end_variant_idx - 1].ref_seq.length() +
-	    opt::min_flanking_sequence;
+                          candidate_variants[end_variant_idx - 1].ref_seq.length() +
+                          opt::min_flanking_sequence;
 
         int calling_size = calling_end - calling_start;
 
@@ -1017,7 +845,7 @@ Haplotype call_haplotype_from_candidates(const AlignmentDB& alignments,
             }
         } else {
             fprintf(stderr, "Warning: %zu variants in span, region not called [%d %d]\n", num_variants, calling_start, calling_end);
-	}
+		}
 
         // advance to start of next region
         curr_variant_idx = end_variant_idx;
@@ -1073,11 +901,12 @@ Haplotype call_variants_for_region(const std::string& contig, int region_start,
         fprintf(stderr, "input region: %s\n", alignments.get_reference_substring(contig, region_start - BUFFER, region_end + BUFFER).c_str());
     }
 
-    /*
-      Haplotype called_haplotype(alignments.get_region_contig(),
-      alignments.get_region_start(),
-      alignments.get_reference());
-    */
+/*
+    Haplotype called_haplotype(alignments.get_region_contig(),
+                               alignments.get_region_start(),
+                               alignments.get_reference());
+*/
+
     // Step 1. Discover putative variants across the whole region
     std::vector<Variant> candidate_variants;
     if(opt::candidates_file.empty()) {
@@ -1089,24 +918,20 @@ Haplotype call_variants_for_region(const std::string& contig, int region_start,
     if(opt::consensus_mode) {
 
         // generate single-base edits that have a positive haplotype score
-
         std::vector<Variant> single_base_edits;
-        if(opt::gpu) {
-            #ifdef HAVE_CUDA
-            single_base_edits = generate_candidate_single_base_edits_gpu(alignments,
-									 region_start,
-									 region_end,
-                                                                         alignment_flags);
-            #else
-                fprintf(stderr,"Not compiled for CUDA\n");
-                exit(1);
-            #endif
-        } else {
-            single_base_edits = generate_candidate_single_base_edits(alignments,
-								     region_start,
-								     region_end,
-                                                                     alignment_flags);
+        if(opt::gpu==0) {
+            single_base_edits= generate_candidate_single_base_edits(alignments, region_start, region_end, alignment_flags);
+        }
+        else{
+           #ifdef HAVE_CUDA
+                single_base_edits= generate_candidate_single_base_edits_gpu(alignments, region_start, region_end, alignment_flags);
+           #else
+                fprintf(stderr,"--gpu option is only effective when compiled with CUDA support\n");
+                fprintf(stderr,"Please compile nanopolish by 'make cuda=1'. You need to have CUDA toolkit setup for this.");
+                exit(EXIT_FAILURE);
+           #endif
         }
+
         // insert these into the candidate set
         candidate_variants.insert(candidate_variants.end(), single_base_edits.begin(), single_base_edits.end());
 
@@ -1117,6 +942,8 @@ Haplotype call_variants_for_region(const std::string& contig, int region_start,
         std::sort(candidate_variants.begin(), candidate_variants.end(), sortByPosition);
     }
 
+    // Step 2. Call variants
+
     Haplotype called_haplotype(alignments.get_region_contig(),
                                alignments.get_region_start(),
                                alignments.get_reference());
@@ -1162,10 +989,10 @@ Haplotype call_variants_for_region(const std::string& contig, int region_start,
             last_round_variant_keys = this_round_variant_keys;
             if(variant_set_changed) {
                 candidate_variants = expand_variants(alignments,
-						     called_variants,
-						     region_start,
-						     region_end,
-						     alignment_flags);
+                        called_variants,
+                        region_start,
+                        region_end,
+                        alignment_flags);
 
             } else {
                 break;
@@ -1196,44 +1023,44 @@ void parse_call_variants_options(int argc, char** argv)
     for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) {
         std::istringstream arg(optarg != NULL ? optarg : "");
         switch (c) {
-	case 'r': arg >> opt::reads_file; break;
-	case 'g': arg >> opt::genome_file; break;
-	case 'b': arg >> opt::bam_file; break;
-	case 'e': arg >> opt::event_bam_file; break;
-	case 'w': arg >> opt::window; break;
-	case 'o': arg >> opt::output_file; break;
-	case 'm': arg >> opt::min_candidate_frequency; break;
-	case 'd': arg >> opt::min_candidate_depth; break;
-	case 'x': arg >> opt::max_haplotypes; break;
-	case 'c': arg >> opt::candidates_file; break;
-	case 'p': arg >> opt::ploidy; break;
-	case 'q': arg >> methylation_motifs_str; break;
-	case 'a': arg >> opt::alternative_basecalls_bam; break;
-	case '?': die = true; break;
-	case 't': arg >> opt::num_threads; break;
-	case 'v': opt::verbose++; break;
-	case OPT_CONSENSUS: opt::consensus_mode = 1; break;
-	case OPT_GPU: opt::gpu = 1; break;
-	case OPT_FIX_HOMOPOLYMERS: opt::fix_homopolymers = 1; break;
-	case OPT_EFFORT: arg >> opt::screen_score_threshold; break;
-	case OPT_FASTER: opt::screen_score_threshold = 25; break;
-	case OPT_MAX_ROUNDS: arg >> opt::max_rounds; break;
-	case OPT_GENOTYPE: opt::genotype_only = 1; arg >> opt::candidates_file; break;
-	case OPT_MODELS_FOFN: arg >> opt::models_fofn; break;
-	case OPT_CALC_ALL_SUPPORT: opt::calculate_all_support = 1; break;
-	case OPT_SNPS_ONLY: opt::snps_only = 1; break;
-	case OPT_PROGRESS: opt::show_progress = 1; break;
-	case OPT_P_SKIP: arg >> g_p_skip; break;
-	case OPT_P_SKIP_SELF: arg >> g_p_skip_self; break;
-	case OPT_P_BAD: arg >> g_p_bad; break;
-	case OPT_P_BAD_SELF: arg >> g_p_bad_self; break;
-	case OPT_MIN_FLANKING_SEQUENCE: arg >> opt::min_flanking_sequence; break;
-	case OPT_HELP:
-	    std::cout << CONSENSUS_USAGE_MESSAGE;
-	    exit(EXIT_SUCCESS);
-	case OPT_VERSION:
-	    std::cout << CONSENSUS_VERSION_MESSAGE;
-	    exit(EXIT_SUCCESS);
+            case 'r': arg >> opt::reads_file; break;
+            case 'g': arg >> opt::genome_file; break;
+            case 'b': arg >> opt::bam_file; break;
+            case 'e': arg >> opt::event_bam_file; break;
+            case 'w': arg >> opt::window; break;
+            case 'o': arg >> opt::output_file; break;
+            case 'm': arg >> opt::min_candidate_frequency; break;
+            case 'd': arg >> opt::min_candidate_depth; break;
+            case 'x': arg >> opt::max_haplotypes; break;
+            case 'c': arg >> opt::candidates_file; break;
+            case 'p': arg >> opt::ploidy; break;
+            case 'q': arg >> methylation_motifs_str; break;
+            case 'a': arg >> opt::alternative_basecalls_bam; break;
+            case '?': die = true; break;
+            case 't': arg >> opt::num_threads; break;
+            case 'v': opt::verbose++; break;
+            case OPT_CONSENSUS: opt::consensus_mode = 1; break;
+            case OPT_GPU: opt::gpu = 1; break;
+            case OPT_FIX_HOMOPOLYMERS: opt::fix_homopolymers = 1; break;
+            case OPT_EFFORT: arg >> opt::screen_score_threshold; break;
+            case OPT_FASTER: opt::screen_score_threshold = 25; break;
+            case OPT_MAX_ROUNDS: arg >> opt::max_rounds; break;
+            case OPT_GENOTYPE: opt::genotype_only = 1; arg >> opt::candidates_file; break;
+            case OPT_MODELS_FOFN: arg >> opt::models_fofn; break;
+            case OPT_CALC_ALL_SUPPORT: opt::calculate_all_support = 1; break;
+            case OPT_SNPS_ONLY: opt::snps_only = 1; break;
+            case OPT_PROGRESS: opt::show_progress = 1; break;
+            case OPT_P_SKIP: arg >> g_p_skip; break;
+            case OPT_P_SKIP_SELF: arg >> g_p_skip_self; break;
+            case OPT_P_BAD: arg >> g_p_bad; break;
+            case OPT_P_BAD_SELF: arg >> g_p_bad_self; break;
+            case OPT_MIN_FLANKING_SEQUENCE: arg >> opt::min_flanking_sequence; break;
+            case OPT_HELP:
+                std::cout << CONSENSUS_USAGE_MESSAGE;
+                exit(EXIT_SUCCESS);
+            case OPT_VERSION:
+                std::cout << CONSENSUS_VERSION_MESSAGE;
+                exit(EXIT_SUCCESS);
         }
     }
 
@@ -1287,10 +1114,10 @@ void parse_call_variants_options(int argc, char** argv)
     }
 
     if (die)
-	{
-	    std::cout << "\n" << CONSENSUS_USAGE_MESSAGE;
-	    exit(EXIT_FAILURE);
-	}
+    {
+        std::cout << "\n" << CONSENSUS_USAGE_MESSAGE;
+        exit(EXIT_FAILURE);
+    }
 }
 
 void print_invalid_window_error(int start_base, int end_base)
@@ -1356,34 +1183,34 @@ int call_variants_main(int argc, char** argv)
 
     //
     header_fields.push_back(
-			    Variant::make_vcf_tag_string("INFO", "TotalReads", 1, "Integer",
-							 "The number of event-space reads used to call the variant"));
+        Variant::make_vcf_tag_string("INFO", "TotalReads", 1, "Integer",
+                                      "The number of event-space reads used to call the variant"));
 
     header_fields.push_back(
-			    Variant::make_vcf_tag_string("INFO", "SupportFraction", 1, "Float",
-							 "The fraction of event-space reads that support the variant"));
+        Variant::make_vcf_tag_string("INFO", "SupportFraction", 1, "Float",
+                                      "The fraction of event-space reads that support the variant"));
 
     header_fields.push_back(
-			    Variant::make_vcf_tag_string("INFO", "BaseCalledReadsWithVariant", 1, "Integer",
-							 "The number of base-space reads that support the variant"));
+        Variant::make_vcf_tag_string("INFO", "BaseCalledReadsWithVariant", 1, "Integer",
+                                      "The number of base-space reads that support the variant"));
 
     header_fields.push_back(
-			    Variant::make_vcf_tag_string("INFO", "BaseCalledFraction", 1, "Float",
-							 "The fraction of base-space reads that support the variant"));
+        Variant::make_vcf_tag_string("INFO", "BaseCalledFraction", 1, "Float",
+                                      "The fraction of base-space reads that support the variant"));
 
     header_fields.push_back(
-			    Variant::make_vcf_tag_string("INFO", "AlleleCount", 1, "Integer",
-							 "The inferred number of copies of the allele"));
+            Variant::make_vcf_tag_string("INFO", "AlleleCount", 1, "Integer",
+                "The inferred number of copies of the allele"));
 
     if(opt::calculate_all_support) {
         header_fields.push_back(
-				Variant::make_vcf_tag_string("INFO", "SupportFractionByBase", 4, "Integer",
-							     "The fraction of reads supporting A,C,G,T at this position"));
+                Variant::make_vcf_tag_string("INFO", "SupportFractionByBase", 4, "Integer",
+                    "The fraction of reads supporting A,C,G,T at this position"));
 
     }
     header_fields.push_back(
-			    Variant::make_vcf_tag_string("FORMAT", "GT", 1, "String",
-							 "Genotype"));
+            Variant::make_vcf_tag_string("FORMAT", "GT", 1, "String",
+                "Genotype"));
 
     Variant::write_vcf_header(out_fp, header_fields);
 
@@ -1393,9 +1220,9 @@ int call_variants_main(int argc, char** argv)
     if(!opt::consensus_output.empty()) {
         FILE* consensus_fp = fopen(opt::consensus_output.c_str(), "w");
         fprintf(consensus_fp, ">%s:%d-%d\n%s\n", contig.c_str(),
-		start_base,
-		end_base,
-		haplotype.get_sequence().c_str());
+                                  start_base,
+                                  end_base,
+                                  haplotype.get_sequence().c_str());
         fclose(consensus_fp);
     }
 

From 8338b92df9d54de3b2457068df910ab47138a6b6 Mon Sep 17 00:00:00 2001
From: Hasindu Gamaarachchi <hasindu2008@gmail.com>
Date: Sat, 28 Sep 2019 15:42:06 +1000
Subject: [PATCH 75/80] make the --gpu more clear

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ebf66275..fc36fc65 100644
--- a/README.md
+++ b/README.md
@@ -116,10 +116,11 @@ docker run -v /path/to/local/data/data/:/data/ -it :image_id  ./nanopolish event
 
 ## GPU acceleration
 
-The nanopolish consensus improvement algorithm can be performed faster using CUDA-enabled GPU acceleration. This is an experimental feature, to try this feature run with the `--gpu` flag e.g:
+The nanopolish consensus improvement algorithm can be performed faster using CUDA-enabled GPU acceleration. This is an experimental feature, to try this feature run with the `--gpu=1` flag e.g:
 ```
 nanopolish variants --consensus polished_gpu.fa -w "tig00000001:200000-230000" -r reads.fasta -b reads.sorted.bam -g draft.fa --threads=8 --gpu=1
 ```
+Note that this feature requires nanopolish to be compiled with `make cuda=1`. You should have the [CUDA toolkit installed and configured](https://docs.nvidia.com/cuda/cuda-quick-start-guide/). If your CUDA installation is not in the default location, you can provide the path to make as `make cuda=1 NVCC=/path/to/nvidia_c_compiler CUDA_LIB=/path/to/cuda/lib CUDA_INCLUDE=/path/to/cuda/include`.
 
 ## Credits and Thanks
 

From c05733bbfea00f0b3e375d19f5be637ca3a10bd6 Mon Sep 17 00:00:00 2001
From: Hasindu Gamaarachchi <hasindu2008@gmail.com>
Date: Sat, 28 Sep 2019 16:09:03 +1000
Subject: [PATCH 76/80] set to cuda static runtime library

---
 cuda.mk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda.mk b/cuda.mk
index 57c97bf4..59c91c4f 100644
--- a/cuda.mk
+++ b/cuda.mk
@@ -4,7 +4,7 @@ NVCC ?= nvcc
 CUDA_ROOT = /usr/local/cuda
 CUDA_LIB ?= $(CUDA_ROOT)/lib64
 CUDA_INCLUDE ?= $(CUDA_ROOT)/include
-CURTFLAGS = -L$(CUDA_LIB) -lcudart
+CURTFLAGS = -L$(CUDA_LIB) -lcudart_static -lrt 
 NVCCFLAGS ?= -std=c++11 -I. -I$(CUDA_INCLUDE) -O3 -use_fast_math --default-stream per-thread -restrict
 
 CPPFLAGS += -I$(CUDA_INCLUDE)

From 8964db064785eb039b43f23657520680610ef01f Mon Sep 17 00:00:00 2001
From: Hasindu Gamaarachchi <hasindu2008@gmail.com>
Date: Sat, 28 Sep 2019 18:30:52 +1000
Subject: [PATCH 77/80] removed .gitignore in test/

---
 test/.gitignore | 2 --
 1 file changed, 2 deletions(-)
 delete mode 100644 test/.gitignore

diff --git a/test/.gitignore b/test/.gitignore
deleted file mode 100644
index 479a396b..00000000
--- a/test/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-ecoli_2kb_region
-

From 896b8066e49d9004f0e62aed70d1f966b49bdb3a Mon Sep 17 00:00:00 2001
From: Hasindu Gamaarachchi <hasindu2008@gmail.com>
Date: Thu, 3 Oct 2019 21:47:29 +1000
Subject: [PATCH 78/80] add cida object file to make file clean option

---
 Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile b/Makefile
index 1e56cca4..9d7a6974 100644
--- a/Makefile
+++ b/Makefile
@@ -155,4 +155,5 @@ test: $(TEST_PROGRAM)
 .PHONY: clean
 clean:
 	rm -f $(PROGRAM) $(TEST_PROGRAM) $(CPP_OBJ) $(C_OBJ) \
+		src/cuda_kernels/GpuAligner.o \
 		src/main/nanopolish.o src/test/nanopolish_test.o

From 09df08dbe6e3aefa2332acf2988f72b6124b72c6 Mon Sep 17 00:00:00 2001
From: Hasindu Gamaarachchi <hasindu2008@gmail.com>
Date: Fri, 13 Mar 2020 11:14:14 +1100
Subject: [PATCH 79/80] implementation of the methylation aware polishing
 option for the GPU

---
 Makefile                                      |   2 +-
 cuda.mk                                       |   2 +-
 src/cuda_kernels/GpuAligner.cu                | 722 ---------------
 src/cuda_kernels/gpu_aligner.cu               | 864 ++++++++++++++++++
 .../{GpuAligner.h => gpu_aligner.h}           |  29 +-
 src/cuda_kernels/gpu_call_variants.inl        |   2 +-
 src/pore_model/nanopolish_pore_model_set.cpp  |  10 +
 src/pore_model/nanopolish_pore_model_set.h    |  10 +-
 8 files changed, 909 insertions(+), 732 deletions(-)
 delete mode 100644 src/cuda_kernels/GpuAligner.cu
 create mode 100644 src/cuda_kernels/gpu_aligner.cu
 rename src/cuda_kernels/{GpuAligner.h => gpu_aligner.h} (76%)

diff --git a/Makefile b/Makefile
index bd5eb566..40d90313 100644
--- a/Makefile
+++ b/Makefile
@@ -176,5 +176,5 @@ test: $(TEST_PROGRAM)
 .PHONY: clean
 clean:
 	rm -f $(PROGRAM) $(TEST_PROGRAM) $(CPP_OBJ) $(C_OBJ) \
-		src/cuda_kernels/GpuAligner.o \
+		src/cuda_kernels/gpu_aligner.o \
 		src/main/nanopolish.o src/test/nanopolish_test.o
diff --git a/cuda.mk b/cuda.mk
index 59c91c4f..50330d3e 100644
--- a/cuda.mk
+++ b/cuda.mk
@@ -5,7 +5,7 @@ CUDA_ROOT = /usr/local/cuda
 CUDA_LIB ?= $(CUDA_ROOT)/lib64
 CUDA_INCLUDE ?= $(CUDA_ROOT)/include
 CURTFLAGS = -L$(CUDA_LIB) -lcudart_static -lrt 
-NVCCFLAGS ?= -std=c++11 -I. -I$(CUDA_INCLUDE) -O3 -use_fast_math --default-stream per-thread -restrict
+NVCCFLAGS ?= -g  -lineinfo -std=c++11 -I. -I$(CUDA_INCLUDE) -O3 -use_fast_math --default-stream per-thread -restrict
 
 CPPFLAGS += -I$(CUDA_INCLUDE)
 CPPFLAGS += -DHAVE_CUDA=1
diff --git a/src/cuda_kernels/GpuAligner.cu b/src/cuda_kernels/GpuAligner.cu
deleted file mode 100644
index 36b6378a..00000000
--- a/src/cuda_kernels/GpuAligner.cu
+++ /dev/null
@@ -1,722 +0,0 @@
-#include <iostream>
-#include <cuda.h>
-#include "GpuAligner.h"
-#include <vector>
-#include "nanopolish_profile_hmm_r9.h"
-
-#define MAX_STATES 256
-
-#define EXPAND_TO_STRING(X) #X
-#define TO_STRING(X) EXPAND_TO_STRING(X)
-#define CU_CHECK_ERR(X) if (X != cudaSuccess){printf("CUDA error: <<%s>> at line %s\n", cudaGetErrorString(X), TO_STRING(__LINE__));throw std::runtime_error("CUDA ERROR");}
-
-__device__ float logsumexpf(float x, float y){
-    if(x == -INFINITY && y == -INFINITY){
-	return -INFINITY;
-    }
-    float result = fmax(x, y) + log1pf(expf(-fabsf(y - x)));
-    return result;
-}
-
-__device__ float lp_match_r9(int rank,
-                             float mean,
-                             float pore_mean,
-                             float pore_stdv,
-                             float pore_log_level_stdv,
-                             float scale,
-                             float shift,
-                             float var,
-                             float logVar){
-
-    float log_inv_sqrt_2pi = logf(0.3989422804014327);
-
-    float level = mean;
-    float gaussian_mean = scale * pore_mean + shift;
-    float gaussian_stdv = pore_stdv * var;
-    float gaussian_log_level_stdv = pore_log_level_stdv + logVar;
-
-    float a = (level - gaussian_mean) / gaussian_stdv;
-    float emission = log_inv_sqrt_2pi - gaussian_log_level_stdv + (-0.5f * a * a);
-    return emission;
-
-}
-
-__global__ void getScoresMod (float * poreModelDev,
-                              int * readLengthsDev,
-                              int * eventStartsDev,
-                              int * eventStridesDev,
-                              float * eventsPerBaseDev,
-                              float * scaleDev,
-                              float * shiftDev,
-                              float * varDev,
-                              float * logVarDev,
-                              int * eventOffsetsDev,
-                              float * eventMeansDev,
-                              float * preFlankingDev,
-                              float * postFlankingDev,
-                              int * sequenceLengthsDev,
-                              int * sequenceOffsetsDev,
-                              int * kmerRanksDev,
-                              int * seqIdxDev,
-                              int * readIdxDev,
-                              int numScores,
-                              float * returnValuesDev){
-
-    bool debug = false;
-    if ((threadIdx.x == 0) && (blockIdx.x == 0)){
-	debug = false;
-    }
-
-    // get buffer indices
-    int scoreIdx = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (scoreIdx < numScores) {
-
-	int readIdx = readIdxDev[scoreIdx];
-	int seqIdx = seqIdxDev[scoreIdx];
-
-	// get read statistics
-	int numEvents = readLengthsDev[readIdx];
-	int readOffset = eventOffsetsDev[readIdx];
-	float read_events_per_base = eventsPerBaseDev[readIdx];
-	int e_start = eventStartsDev[readIdx]; // Event start for read
-	int e_stride = eventStridesDev[readIdx];
-	int e_offset = eventOffsetsDev[readIdx]; // Within the event means etc, the offset needed for this block to get a specific event
-	float scale = scaleDev[readIdx];
-	float shift = shiftDev[readIdx];
-	float var = varDev[readIdx];
-	float logVar = logVarDev[readIdx];
-
-	// get sequence statistics
-	int numKmers = sequenceLengthsDev[seqIdx];
-	int seqOffset = sequenceOffsetsDev[seqIdx];
-
-	int lastRowIdx = numEvents - 1;
-	int lastKmerIdx = numKmers - 1;
-
-	float returnValue = -INFINITY; //Used to sum over the last column.
-	float prevProbabilities[MAX_STATES];
-
-	int numBlocks = numKmers + 2;
-	int numStates = numBlocks * PSR9_NUM_STATES; // 3 blocks per kmer and then 3 each for start and end state.
-
-	if (debug) {
-	    printf("Kernel 1 >>> Num Kmers is %i\n", numKmers);
-	    printf("Kernel 1 >>> n_states %i\n", numStates);
-	    printf("Kernel 1 >>> num events in read is  %i\n", numEvents);
-	    printf("Kernel 1 >>> event offset is  %i\n", e_offset);
-	}
-
-	// Initialise the prev probabilities vector
-	for (int i = 0; i < numStates - PSR9_NUM_STATES; i++) {
-	    prevProbabilities[i] = -INFINITY;
-	}
-	for (int i = numStates - PSR9_NUM_STATES; i < numStates; i++) {
-	    prevProbabilities[i] = 0.0f;
-	}
-
-	bool rc = false;
-	if (e_stride == -1) {
-	    rc = true;
-	}
-
-	float p_stay = 1 - (1 / read_events_per_base);
-	float p_skip = 0.0025;
-	float p_bad = 0.001;
-	float p_bad_self = p_bad;
-	float p_skip_self = 0.3;
-	float p_mk = p_skip; // probability of not observing an event at all
-	float p_mb = p_bad; // probabilty of observing a bad event
-	float p_mm_self = p_stay; // probability of observing additional events from this k-mer
-	float p_mm_next = 1.0f - p_mm_self - p_mk - p_mb; // normal movement from state to state
-	// transitions from event split state in previous block
-	float p_bb = p_bad_self;
-	float p_bk, p_bm_next, p_bm_self;
-	p_bk = p_bm_next = p_bm_self = (1.0f - p_bb) / 3;
-	// transitions from kmer skip state in previous block
-	float p_kk = p_skip_self;
-	float p_km = 1.0f - p_kk;
-	// We assign some transition probabilities. I believe this is correct and they don't vary by location in the sequence
-	float lp_mk = logf(p_mk);
-	float lp_mb = logf(p_mb);
-	float lp_mm_self = logf(p_mm_self);
-	float lp_mm_next = logf(p_mm_next);
-	float lp_bb = logf(p_bb);
-	float lp_bk = logf(p_bk);
-	float lp_bm_next = logf(p_bm_next);
-	float lp_bm_self = logf(p_bm_self);
-	float lp_kk = logf(p_kk);
-	float lp_km = logf(p_km);
-	float lp_sm, lp_ms;
-	lp_sm = lp_ms = 0.0f;
-
-	// the penalty is controlled by the transition probability
-	float BAD_EVENT_PENALTY = 0.0f;
-
-	//Fill out the dynamic programming table
-	for (int row = 1; row < numEvents + 1; row++) {
-	    //row-specific values
-	    int event_idx = e_start + (row - 1) * e_stride;
-	    float eventMean = eventMeansDev[e_offset + row - 1];
-	    float preFlank = preFlankingDev[e_offset + row - 1];
-	    float postFlank = postFlankingDev[e_offset + row - 1];
-
-	    float lp_emission_b = BAD_EVENT_PENALTY;
-
-	    //Initialise temp registers
-	    float prevMatch = prevProbabilities[PSR9_MATCH];;
-	    float prevSkip = prevProbabilities[PSR9_KMER_SKIP];
-	    float prevBad = prevProbabilities[PSR9_BAD_EVENT];
-
-	    for (int blkIdx = 1; blkIdx < numBlocks - 1; blkIdx++) {
-		int curBlockIdx = blkIdx;
-		int prevBlockIdx = curBlockIdx - 1;
-		int prevBlockOffset = PSR9_NUM_STATES * prevBlockIdx;
-		int curBlockOffset = PSR9_NUM_STATES * curBlockIdx;
-
-		int kmerIdx = blkIdx - 1; // because there is a start block with no associated kmer
-		uint32_t rank = kmerRanksDev[seqOffset + kmerIdx + (numKmers *
-								    rc)];
-
-		float pore_mean = poreModelDev[rank * 3];
-		float pore_stdv = poreModelDev[rank * 3 + 1];
-		float pore_log_level_stdv = poreModelDev[rank * 3 + 2];
-
-		float lp_emission_m = lp_match_r9(rank,
-						  eventMean,
-						  pore_mean,
-						  pore_stdv,
-						  pore_log_level_stdv,
-						  scale,
-						  shift,
-						  var,
-						  logVar);
-
-		// Get all the scores for a match
-		float curMatch = prevProbabilities[curBlockOffset + PSR9_MATCH];
-		float curBad = prevProbabilities[curBlockOffset + PSR9_BAD_EVENT];
-		float curSkip = prevProbabilities[curBlockOffset + PSR9_KMER_SKIP];
-
-		float HMT_FROM_SAME_M = lp_mm_self + curMatch;
-		float HMT_FROM_PREV_M = lp_mm_next + prevMatch;
-		float HMT_FROM_SAME_B = lp_bm_self + curBad;
-		float HMT_FROM_PREV_B = lp_bm_next + prevBad;
-		float HMT_FROM_PREV_K = lp_km + prevSkip;
-
-		// m_s is the probability of going from the start state
-		// to this kmer. The start state is (currently) only
-		// allowed to go to the first kmer. If ALLOW_PRE_CLIP
-		// is defined, we allow all events before this one to be skipped,
-		// with a penalty;
-		float HMT_FROM_SOFT = (kmerIdx == 0 &&
-				       (event_idx == e_start ||
-					(HAF_ALLOW_PRE_CLIP))) ? lp_sm + preFlank : -INFINITY;
-
-		// calculate the score
-		float sum = HMT_FROM_SAME_M;
-		sum = logsumexpf(sum, HMT_FROM_SOFT);
-		sum = logsumexpf(sum, HMT_FROM_PREV_M);
-		sum = logsumexpf(sum, HMT_FROM_SAME_B);
-		sum = logsumexpf(sum, HMT_FROM_PREV_B);
-		sum = logsumexpf(sum, HMT_FROM_PREV_K);
-		sum += lp_emission_m;
-
-		float newMatchScore = sum;
-
-		// Calculate the bad event scores
-		// state PSR9_BAD_EVENT
-		HMT_FROM_SAME_M = lp_mb + curMatch;
-		HMT_FROM_PREV_M = -INFINITY;
-		HMT_FROM_SAME_B = lp_bb + prevBad;
-		HMT_FROM_PREV_B = -INFINITY;
-		HMT_FROM_PREV_K = -INFINITY;
-		HMT_FROM_SOFT = -INFINITY;
-
-		sum = HMT_FROM_SAME_M;
-		sum = logsumexpf(sum, HMT_FROM_SAME_B);
-		sum += lp_emission_b;
-
-		float newBadEventScore = sum;
-
-		// Write row out. prevProbabilities now becomes "current probabilities" for evaluating skips.
-		prevProbabilities[curBlockOffset + PSR9_MATCH] = newMatchScore;
-		prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] = newBadEventScore;
-
-		//Update tmp vars
-		prevMatch = curMatch;
-		prevSkip = curSkip;
-		prevBad = prevBad;
-
-		//Now do the non-skip-skip transition. This relies on the updated vector values.
-		// state PSR9_KMER_SKIP
-		HMT_FROM_PREV_M = lp_mk + prevProbabilities[prevBlockOffset + PSR9_MATCH];
-		HMT_FROM_PREV_B = lp_bk + prevProbabilities[prevBlockOffset + PSR9_BAD_EVENT];
-		HMT_FROM_PREV_K = lp_kk + prevProbabilities[prevBlockOffset + PSR9_KMER_SKIP];
-
-		sum = HMT_FROM_PREV_M;
-		sum = logsumexpf(sum, HMT_FROM_PREV_B);
-		sum = logsumexpf(sum, HMT_FROM_PREV_K);
-		sum = logsumexpf(sum, HMT_FROM_PREV_M);
-
-		float newSkipScore = sum;
-
-		prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] = newSkipScore;
-
-		//post-clip transition
-		if (kmerIdx == lastKmerIdx && ((HAF_ALLOW_POST_CLIP) || row == lastRowIdx)) {
-		    float lp1 = lp_ms + prevProbabilities[curBlockOffset + PSR9_MATCH] + postFlank;
-		    float lp2 = lp_ms + prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] + postFlank;
-		    float lp3 = lp_ms + prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] + postFlank;
-
-		    float end = returnValue;
-		    end = logsumexpf(end, lp1);
-		    end = logsumexpf(end, lp2);
-		    end = logsumexpf(end, lp3);
-		    returnValue = end;
-		}
-	    }
-	}
-	returnValuesDev[scoreIdx] = returnValue;
-    }
-}
-
-
-GpuAligner::GpuAligner()
-{
-    size_t numModelElements = 4096;
-    size_t max_reads_per_worker = LOCI_PER_WORKER * MAX_COVERAGE * MAX_NUM_VARIANTS_PER_LOCUS;
-    int readsSizeBuffer = max_reads_per_worker * sizeof(int);
-    int maxBuffer = max_reads_per_worker * MAX_SEQUENCE_LENGTH * sizeof(int);
-
-    //OLD
-    int max_num_sequences = 1;
-    int max_sequence_length = 100;
-    int max_n_rows = 100;
-
-    poreModelInitialized = false;
-
-    CU_CHECK_ERR(cudaMalloc((void**)&scaleDev, readsSizeBuffer));
-    CU_CHECK_ERR(cudaHostAlloc(&scaleHost, readsSizeBuffer, cudaHostAllocDefault));
-
-    CU_CHECK_ERR(cudaMalloc((void**)&shiftDev, readsSizeBuffer));
-    CU_CHECK_ERR(cudaHostAlloc(&shiftHost, readsSizeBuffer, cudaHostAllocDefault));
-
-    CU_CHECK_ERR(cudaMalloc((void**)&varDev, readsSizeBuffer));
-    CU_CHECK_ERR(cudaHostAlloc(&varHost, readsSizeBuffer, cudaHostAllocDefault));
-
-    CU_CHECK_ERR(cudaMalloc((void**)&logVarDev, readsSizeBuffer));
-    CU_CHECK_ERR(cudaHostAlloc(&logVarHost, readsSizeBuffer, cudaHostAllocDefault));
-
-    CU_CHECK_ERR(cudaMalloc((void**)&readLengthsDev, readsSizeBuffer));
-    CU_CHECK_ERR(cudaHostAlloc(&readLengthsHost, readsSizeBuffer, cudaHostAllocDefault));
-
-    CU_CHECK_ERR(cudaMalloc((void**)&eventsPerBaseDev, maxBuffer));
-    CU_CHECK_ERR(cudaHostAlloc(&eventsPerBaseHost, maxBuffer, cudaHostAllocDefault));
-
-    // Allocate Device memory for pore model
-    CU_CHECK_ERR(cudaMalloc((void**)&poreModelDev, numModelElements * 3 * sizeof(float)));
-    CU_CHECK_ERR(cudaHostAlloc(&poreModelHost, numModelElements * sizeof(float) * 3, cudaHostAllocDefault));
-
-    CU_CHECK_ERR(cudaMalloc((void**)&eventStartsDev, readsSizeBuffer));
-    CU_CHECK_ERR(cudaHostAlloc(&eventStartsHost, readsSizeBuffer, cudaHostAllocDefault));
-
-    CU_CHECK_ERR(cudaMalloc((void**)&eventStridesDev, maxBuffer));
-    CU_CHECK_ERR(cudaHostAlloc(&eventStridesHost, maxBuffer, cudaHostAllocDefault));
-
-    CU_CHECK_ERR(cudaMalloc((void**)&eventOffsetsDev, maxBuffer));
-    CU_CHECK_ERR(cudaHostAlloc(&eventOffsetsHost, maxBuffer, cudaHostAllocDefault));
-
-    CU_CHECK_ERR(cudaMalloc((void**)&eventMeansDev, maxBuffer));
-    CU_CHECK_ERR(cudaHostAlloc(&eventMeans, maxBuffer , cudaHostAllocDefault));
-
-    CU_CHECK_ERR(cudaMalloc((void**)&preFlankingDev, maxBuffer));
-    CU_CHECK_ERR(cudaHostAlloc(&preFlankingHost, maxBuffer, cudaHostAllocDefault));
-
-    CU_CHECK_ERR(cudaMalloc((void**)&postFlankingDev, maxBuffer));
-    CU_CHECK_ERR(cudaHostAlloc(&postFlankingHost, maxBuffer, cudaHostAllocDefault));
-
-    CU_CHECK_ERR(cudaMalloc((void**)&sequenceOffsetsDev, maxBuffer));
-    CU_CHECK_ERR(cudaHostAlloc(&sequenceOffsetsHost, maxBuffer, cudaHostAllocDefault));
-
-    CU_CHECK_ERR(cudaMalloc((void**)&sequenceLengthsDev, maxBuffer));
-    CU_CHECK_ERR(cudaHostAlloc(&sequenceLengthsHost, maxBuffer, cudaHostAllocDefault));
-
-    CU_CHECK_ERR(cudaMalloc((void**)&scoresDev, maxBuffer));
-    CU_CHECK_ERR(cudaHostAlloc(&returnValuesHost, maxBuffer, cudaHostAllocDefault));
-
-    CU_CHECK_ERR(cudaMalloc((void**)&seqIdxDev, maxBuffer));
-    CU_CHECK_ERR(cudaHostAlloc(&seqIdxHost, maxBuffer, cudaHostAllocDefault));
-
-    CU_CHECK_ERR(cudaMalloc((void**)&readIdxDev, maxBuffer));
-    CU_CHECK_ERR(cudaHostAlloc(&readIdxHost, maxBuffer, cudaHostAllocDefault));
-
-    int numKmers = max_sequence_length * max_num_sequences;
-    CU_CHECK_ERR(cudaHostAlloc(&kmerRanks, maxBuffer , cudaHostAllocDefault));
-    CU_CHECK_ERR(cudaMalloc((void**)&kmerRanksDev, maxBuffer ));
-
-    // Allocate host memory for model
-    returnValuesHostResultsPointers.resize(max_num_sequences);
-    kmerRanksDevPointers.resize(max_num_sequences);
-    returnValuesDevResultsPointers.resize(max_num_sequences);
-
-    for (int i =0; i<max_num_sequences;i++){
-	cudaStreamCreate(&streams[i]);
-    }
-}
-
-//Destructor
-GpuAligner::~GpuAligner() {
-    CU_CHECK_ERR(cudaFree(scaleDev));
-    CU_CHECK_ERR(cudaFree(shiftDev));
-    CU_CHECK_ERR(cudaFree(varDev));
-    CU_CHECK_ERR(cudaFree(logVarDev));
-    CU_CHECK_ERR(cudaFree(eventsPerBaseDev));
-    CU_CHECK_ERR(cudaFree(readLengthsDev));
-    CU_CHECK_ERR(cudaFree(eventMeansDev));
-    CU_CHECK_ERR(cudaFree(eventStartsDev));
-    CU_CHECK_ERR(cudaFree(eventStridesDev));
-    CU_CHECK_ERR(cudaFree(eventOffsetsDev));
-    CU_CHECK_ERR(cudaFree(preFlankingDev));
-    CU_CHECK_ERR(cudaFree(postFlankingDev));
-    CU_CHECK_ERR(cudaFree(kmerRanksDev));
-    CU_CHECK_ERR(cudaFree(poreModelDev));
-    CU_CHECK_ERR(cudaFree(sequenceOffsetsDev));
-    CU_CHECK_ERR(cudaFree(sequenceLengthsDev));
-    CU_CHECK_ERR(cudaFree(scoresDev));
-    CU_CHECK_ERR(cudaFree(seqIdxDev));
-    CU_CHECK_ERR(cudaFree(readIdxDev));
-
-    CU_CHECK_ERR(cudaFreeHost(eventMeans));
-    CU_CHECK_ERR(cudaFreeHost(poreModelHost));
-    CU_CHECK_ERR(cudaFreeHost(preFlankingHost));
-    CU_CHECK_ERR(cudaFreeHost(postFlankingHost));
-    CU_CHECK_ERR(cudaFreeHost(kmerRanks));
-    CU_CHECK_ERR(cudaFreeHost(sequenceOffsetsHost));
-    CU_CHECK_ERR(cudaFreeHost(returnValuesHost));
-    CU_CHECK_ERR(cudaFreeHost(readLengthsHost));
-    CU_CHECK_ERR(cudaFreeHost(sequenceLengthsHost));
-    CU_CHECK_ERR(cudaFreeHost(seqIdxHost));
-    CU_CHECK_ERR(cudaFreeHost(readIdxHost));
-
-    int max_num_sequences = 1;
-    for (int i =0; i<max_num_sequences; i++) {
-	CU_CHECK_ERR(cudaStreamDestroy(streams[i]));
-    }
-}
-
-std::vector<std::vector<std::vector<double>>> GpuAligner::scoreKernelMod(std::vector<ScoreSet> &scoreSets,
-                                                                         uint32_t alignment_flags){
-
-    int numEventsTotal = 0; // The number of events across all scoreSets
-    int  numSequences = 0; // The number of sequences across all scoreSets
-    int kmerOffset = 0;
-    int numReads = 0; // The number of reads across all scoreSets
-    int numScoreSets = scoreSets.size();
-
-    int rawReadOffset = 0;
-    int globalReadIdx = 0;
-    int globalSequenceIdx = 0;
-    int globalScoreIdx = 0;
-
-    //Loop over every scoreset, filling out buffers and counters
-    for (int scoreSetIdx=0; scoreSetIdx < numScoreSets; scoreSetIdx++){
-	auto scoreSet = scoreSets[scoreSetIdx];
-	int firstReadIdxinScoreSet = globalReadIdx;
-	//Read data
-	for (int eventSequenceIdx=0; eventSequenceIdx < scoreSet.rawData.size();eventSequenceIdx++){
-	    auto e = scoreSet.rawData[eventSequenceIdx];
-	    numReads++;
-
-	    //Read statistics - populate host buffers
-	    scaleHost[globalReadIdx] = e.read->scalings[e.strand].scale;
-	    shiftHost[globalReadIdx] = e.read->scalings[e.strand].shift;
-	    varHost[globalReadIdx] = e.read->scalings[e.strand].var;
-	    logVarHost[globalReadIdx] = e.read->scalings[e.strand].log_var;
-
-	    int e_start = e.event_start_idx;
-	    eventStartsHost[globalReadIdx] = e_start;
-
-	    int e_stride = e.event_stride;
-	    eventStridesHost[globalReadIdx] = e_stride;
-
-	    uint32_t e_end = e.event_stop_idx;
-	    uint32_t n_events;
-	    if(e_end > e_start)
-		n_events = e_end - e_start + 1;
-	    else
-		n_events = e_start - e_end + 1;
-	    readLengthsHost[globalReadIdx] = n_events;
-	    numEventsTotal += n_events;
-
-	    eventOffsetsHost[globalReadIdx] = rawReadOffset;
-
-	    float readEventsPerBase = e.read->events_per_base[e.strand];
-	    eventsPerBaseHost[globalReadIdx] = readEventsPerBase;
-
-	    std::vector<float> pre_flank = make_pre_flanking(e, e_start, n_events);
-	    std::vector<float> post_flank = make_post_flanking(e, e_start, n_events);
-
-	    for (int i=0;i<n_events;i++) {
-		auto event_idx =  e_start + i * e_stride;
-		auto scaled = e.read->get_drift_scaled_level(event_idx, e.strand); // send the data in drift scaled
-		eventMeans[rawReadOffset + i] = scaled;
-
-		//populate the pre/post-flanking data, since it has a 1-1 correspondence with events
-		preFlankingHost[rawReadOffset + i] = pre_flank[i];
-		postFlankingHost[rawReadOffset + i] = post_flank[i];
-	    }
-
-	    rawReadOffset += n_events;
-	    globalReadIdx++;
-	}
-	//Pore Model
-	const uint32_t k = scoreSets[0].rawData[0].pore_model->k; //k is the length of a kmer in the pore model
-	if (poreModelInitialized == false) {
-	    int num_states = scoreSets[0].rawData[0].pore_model->states.size();
-	    int poreModelEntriesPerState = 3;
-	    for(int st=0; st<num_states; st++){
-		auto params = scoreSets[0].rawData[0].pore_model->states[st];
-		poreModelHost[st * poreModelEntriesPerState] = params.level_mean;
-		poreModelHost[st * poreModelEntriesPerState + 1] = params.level_stdv;
-		poreModelHost[st * poreModelEntriesPerState + 2] = params.level_log_stdv;
-	    }
-	    // copy over the pore model
-	    CU_CHECK_ERR(cudaMemcpyAsync(poreModelDev, poreModelHost,
-					 poreModelEntriesPerState * 4096 * sizeof(float), cudaMemcpyHostToDevice, streams[0]));
-	    poreModelInitialized = true;
-	}
-	auto & sequences = scoreSet.stateSequences;
-	numSequences += sequences.size();
-
-	for (int i = 0; i<sequences.size(); i++) {
-	    auto sequence = sequences[i];
-
-	    sequenceOffsetsHost[globalSequenceIdx] = kmerOffset;
-
-	    int sequenceLength = sequence.length();
-
-	    int numKmers = sequenceLength - k + 1;
-
-	    for(size_t ki = 0; ki < numKmers; ++ki) {
-		int rank = sequence.get_kmer_rank(ki, k, false);
-		kmerRanks[ki + kmerOffset] = rank;
-	    }
-
-	    kmerOffset += numKmers;
-
-	    for(size_t ki = 0; ki < numKmers; ++ki) {
-		int rank = sequence.get_kmer_rank(ki, k, true);
-		kmerRanks[ki + kmerOffset] = rank;
-	    }
-
-	    kmerOffset += numKmers;
-
-	    sequenceLengthsHost[globalSequenceIdx] = numKmers;
-
-	    // Loop over the raw reads, producing a cartesian product of reads and sequences
-	    auto numReadsInScoreSet = scoreSet.rawData.size();
-	    for (int r=0; r<numReadsInScoreSet; r++){
-		seqIdxHost[globalScoreIdx] = globalSequenceIdx;
-		readIdxHost[globalScoreIdx] = firstReadIdxinScoreSet + r;
-		globalScoreIdx++;
-	    }
-
-	    globalSequenceIdx++;
-	}
-    }
-
-    // All data is now in host buffers - perform memcpys
-    //Read statistics
-    CU_CHECK_ERR(cudaMemcpyAsync(eventStartsDev, eventStartsHost,
-				 numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
-
-    CU_CHECK_ERR(cudaMemcpyAsync(eventsPerBaseDev, eventsPerBaseHost,
-				 numReads * sizeof(float), cudaMemcpyHostToDevice, streams[0]));
-
-    CU_CHECK_ERR(cudaMemcpyAsync(scaleDev, scaleHost,
-				 numReads * sizeof(float), cudaMemcpyHostToDevice, streams[0]));
-
-    CU_CHECK_ERR(cudaMemcpyAsync(shiftDev, shiftHost,
-				 numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
-
-    CU_CHECK_ERR(cudaMemcpyAsync(eventStridesDev, eventStridesHost,
-				 numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
-
-    CU_CHECK_ERR(cudaMemcpyAsync(varDev, varHost,
-				 numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
-
-    CU_CHECK_ERR(cudaMemcpyAsync(logVarDev, logVarHost,
-				 numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
-
-    CU_CHECK_ERR(cudaMemcpyAsync(readLengthsDev, readLengthsHost,
-				 numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
-
-    // Read offsets
-    CU_CHECK_ERR(cudaMemcpyAsync(eventOffsetsDev, eventOffsetsHost,
-				 numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
-
-    // Reads + Flanks
-    CU_CHECK_ERR(cudaMemcpyAsync(eventMeansDev, eventMeans, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice, streams[0] ));
-
-    CU_CHECK_ERR(cudaMemcpyAsync(preFlankingDev, preFlankingHost, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice, streams[0] ));
-
-    CU_CHECK_ERR(cudaMemcpyAsync(postFlankingDev, postFlankingHost, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice, streams[0] ));
-
-    // Sequence statistics
-
-    CU_CHECK_ERR(cudaMemcpyAsync(sequenceLengthsDev, sequenceLengthsHost, numSequences * sizeof(int), cudaMemcpyHostToDevice, streams[0] ));
-
-    // Sequence offsets
-    CU_CHECK_ERR(cudaMemcpyAsync(sequenceOffsetsDev, sequenceOffsetsHost, numSequences * sizeof(int), cudaMemcpyHostToDevice, streams[0] ));
-
-    // Sequences
-    CU_CHECK_ERR(cudaMemcpyAsync(kmerRanksDev, kmerRanks, kmerOffset * sizeof(int), cudaMemcpyHostToDevice, streams[0] ));
-
-    // Job details
-    CU_CHECK_ERR(cudaMemcpyAsync(seqIdxDev, seqIdxHost, globalScoreIdx * sizeof(int), cudaMemcpyHostToDevice, streams[0] ));
-    CU_CHECK_ERR(cudaMemcpyAsync(readIdxDev, readIdxHost, globalScoreIdx * sizeof(int), cudaMemcpyHostToDevice, streams[0] ));
-
-    // Launch Kernels
-
-    int blockSize = 32;
-    int numBlocks =  (globalScoreIdx + blockSize - 1 ) / blockSize;
-    dim3 dimBlock(blockSize);
-    dim3 dimGrid(numBlocks);
-
-    //printf("Launching get scores mod kernel\n");
-    getScoresMod <<< dimGrid, dimBlock, 0, streams[0]>>> (poreModelDev,
-							  readLengthsDev,
-							  eventStartsDev,
-							  eventStridesDev,
-							  eventsPerBaseDev,
-							  scaleDev,
-							  shiftDev,
-							  varDev,
-							  logVarDev,
-							  eventOffsetsDev,
-							  eventMeansDev,
-							  preFlankingDev,
-							  postFlankingDev,
-							  sequenceLengthsDev,
-							  sequenceOffsetsDev,
-							  kmerRanksDev,
-							  seqIdxDev,
-							  readIdxDev,
-							  globalScoreIdx,
-							  scoresDev);
-    cudaError_t err = cudaGetLastError();
-
-    if (err != cudaSuccess)
-	printf("Errors during kernel execution: %s\n", cudaGetErrorString(err));
-
-    cudaMemcpyAsync(returnValuesHost, scoresDev, globalScoreIdx * sizeof(float), cudaMemcpyDeviceToHost, streams[0]);
-    cudaStreamSynchronize(streams[0]);
-
-    //Unpack results
-    int k = 0;
-    std::vector<std::vector<std::vector<double>>> result(scoreSets.size());
-
-    for(int scoreSetIdx=0; scoreSetIdx<numScoreSets; scoreSetIdx++){
-	auto scoreSet = scoreSets[scoreSetIdx];
-	int numSequences = scoreSet.stateSequences.size();
-	int numReads = scoreSet.rawData.size();
-	for (int seqIdx=0; seqIdx<numSequences; seqIdx++){
-
-	    std::vector<double> seqScores(numReads);
-
-	    for (int readIdx=0; readIdx<numReads; readIdx++){
-		float score = returnValuesHost[k];
-		seqScores[readIdx] = score;
-		k++;
-	    }
-
-	    result[scoreSetIdx].push_back(seqScores);
-	}
-    }
-
-    return result;
-}
-
-
-std::vector<Variant> GpuAligner::variantScoresThresholded(std::vector<std::vector<Variant>> input_variants_vector,
-                                                          std::vector<Haplotype> base_haplotypes,
-                                                          std::vector<std::vector<HMMInputData>> event_sequences_vector,
-                                                          uint32_t alignment_flags,
-                                                          int screen_score_threshold,
-                                                          std::vector<std::string> methylation_types) {
-    int numScoreSets = base_haplotypes.size();
-    std::vector<ScoreSet> scoreSets;
-    scoreSets.resize(numScoreSets);
-
-    for(int scoreSetIdx=0; scoreSetIdx<numScoreSets;scoreSetIdx++){
-
-	auto input_variants = input_variants_vector[scoreSetIdx];
-	auto base_haplotype = base_haplotypes[scoreSetIdx];
-	auto event_sequences = event_sequences_vector[scoreSetIdx];
-
-	if (event_sequences.size() > MAX_COVERAGE) {
-	    event_sequences.resize(MAX_COVERAGE);
-	}
-
-	int numVariants = input_variants.size();
-
-	std::vector<Variant> out_variants = input_variants;
-	std::vector<Haplotype> variant_haplotypes(numVariants, base_haplotype);
-
-	//loop over the vector, applying the variants to the haplotypes
-	for (int i = 0; i<input_variants.size();i++){
-	    variant_haplotypes[i].apply_variant(input_variants[i]);
-	}
-
-	// Make methylated versions of each input sequence. Once for the base haplotype and once each for each variant
-
-	std::vector<HMMInputSequence> sequences;
-
-	HMMInputSequence base_sequence = generate_methylated_alternatives(base_haplotype.get_sequence(),
-									  methylation_types)[0];
-
-	sequences.push_back(base_sequence);
-
-	for (auto v: variant_haplotypes){
-	    auto variant_sequence = generate_methylated_alternatives(v.get_sequence(), methylation_types)[0];
-	    sequences.push_back(variant_sequence);
-	}
-
-	ScoreSet s = {
-	    sequences,
-	    event_sequences
-	};
-
-	scoreSets[scoreSetIdx] = s;
-
-    }
-
-    std::vector<Variant> v;
-    if (!event_sequences_vector.empty()) {
-
-	auto scoresMod = scoreKernelMod(scoreSets, alignment_flags);
-
-	// results are now ready, need to unpack them
-	for (int scoreSetIdx=0; scoreSetIdx<numScoreSets; scoreSetIdx++){
-	    std::vector<std::vector<double>> scores = scoresMod[scoreSetIdx]; // scores for this candidate, including all variants and base(zeroth)
-	    int numVariants = scores.size() - 1; // subtract one for the base
-	    int numScores = scores[0].size();
-
-	    for (int variantIndex = 0; variantIndex < numVariants; variantIndex++) { // index 0 is the base scores
-		double totalScore = 0.0;
-		for (int k = 0; k < numScores; k++) {
-		    if (fabs(totalScore) < screen_score_threshold) {
-			double baseScore = scores[0][k];
-			totalScore += (scores[variantIndex + 1][k] - baseScore);
-		    }
-		}
-		// get the old variant:
-		auto unScoredVariant = input_variants_vector[scoreSetIdx][variantIndex];
-		unScoredVariant.quality = totalScore;
-		unScoredVariant.info = "";
-		v.push_back(unScoredVariant);
-	    }
-	}
-    }
-    return v;
-}
diff --git a/src/cuda_kernels/gpu_aligner.cu b/src/cuda_kernels/gpu_aligner.cu
new file mode 100644
index 00000000..972086ed
--- /dev/null
+++ b/src/cuda_kernels/gpu_aligner.cu
@@ -0,0 +1,864 @@
+#include <iostream>
+#include <cuda.h>
+#include "gpu_aligner.h"
+#include <vector>
+#include "nanopolish_profile_hmm_r9.h"
+
+int gpu_aligner_debug = 0;
+
+#define MAX_STATES 256
+
+#define EXPAND_TO_STRING(X) #X
+#define TO_STRING(X) EXPAND_TO_STRING(X)
+#define CU_CHECK_ERR(X) if (X != cudaSuccess){printf("CUDA error: <<%s>> at line %s\n", cudaGetErrorString(X), TO_STRING(__LINE__));throw std::runtime_error("CUDA ERROR");}
+
+__device__ float logsumexpf(float x, float y){
+    if(x == -INFINITY && y == -INFINITY){
+        return -INFINITY;
+    }
+    float result = fmax(x, y) + log1pf(expf(-fabsf(y - x)));
+    return result;
+}
+
+__device__ float lp_match_r9(int rank,
+                             float mean,
+                             float pore_mean,
+                             float pore_stdv,
+                             float pore_log_level_stdv,
+                             float scale,
+                             float shift,
+                             float var,
+                             float logVar){
+
+    float log_inv_sqrt_2pi = logf(0.3989422804014327);
+
+    float level = mean;
+    float gaussian_mean = scale * pore_mean + shift;
+    float gaussian_stdv = pore_stdv * var;
+    float gaussian_log_level_stdv = pore_log_level_stdv + logVar;
+
+    float a = (level - gaussian_mean) / gaussian_stdv;
+    float emission = log_inv_sqrt_2pi - gaussian_log_level_stdv + (-0.5f * a * a);
+    return emission;
+
+}
+
+__global__ void getScoresMod (float * poreModelDev,
+                              int * readLengthsDev,
+                              int * eventStartsDev,
+                              int * eventStridesDev,
+                              float * eventsPerBaseDev,
+                              float * scaleDev,
+                              float * shiftDev,
+                              float * varDev,
+                              float * logVarDev,
+                              int * eventOffsetsDev,
+                              float * eventMeansDev,
+                              int * modelOffsetsDev,
+                              float * preFlankingDev,
+                              float * postFlankingDev,
+                              int * sequenceLengthsDev,
+                              int * sequenceOffsetsDev,
+                              int * kmerRanksDev,
+                              int * seqIdxDev,
+                              int * readIdxDev,
+                              int numScores,
+                              float * returnValuesDev){
+
+    bool debug = false;
+    if ((threadIdx.x == 0) && (blockIdx.x == 0)){
+        debug = false;
+    }
+
+    // get buffer indices
+    int scoreIdx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (scoreIdx < numScores) {
+
+        int readIdx = readIdxDev[scoreIdx];
+        int seqIdx = seqIdxDev[scoreIdx];
+
+        // get read statistics
+        int numEvents = readLengthsDev[readIdx];
+        float read_events_per_base = eventsPerBaseDev[readIdx];
+        int e_start = eventStartsDev[readIdx]; // Event start for read
+        int e_stride = eventStridesDev[readIdx];
+        int e_offset = eventOffsetsDev[readIdx]; // Within the event means etc, the offset needed for this block to get a specific event
+        int m_offset = modelOffsetsDev[readIdx];
+        float scale = scaleDev[readIdx];
+        float shift = shiftDev[readIdx];
+        float var = varDev[readIdx];
+        float logVar = logVarDev[readIdx];
+
+        // get sequence statistics
+        int numKmers = sequenceLengthsDev[seqIdx];
+        int seqOffset = sequenceOffsetsDev[seqIdx];
+
+        int lastRowIdx = numEvents - 1;
+        int lastKmerIdx = numKmers - 1;
+
+        float returnValue = -INFINITY; //Used to sum over the last column.
+        float prevProbabilities[MAX_STATES];
+
+        int numBlocks = numKmers + 2;
+        int numStates = numBlocks * PSR9_NUM_STATES; // 3 blocks per kmer and then 3 each for start and end state.
+
+        if (debug) {
+            printf("Kernel 1 >>> Num Kmers is %i\n", numKmers);
+            printf("Kernel 1 >>> n_states %i\n", numStates);
+            printf("Kernel 1 >>> num events in read is  %i\n", numEvents);
+            printf("Kernel 1 >>> event offset is  %i\n", e_offset);
+        }
+
+        // Initialise the prev probabilities vector
+        for (int i = 0; i < numStates - PSR9_NUM_STATES; i++) {
+            prevProbabilities[i] = -INFINITY;
+        }
+        for (int i = numStates - PSR9_NUM_STATES; i < numStates; i++) {
+            prevProbabilities[i] = 0.0f;
+        }
+
+        bool rc = false;
+        if (e_stride == -1) {
+            rc = true;
+        }
+
+        float p_stay = 1 - (1 / read_events_per_base);
+        float p_skip = 0.0025;
+        float p_bad = 0.001;
+        float p_bad_self = p_bad;
+        float p_skip_self = 0.3;
+        float p_mk = p_skip; // probability of not observing an event at all
+        float p_mb = p_bad; // probabilty of observing a bad event
+        float p_mm_self = p_stay; // probability of observing additional events from this k-mer
+        float p_mm_next = 1.0f - p_mm_self - p_mk - p_mb; // normal movement from state to state
+        // transitions from event split state in previous block
+        float p_bb = p_bad_self;
+        float p_bk, p_bm_next, p_bm_self;
+        p_bk = p_bm_next = p_bm_self = (1.0f - p_bb) / 3;
+        // transitions from kmer skip state in previous block
+        float p_kk = p_skip_self;
+        float p_km = 1.0f - p_kk;
+        // We assign some transition probabilities. I believe this is correct and they don't vary by location in the sequence
+        float lp_mk = logf(p_mk);
+        float lp_mb = logf(p_mb);
+        float lp_mm_self = logf(p_mm_self);
+        float lp_mm_next = logf(p_mm_next);
+        float lp_bb = logf(p_bb);
+        float lp_bk = logf(p_bk);
+        float lp_bm_next = logf(p_bm_next);
+        float lp_bm_self = logf(p_bm_self);
+        float lp_kk = logf(p_kk);
+        float lp_km = logf(p_km);
+        float lp_sm, lp_ms;
+        lp_sm = lp_ms = 0.0f;
+
+        // the penalty is controlled by the transition probability
+        float BAD_EVENT_PENALTY = 0.0f;
+
+        //Fill out the dynamic programming table
+        for (int row = 1; row < numEvents + 1; row++) {
+            //row-specific values
+            int event_idx = e_start + (row - 1) * e_stride;
+            float eventMean = eventMeansDev[e_offset + row - 1];
+            float preFlank = preFlankingDev[e_offset + row - 1];
+            float postFlank = postFlankingDev[e_offset + row - 1];
+
+            float lp_emission_b = BAD_EVENT_PENALTY;
+
+            //Initialise temp registers
+            float prevMatch = prevProbabilities[PSR9_MATCH];;
+            float prevSkip = prevProbabilities[PSR9_KMER_SKIP];
+            float prevBad = prevProbabilities[PSR9_BAD_EVENT];
+
+            for (int blkIdx = 1; blkIdx < numBlocks - 1; blkIdx++) {
+                int curBlockIdx = blkIdx;
+                int prevBlockIdx = curBlockIdx - 1;
+                int prevBlockOffset = PSR9_NUM_STATES * prevBlockIdx;
+                int curBlockOffset = PSR9_NUM_STATES * curBlockIdx;
+
+                int kmerIdx = blkIdx - 1; // because there is a start block with no associated kmer
+                uint32_t rank = kmerRanksDev[seqOffset + kmerIdx + (numKmers *
+                                                                    rc)];
+
+                float pore_mean = poreModelDev[m_offset + rank * 3];
+                float pore_stdv = poreModelDev[m_offset + rank * 3 + 1];
+                float pore_log_level_stdv = poreModelDev[m_offset + rank * 3 + 2];
+
+                float lp_emission_m = lp_match_r9(rank,
+                                                  eventMean,
+                                                  pore_mean,
+                                                  pore_stdv,
+                                                  pore_log_level_stdv,
+                                                  scale,
+                                                  shift,
+                                                  var,
+                                                  logVar);
+
+                // Get all the scores for a match
+                float curMatch = prevProbabilities[curBlockOffset + PSR9_MATCH];
+                float curBad = prevProbabilities[curBlockOffset + PSR9_BAD_EVENT];
+                float curSkip = prevProbabilities[curBlockOffset + PSR9_KMER_SKIP];
+
+                float HMT_FROM_SAME_M = lp_mm_self + curMatch;
+                float HMT_FROM_PREV_M = lp_mm_next + prevMatch;
+                float HMT_FROM_SAME_B = lp_bm_self + curBad;
+                float HMT_FROM_PREV_B = lp_bm_next + prevBad;
+                float HMT_FROM_PREV_K = lp_km + prevSkip;
+
+                // m_s is the probability of going from the start state
+                // to this kmer. The start state is (currently) only
+                // allowed to go to the first kmer. If ALLOW_PRE_CLIP
+                // is defined, we allow all events before this one to be skipped,
+                // with a penalty;
+                float HMT_FROM_SOFT = (kmerIdx == 0 &&
+                                       (event_idx == e_start ||
+                                        (HAF_ALLOW_PRE_CLIP))) ? lp_sm + preFlank : -INFINITY;
+
+                // calculate the score
+                float sum = HMT_FROM_SAME_M;
+                sum = logsumexpf(sum, HMT_FROM_SOFT);
+                sum = logsumexpf(sum, HMT_FROM_PREV_M);
+                sum = logsumexpf(sum, HMT_FROM_SAME_B);
+                sum = logsumexpf(sum, HMT_FROM_PREV_B);
+                sum = logsumexpf(sum, HMT_FROM_PREV_K);
+                sum += lp_emission_m;
+
+                float newMatchScore = sum;
+
+                // Calculate the bad event scores
+                // state PSR9_BAD_EVENT
+                HMT_FROM_SAME_M = lp_mb + curMatch;
+                HMT_FROM_PREV_M = -INFINITY;
+                HMT_FROM_SAME_B = lp_bb + prevBad;
+                HMT_FROM_PREV_B = -INFINITY;
+                HMT_FROM_PREV_K = -INFINITY;
+                HMT_FROM_SOFT = -INFINITY;
+
+                sum = HMT_FROM_SAME_M;
+                sum = logsumexpf(sum, HMT_FROM_SAME_B);
+                sum += lp_emission_b;
+
+                float newBadEventScore = sum;
+
+                // Write row out. prevProbabilities now becomes "current probabilities" for evaluating skips.
+                prevProbabilities[curBlockOffset + PSR9_MATCH] = newMatchScore;
+                prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] = newBadEventScore;
+
+                //Update tmp vars
+                prevMatch = curMatch;
+                prevSkip = curSkip;
+                prevBad = prevBad;
+
+                //Now do the non-skip-skip transition. This relies on the updated vector values.
+                // state PSR9_KMER_SKIP
+                HMT_FROM_PREV_M = lp_mk + prevProbabilities[prevBlockOffset + PSR9_MATCH];
+                HMT_FROM_PREV_B = lp_bk + prevProbabilities[prevBlockOffset + PSR9_BAD_EVENT];
+                HMT_FROM_PREV_K = lp_kk + prevProbabilities[prevBlockOffset + PSR9_KMER_SKIP];
+
+                sum = HMT_FROM_PREV_M;
+                sum = logsumexpf(sum, HMT_FROM_PREV_B);
+                sum = logsumexpf(sum, HMT_FROM_PREV_K);
+                sum = logsumexpf(sum, HMT_FROM_PREV_M);
+
+                float newSkipScore = sum;
+
+                prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] = newSkipScore;
+
+                //post-clip transition
+                if (kmerIdx == lastKmerIdx && ((HAF_ALLOW_POST_CLIP) || row == lastRowIdx)) {
+                    float lp1 = lp_ms + prevProbabilities[curBlockOffset + PSR9_MATCH] + postFlank;
+                    float lp2 = lp_ms + prevProbabilities[curBlockOffset + PSR9_BAD_EVENT] + postFlank;
+                    float lp3 = lp_ms + prevProbabilities[curBlockOffset + PSR9_KMER_SKIP] + postFlank;
+
+                    float end = returnValue;
+                    end = logsumexpf(end, lp1);
+                    end = logsumexpf(end, lp2);
+                    end = logsumexpf(end, lp3);
+                    returnValue = end;
+                }
+            }
+        }
+        returnValuesDev[scoreIdx] = returnValue;
+    }
+}
+
+
+GpuAligner::GpuAligner()
+{
+    size_t max_reads_per_worker = LOCI_PER_WORKER * MAX_COVERAGE * MAX_NUM_VARIANTS_PER_LOCUS;
+    int readsSizeBuffer = max_reads_per_worker * sizeof(int);
+    int maxBuffer = max_reads_per_worker * MAX_SEQUENCE_LENGTH * sizeof(int);
+
+    //OLD
+    int max_num_sequences = 1;
+    //int max_sequence_length = 100;
+
+    poreModelInitialized = false;
+
+    CU_CHECK_ERR(cudaMalloc((void**)&scaleDev, readsSizeBuffer));
+    CU_CHECK_ERR(cudaHostAlloc(&scaleHost, readsSizeBuffer, cudaHostAllocDefault));
+
+    CU_CHECK_ERR(cudaMalloc((void**)&shiftDev, readsSizeBuffer));
+    CU_CHECK_ERR(cudaHostAlloc(&shiftHost, readsSizeBuffer, cudaHostAllocDefault));
+
+    CU_CHECK_ERR(cudaMalloc((void**)&varDev, readsSizeBuffer));
+    CU_CHECK_ERR(cudaHostAlloc(&varHost, readsSizeBuffer, cudaHostAllocDefault));
+
+    CU_CHECK_ERR(cudaMalloc((void**)&logVarDev, readsSizeBuffer));
+    CU_CHECK_ERR(cudaHostAlloc(&logVarHost, readsSizeBuffer, cudaHostAllocDefault));
+
+    CU_CHECK_ERR(cudaMalloc((void**)&readLengthsDev, readsSizeBuffer));
+    CU_CHECK_ERR(cudaHostAlloc(&readLengthsHost, readsSizeBuffer, cudaHostAllocDefault));
+
+    CU_CHECK_ERR(cudaMalloc((void**)&eventsPerBaseDev, maxBuffer));
+    CU_CHECK_ERR(cudaHostAlloc(&eventsPerBaseHost, maxBuffer, cudaHostAllocDefault));
+
+    CU_CHECK_ERR(cudaMalloc((void**)&eventStartsDev, readsSizeBuffer));
+    CU_CHECK_ERR(cudaHostAlloc(&eventStartsHost, readsSizeBuffer, cudaHostAllocDefault));
+
+    CU_CHECK_ERR(cudaMalloc((void**)&eventStridesDev, maxBuffer));
+    CU_CHECK_ERR(cudaHostAlloc(&eventStridesHost, maxBuffer, cudaHostAllocDefault));
+
+    CU_CHECK_ERR(cudaMalloc((void**)&eventOffsetsDev, maxBuffer));
+    CU_CHECK_ERR(cudaHostAlloc(&eventOffsetsHost, maxBuffer, cudaHostAllocDefault));
+
+    CU_CHECK_ERR(cudaMalloc((void**)&eventMeansDev, maxBuffer));
+    CU_CHECK_ERR(cudaHostAlloc(&eventMeans, maxBuffer , cudaHostAllocDefault));
+    
+    CU_CHECK_ERR(cudaMalloc((void**)&modelOffsetsDev, maxBuffer));
+    CU_CHECK_ERR(cudaHostAlloc(&modelOffsetsHost, maxBuffer, cudaHostAllocDefault));
+
+    CU_CHECK_ERR(cudaMalloc((void**)&preFlankingDev, maxBuffer));
+    CU_CHECK_ERR(cudaHostAlloc(&preFlankingHost, maxBuffer, cudaHostAllocDefault));
+
+    CU_CHECK_ERR(cudaMalloc((void**)&postFlankingDev, maxBuffer));
+    CU_CHECK_ERR(cudaHostAlloc(&postFlankingHost, maxBuffer, cudaHostAllocDefault));
+
+    CU_CHECK_ERR(cudaMalloc((void**)&sequenceOffsetsDev, maxBuffer));
+    CU_CHECK_ERR(cudaHostAlloc(&sequenceOffsetsHost, maxBuffer, cudaHostAllocDefault));
+
+    CU_CHECK_ERR(cudaMalloc((void**)&sequenceLengthsDev, maxBuffer));
+    CU_CHECK_ERR(cudaHostAlloc(&sequenceLengthsHost, maxBuffer, cudaHostAllocDefault));
+
+    CU_CHECK_ERR(cudaMalloc((void**)&scoresDev, maxBuffer));
+    CU_CHECK_ERR(cudaHostAlloc(&returnValuesHost, maxBuffer, cudaHostAllocDefault));
+
+    CU_CHECK_ERR(cudaMalloc((void**)&seqIdxDev, maxBuffer));
+    CU_CHECK_ERR(cudaHostAlloc(&seqIdxHost, maxBuffer, cudaHostAllocDefault));
+
+    CU_CHECK_ERR(cudaMalloc((void**)&readIdxDev, maxBuffer));
+    CU_CHECK_ERR(cudaHostAlloc(&readIdxHost, maxBuffer, cudaHostAllocDefault));
+
+    CU_CHECK_ERR(cudaHostAlloc(&kmerRanks, maxBuffer , cudaHostAllocDefault));
+    CU_CHECK_ERR(cudaMalloc((void**)&kmerRanksDev, maxBuffer ));
+
+    //
+    // Allocate Device memory for pore model
+    // 
+
+    // Count the total number of k-mer states across all pore models
+    int numModelElements = 0;
+    int numModels = 0;
+    for(const PoreModel* model : PoreModelSet::get_all_models()) {
+        numModelElements += model->states.size();
+        numModels += 1;
+    }
+    //fprintf(stderr, "Initialized %d states from %d models\n", numModelElements, numModels);
+    int poreModelEntriesPerState = 3;
+    int totalModelEntries = numModelElements * poreModelEntriesPerState;
+    CU_CHECK_ERR(cudaMalloc((void**)&poreModelDev, totalModelEntries * sizeof(float)));
+    CU_CHECK_ERR(cudaHostAlloc(&poreModelHost, totalModelEntries * sizeof(float), cudaHostAllocDefault));
+
+    //
+    // Initialize pore model
+    //
+    int modelOffset = 0;
+    for(const PoreModel* model : PoreModelSet::get_all_models()) {
+        modelToOffsetMap[model] = modelOffset;
+        fprintf(stderr, "inserted model %s at offset %d\n", PoreModelSet::get_model_key(*model).c_str(), modelOffset);
+
+        int num_states = model->states.size();
+        for(int st=0; st<num_states; st++) {
+            auto params = model->states[st];
+            poreModelHost[modelOffset++] = params.level_mean;
+            poreModelHost[modelOffset++] = params.level_stdv;
+            poreModelHost[modelOffset++] = params.level_log_stdv;
+        }
+    }
+
+    fprintf(stderr, "Initialized %d/%d states from %d models\n", modelOffset, numModelElements, numModels);
+    assert(modelOffset == totalModelEntries);
+
+    // Allocate host memory for model
+    returnValuesHostResultsPointers.resize(max_num_sequences);
+    kmerRanksDevPointers.resize(max_num_sequences);
+    returnValuesDevResultsPointers.resize(max_num_sequences);
+
+    for (int i =0; i<max_num_sequences;i++){
+        cudaStreamCreate(&streams[i]);
+    }
+
+    // copy over the pore model to the device
+    // TODO: move this somewhere else?
+    CU_CHECK_ERR(cudaMemcpyAsync(poreModelDev, poreModelHost,
+                                 totalModelEntries * sizeof(float), cudaMemcpyHostToDevice, streams[0]));
+}
+
+//Destructor
+GpuAligner::~GpuAligner() {
+    CU_CHECK_ERR(cudaFree(scaleDev));
+    CU_CHECK_ERR(cudaFree(shiftDev));
+    CU_CHECK_ERR(cudaFree(varDev));
+    CU_CHECK_ERR(cudaFree(logVarDev));
+    CU_CHECK_ERR(cudaFree(eventsPerBaseDev));
+    CU_CHECK_ERR(cudaFree(readLengthsDev));
+    CU_CHECK_ERR(cudaFree(eventMeansDev));
+    CU_CHECK_ERR(cudaFree(eventStartsDev));
+    CU_CHECK_ERR(cudaFree(eventStridesDev));
+    CU_CHECK_ERR(cudaFree(eventOffsetsDev));
+    CU_CHECK_ERR(cudaFree(modelOffsetsDev));
+    CU_CHECK_ERR(cudaFree(preFlankingDev));
+    CU_CHECK_ERR(cudaFree(postFlankingDev));
+    CU_CHECK_ERR(cudaFree(kmerRanksDev));
+    CU_CHECK_ERR(cudaFree(poreModelDev));
+    CU_CHECK_ERR(cudaFree(sequenceOffsetsDev));
+    CU_CHECK_ERR(cudaFree(sequenceLengthsDev));
+    CU_CHECK_ERR(cudaFree(scoresDev));
+    CU_CHECK_ERR(cudaFree(seqIdxDev));
+    CU_CHECK_ERR(cudaFree(readIdxDev));
+
+    CU_CHECK_ERR(cudaFreeHost(eventMeans));
+    CU_CHECK_ERR(cudaFreeHost(poreModelHost));
+    CU_CHECK_ERR(cudaFreeHost(preFlankingHost));
+    CU_CHECK_ERR(cudaFreeHost(postFlankingHost));
+    CU_CHECK_ERR(cudaFreeHost(kmerRanks));
+    CU_CHECK_ERR(cudaFreeHost(sequenceOffsetsHost));
+    CU_CHECK_ERR(cudaFreeHost(returnValuesHost));
+    CU_CHECK_ERR(cudaFreeHost(readLengthsHost));
+    CU_CHECK_ERR(cudaFreeHost(sequenceLengthsHost));
+    CU_CHECK_ERR(cudaFreeHost(seqIdxHost));
+    CU_CHECK_ERR(cudaFreeHost(readIdxHost));
+    CU_CHECK_ERR(cudaFreeHost(modelOffsetsHost));
+
+    int max_num_sequences = 1;
+    for (int i =0; i<max_num_sequences; i++) {
+        CU_CHECK_ERR(cudaStreamDestroy(streams[i]));
+    }
+}
+
+std::vector<std::vector<std::vector<double>>> GpuAligner::scoreKernelMod(std::vector<ScoreSet> &scoreSets,
+                                                                         uint32_t alignment_flags){
+
+    int numEventsTotal = 0; // The number of events across all scoreSets
+    int numSequences = 0; // The number of sequences across all scoreSets
+    int kmerOffset = 0;
+    int numReads = 0; // The number of reads across all scoreSets
+    int numScoreSets = scoreSets.size();
+
+    int rawReadOffset = 0;
+    int globalReadIdx = 0;
+    int globalSequenceIdx = 0;
+    int globalScoreIdx = 0;
+
+    //Loop over every scoreset, filling out buffers and counters
+    for (int scoreSetIdx=0; scoreSetIdx < numScoreSets; scoreSetIdx++) {
+        auto scoreSet = scoreSets[scoreSetIdx];
+        int firstReadIdxinScoreSet = globalReadIdx;
+
+        //Read data
+        for (int eventSequenceIdx=0; eventSequenceIdx < scoreSet.rawData.size(); eventSequenceIdx++) {
+            auto e = scoreSet.rawData[eventSequenceIdx];
+            numReads++;
+
+            //Read statistics - populate host buffers
+            scaleHost[globalReadIdx] = e.read->scalings[e.strand].scale;
+            shiftHost[globalReadIdx] = e.read->scalings[e.strand].shift;
+            varHost[globalReadIdx] = e.read->scalings[e.strand].var;
+            logVarHost[globalReadIdx] = e.read->scalings[e.strand].log_var;
+
+            int e_start = e.event_start_idx;
+            eventStartsHost[globalReadIdx] = e_start;
+
+            int e_stride = e.event_stride;
+            eventStridesHost[globalReadIdx] = e_stride;
+
+            uint32_t e_end = e.event_stop_idx;
+            uint32_t n_events;
+            if(e_end > e_start)
+                n_events = e_end - e_start + 1;
+            else
+                n_events = e_start - e_end + 1;
+            readLengthsHost[globalReadIdx] = n_events;
+            numEventsTotal += n_events;
+
+            eventOffsetsHost[globalReadIdx] = rawReadOffset;
+
+            float readEventsPerBase = e.read->events_per_base[e.strand];
+            eventsPerBaseHost[globalReadIdx] = readEventsPerBase;
+
+            std::vector<float> pre_flank = make_pre_flanking(e, e_start, n_events);
+            std::vector<float> post_flank = make_post_flanking(e, e_start, n_events);
+
+            for (int i=0;i<n_events;i++) {
+                auto event_idx =  e_start + i * e_stride;
+                auto scaled = e.read->get_drift_scaled_level(event_idx, e.strand); // send the data in drift scaled
+                eventMeans[rawReadOffset + i] = scaled;
+
+                //populate the pre/post-flanking data, since it has a 1-1 correspondence with events
+                preFlankingHost[rawReadOffset + i] = pre_flank[i];
+                postFlankingHost[rawReadOffset + i] = post_flank[i];
+            }
+
+            // look up model offset in the map
+            const auto& modelOffsetIter = modelToOffsetMap.find(e.pore_model);
+            assert(modelOffsetIter != modelToOffsetMap.end());
+            modelOffsetsHost[globalReadIdx] = modelOffsetIter->second;
+
+            rawReadOffset += n_events;
+            globalReadIdx++;
+        }
+
+        auto & sequences = scoreSet.stateSequences;
+        numSequences += sequences.size();
+
+        for (int i = 0; i<sequences.size(); i++) {
+            auto sequence = sequences[i];
+
+            sequenceOffsetsHost[globalSequenceIdx] = kmerOffset;
+
+            int sequenceLength = sequence.length();
+            // TODO: k must be set per read, per score set not fixed
+            const uint32_t k = scoreSet.rawData[0].pore_model->k; 
+            int numKmers = sequenceLength - k + 1;
+
+            for(size_t ki = 0; ki < numKmers; ++ki) {
+                int rank = sequence.get_kmer_rank(ki, k, false);
+                kmerRanks[ki + kmerOffset] = rank;
+            }
+
+            kmerOffset += numKmers;
+
+            for(size_t ki = 0; ki < numKmers; ++ki) {
+                int rank = sequence.get_kmer_rank(ki, k, true);
+                kmerRanks[ki + kmerOffset] = rank;
+            }
+
+            kmerOffset += numKmers;
+
+            sequenceLengthsHost[globalSequenceIdx] = numKmers;
+
+            // Loop over the raw reads, producing a cartesian product of reads and sequences
+            auto numReadsInScoreSet = scoreSet.rawData.size();
+            for (int r=0; r<numReadsInScoreSet; r++){
+                seqIdxHost[globalScoreIdx] = globalSequenceIdx;
+                readIdxHost[globalScoreIdx] = firstReadIdxinScoreSet + r;
+                globalScoreIdx++;
+            }
+
+            globalSequenceIdx++;
+        }
+    }
+
+    // All data is now in host buffers - perform memcpys
+    //Read statistics
+    CU_CHECK_ERR(cudaMemcpyAsync(eventStartsDev, eventStartsHost,
+                                 numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
+
+    CU_CHECK_ERR(cudaMemcpyAsync(eventsPerBaseDev, eventsPerBaseHost,
+                                 numReads * sizeof(float), cudaMemcpyHostToDevice, streams[0]));
+
+    CU_CHECK_ERR(cudaMemcpyAsync(scaleDev, scaleHost,
+                                 numReads * sizeof(float), cudaMemcpyHostToDevice, streams[0]));
+
+    CU_CHECK_ERR(cudaMemcpyAsync(shiftDev, shiftHost,
+                                 numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
+
+    CU_CHECK_ERR(cudaMemcpyAsync(eventStridesDev, eventStridesHost,
+                                 numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
+
+    CU_CHECK_ERR(cudaMemcpyAsync(varDev, varHost,
+                                 numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
+
+    CU_CHECK_ERR(cudaMemcpyAsync(logVarDev, logVarHost,
+                                 numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
+
+    CU_CHECK_ERR(cudaMemcpyAsync(readLengthsDev, readLengthsHost,
+                                 numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
+
+    // Read offsets
+    CU_CHECK_ERR(cudaMemcpyAsync(eventOffsetsDev, eventOffsetsHost,
+                                 numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
+
+    // Model offsets
+    CU_CHECK_ERR(cudaMemcpyAsync(modelOffsetsDev, modelOffsetsHost,
+                                 numReads * sizeof(int), cudaMemcpyHostToDevice, streams[0]));
+
+    // Reads + Flanks
+    CU_CHECK_ERR(cudaMemcpyAsync(eventMeansDev, eventMeans, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice, streams[0] ));
+
+    CU_CHECK_ERR(cudaMemcpyAsync(preFlankingDev, preFlankingHost, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice, streams[0] ));
+
+    CU_CHECK_ERR(cudaMemcpyAsync(postFlankingDev, postFlankingHost, numEventsTotal * sizeof(float), cudaMemcpyHostToDevice, streams[0] ));
+
+    // Sequence statistics
+
+    CU_CHECK_ERR(cudaMemcpyAsync(sequenceLengthsDev, sequenceLengthsHost, numSequences * sizeof(int), cudaMemcpyHostToDevice, streams[0] ));
+
+    // Sequence offsets
+    CU_CHECK_ERR(cudaMemcpyAsync(sequenceOffsetsDev, sequenceOffsetsHost, numSequences * sizeof(int), cudaMemcpyHostToDevice, streams[0] ));
+
+    // Sequences
+    CU_CHECK_ERR(cudaMemcpyAsync(kmerRanksDev, kmerRanks, kmerOffset * sizeof(int), cudaMemcpyHostToDevice, streams[0] ));
+
+    // Job details
+    CU_CHECK_ERR(cudaMemcpyAsync(seqIdxDev, seqIdxHost, globalScoreIdx * sizeof(int), cudaMemcpyHostToDevice, streams[0] ));
+    CU_CHECK_ERR(cudaMemcpyAsync(readIdxDev, readIdxHost, globalScoreIdx * sizeof(int), cudaMemcpyHostToDevice, streams[0] ));
+
+    // Launch Kernels
+
+    int blockSize = 32;
+    int numBlocks =  (globalScoreIdx + blockSize - 1 ) / blockSize;
+    dim3 dimBlock(blockSize);
+    dim3 dimGrid(numBlocks);
+
+    //printf("Launching get scores mod kernel\n");
+    getScoresMod <<< dimGrid, dimBlock, 0, streams[0]>>> (poreModelDev,
+                                                          readLengthsDev,
+                                                          eventStartsDev,
+                                                          eventStridesDev,
+                                                          eventsPerBaseDev,
+                                                          scaleDev,
+                                                          shiftDev,
+                                                          varDev,
+                                                          logVarDev,
+                                                          eventOffsetsDev,
+                                                          eventMeansDev,
+                                                          modelOffsetsDev,
+                                                          preFlankingDev,
+                                                          postFlankingDev,
+                                                          sequenceLengthsDev,
+                                                          sequenceOffsetsDev,
+                                                          kmerRanksDev,
+                                                          seqIdxDev,
+                                                          readIdxDev,
+                                                          globalScoreIdx,
+                                                          scoresDev);
+    cudaError_t err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+        printf("Errors during kernel execution: %s\n", cudaGetErrorString(err));
+
+    cudaMemcpyAsync(returnValuesHost, scoresDev, globalScoreIdx * sizeof(float), cudaMemcpyDeviceToHost, streams[0]);
+    cudaStreamSynchronize(streams[0]);
+
+    //Unpack results
+    int k = 0;
+    std::vector<std::vector<std::vector<double>>> result(scoreSets.size());
+
+    for(int scoreSetIdx=0; scoreSetIdx<numScoreSets; scoreSetIdx++){
+        auto scoreSet = scoreSets[scoreSetIdx];
+        int numSequences = scoreSet.stateSequences.size();
+        int numReads = scoreSet.rawData.size();
+        for (int seqIdx=0; seqIdx<numSequences; seqIdx++){
+
+            std::vector<double> seqScores(numReads);
+
+            for (int readIdx=0; readIdx<numReads; readIdx++){
+                float score = returnValuesHost[k];
+                seqScores[readIdx] = score;
+                k++;
+            }
+
+            result[scoreSetIdx].push_back(seqScores);
+        }
+    }
+
+    return result;
+}
+
+
+std::vector<Variant> GpuAligner::variantScoresThresholded(std::vector<std::vector<Variant>> input_variants_vector,
+                                                          std::vector<Haplotype> base_haplotypes,
+                                                          std::vector<std::vector<HMMInputData>> event_sequences_vector,
+                                                          uint32_t alignment_flags,
+                                                          int screen_score_threshold,
+                                                          std::vector<std::string> methylation_types) {
+    int numScoreSets = base_haplotypes.size();
+    std::vector<ScoreSet> scoreSets;
+    scoreSets.resize(numScoreSets);
+
+    if(gpu_aligner_debug){
+        fprintf(stderr,"Generating variants:\n");
+    }
+
+    for(int scoreSetIdx=0; scoreSetIdx<numScoreSets;scoreSetIdx++){
+
+        if(gpu_aligner_debug){
+            fprintf(stderr,"scoreSetIdx=%d\t",scoreSetIdx);
+        }
+
+        auto input_variants = input_variants_vector[scoreSetIdx];
+        auto base_haplotype = base_haplotypes[scoreSetIdx];
+        auto event_sequences = event_sequences_vector[scoreSetIdx];
+
+        if (event_sequences.size() > MAX_COVERAGE) {
+            event_sequences.resize(MAX_COVERAGE);
+        }
+
+        int numVariants = input_variants.size();
+
+        std::vector<Variant> out_variants = input_variants;
+        std::vector<Haplotype> variant_haplotypes(numVariants, base_haplotype);
+
+        //loop over the vector, applying the variants to the haplotypes
+        for (int i = 0; i<input_variants.size();i++){
+            variant_haplotypes[i].apply_variant(input_variants[i]);
+        }
+
+        // Make methylated versions of each input sequence. Once for the base haplotype and once each for each variant
+        std::vector<HMMInputSequence> sequences;
+        std::vector<HMMInputSequence> base_sequence_vector = generate_methylated_alternatives(base_haplotype.get_sequence(),methylation_types);
+
+#ifdef MULTI_MODEL
+        std::vector<size_t> num_models_vector;
+        std::vector<size_t> score_offsets_vector;
+        size_t offset = 0;
+        size_t num_models = base_sequence_vector.size();
+        num_models_vector.push_back(num_models);
+        score_offsets_vector.push_back(offset);
+        if(gpu_aligner_debug){
+            fprintf(stderr,"num_models_base=%ld,offset_base=%ld\t",num_models,offset);
+        }
+        offset += num_models;
+        for (auto base_sequence: base_sequence_vector){
+             sequences.push_back(base_sequence);
+        }
+#else
+        HMMInputSequence base_sequence = base_sequence_vector[0];
+        sequences.push_back(base_sequence);
+#endif
+
+        for (auto v: variant_haplotypes){
+            auto variant_sequence_vector = generate_methylated_alternatives(v.get_sequence(), methylation_types);
+#ifdef MULTI_MODEL
+            size_t num_models = variant_sequence_vector.size();
+            num_models_vector.push_back(num_models);
+            score_offsets_vector.push_back(offset);
+            if(gpu_aligner_debug){
+                fprintf(stderr,"num_models_var=%ld,offset_var=%ld\t",num_models,offset);
+            }
+            offset += num_models;
+            for (auto variant_sequence: variant_sequence_vector){
+                sequences.push_back(variant_sequence);
+            }
+#else
+            auto variant_sequence = variant_sequence_vector[0];
+            sequences.push_back(variant_sequence);
+#endif
+        }
+
+        ScoreSet s = {
+            sequences,
+            event_sequences
+#ifdef MULTI_MODEL
+            ,
+            num_models_vector,
+            score_offsets_vector
+#endif
+        };
+
+        scoreSets[scoreSetIdx] = s;
+        if(gpu_aligner_debug){
+            fprintf(stderr,"\n");
+        }
+    }
+    if(gpu_aligner_debug){
+        fprintf(stderr,"\n");
+    }
+
+    std::vector<Variant> v;
+    if (!event_sequences_vector.empty()) {
+
+        if(gpu_aligner_debug){
+            fprintf(stderr,"Calling scoreKernelMod\n");
+        }
+        auto scoresMod = scoreKernelMod(scoreSets, alignment_flags);
+
+        if(gpu_aligner_debug){
+            fprintf(stderr,"Unpacking scores\n");
+        }
+        // results are now ready, need to unpack them
+        for (int scoreSetIdx=0; scoreSetIdx<numScoreSets; scoreSetIdx++){
+            if(gpu_aligner_debug) {
+                fprintf(stderr,"scoreSetIdx=%d\t",scoreSetIdx);\
+            }
+            std::vector<std::vector<double>> scores = scoresMod[scoreSetIdx]; // scores for this candidate, including all variants and base(zeroth)
+        #ifdef MULTI_MODEL
+            ScoreSet s = scoreSets[scoreSetIdx];
+            int numVariants = s.num_models_vector.size() -1; // subtract one for the base sequence
+        #else
+            int numVariants = scores.size() - 1; // subtract one for the base sequence
+        #endif
+            int numScores = scores[0].size();
+            for (int variantIndex = 0; variantIndex < numVariants; variantIndex++) { // index 0 is the base scores
+                double totalScore = 0.0;
+                for (int k = 0; k < numScores; k++) {
+                    if (fabs(totalScore) < screen_score_threshold) {
+                    #ifdef MULTI_MODEL
+
+                        //compute the base score based on the base sequences
+                        size_t num_models = s.num_models_vector[0];
+                        double num_model_penalty = log(num_models);
+                        double score = scores[0][k] - num_model_penalty;
+                        for(size_t seq_idx = 1; seq_idx < num_models; ++seq_idx) {
+                            double alt_score = scores[seq_idx][k] - num_model_penalty;
+                            score = add_logs(score, alt_score);
+                        }
+                        double baseScore = score;
+                        if (k==0 && variantIndex==0 && gpu_aligner_debug){
+                            fprintf(stderr,"num_models_base=%ld,offset_base=%d\t",num_models,0);
+                        }
+
+                        if(variantIndex+1 >= s.num_models_vector.size()){ //a sanity check
+                            fprintf(stderr,"\nAn invalid memory access occured\nscoreSetIdx=%d, variantIndex=%d, k=%d, \n",scoreSetIdx,variantIndex,k);
+                            assert(0);
+                        }
+
+                        //compute the variant score based on the variant sequences
+                        num_models = s.num_models_vector[variantIndex+1];
+                        size_t score_offset = s.score_offsets_vector[variantIndex+1];
+                        num_model_penalty = log(num_models);
+                        score = scores[score_offset][k] - num_model_penalty;
+                        for(size_t seq_idx = 1; seq_idx < num_models; ++seq_idx) {
+                            double alt_score = scores[score_offset + seq_idx][k] - num_model_penalty;
+                            score = add_logs(score, alt_score);
+                        }
+                        double variantScore = score;
+                        if (k==0 && gpu_aligner_debug) {
+                            fprintf(stderr,"num_models_var=%ld,offset_var=%ld\t",num_models,score_offset);
+                        }
+
+                    #else
+                        double baseScore = scores[0][k];
+                        double variantScore = scores[variantIndex + 1][k];
+                    #endif
+                        totalScore += (variantScore - baseScore);
+                    }
+                }
+                // get the old variant:
+                auto unScoredVariant = input_variants_vector[scoreSetIdx][variantIndex];
+                unScoredVariant.quality = totalScore;
+                unScoredVariant.info = "";
+                v.push_back(unScoredVariant);
+            }
+            if(gpu_aligner_debug){
+                fprintf(stderr,"\n");
+            }
+        }
+        if(gpu_aligner_debug){
+            fprintf(stderr,"\n");
+        }
+    }
+    return v;
+}
diff --git a/src/cuda_kernels/GpuAligner.h b/src/cuda_kernels/gpu_aligner.h
similarity index 76%
rename from src/cuda_kernels/GpuAligner.h
rename to src/cuda_kernels/gpu_aligner.h
index 731f2ed9..31121128 100644
--- a/src/cuda_kernels/GpuAligner.h
+++ b/src/cuda_kernels/gpu_aligner.h
@@ -46,29 +46,42 @@
 #define MAX_NUM_VARIANTS_PER_LOCUS 10
 #define MAX_NUM_WORKERS 16
 
+#define MULTI_MODEL 1
+
 //Data to be scored
 typedef struct {
     std::vector<HMMInputSequence> stateSequences;
     std::vector<HMMInputData> rawData;
+#ifdef  MULTI_MODEL
+    std::vector<size_t> num_models_vector;      //store the number of models for base sequence and then variant sequences
+    std::vector<size_t> score_offsets_vector;   //store the offsets based on number of models
+#endif
 } ScoreSet;
 
 class GpuAligner
 {
+
 public:
     GpuAligner();
     ~GpuAligner();
 
+    // GPU version of the candidate-variant scoring function
     std::vector<Variant>
       variantScoresThresholded(std::vector<std::vector<Variant>>,
-			       std::vector<Haplotype>,
-			       std::vector<std::vector<HMMInputData>>,
-              uint32_t alignment_flags, int screen_score_threshold, std::vector<std::string> methylation_types);
+                               std::vector<Haplotype>,
+                               std::vector<std::vector<HMMInputData>>,
+                               uint32_t alignment_flags, 
+                               int screen_score_threshold, 
+                               std::vector<std::string> methylation_types);
 
     std::vector<std::vector<double>> scoreKernel(std::vector<HMMInputSequence> sequences,
-    std::vector<HMMInputData> event_sequences,
-            uint32_t alignment_flags);
+                                                 std::vector<HMMInputData> event_sequences,
+                                                 uint32_t alignment_flags);
+
     std::vector<std::vector<std::vector<double>>> scoreKernelMod(std::vector<ScoreSet> &scoreSets,
-                                                                             uint32_t alignment_flags);
+                                                                 uint32_t alignment_flags);
+
+
 private:
     float* scaleDev;
     float* shiftDev;
@@ -80,6 +93,7 @@ class GpuAligner
     int* eventOffsetsDev;
     int* eventStridesDev;
     int* eventStartsDev;
+    int* modelOffsetsDev;
     int* numRowsDev;
     float* postFlankingDev;
     float* preFlankingDev;
@@ -104,6 +118,7 @@ class GpuAligner
     int * sequenceLengthsHost;
     int * eventOffsetsHost;
     int * sequenceOffsetsHost;
+    int * modelOffsetsHost;
     int * readIdxHost;
     int * seqIdxHost;
 
@@ -123,6 +138,8 @@ class GpuAligner
     std::vector<int*> kmerRanksDevPointers;
     std::vector<float*> returnValuesDevResultsPointers;
     std::vector<float*> returnValuesHostResultsPointers;
+    
+    std::map<const PoreModel*, int> modelToOffsetMap;
 
     cudaStream_t streams[8]; // TODO 8 should not be hardcoded here
 };
diff --git a/src/cuda_kernels/gpu_call_variants.inl b/src/cuda_kernels/gpu_call_variants.inl
index c5036dcf..e2358d20 100644
--- a/src/cuda_kernels/gpu_call_variants.inl
+++ b/src/cuda_kernels/gpu_call_variants.inl
@@ -1,4 +1,4 @@
-#include <cuda_kernels/GpuAligner.h>
+#include <cuda_kernels/gpu_aligner.h>
 #include <thread>
 #include <chrono>
 #include <future>
diff --git a/src/pore_model/nanopolish_pore_model_set.cpp b/src/pore_model/nanopolish_pore_model_set.cpp
index 474621dc..c2bee02d 100644
--- a/src/pore_model/nanopolish_pore_model_set.cpp
+++ b/src/pore_model/nanopolish_pore_model_set.cpp
@@ -126,6 +126,16 @@ const PoreModel* PoreModelSet::get_model_by_key(const std::string& key)
     }
 }
 
+std::vector<const PoreModel*> PoreModelSet::get_all_models()
+{
+    PoreModelSet& model_set = getInstance();
+    std::vector<const PoreModel*> out;
+    for(auto& iter : model_set.model_map) {
+        out.push_back(iter.second);       
+    }
+    return out;
+}
+
 //
 std::map<std::string, PoreModel> PoreModelSet::copy_strand_models(const std::string& kit_name,
                                                                   const std::string& alphabet,
diff --git a/src/pore_model/nanopolish_pore_model_set.h b/src/pore_model/nanopolish_pore_model_set.h
index 63da243f..ee16f02a 100644
--- a/src/pore_model/nanopolish_pore_model_set.h
+++ b/src/pore_model/nanopolish_pore_model_set.h
@@ -52,9 +52,17 @@ class PoreModelSet
                                           const std::string& alphabet,
                                           const std::string& strand,
                                           size_t k);
-
+        
+        //
+        // Get a single model
+        //
         static const PoreModel* get_model_by_key(const std::string& key);
 
+        //
+        // Get pointers to all models
+        //
+        static std::vector<const PoreModel*> get_all_models();
+
         //
         // get all the models for the combination of parameters
         //

From b13ab2d289a987143318125a6cd09d94e03db006 Mon Sep 17 00:00:00 2001
From: Mike Vella <vellamike@gmail.com>
Date: Fri, 1 May 2020 14:38:39 +0100
Subject: [PATCH 80/80] Fixed edge case causing segfault when no reads are
 present in a scoreSet

---
 src/cuda_kernels/gpu_aligner.cu | 70 ++++++++++++++++++---------------
 1 file changed, 39 insertions(+), 31 deletions(-)

diff --git a/src/cuda_kernels/gpu_aligner.cu b/src/cuda_kernels/gpu_aligner.cu
index 972086ed..30c19bd1 100644
--- a/src/cuda_kernels/gpu_aligner.cu
+++ b/src/cuda_kernels/gpu_aligner.cu
@@ -466,7 +466,7 @@ std::vector<std::vector<std::vector<double>>> GpuAligner::scoreKernelMod(std::ve
         auto scoreSet = scoreSets[scoreSetIdx];
         int firstReadIdxinScoreSet = globalReadIdx;
 
-        //Read data
+        //Populate host buffers with data from raw reads.
         for (int eventSequenceIdx=0; eventSequenceIdx < scoreSet.rawData.size(); eventSequenceIdx++) {
             auto e = scoreSet.rawData[eventSequenceIdx];
             numReads++;
@@ -521,15 +521,22 @@ std::vector<std::vector<std::vector<double>>> GpuAligner::scoreKernelMod(std::ve
 
         auto & sequences = scoreSet.stateSequences;
         numSequences += sequences.size();
-
+        //Populate host buffers with data from sequences.
         for (int i = 0; i<sequences.size(); i++) {
             auto sequence = sequences[i];
 
+            // The offset to the sequence in the sequence buffer
             sequenceOffsetsHost[globalSequenceIdx] = kmerOffset;
 
             int sequenceLength = sequence.length();
             // TODO: k must be set per read, per score set not fixed
-            const uint32_t k = scoreSet.rawData[0].pore_model->k; 
+            // If there is no raw data associated with this scoreSet, then a default of k=1 is used.
+            // The sequence is copied to the device, although it is not actually used since there are no
+            // raw reads to compute candidates with.
+            uint32_t k = 1;
+            if (scoreSet.rawData.size() > 0){
+                 k = scoreSet.rawData[0].pore_model->k;
+            }
             int numKmers = sequenceLength - k + 1;
 
             for(size_t ki = 0; ki < numKmers; ++ki) {
@@ -548,14 +555,13 @@ std::vector<std::vector<std::vector<double>>> GpuAligner::scoreKernelMod(std::ve
 
             sequenceLengthsHost[globalSequenceIdx] = numKmers;
 
-            // Loop over the raw reads, producing a cartesian product of reads and sequences
+            // Loop over the raw reads, producing a Cartesian product of reads and sequences
             auto numReadsInScoreSet = scoreSet.rawData.size();
             for (int r=0; r<numReadsInScoreSet; r++){
                 seqIdxHost[globalScoreIdx] = globalSequenceIdx;
                 readIdxHost[globalScoreIdx] = firstReadIdxinScoreSet + r;
                 globalScoreIdx++;
             }
-
             globalSequenceIdx++;
         }
     }
@@ -622,33 +628,35 @@ std::vector<std::vector<std::vector<double>>> GpuAligner::scoreKernelMod(std::ve
     dim3 dimBlock(blockSize);
     dim3 dimGrid(numBlocks);
 
-    //printf("Launching get scores mod kernel\n");
-    getScoresMod <<< dimGrid, dimBlock, 0, streams[0]>>> (poreModelDev,
-                                                          readLengthsDev,
-                                                          eventStartsDev,
-                                                          eventStridesDev,
-                                                          eventsPerBaseDev,
-                                                          scaleDev,
-                                                          shiftDev,
-                                                          varDev,
-                                                          logVarDev,
-                                                          eventOffsetsDev,
-                                                          eventMeansDev,
-                                                          modelOffsetsDev,
-                                                          preFlankingDev,
-                                                          postFlankingDev,
-                                                          sequenceLengthsDev,
-                                                          sequenceOffsetsDev,
-                                                          kmerRanksDev,
-                                                          seqIdxDev,
-                                                          readIdxDev,
-                                                          globalScoreIdx,
-                                                          scoresDev);
-    cudaError_t err = cudaGetLastError();
-
-    if (err != cudaSuccess)
-        printf("Errors during kernel execution: %s\n", cudaGetErrorString(err));
+    if (globalScoreIdx > 0){
+        getScoresMod <<< dimGrid, dimBlock, 0, streams[0]>>> (poreModelDev,
+                                                              readLengthsDev,
+                                                              eventStartsDev,
+                                                              eventStridesDev,
+                                                              eventsPerBaseDev,
+                                                              scaleDev,
+                                                              shiftDev,
+                                                              varDev,
+                                                              logVarDev,
+                                                              eventOffsetsDev,
+                                                              eventMeansDev,
+                                                              modelOffsetsDev,
+                                                              preFlankingDev,
+                                                              postFlankingDev,
+                                                              sequenceLengthsDev,
+                                                              sequenceOffsetsDev,
+                                                              kmerRanksDev,
+                                                              seqIdxDev,
+                                                              readIdxDev,
+                                                              globalScoreIdx,
+                                                              scoresDev);
+        cudaError_t err = cudaGetLastError();
+
+        if (err != cudaSuccess){
+            printf("Errors during GPU kernel execution: %s\n", cudaGetErrorString(err));
+        }
 
+    }
     cudaMemcpyAsync(returnValuesHost, scoresDev, globalScoreIdx * sizeof(float), cudaMemcpyDeviceToHost, streams[0]);
     cudaStreamSynchronize(streams[0]);