From 6fb099341789e373d59f41a50311be5e1b064262 Mon Sep 17 00:00:00 2001
From: Yunfei Guo <yunfei.guo@bina.roche.com>
Date: Sat, 11 Feb 2017 23:07:19 -0800
Subject: [PATCH 1/3] refactoring; draft of globalMatching; incomplete
 implementation of max-weighted bipartite matching

---
 .../varsim/tools/evaluation/VCFcompare.java   | 946 +++++++++++++++++-
 1 file changed, 898 insertions(+), 48 deletions(-)

diff --git a/src/main/java/com/bina/varsim/tools/evaluation/VCFcompare.java b/src/main/java/com/bina/varsim/tools/evaluation/VCFcompare.java
index e50cdaac..ded29b47 100644
--- a/src/main/java/com/bina/varsim/tools/evaluation/VCFcompare.java
+++ b/src/main/java/com/bina/varsim/tools/evaluation/VCFcompare.java
@@ -22,11 +22,11 @@
 import com.fasterxml.jackson.databind.ObjectMapper;
 import org.apache.commons.io.FileUtils;
 import org.apache.log4j.Logger;
+import org.jgrapht.Graph;
+import org.jgrapht.graph.SimpleWeightedGraph;
+import org.jgrapht.graph.builder.UndirectedWeightedGraphBuilderBase;
 import org.kohsuke.args4j.Argument;
-import org.kohsuke.args4j.CmdLineException;
-import org.kohsuke.args4j.CmdLineParser;
 import org.kohsuke.args4j.Option;
-import com.bina.varsim.types.ComparisonResultWriter;
 
 import java.io.File;
 import java.io.IOException;
@@ -720,6 +720,13 @@ public void run(String[] args) {
             localMatching();
         }
     }
+
+    /**
+     * match truth and test variants on a local basis, i.e.
+     * do not optimize matching to reduce global mismatches
+     *
+     * this is for preserving legacy behavior
+     */
     private void localMatching() {
 
         // these are the statistics we "ideally" want to collect
@@ -764,40 +771,6 @@ private void localMatching() {
         // load true VCF into interval tree
         log.info("Load Truth VCF");
 
-        /**
-         * This is just for outputting to JSON
-         */
-        class outputClass {
-            CompareParams params;
-            @JsonProperty(value = "num_true_correct")
-            EnumStatsRatioCounter<VariantOverallType> numberOfTrueCorrect;
-
-            outputClass(CompareParams params, EnumStatsRatioCounter<VariantOverallType> numberOfTrueCorrect) {
-                this.params = params;
-                this.numberOfTrueCorrect = numberOfTrueCorrect;
-            }
-
-            outputClass() {
-            }
-
-            public CompareParams getParams() {
-                return params;
-            }
-
-            public void setParams(CompareParams params) {
-                this.params = params;
-            }
-
-            public EnumStatsRatioCounter<VariantOverallType> getNumberOfTrueCorrect() {
-                return numberOfTrueCorrect;
-            }
-
-            public void setNumberOfTrueCorrect(EnumStatsRatioCounter<VariantOverallType> numberOfTrueCorrect) {
-                this.numberOfTrueCorrect = numberOfTrueCorrect;
-            }
-        }
-
-
         outputClass outputBlob = new outputClass();
 
         outputBlob.setParams(new CompareParams());
@@ -1001,7 +974,7 @@ public void setNumberOfTrueCorrect(EnumStatsRatioCounter<VariantOverallType> num
 
                     if (currentVariant.isHom()) {
                         int maxTrueLength = resultComparator.compareVariant(currentVariant, geno.geno[0], validatedTrue);
-                        final DualIdx dualIdx = matchGenotype ? resultComparator.isHomMatch() : resultComparator.isMatch();
+                        final DualIdx dualIdx = matchGenotype ? resultComparator.firstHomMatch() : resultComparator.firstMatch();
                         if (dualIdx.isSplitVariantValid()) {
                             // validated
                             validatedTrue.set(dualIdx.splitVariantIndex);
@@ -1033,7 +1006,7 @@ public void setNumberOfTrueCorrect(EnumStatsRatioCounter<VariantOverallType> num
                             }
                         }
 
-                        final DualIdx dualIdx = matchGenotype ? resultComparator.isHetMatch() : resultComparator.isMatch();
+                        final DualIdx dualIdx = matchGenotype ? resultComparator.firstHetMatch() : resultComparator.firstMatch();
 
                         if (dualIdx.isSplitVariantValid()) {
                             validatedTrue.set(dualIdx.splitVariantIndex);
@@ -1168,7 +1141,625 @@ public void setNumberOfTrueCorrect(EnumStatsRatioCounter<VariantOverallType> num
 
         log.info("Done!"); // used to record the time
     }
-    private void globalMatching() {}
+
+    /**
+     * globalMatching matches truth and test variants
+     * on a global basis, i.e. maximize overall matched
+     * variant pairs.
+     *
+     * some implementation details:
+     * *****************************************************
+     * *****CAUTION: subject to change without notice*******
+     * *****************************************************
+     * true variants and test variants are joined by edges if
+     * they match with each other based on user-specified
+     * criteria. The edges will be assigned weights, i.e.
+     * rankings among all the pairings based on degree of
+     * matching. The variants and pairings are used to construct
+     * an undirected weighted bipartite graph. Solution to
+     * maximum-weighted bipartite matching will be used to
+     * determine final pairing between true and test variants.
+     *
+     * note here, no more than one true/test variants will be
+     * paired with each test/true variant. In other words,
+     * degree of vertices is no more than 1.
+     */
+    private void globalMatching() {
+
+        // these are the statistics we "ideally" want to collect
+        // number of variants correct (either genotype) (for each type)
+        // number homozygous correct (for each type)
+        // number heterozygous correct (for each type)
+        // number homozygous genotype correct (for each type)
+        // number heterozyous genotype correct (for each type)
+
+
+        BedFile intersector = null;
+        boolean bedExists = false;
+        // check if the file exists
+        try {
+            File f = new File(bedFilename);
+            bedExists = f.exists();
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+
+        if (bedExists) {
+            log.info("Using " + bedFilename + " to intersect");
+            intersector = new BedFile(bedFilename);
+        } else {
+            if (excludeTprFromBedFiltering || excludeFdfFromBedFiltering) {
+                log.warn("No BED file specified but used exclude parameters");
+            }
+        }
+
+        Set<String> chrAcceptor = null;
+        //TODO: make chrAcceptor a class
+        if (chromosomeToBeIncluded != null) {
+            chrAcceptor = new HashSet<>(Arrays.asList(chromosomeToBeIncluded.split(",")));
+            log.info("Only accepting chromosomes: " + Arrays.toString(chrAcceptor.toArray()));
+        }
+
+        ConstraintValidator validator = new ConstraintValidator(constraintArgs);
+
+        // Load refernece genome
+        SimpleReference referenceGenome = reference == null ? null : new SimpleReference(reference);
+
+        // load true VCF into interval tree
+        log.info("Load Truth VCF");
+
+        outputClass outputBlob = new outputClass();
+
+        outputBlob.setParams(new CompareParams());
+        outputBlob.getParams().setBedFilename(bedFilename);
+        // TODO: make it output the full list if variants in JSON
+        outputBlob.getParams().setNewVcfFilename(newVcfFilename.get(0));
+        outputBlob.getParams().setOverlapPercent(overlapRatio);
+        outputBlob.getParams().setTrueVcfFilename(trueVcfFilename);
+        outputBlob.getParams().setWiggle(wiggle);
+
+        VCFparser trueVcfParser = new VCFparser(trueVcfFilename, null, false);
+
+        // allow duplicates, this is needed because insertions don't actually take up a location
+        chrSearchTree<ValueInterval1D<Variant>> trueVariantIntervalTree = new chrSearchTree<>(true);
+        int numReadOriginalVariant = 0;
+        int numAddedSplitVariant = 0;
+
+
+        //track TRAID-linked variants
+        Map<String, List<Variant>> traid2composingVariants = new HashMap<>();
+
+        // this is for the original variants
+        // it stores the total length of the original variant in bases
+        // Still check for validation of canonical full variants
+        ArrayList<Integer> validatedTotalLength = new ArrayList<>();
+        ArrayList<Variant> trueVariantsForOutput = new ArrayList<>();
+
+        Set<Variant> canonicalizedTrueVariants = new HashSet<>();
+        Set<Variant> canonicalizedTestVariants = new HashSet<>();
+        List<VariantPair> allCanonicalizedVariantPairs = new ArrayList<>();
+        UndirectedWeightedGraphBuilderBase weightedGraphBuilder = SimpleWeightedGraph.builder(VariantPair.class);
+        Graph<Variant, VariantPair> bipartiteTrueVsTestVariantGraph = null;
+
+        // For each true variant, if the number of bases validated is over a certain threshold
+        // call it correct
+        outputBlob.setNumberOfTrueCorrect(new EnumStatsRatioCounter<VariantOverallType>());
+
+        // For called variants, break down into canonical ones and count based on that
+        // if any called variant overlaps a complex variant or MNP, count it as "complex"
+        // otherwise, simple count them in their canonical forms
+
+        // store true variants as canonical ones, but remember original form
+        while (trueVcfParser.hasMoreInput()) {
+            Variant trueVariant = trueVcfParser.parseLine();
+            if (trueVariant == null ||
+                    (!trueVariant.getGenotypes().isNonRef()) ||
+                    (chrAcceptor != null && !chrAcceptor.contains(trueVariant.getChr().getName()))) {
+                log.info("skip line");
+                continue;
+            }
+
+            VariantOverallType trueVariantOriginalType = trueVariant.getType();
+
+            if (trueVariant.getTraid() != null) {
+                String currentTraid = trueVariant.getTraid();
+                if (traid2composingVariants.containsKey(currentTraid)) {
+                    traid2composingVariants.get(currentTraid).add(trueVariant);
+                    trueVariant = new Variant.Builder().compositions(traid2composingVariants.get(currentTraid)).build();
+                    traid2composingVariants.remove(currentTraid);
+                } else {
+                    traid2composingVariants.put(currentTraid, new ArrayList<Variant>());
+                    traid2composingVariants.get(currentTraid).add(trueVariant);
+                    continue;
+                }
+            }
+            // determine max variant region
+            // when comparing genotypes, we need to individually compare
+            // to make sure they really overlap
+
+            //TODO: remove constructor here (because another copy will be created inside canonicalizeVariant
+            List<Variant> canonicalVariantList = canonicalizeVariant(new Variant(trueVariant));
+
+            int totalLength = canonicalVariantList.stream().mapToInt(v -> v.maxLen()).sum();
+            int maxLength = canonicalVariantList.stream().mapToInt(v -> v.maxLen()).max().getAsInt();
+
+            // add to interval tree
+            for (Variant currentVariant : canonicalVariantList) {
+
+                ChrString chr = currentVariant.getChr();
+                SimpleInterval1D currentVariantInterval = null;
+                try {
+                    currentVariantInterval = currentVariant.getGenotypeUnionVariantInterval();
+                } catch (Exception e) {
+                    e.printStackTrace();
+                    log.error("Original variant: " + trueVariant);
+                    log.error("Bad variant: " + currentVariant);
+                    System.exit(1);
+                }
+                currentVariant.splitVariantIndex = numAddedSplitVariant;
+                currentVariant.wholeVariantIndex = numReadOriginalVariant;
+                currentVariant.originalType = trueVariantOriginalType;
+
+                trueVariantIntervalTree.put(chr, new ValueInterval1D<>(currentVariantInterval, currentVariant));
+                canonicalizedTrueVariants.add(currentVariant);
+                numAddedSplitVariant++;
+            }
+
+            if (totalLength >= Constant.SVLEN && maxLength >= overlapRatio * totalLength && canonicalVariantList.size() > 1) {
+                // in this case we break down the variant into canoical forms since
+                // the original variant was probably a large deletion with a small insertion
+                for (Variant currentVariant : canonicalVariantList) {
+                    int currentLength = currentVariant.maxLen();
+                    validatedTotalLength.add(currentLength);
+                    trueVariantsForOutput.add(currentVariant);
+                    numReadOriginalVariant++;
+                }
+            } else {
+                validatedTotalLength.add(totalLength);
+                trueVariantsForOutput.add(trueVariant);
+                numReadOriginalVariant++;
+            }
+        }
+
+        log.info("Num read:  " + numReadOriginalVariant);
+        log.info("Num added: " + numAddedSplitVariant);
+        log.info("Num nodes: " + trueVariantIntervalTree.size());
+        log.info("Max depth: " + trueVariantIntervalTree.maxDepth());
+
+        // this is for the split variants
+        // set to true if the canonical original variant was validated true
+        BitSet validatedTrue = new BitSet(numAddedSplitVariant);
+
+        // this is for the original variants
+        // count of the number of bases validated for the original variant
+        int[] fullValidatedCount = new int[numReadOriginalVariant];
+
+        // generate the output files
+        try (
+                PrintWriter tpWriter = TP_WRITER.getWriter(outPrefix);
+                PrintWriter unknownTpWriter = UNKNOWN_TP_WRITER.getWriter(outPrefix);
+                PrintWriter fpWriter = FP_WRITER.getWriter(outPrefix);
+                PrintWriter unknownFpWriter = UNKNOWN_FP_WRITER.getWriter(outPrefix);
+                PrintWriter fnWriter = FN_WRITER.getWriter(outPrefix);
+                PrintWriter jsonWriter = JSON_WRITER.getWriter(outPrefix);) {
+
+
+            // for this case we add to false positives if the variant is not validated.
+            // However, do don't add to true positives, those that computed later
+
+            log.info("Load New VCF");
+            int numberOfNewVariants = 0;
+            // iterate over new VCF and collect stats
+
+            traid2composingVariants.clear();
+            MultiResultComparator multiResultComparator = new MultiResultComparator(trueVariantIntervalTree, overlapRatio,
+                    wiggle, ignoreInsertionLength, matchGenotype);
+
+            for (String currentVcfFile : newVcfFilename) {
+                VCFparser newParser = new VCFparser(currentVcfFile, sampleName, excludeFiltered);
+
+                while (newParser.hasMoreInput()) {
+                    Variant variant = newParser.parseLine();
+
+                    //TODO: wrap variant comparison into a method for easier reading
+                    if (variant == null ||
+                            (chrAcceptor != null && !chrAcceptor.contains(variant.getChr().getName()))) {
+                        log.info("skip line");
+                        continue;
+                    }
+
+                    boolean skipFP = false;
+
+                    if (variant.getTraid() != null) {
+                        String currentTraid = variant.getTraid();
+                        if (traid2composingVariants.containsKey(currentTraid)) {
+                            traid2composingVariants.get(currentTraid).add(variant);
+                            variant = new Variant.Builder().compositions(traid2composingVariants.get(currentTraid)).build();
+                            traid2composingVariants.remove(currentTraid);
+                            if (intersector != null) {
+                                for (Variant c : variant.getCompositions()) {
+                                    if (!excludeFdfFromBedFiltering && !intersector.containsEndpoints(c.getChr(), c.getGenotypeUnionAlternativeInterval(), bedEither)) {
+                                        skipFP = true;
+                                    }
+                                }
+                            }
+                        } else {
+                            traid2composingVariants.put(currentTraid, new ArrayList<Variant>());
+                            traid2composingVariants.get(currentTraid).add(variant);
+                            continue;
+                        }
+                    } else {
+                        if (intersector != null) {
+                            if (!excludeFdfFromBedFiltering && !intersector.containsEndpoints(variant.getChr(), variant.getGenotypeUnionAlternativeInterval(), bedEither)) {
+                                skipFP = true;
+                            }
+                        }
+                    }
+
+                    // the overall type of the called variant
+                    VariantOverallType currentVariantType = variant.getType();
+
+                    // if called as complex variant convert to indel+snps
+                    List<Variant> canonicalVariantList = canonicalizeVariant(variant);
+
+                    int totalLength = canonicalVariantList.stream().mapToInt(v -> v.maxLen()).sum();
+                    double validatedLength = 0;
+                    int maxLength = canonicalVariantList.stream().mapToInt(v -> v.maxLen()).max().getAsInt();
+
+                    // split up variants that are basically one big variant and one small one
+                    //maxLength / totalLength >= overlapRatio, true if the longest canonicalized variant
+                    //is longer than certain proportion of sum of lengths of all canonicalized variants.
+                    boolean computeAsSplit = totalLength >= Constant.SVLEN && maxLength >= overlapRatio * totalLength  &&
+                            canonicalVariantList.size() > 1;
+
+                  /*
+                  here we need to figure out all possible pairings between current test variant
+                  and true variants, the pairings will be sorted later based on degree of matching
+                   */
+                    for (Variant currentVariant : canonicalVariantList) {
+                        canonicalizedTestVariants.add(currentVariant);
+                        allCanonicalizedVariantPairs.addAll(multiResultComparator.findAllPairs(currentVariant));
+                    }
+                            for (int i = 0; i < 2; i++) {
+                                byte allele = geno.geno[i];
+                                if (allele > 0) {
+                                    maxTrueLen = Math.max(resultComparator.compareVariant(currentVariant, allele, validatedTrue), maxTrueLen);
+                                }
+                            }
+
+                            final DualIdx dualIdx = matchGenotype ? resultComparator.firstHetMatch() : resultComparator.firstMatch();
+
+                            if (dualIdx.isSplitVariantValid()) {
+                                validatedTrue.set(dualIdx.splitVariantIndex);
+                                fullValidatedCount[dualIdx.wholeVariantIndex] += currentVariant.maxLen(); // this 'should' be overlap len
+                                validatedLength += currentVariant.maxLen();
+                            } else if (computeAsSplit) {
+                                if (!skipFP) {
+                                    outputBlob.getNumberOfTrueCorrect().incFP(currentVariant.getType(), currentVariant.maxLen());
+                                    validator.inc(StatsNamespace.FP, currentVariant.getType(), currentVariant.maxLen());
+                                    if (currentVariant.getType() == VariantOverallType.SNP && currentVariant.maxLen() > 1) {
+                                        log.warn("SNP with bad length: " + currentVariant);
+                                    }
+                                    variant.output(fpWriter);
+                                } else {
+                                    variant.output(unknownFpWriter);
+                                }
+                            }
+                        }
+                    }
+
+                    if (!computeAsSplit && validatedLength < (totalLength * overlapRatio)) {
+                        if (!skipFP) {
+                            // this is a false positive!
+                            outputBlob.getNumberOfTrueCorrect().incFP(currentVariantType, variant.maxLen());
+                            validator.inc(StatsNamespace.FP, currentVariantType, variant.maxLen());
+                            if (currentVariantType == VariantOverallType.SNP && variant.maxLen() > 1) {
+                                log.warn("SNP with bad length: " + variant);
+                            }
+                            variant.output(fpWriter);
+                        } else {
+                            variant.output(unknownFpWriter);
+                        }
+                    }
+
+                    numberOfNewVariants++;
+                }
+            }
+            /**********/
+        /**********/
+        /**********/
+        /**********/
+        /**********/
+        /**********/
+        /**********/
+        /**********/
+        /**********/
+        //perform global matching
+        /**********/
+        /**********/
+        /**********/
+        /**********/
+        /**********/
+        /**********/
+        /**********/
+        /**********/
+        /**********/
+        /**********/
+        /**********/
+        /**********/
+        /**********/
+        /**********/
+        /**********/
+        /**********/
+        /**********/
+        /**********/
+        /**********/
+        /**********/
+//            for (String currentVcfFile : newVcfFilename) {
+//                VCFparser newParser = new VCFparser(currentVcfFile, sampleName, excludeFiltered);
+//
+//                while (newParser.hasMoreInput()) {
+//                    Variant variant = newParser.parseLine();
+//
+//                    //TODO: wrap variant comparison into a method for easier reading
+//                    if (variant == null ||
+//                            (chrAcceptor != null && !chrAcceptor.contains(variant.getChr().getName()))) {
+//                        log.info("skip line");
+//                        continue;
+//                    }
+//
+//                    boolean skipFP = false;
+//
+//                    if (variant.getTraid() != null) {
+//                        String currentTraid = variant.getTraid();
+//                        if (traid2composingVariants.containsKey(currentTraid)) {
+//                            traid2composingVariants.get(currentTraid).add(variant);
+//                            variant = new Variant.Builder().compositions(traid2composingVariants.get(currentTraid)).build();
+//                            traid2composingVariants.remove(currentTraid);
+//                            if (intersector != null) {
+//                                for (Variant c : variant.getCompositions()) {
+//                                    if (!excludeFdfFromBedFiltering && !intersector.containsEndpoints(c.getChr(), c.getGenotypeUnionAlternativeInterval(), bedEither)) {
+//                                        skipFP = true;
+//                                    }
+//                                }
+//                            }
+//                        } else {
+//                            traid2composingVariants.put(currentTraid, new ArrayList<Variant>());
+//                            traid2composingVariants.get(currentTraid).add(variant);
+//                            continue;
+//                        }
+//                    } else {
+//                        if (intersector != null) {
+//                            if (!excludeFdfFromBedFiltering && !intersector.containsEndpoints(variant.getChr(), variant.getGenotypeUnionAlternativeInterval(), bedEither)) {
+//                                skipFP = true;
+//                            }
+//                        }
+//                    }
+//
+//                    // the overall type of the called variant
+//                    VariantOverallType currentVariantType = variant.getType();
+//
+//                    // if called as complex variant convert to indel+snps
+//                    List<Variant> canonicalVariantList = canonicalizeVariant(variant);
+//
+//                    int totalLength = canonicalVariantList.stream().mapToInt(v -> v.maxLen()).sum();
+//                    double validatedLength = 0;
+//                    int maxLength = canonicalVariantList.stream().mapToInt(v -> v.maxLen()).max().getAsInt();
+//
+//                    // split up variants that are basically one big variant and one small one
+//                    //maxLength / totalLength >= overlapRatio, true if the longest canonicalized variant
+//                    //is longer than certain proportion of sum of lengths of all canonicalized variants.
+//                    boolean computeAsSplit = totalLength >= Constant.SVLEN && maxLength >= overlapRatio * totalLength  &&
+//                            canonicalVariantList.size() > 1;
+//
+//                    for (Variant currentVariant : canonicalVariantList) {
+//                        canonicalizedTestVariants.add(currentVariant);
+//                    // get genotype
+//                    Genotypes geno = currentVariant.getGenotypes();
+//                    //note here ResultComparator is created for each canonical variant
+//                    ResultComparator resultComparator = new ResultComparator(trueVariantIntervalTree, overlapRatio, wiggle, ignoreInsertionLength);
+//
+//                    if (currentVariant.isHom()) {
+//                        int maxTrueLength = resultComparator.compareVariant(currentVariant, geno.geno[0], validatedTrue);
+//                        final DualIdx dualIdx = matchGenotype ? resultComparator.isHomMatch() : resultComparator.firstMatch();
+//                        if (dualIdx.isSplitVariantValid()) {
+//                            // validated
+//                            validatedTrue.set(dualIdx.splitVariantIndex);
+//                            fullValidatedCount[dualIdx.wholeVariantIndex] += maxTrueLength;// this 'should' be overlap len
+//                            validatedLength += currentVariant.maxLen();
+//                        } else if (computeAsSplit) {
+//                            if (!skipFP) {
+//                                outputBlob.getNumberOfTrueCorrect().incFP(currentVariant.getType(), variant.maxLen());
+//                                validator.inc(StatsNamespace.FP, currentVariant.getType(), variant.maxLen());
+//                                variant.output(fpWriter);
+//                            } else {
+//                                variant.output(unknownFpWriter);
+//                            }
+//                        }
+//
+//                    } else {
+//                        // het
+//                        //boolean matched = false;
+//                        int maxTrueLen = 0;
+//                        /*
+//                        for heterozygous variants, all genotypes will be checked
+//                        inside compareVariant, all genotypes of overlapping true variants will be checked, too
+//                        so overall all possible combinations of genotype matching will be checked
+//                        */
+//                        for (int i = 0; i < 2; i++) {
+//                            byte allele = geno.geno[i];
+//                            if (allele > 0) {
+//                                maxTrueLen = Math.max(resultComparator.compareVariant(currentVariant, allele, validatedTrue), maxTrueLen);
+//                            }
+//                        }
+//
+//                        final DualIdx dualIdx = matchGenotype ? resultComparator.firstHetMatch() : resultComparator.firstMatch();
+//
+//                        if (dualIdx.isSplitVariantValid()) {
+//                            validatedTrue.set(dualIdx.splitVariantIndex);
+//                            fullValidatedCount[dualIdx.wholeVariantIndex] += currentVariant.maxLen(); // this 'should' be overlap len
+//                            validatedLength += currentVariant.maxLen();
+//                        } else if (computeAsSplit) {
+//                            if (!skipFP) {
+//                                outputBlob.getNumberOfTrueCorrect().incFP(currentVariant.getType(), currentVariant.maxLen());
+//                                validator.inc(StatsNamespace.FP, currentVariant.getType(), currentVariant.maxLen());
+//                                if (currentVariant.getType() == VariantOverallType.SNP && currentVariant.maxLen() > 1) {
+//                                    log.warn("SNP with bad length: " + currentVariant);
+//                                }
+//                                variant.output(fpWriter);
+//                            } else {
+//                                variant.output(unknownFpWriter);
+//                            }
+//                        }
+//                    }
+//                }
+//
+//                if (!computeAsSplit && validatedLength < (totalLength * overlapRatio)) {
+//                    if (!skipFP) {
+//                        // this is a false positive!
+//                        outputBlob.getNumberOfTrueCorrect().incFP(currentVariantType, variant.maxLen());
+//                        validator.inc(StatsNamespace.FP, currentVariantType, variant.maxLen());
+//                        if (currentVariantType == VariantOverallType.SNP && variant.maxLen() > 1) {
+//                            log.warn("SNP with bad length: " + variant);
+//                        }
+//                        variant.output(fpWriter);
+//                    } else {
+//                        variant.output(unknownFpWriter);
+//                    }
+//                }
+//
+//                numberOfNewVariants++;
+//            }
+//        }
+
+            log.info("Num new variants read: " + numberOfNewVariants);
+
+            // read through again and compute for the true variants
+            int numRead2 = 0;
+            for (Variant var : trueVariantsForOutput) {
+
+                boolean isKnown = intersector == null || excludeTprFromBedFiltering;
+
+                if (var.getCompositions() == null) {
+                    isKnown = isKnown || intersector.containsEndpoints(var.getChr(), var.getGenotypeUnionAlternativeInterval(), bedEither);
+                } else {
+                    for (Variant c : var.getCompositions()) {
+                        isKnown = isKnown || intersector.containsEndpoints(c.getChr(), c.getGenotypeUnionAlternativeInterval(), bedEither);
+                    }
+                }
+                if (isKnown) {
+                    int totalLength = validatedTotalLength.get(numRead2);
+                    int validatedLength = fullValidatedCount[numRead2];
+
+                    //if a variant is canonicalized into a few smaller variants, validation
+                    //will be carried out on a per-variant basis. An original variant will
+                    //be considered a match only if the sum of lengths of all its VALIDATED
+                    //canonicalized variants is larger than overlapRatio * totalLength
+                    if (validatedLength >= (overlapRatio * totalLength)) {
+                        // validated
+                        outputBlob.getNumberOfTrueCorrect().incTP(var.getType(), var.maxLen());
+                        validator.inc(StatsNamespace.TP, var.getType(), var.maxLen());
+                        var.output(tpWriter);
+                    } else {
+                        var.output(fnWriter);
+                    }
+
+                    outputBlob.getNumberOfTrueCorrect().incT(var.getType(), var.maxLen());
+                    validator.inc(StatsNamespace.T, var.getType(), var.maxLen());
+                } else {
+                    var.output(unknownTpWriter);
+                }
+                numRead2++;
+            }
+
+            if (numReadOriginalVariant != numRead2) {
+                log.error("Number of variants read are inconsistent: " + numReadOriginalVariant + "," + numRead2);
+            }
+
+            // Compute and update the true negatives here so that we have specificity values
+            if (referenceGenome != null) {
+                final VariantOverallType variantOverallTypes[] = {VariantOverallType.SNP, VariantOverallType.Insertion, VariantOverallType.Deletion};
+                for (final VariantOverallType variantOverallType : variantOverallTypes) {
+                    if (outputBlob.getNumberOfTrueCorrect().getData().containsKey(variantOverallType)) {
+                        outputBlob.getNumberOfTrueCorrect().getData().get(variantOverallType).computeTN((int) referenceGenome.getNumNonNBases());
+                    }
+                }
+            }
+
+
+            // output the stats
+            System.err.println(outputBlob.getNumberOfTrueCorrect());
+
+            ObjectMapper mapper = new ObjectMapper();
+            mapper.configure(JsonGenerator.Feature.AUTO_CLOSE_TARGET, false);
+
+            String jsonStr = "";
+            try {
+                jsonStr = mapper.writeValueAsString(outputBlob);
+                jsonWriter.print(jsonStr);
+            } catch (Exception e) {
+                e.printStackTrace();
+            }
+
+            if (htmlFile != null) {
+                try {
+                    FileUtils.writeStringToFile(new File(outPrefix + "_varcomp.html"),
+                            JSONInserter.insertJSON(FileUtils.readFileToString(htmlFile), jsonStr));
+                } catch (IOException e) {
+                    e.printStackTrace();
+                }
+            }
+
+        } catch (Exception e) {
+            e.printStackTrace();
+            System.exit(1);
+        }
+
+        // Check the validity
+        try {
+            validator.testValidity();
+        } catch (UnsatisfiedConstraintException e) {
+            log.error("A number of constraints were not satisfied:");
+            for (UnsatisfiedConstraintException.valuePair constraint : e.getConstraints()) {
+                log.error(String.format("%.4f : %s", constraint.getActualValue(), constraint.getConstraint()));
+            }
+            System.exit(1);
+        }
+
+        log.info("Done!"); // used to record the time
+    }
+    /**
+     * This is just for outputting to JSON
+     */
+    class outputClass {
+        CompareParams params;
+        @JsonProperty(value = "num_true_correct")
+        EnumStatsRatioCounter<VariantOverallType> numberOfTrueCorrect;
+
+        outputClass(CompareParams params, EnumStatsRatioCounter<VariantOverallType> numberOfTrueCorrect) {
+            this.params = params;
+            this.numberOfTrueCorrect = numberOfTrueCorrect;
+        }
+
+        outputClass() {
+        }
+
+        public CompareParams getParams() {
+            return params;
+        }
+
+        public void setParams(CompareParams params) {
+            this.params = params;
+        }
+
+        public EnumStatsRatioCounter<VariantOverallType> getNumberOfTrueCorrect() {
+            return numberOfTrueCorrect;
+        }
+
+        public void setNumberOfTrueCorrect(EnumStatsRatioCounter<VariantOverallType> numberOfTrueCorrect) {
+            this.numberOfTrueCorrect = numberOfTrueCorrect;
+        }
+    }
 
     class CompareParams {
         @JsonProperty(value = "true_vcf_filename")
@@ -1287,6 +1878,13 @@ public String toString() {
         }
     }
 
+    /**
+     * inner class for finding a matched true variant
+     * only handles 1 matched variant
+     *
+     * main purpose is for backward compatibility
+     * should favor {@link MultiResultComparator}
+     */
     class ResultComparator {
 
         chrSearchTree<ValueInterval1D<Variant>> trueVariantIntervalTree; // true variants
@@ -1310,18 +1908,18 @@ public ResultComparator(chrSearchTree<ValueInterval1D<Variant>> trueVariantInter
         }
 
         /**
-         * return homozygous match (empty if no match)
+         * return first homozygous match (empty if no match)
          * @return
          */
-        public DualIdx isHomMatch() {
+        public DualIdx firstHomMatch() {
             return homozygousMatches.isEmpty() ? new DualIdx() : homozygousMatches.get(0);
         }
 
         /**
-         * return heterozygous match (empty if no match)
+         * return first heterozygous match (empty if no match)
          * @return
          */
-        public DualIdx isHetMatch() {
+        public DualIdx firstHetMatch() {
             List<DualIdx> temp = new ArrayList<>(heterozygousMatches.get(0));
             temp.retainAll(heterozygousMatches.get(1)); //essentially get intersection
             //if there is a match in intersection, return the first one from the intersection
@@ -1352,20 +1950,32 @@ public DualIdx isHetMatch() {
 
         /**
          * determine if there is a match regardless of genotype
-         * @return
+         * @return DualIdx of matched variant
          */
-        public DualIdx isMatch() {
-            DualIdx idx = isHomMatch();
+        public DualIdx firstMatch() {
+            DualIdx idx = firstHomMatch();
             if (idx.isSplitVariantValid()) {
                 return idx;
             }
-            idx = isHetMatch();
+            idx = firstHetMatch();
             if (idx.isSplitVariantValid()) {
                 return idx;
             }
             return idx;
         }
 
+        /**
+         * Only compares one allele at a time
+         * - don't match variants in the bitset
+         * - if match set the bitset
+         *
+         * @param variant       variant we want to compare
+         * @param genotype      allele of the variant to compare
+         * @return The maximum length of all true variants
+         */
+        public int compareVariant(Variant variant, int genotype) {
+            return compareVariant(variant, genotype, new BitSet());
+        }
         /**
          * Only compares one allele at a time
          * - don't match variants in the bitset
@@ -1374,6 +1984,7 @@ public DualIdx isMatch() {
          * @param variant       variant we want to compare
          * @param genotype      allele of the variant to compare
          * @param validated BitSet that records the true variants that have already been validated
+         *                  can be empty BitSet if we don't skip validated ones
          * @return The maximum length of all true variants
          */
         public int compareVariant(Variant variant, int genotype, BitSet validated) {
@@ -1499,4 +2110,243 @@ public int compareVariant(Variant variant, int genotype, BitSet validated) {
         }
     }
 
+    /**
+     * capable of recording multiple matched true variants
+     * should be favored over {@link ResultComparator}
+     */
+    class MultiResultComparator {
+        final chrSearchTree<ValueInterval1D<Variant>> trueVariantIntervalTree; // true variants
+        final double overlapRatio;
+        final int wiggle;
+        final boolean ignoreInsertionLength;
+        final boolean matchGenotype;
+
+        public MultiResultComparator(chrSearchTree<ValueInterval1D<Variant>> trueVariantIntervalTree, double overlapRatio,
+                                     int wiggle, boolean ignoreInsLen, boolean matchGenotype) {
+            this.trueVariantIntervalTree = trueVariantIntervalTree;
+            this.overlapRatio = overlapRatio;
+            this.wiggle = wiggle;
+            this.ignoreInsertionLength = ignoreInsLen;
+            this.matchGenotype = matchGenotype;
+        }
+
+        List<DualIdx> homozygousMatches = new ArrayList<>();
+        List<List<DualIdx>> heterozygousMatches = Arrays.asList(new ArrayList<DualIdx>(), new ArrayList<DualIdx>()); // matches either parent
+        public List<VariantPair> findAllPairs(Variant v) {
+            //boolean overlapComplex;
+            // Results to store
+            // this stores the indexes of the true variants matched
+
+            return new ArrayList<VariantPair>();
+        }
+
+        /**
+         * return first homozygous match (empty if no match)
+         * @return
+         */
+        public DualIdx firstHomMatch() {
+            return homozygousMatches.isEmpty() ? new DualIdx() : homozygousMatches.get(0);
+        }
+
+        /**
+         * return first heterozygous match (empty if no match)
+         * @return
+         */
+        public DualIdx firstHetMatch() {
+            List<DualIdx> temp = new ArrayList<>(heterozygousMatches.get(0));
+            temp.retainAll(heterozygousMatches.get(1)); //essentially get intersection
+            //if there is a match in intersection, return the first one from the intersection
+            if (!temp.isEmpty()) {
+                return temp.get(0);
+              /*
+              if no match in intersection, return the first one from the larger list (the genotype
+              containing more heterozygous matches).
+              since ResultComparator is created for each variant, heterzygousMatches will only
+              store matches for one variant against some true variants. Therefore, I think
+              one of the lists in heterozygousMatches will always be zero if there is no intersection
+              between them. And in such cases, we just return whatever we have in matches.
+
+              an example, compare these two lines:
+              1	993	rs35493185	C	CA	.	.	SVLEN=1	GT	1|0
+              1	993	rs35493185	C	CA	.	.	SVLEN=1	GT	1|0
+
+               */
+            } else if (heterozygousMatches.stream().anyMatch(l -> !l.isEmpty())) {
+                if (heterozygousMatches.get(0).size() > heterozygousMatches.get(1).size()) {
+                    return heterozygousMatches.get(0).get(0);
+                } else {
+                    return heterozygousMatches.get(1).get(0);
+                }
+            }
+            return new DualIdx();
+        }
+
+        /**
+         * determine if there is a match regardless of genotype
+         * @return DualIdx of matched variant
+         */
+        public DualIdx firstMatch() {
+            DualIdx idx = firstHomMatch();
+            if (idx.isSplitVariantValid()) {
+                return idx;
+            }
+            idx = firstHetMatch();
+            if (idx.isSplitVariantValid()) {
+                return idx;
+            }
+            return idx;
+        }
+
+        /**
+         * Only compares one allele at a time
+         * - don't match variants in the bitset
+         * - if match set the bitset
+         *
+         * @param variant       variant we want to compare
+         * @param genotype      allele of the variant to compare
+         * @return The maximum length of all true variants
+         */
+        public int compareVariant(Variant variant, int genotype) {
+            return compareVariant(variant, genotype, new BitSet());
+        }
+        /**
+         * Only compares one allele at a time
+         * - don't match variants in the bitset
+         * - if match set the bitset
+         *
+         * @param variant       variant we want to compare
+         * @param genotype      allele of the variant to compare
+         * @param validated BitSet that records the true variants that have already been validated
+         *                  can be empty BitSet if we don't skip validated ones
+         * @return The maximum length of all true variants
+         */
+        public int compareVariant(Variant variant, int genotype, BitSet validated) {
+            //TODO: consider type to change overlap percent
+            VariantType type = variant.getType(genotype);
+            ChrString chr = variant.getChr();
+            SimpleInterval1D intervalForCompare = variant.getVariantInterval(genotype, ignoreInsertionLength);
+
+            int maxTrueVarianLength = 0;
+
+            // sometimes MNPs are called as SNPs?
+            if (type == VariantType.SNP) {
+                // handle SNPs differently
+                // require SNP content to match
+                Iterable<ValueInterval1D<Variant>> overlaps = trueVariantIntervalTree.getOverlaps(chr, intervalForCompare);
+
+                if (overlaps == null) {
+                    // nothing found
+                    return maxTrueVarianLength;
+                }
+
+                byte alternativeAlleleFirstBase = variant.getAlt(genotype).getSequence()[0];
+
+                int numberOfSnpMatches = 0;
+                for (ValueInterval1D<Variant> trueVariantInterval : overlaps) {
+                    Variant trueVariant = trueVariantInterval.getContent();
+                    boolean hasSnp = false;
+                    int splitVariantIndex = trueVariant.splitVariantIndex;
+                    int wholeVariantIndex = trueVariant.wholeVariantIndex;
+                    overlapComplex = trueVariant.originalType == VariantOverallType.Complex;
+
+                    if (validated.get(splitVariantIndex)) {
+                        // skip ones already validated
+                        continue;
+                    }
+
+                    // check genotype
+                    for (int parent = 0; parent < 2; parent++) {
+                        int allele = trueVariant.getAllele(parent);
+                        //true variant is heterozygous, so only one allele has non-reference sequence
+                        if (trueVariant.isHom() || allele > 0) {
+                            if (trueVariant.getType(allele) == VariantType.SNP
+                                    && variant.getPos() == trueVariant.getPos()) {
+                                if (alternativeAlleleFirstBase == trueVariant.getAlt(allele).getSequence()[0]) {
+                                    if (trueVariant.isHom())
+                                        homozygousMatches.add(new DualIdx(splitVariantIndex, wholeVariantIndex));
+                                    else
+                                        heterozygousMatches.get(parent).add(new DualIdx(splitVariantIndex, wholeVariantIndex));
+                                }
+                                hasSnp = true;
+                            }
+                        }
+                        if (trueVariant.isHom())
+                            break;
+                    }
+
+                    if (hasSnp) {
+                        numberOfSnpMatches++;
+                        maxTrueVarianLength = 1;
+                    }
+                }
+
+                if (numberOfSnpMatches > 1) {
+                    log.info("Something strange, multiple SNP matches in true set: " + numberOfSnpMatches);
+                }
+            } else {
+                // Non-SNPs
+                SimpleInterval1D intervalForCompareWithWiggle = new SimpleInterval1D(intervalForCompare.left - wiggle, intervalForCompare.right + wiggle);
+                Iterable<ValueInterval1D<Variant>> overlaps = trueVariantIntervalTree.getOverlaps(chr, intervalForCompareWithWiggle);
+
+                if (overlaps == null) {
+                    // nothing found
+                    return maxTrueVarianLength;
+                }
+
+                for (ValueInterval1D<Variant> trueVariantInterval : overlaps) {
+                    Variant trueVariant = trueVariantInterval.getContent();
+                    int splitVariantIndex = trueVariant.splitVariantIndex;
+                    int wholeVariantIndex = trueVariant.wholeVariantIndex;
+                    overlapComplex = trueVariant.originalType == VariantOverallType.Complex;
+
+                    if (validated.get(splitVariantIndex)) {
+                        // skip ones already validated
+                        continue;
+                    }
+
+                    //all genotypes in the overlapping true variant will be checked
+                    for (int parent = 0; parent < 2; parent++) {
+                        int allele = trueVariant.getAllele(parent);
+
+                        if (allele == 0 || // reference allele
+                                type != trueVariant.getType(allele) ||
+                                variant.getAlt(allele) == null ||
+                                trueVariant.getAlt(allele) == null) { // need type to be the same
+                            continue;
+                        }
+
+                        Alt.Breakend currentBreakend = variant.getAlt(allele).getBreakend();
+                        Alt.Breakend trueBreakend = trueVariant.getAlt(allele).getBreakend();
+                        boolean overlap = trueVariant.getVariantInterval(allele, ignoreInsertionLength).
+                                intersects(intervalForCompare, overlapRatio, wiggle);
+                        boolean breakendMatch =  trueVariant.getType(allele) == VariantType.Breakend &&
+                                Alt.Breakend.looseEquals(currentBreakend, trueBreakend, overlapRatio, wiggle);
+                        boolean matched = trueVariant.getType(allele) == VariantType.Breakend ? breakendMatch : overlap;
+
+                        if (matched) {
+                            if (trueVariant.isHom()) {
+                                homozygousMatches.add(new DualIdx(splitVariantIndex, wholeVariantIndex));
+                            } else {
+                                heterozygousMatches.get(parent).add(new DualIdx(splitVariantIndex, wholeVariantIndex));
+                            }
+                            maxTrueVarianLength = Math.max(trueVariant.maxLen(allele), maxTrueVarianLength);
+                        }
+                        //for homozygous variants, we only check the first allele
+                        if (trueVariant.isHom())
+                            break;
+                    }
+                }
+
+            }
+
+            return maxTrueVarianLength;
+        }
+    }
+    class VariantPair implements Comparable<VariantPair> {
+        @Override
+        public int compareTo(VariantPair o) {
+            return 0;
+        }
+    }
+
 }

From 4947c22e2e1b913f5008990325745176777adc0d Mon Sep 17 00:00:00 2001
From: Yunfei Guo <yunfei.guo@bina.roche.com>
Date: Sun, 12 Feb 2017 10:37:07 -0800
Subject: [PATCH 2/3] backup

---
 .../varsim/tools/evaluation/VCFcompare.java   | 186 +++++-------------
 1 file changed, 53 insertions(+), 133 deletions(-)

diff --git a/src/main/java/com/bina/varsim/tools/evaluation/VCFcompare.java b/src/main/java/com/bina/varsim/tools/evaluation/VCFcompare.java
index ded29b47..86afb92b 100644
--- a/src/main/java/com/bina/varsim/tools/evaluation/VCFcompare.java
+++ b/src/main/java/com/bina/varsim/tools/evaluation/VCFcompare.java
@@ -23,8 +23,11 @@
 import org.apache.commons.io.FileUtils;
 import org.apache.log4j.Logger;
 import org.jgrapht.Graph;
+import org.jgrapht.alg.interfaces.MatchingAlgorithm;
+import org.jgrapht.alg.matching.MaximumWeightBipartiteMatching;
 import org.jgrapht.graph.SimpleWeightedGraph;
 import org.jgrapht.graph.builder.UndirectedWeightedGraphBuilderBase;
+import org.jgrapht.alg.interfaces.MatchingAlgorithm.Matching;
 import org.kohsuke.args4j.Argument;
 import org.kohsuke.args4j.Option;
 
@@ -1367,146 +1370,39 @@ private void globalMatching() {
                         log.info("skip line");
                         continue;
                     }
-
-                    boolean skipFP = false;
-
-                    if (variant.getTraid() != null) {
-                        String currentTraid = variant.getTraid();
-                        if (traid2composingVariants.containsKey(currentTraid)) {
-                            traid2composingVariants.get(currentTraid).add(variant);
-                            variant = new Variant.Builder().compositions(traid2composingVariants.get(currentTraid)).build();
-                            traid2composingVariants.remove(currentTraid);
-                            if (intersector != null) {
-                                for (Variant c : variant.getCompositions()) {
-                                    if (!excludeFdfFromBedFiltering && !intersector.containsEndpoints(c.getChr(), c.getGenotypeUnionAlternativeInterval(), bedEither)) {
-                                        skipFP = true;
-                                    }
-                                }
-                            }
-                        } else {
-                            traid2composingVariants.put(currentTraid, new ArrayList<Variant>());
-                            traid2composingVariants.get(currentTraid).add(variant);
-                            continue;
-                        }
-                    } else {
-                        if (intersector != null) {
-                            if (!excludeFdfFromBedFiltering && !intersector.containsEndpoints(variant.getChr(), variant.getGenotypeUnionAlternativeInterval(), bedEither)) {
-                                skipFP = true;
-                            }
-                        }
-                    }
-
-                    // the overall type of the called variant
-                    VariantOverallType currentVariantType = variant.getType();
-
                     // if called as complex variant convert to indel+snps
-                    List<Variant> canonicalVariantList = canonicalizeVariant(variant);
-
-                    int totalLength = canonicalVariantList.stream().mapToInt(v -> v.maxLen()).sum();
-                    double validatedLength = 0;
-                    int maxLength = canonicalVariantList.stream().mapToInt(v -> v.maxLen()).max().getAsInt();
-
-                    // split up variants that are basically one big variant and one small one
-                    //maxLength / totalLength >= overlapRatio, true if the longest canonicalized variant
-                    //is longer than certain proportion of sum of lengths of all canonicalized variants.
-                    boolean computeAsSplit = totalLength >= Constant.SVLEN && maxLength >= overlapRatio * totalLength  &&
-                            canonicalVariantList.size() > 1;
-
                   /*
                   here we need to figure out all possible pairings between current test variant
-                  and true variants, the pairings will be sorted later based on degree of matching
+                  and true variants, the pairings will be sorted later based on quality of matching
+
+                  we will examine all possible pairings regardless of TPR/FDR BED files
+                  because we consider global maximal matching as the ground truth
                    */
-                    for (Variant currentVariant : canonicalVariantList) {
+                    for (Variant currentVariant : canonicalizeVariant(variant)) {
                         canonicalizedTestVariants.add(currentVariant);
                         allCanonicalizedVariantPairs.addAll(multiResultComparator.findAllPairs(currentVariant));
                     }
-                            for (int i = 0; i < 2; i++) {
-                                byte allele = geno.geno[i];
-                                if (allele > 0) {
-                                    maxTrueLen = Math.max(resultComparator.compareVariant(currentVariant, allele, validatedTrue), maxTrueLen);
-                                }
-                            }
-
-                            final DualIdx dualIdx = matchGenotype ? resultComparator.firstHetMatch() : resultComparator.firstMatch();
-
-                            if (dualIdx.isSplitVariantValid()) {
-                                validatedTrue.set(dualIdx.splitVariantIndex);
-                                fullValidatedCount[dualIdx.wholeVariantIndex] += currentVariant.maxLen(); // this 'should' be overlap len
-                                validatedLength += currentVariant.maxLen();
-                            } else if (computeAsSplit) {
-                                if (!skipFP) {
-                                    outputBlob.getNumberOfTrueCorrect().incFP(currentVariant.getType(), currentVariant.maxLen());
-                                    validator.inc(StatsNamespace.FP, currentVariant.getType(), currentVariant.maxLen());
-                                    if (currentVariant.getType() == VariantOverallType.SNP && currentVariant.maxLen() > 1) {
-                                        log.warn("SNP with bad length: " + currentVariant);
-                                    }
-                                    variant.output(fpWriter);
-                                } else {
-                                    variant.output(unknownFpWriter);
-                                }
-                            }
-                        }
-                    }
-
-                    if (!computeAsSplit && validatedLength < (totalLength * overlapRatio)) {
-                        if (!skipFP) {
-                            // this is a false positive!
-                            outputBlob.getNumberOfTrueCorrect().incFP(currentVariantType, variant.maxLen());
-                            validator.inc(StatsNamespace.FP, currentVariantType, variant.maxLen());
-                            if (currentVariantType == VariantOverallType.SNP && variant.maxLen() > 1) {
-                                log.warn("SNP with bad length: " + variant);
-                            }
-                            variant.output(fpWriter);
-                        } else {
-                            variant.output(unknownFpWriter);
-                        }
-                    }
-
                     numberOfNewVariants++;
                 }
             }
-            /**********/
-        /**********/
-        /**********/
-        /**********/
-        /**********/
-        /**********/
-        /**********/
-        /**********/
-        /**********/
-        //perform global matching
-        /**********/
-        /**********/
-        /**********/
-        /**********/
-        /**********/
-        /**********/
-        /**********/
-        /**********/
-        /**********/
-        /**********/
-        /**********/
-        /**********/
-        /**********/
-        /**********/
-        /**********/
-        /**********/
-        /**********/
-        /**********/
-        /**********/
-        /**********/
-//            for (String currentVcfFile : newVcfFilename) {
-//                VCFparser newParser = new VCFparser(currentVcfFile, sampleName, excludeFiltered);
-//
-//                while (newParser.hasMoreInput()) {
-//                    Variant variant = newParser.parseLine();
-//
-//                    //TODO: wrap variant comparison into a method for easier reading
-//                    if (variant == null ||
-//                            (chrAcceptor != null && !chrAcceptor.contains(variant.getChr().getName()))) {
-//                        log.info("skip line");
-//                        continue;
-//                    }
+            /*
+            now we have all possible pairings of true variants and test variants
+            let's sort them and construct weighted bipartite graph
+             */
+            VanillaVariantPairComparator vanillaVariantPairComparator = new VanillaVariantPairComparator();
+            Collections.sort(allCanonicalizedVariantPairs, vanillaVariantPairComparator);
+            int ranking = 0;
+            for (VariantPair p : allCanonicalizedVariantPairs) {
+                weightedGraphBuilder.addEdge(p.getTrueVariant(), p.getTestVariant(), ranking);
+                ranking++;
+            }
+            MaximumWeightBipartiteMatching maximumWeightBipartiteMatching = new MaximumWeightBipartiteMatching(weightedGraphBuilder.build(),
+                    canonicalizedTrueVariants, canonicalizedTestVariants);
+            //perform global matching
+          Matching<VariantPair> matching = maximumWeightBipartiteMatching.computeMatching();
+            for (VariantPair p : matching.getEdges()) {
+
+            }
 //
 //                    boolean skipFP = false;
 //
@@ -2342,11 +2238,35 @@ public int compareVariant(Variant variant, int genotype, BitSet validated) {
             return maxTrueVarianLength;
         }
     }
-    class VariantPair implements Comparable<VariantPair> {
+
+    /**
+     * store a pair of matched variants
+     * can perform detailed matching (homo, hetero)
+     */
+    class VariantPair {
+        final Variant trueVariant;
+        final Variant testVariant;
+        public VariantPair(Variant trueVariant, Variant testVariant) {
+            this.trueVariant = trueVariant;
+            this.testVariant = testVariant;
+        }
+        public Variant getTrueVariant() {
+            return trueVariant;
+        }
+        public Variant getTestVariant() {
+            return testVariant;
+        }
+    }
+
+    /**
+     * determine how pairs of matched variants are sorted
+     * the better the match is, the ranking should be larger
+     * (not higher)
+     */
+    class VanillaVariantPairComparator implements Comparator<VariantPair> {
         @Override
-        public int compareTo(VariantPair o) {
+        public int compare(VariantPair o1, VariantPair o2) {
             return 0;
         }
     }
-
 }

From 078cd9a1f1be4ac5202efb8ae778108841c5d359 Mon Sep 17 00:00:00 2001
From: Yunfei Guo <yunfei.guo@bina.roche.com>
Date: Mon, 13 Feb 2017 12:45:16 -0800
Subject: [PATCH 3/3] backup

---
 .../varsim/tools/evaluation/VCFcompare.java   | 301 ++++++++++--------
 1 file changed, 163 insertions(+), 138 deletions(-)

diff --git a/src/main/java/com/bina/varsim/tools/evaluation/VCFcompare.java b/src/main/java/com/bina/varsim/tools/evaluation/VCFcompare.java
index 86afb92b..4fe050b2 100644
--- a/src/main/java/com/bina/varsim/tools/evaluation/VCFcompare.java
+++ b/src/main/java/com/bina/varsim/tools/evaluation/VCFcompare.java
@@ -1230,7 +1230,7 @@ private void globalMatching() {
 
 
         //track TRAID-linked variants
-        Map<String, List<Variant>> traid2composingVariants = new HashMap<>();
+        Map<String, List<Variant>> traid2composingTrueVariants = new HashMap<>();
 
         // this is for the original variants
         // it stores the total length of the original variant in bases
@@ -1238,8 +1238,10 @@ private void globalMatching() {
         ArrayList<Integer> validatedTotalLength = new ArrayList<>();
         ArrayList<Variant> trueVariantsForOutput = new ArrayList<>();
 
-        Set<Variant> canonicalizedTrueVariants = new HashSet<>();
-        Set<Variant> canonicalizedTestVariants = new HashSet<>();
+        List<Variant> uncanonicalizedTrueVariants = new ArrayList<>();
+        List<Variant> uncanonicalizedTestVariants = new ArrayList<>();
+        List<Variant> canonicalizedTrueVariants = new ArrayList<>();
+        List<Variant> canonicalizedTestVariants = new ArrayList<>();
         List<VariantPair> allCanonicalizedVariantPairs = new ArrayList<>();
         UndirectedWeightedGraphBuilderBase weightedGraphBuilder = SimpleWeightedGraph.builder(VariantPair.class);
         Graph<Variant, VariantPair> bipartiteTrueVsTestVariantGraph = null;
@@ -1266,13 +1268,16 @@ private void globalMatching() {
 
             if (trueVariant.getTraid() != null) {
                 String currentTraid = trueVariant.getTraid();
-                if (traid2composingVariants.containsKey(currentTraid)) {
-                    traid2composingVariants.get(currentTraid).add(trueVariant);
-                    trueVariant = new Variant.Builder().compositions(traid2composingVariants.get(currentTraid)).build();
-                    traid2composingVariants.remove(currentTraid);
+                if (traid2composingTrueVariants.containsKey(currentTraid)) {
+                    traid2composingTrueVariants.get(currentTraid).add(trueVariant);
+                    /*!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!*/
+                    /*note here the trueVariant has been replaced here*/
+                    /*!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!*/
+                    trueVariant = new Variant.Builder().compositions(traid2composingTrueVariants.get(currentTraid)).build();
+                    traid2composingTrueVariants.remove(currentTraid);
                 } else {
-                    traid2composingVariants.put(currentTraid, new ArrayList<Variant>());
-                    traid2composingVariants.get(currentTraid).add(trueVariant);
+                    traid2composingTrueVariants.put(currentTraid, new ArrayList<Variant>());
+                    traid2composingTrueVariants.get(currentTraid).add(trueVariant);
                     continue;
                 }
             }
@@ -1315,11 +1320,13 @@ private void globalMatching() {
                     int currentLength = currentVariant.maxLen();
                     validatedTotalLength.add(currentLength);
                     trueVariantsForOutput.add(currentVariant);
+                    uncanonicalizedTrueVariants.add(currentVariant);
                     numReadOriginalVariant++;
                 }
             } else {
                 validatedTotalLength.add(totalLength);
                 trueVariantsForOutput.add(trueVariant);
+                uncanonicalizedTrueVariants.add(trueVariant);
                 numReadOriginalVariant++;
             }
         }
@@ -1351,10 +1358,11 @@ private void globalMatching() {
             // However, do don't add to true positives, those that computed later
 
             log.info("Load New VCF");
-            int numberOfNewVariants = 0;
+            int numReadOriginalTestVariant = 0;
+            int numAddedSplitTestVariant = 0;
             // iterate over new VCF and collect stats
 
-            traid2composingVariants.clear();
+            Map<String, List<Variant>> traid2composingTestVariants = new HashMap<>();
             MultiResultComparator multiResultComparator = new MultiResultComparator(trueVariantIntervalTree, overlapRatio,
                     wiggle, ignoreInsertionLength, matchGenotype);
 
@@ -1370,6 +1378,18 @@ private void globalMatching() {
                         log.info("skip line");
                         continue;
                     }
+                    if (variant.getTraid() != null) {
+                        String currentTraid = variant.getTraid();
+                        if (traid2composingTestVariants.containsKey(currentTraid)) {
+                            traid2composingTestVariants.get(currentTraid).add(variant);
+                            variant = new Variant.Builder().compositions(traid2composingTestVariants.get(currentTraid)).build();
+                            traid2composingTestVariants.remove(currentTraid);
+                        } else {
+                            traid2composingTestVariants.put(currentTraid, new ArrayList<Variant>());
+                            traid2composingTestVariants.get(currentTraid).add(variant);
+                            continue;
+                        }
+                    }
                     // if called as complex variant convert to indel+snps
                   /*
                   here we need to figure out all possible pairings between current test variant
@@ -1379,10 +1399,15 @@ private void globalMatching() {
                   because we consider global maximal matching as the ground truth
                    */
                     for (Variant currentVariant : canonicalizeVariant(variant)) {
+                        currentVariant.splitVariantIndex = numAddedSplitTestVariant;
+                        currentVariant.wholeVariantIndex = numReadOriginalTestVariant;
+                        currentVariant.originalType = variant.getType();
+
                         canonicalizedTestVariants.add(currentVariant);
                         allCanonicalizedVariantPairs.addAll(multiResultComparator.findAllPairs(currentVariant));
                     }
-                    numberOfNewVariants++;
+                    uncanonicalizedTestVariants.add(variant);
+                    numReadOriginalTestVariant++;
                 }
             }
             /*
@@ -1401,134 +1426,134 @@ private void globalMatching() {
             //perform global matching
           Matching<VariantPair> matching = maximumWeightBipartiteMatching.computeMatching();
             for (VariantPair p : matching.getEdges()) {
+              Variant trueVariant = p.getTrueVariant();
+                Variant testVariant = p.getTestVariant();
+                boolean skipFP = false;
+
+                if (testVariant.getTraid() != null) {
+                    String currentTraid = testVariant.getTraid();
+                    if (traid2composingVariants.containsKey(currentTraid)) {
+                        traid2composingVariants.get(currentTraid).add(testVariant);
+                        variant = new Variant.Builder().compositions(traid2composingVariants.get(currentTraid)).build();
+                        traid2composingVariants.remove(currentTraid);
+                        if (intersector != null) {
+                            for (Variant c : variant.getCompositions()) {
+                                if (!excludeFdfFromBedFiltering && !intersector.containsEndpoints(c.getChr(), c.getGenotypeUnionAlternativeInterval(), bedEither)) {
+                                    skipFP = true;
+                                }
+                            }
+                        }
+                    } else {
+                        traid2composingVariants.put(currentTraid, new ArrayList<Variant>());
+                        traid2composingVariants.get(currentTraid).add(variant);
+                        continue;
+                    }
+                } else {
+                    if (intersector != null) {
+                        if (!excludeFdfFromBedFiltering && !intersector.containsEndpoints(variant.getChr(), variant.getGenotypeUnionAlternativeInterval(), bedEither)) {
+                            skipFP = true;
+                        }
+                    }
+                }
+
+                // the overall type of the called variant
+                VariantOverallType currentVariantType = variant.getType();
+
+                // if called as complex variant convert to indel+snps
+                List<Variant> canonicalVariantList = canonicalizeVariant(variant);
+
+                int totalLength = canonicalVariantList.stream().mapToInt(v -> v.maxLen()).sum();
+                double validatedLength = 0;
+                int maxLength = canonicalVariantList.stream().mapToInt(v -> v.maxLen()).max().getAsInt();
+
+                // split up variants that are basically one big variant and one small one
+                //maxLength / totalLength >= overlapRatio, true if the longest canonicalized variant
+                //is longer than certain proportion of sum of lengths of all canonicalized variants.
+                boolean computeAsSplit = totalLength >= Constant.SVLEN && maxLength >= overlapRatio * totalLength  &&
+                        canonicalVariantList.size() > 1;
+
+                for (Variant currentVariant : canonicalVariantList) {
+                    canonicalizedTestVariants.add(currentVariant);
+                    // get genotype
+                    Genotypes geno = currentVariant.getGenotypes();
+                    //note here ResultComparator is created for each canonical variant
+                    ResultComparator resultComparator = new ResultComparator(trueVariantIntervalTree, overlapRatio, wiggle, ignoreInsertionLength);
+
+                    if (currentVariant.isHom()) {
+                        int maxTrueLength = resultComparator.compareVariant(currentVariant, geno.geno[0], validatedTrue);
+                        final DualIdx dualIdx = matchGenotype ? resultComparator.isHomMatch() : resultComparator.firstMatch();
+                        if (dualIdx.isSplitVariantValid()) {
+                            // validated
+                            validatedTrue.set(dualIdx.splitVariantIndex);
+                            fullValidatedCount[dualIdx.wholeVariantIndex] += maxTrueLength;// this 'should' be overlap len
+                            validatedLength += currentVariant.maxLen();
+                        } else if (computeAsSplit) {
+                            if (!skipFP) {
+                                outputBlob.getNumberOfTrueCorrect().incFP(currentVariant.getType(), variant.maxLen());
+                                validator.inc(StatsNamespace.FP, currentVariant.getType(), variant.maxLen());
+                                variant.output(fpWriter);
+                            } else {
+                                variant.output(unknownFpWriter);
+                            }
+                        }
+
+                    } else {
+                        // het
+                        //boolean matched = false;
+                        int maxTrueLen = 0;
+                        /*
+                        for heterozygous variants, all genotypes will be checked
+                        inside compareVariant, all genotypes of overlapping true variants will be checked, too
+                        so overall all possible combinations of genotype matching will be checked
+                        */
+                        for (int i = 0; i < 2; i++) {
+                            byte allele = geno.geno[i];
+                            if (allele > 0) {
+                                maxTrueLen = Math.max(resultComparator.compareVariant(currentVariant, allele, validatedTrue), maxTrueLen);
+                            }
+                        }
 
+                        final DualIdx dualIdx = matchGenotype ? resultComparator.firstHetMatch() : resultComparator.firstMatch();
+
+                        if (dualIdx.isSplitVariantValid()) {
+                            validatedTrue.set(dualIdx.splitVariantIndex);
+                            fullValidatedCount[dualIdx.wholeVariantIndex] += currentVariant.maxLen(); // this 'should' be overlap len
+                            validatedLength += currentVariant.maxLen();
+                        } else if (computeAsSplit) {
+                            if (!skipFP) {
+                                outputBlob.getNumberOfTrueCorrect().incFP(currentVariant.getType(), currentVariant.maxLen());
+                                validator.inc(StatsNamespace.FP, currentVariant.getType(), currentVariant.maxLen());
+                                if (currentVariant.getType() == VariantOverallType.SNP && currentVariant.maxLen() > 1) {
+                                    log.warn("SNP with bad length: " + currentVariant);
+                                }
+                                variant.output(fpWriter);
+                            } else {
+                                variant.output(unknownFpWriter);
+                            }
+                        }
+                    }
+                }
+
+                if (!computeAsSplit && validatedLength < (totalLength * overlapRatio)) {
+                    if (!skipFP) {
+                        // this is a false positive!
+                        outputBlob.getNumberOfTrueCorrect().incFP(currentVariantType, variant.maxLen());
+                        validator.inc(StatsNamespace.FP, currentVariantType, variant.maxLen());
+                        if (currentVariantType == VariantOverallType.SNP && variant.maxLen() > 1) {
+                            log.warn("SNP with bad length: " + variant);
+                        }
+                        variant.output(fpWriter);
+                    } else {
+                        variant.output(unknownFpWriter);
+                    }
+                }
+
+                numberOfNewVariants++;
             }
-//
-//                    boolean skipFP = false;
-//
-//                    if (variant.getTraid() != null) {
-//                        String currentTraid = variant.getTraid();
-//                        if (traid2composingVariants.containsKey(currentTraid)) {
-//                            traid2composingVariants.get(currentTraid).add(variant);
-//                            variant = new Variant.Builder().compositions(traid2composingVariants.get(currentTraid)).build();
-//                            traid2composingVariants.remove(currentTraid);
-//                            if (intersector != null) {
-//                                for (Variant c : variant.getCompositions()) {
-//                                    if (!excludeFdfFromBedFiltering && !intersector.containsEndpoints(c.getChr(), c.getGenotypeUnionAlternativeInterval(), bedEither)) {
-//                                        skipFP = true;
-//                                    }
-//                                }
-//                            }
-//                        } else {
-//                            traid2composingVariants.put(currentTraid, new ArrayList<Variant>());
-//                            traid2composingVariants.get(currentTraid).add(variant);
-//                            continue;
-//                        }
-//                    } else {
-//                        if (intersector != null) {
-//                            if (!excludeFdfFromBedFiltering && !intersector.containsEndpoints(variant.getChr(), variant.getGenotypeUnionAlternativeInterval(), bedEither)) {
-//                                skipFP = true;
-//                            }
-//                        }
-//                    }
-//
-//                    // the overall type of the called variant
-//                    VariantOverallType currentVariantType = variant.getType();
-//
-//                    // if called as complex variant convert to indel+snps
-//                    List<Variant> canonicalVariantList = canonicalizeVariant(variant);
-//
-//                    int totalLength = canonicalVariantList.stream().mapToInt(v -> v.maxLen()).sum();
-//                    double validatedLength = 0;
-//                    int maxLength = canonicalVariantList.stream().mapToInt(v -> v.maxLen()).max().getAsInt();
-//
-//                    // split up variants that are basically one big variant and one small one
-//                    //maxLength / totalLength >= overlapRatio, true if the longest canonicalized variant
-//                    //is longer than certain proportion of sum of lengths of all canonicalized variants.
-//                    boolean computeAsSplit = totalLength >= Constant.SVLEN && maxLength >= overlapRatio * totalLength  &&
-//                            canonicalVariantList.size() > 1;
-//
-//                    for (Variant currentVariant : canonicalVariantList) {
-//                        canonicalizedTestVariants.add(currentVariant);
-//                    // get genotype
-//                    Genotypes geno = currentVariant.getGenotypes();
-//                    //note here ResultComparator is created for each canonical variant
-//                    ResultComparator resultComparator = new ResultComparator(trueVariantIntervalTree, overlapRatio, wiggle, ignoreInsertionLength);
-//
-//                    if (currentVariant.isHom()) {
-//                        int maxTrueLength = resultComparator.compareVariant(currentVariant, geno.geno[0], validatedTrue);
-//                        final DualIdx dualIdx = matchGenotype ? resultComparator.isHomMatch() : resultComparator.firstMatch();
-//                        if (dualIdx.isSplitVariantValid()) {
-//                            // validated
-//                            validatedTrue.set(dualIdx.splitVariantIndex);
-//                            fullValidatedCount[dualIdx.wholeVariantIndex] += maxTrueLength;// this 'should' be overlap len
-//                            validatedLength += currentVariant.maxLen();
-//                        } else if (computeAsSplit) {
-//                            if (!skipFP) {
-//                                outputBlob.getNumberOfTrueCorrect().incFP(currentVariant.getType(), variant.maxLen());
-//                                validator.inc(StatsNamespace.FP, currentVariant.getType(), variant.maxLen());
-//                                variant.output(fpWriter);
-//                            } else {
-//                                variant.output(unknownFpWriter);
-//                            }
-//                        }
-//
-//                    } else {
-//                        // het
-//                        //boolean matched = false;
-//                        int maxTrueLen = 0;
-//                        /*
-//                        for heterozygous variants, all genotypes will be checked
-//                        inside compareVariant, all genotypes of overlapping true variants will be checked, too
-//                        so overall all possible combinations of genotype matching will be checked
-//                        */
-//                        for (int i = 0; i < 2; i++) {
-//                            byte allele = geno.geno[i];
-//                            if (allele > 0) {
-//                                maxTrueLen = Math.max(resultComparator.compareVariant(currentVariant, allele, validatedTrue), maxTrueLen);
-//                            }
-//                        }
-//
-//                        final DualIdx dualIdx = matchGenotype ? resultComparator.firstHetMatch() : resultComparator.firstMatch();
-//
-//                        if (dualIdx.isSplitVariantValid()) {
-//                            validatedTrue.set(dualIdx.splitVariantIndex);
-//                            fullValidatedCount[dualIdx.wholeVariantIndex] += currentVariant.maxLen(); // this 'should' be overlap len
-//                            validatedLength += currentVariant.maxLen();
-//                        } else if (computeAsSplit) {
-//                            if (!skipFP) {
-//                                outputBlob.getNumberOfTrueCorrect().incFP(currentVariant.getType(), currentVariant.maxLen());
-//                                validator.inc(StatsNamespace.FP, currentVariant.getType(), currentVariant.maxLen());
-//                                if (currentVariant.getType() == VariantOverallType.SNP && currentVariant.maxLen() > 1) {
-//                                    log.warn("SNP with bad length: " + currentVariant);
-//                                }
-//                                variant.output(fpWriter);
-//                            } else {
-//                                variant.output(unknownFpWriter);
-//                            }
-//                        }
-//                    }
-//                }
-//
-//                if (!computeAsSplit && validatedLength < (totalLength * overlapRatio)) {
-//                    if (!skipFP) {
-//                        // this is a false positive!
-//                        outputBlob.getNumberOfTrueCorrect().incFP(currentVariantType, variant.maxLen());
-//                        validator.inc(StatsNamespace.FP, currentVariantType, variant.maxLen());
-//                        if (currentVariantType == VariantOverallType.SNP && variant.maxLen() > 1) {
-//                            log.warn("SNP with bad length: " + variant);
-//                        }
-//                        variant.output(fpWriter);
-//                    } else {
-//                        variant.output(unknownFpWriter);
-//                    }
-//                }
-//
-//                numberOfNewVariants++;
-//            }
-//        }
-
-            log.info("Num new variants read: " + numberOfNewVariants);
+        }
+            }
+
+            log.info("Num new variants read: " + numReadOriginalTestVariant);
 
             // read through again and compute for the true variants
             int numRead2 = 0;