Skip to content

Commit

Permalink
responded to review commnets and added some tests
Browse files Browse the repository at this point in the history
  • Loading branch information
jamesemery committed Feb 4, 2022
1 parent b234ee3 commit 5bfa0d3
Show file tree
Hide file tree
Showing 7 changed files with 139 additions and 53 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,6 @@ public static void finalizeRegion(final AssemblyRegion region,
readsToUse.sort(new ReadCoordinateComparator(readsHeader));
hardClippedReadsToUse.sort(new ReadCoordinateComparator(readsHeader));


// handle overlapping read pairs from the same fragment
if (correctOverlappingBaseQualities) {
cleanOverlappingReadPairs(readsToUse, samplesList, readsHeader, true, OptionalInt.empty(), OptionalInt.empty());
Expand Down Expand Up @@ -321,7 +320,7 @@ public static AssemblyResultSet assembleReads(final AssemblyRegion region,
}

if (!forcedPileupAlleles.isEmpty()) {
processPileupAlleles(region, forcedPileupAlleles, argumentCollection.pileupDetectionArgs.snpAdajacentToAssemblyIndel, argumentCollection.maxMnpDistance, aligner, refHaplotype, assemblyResultSet, argumentCollection.getHaplotypeToReferenceSWParameters());
processPileupAlleles(region, forcedPileupAlleles, argumentCollection.pileupDetectionArgs.snpAdajacentToAssemblyIndel, argumentCollection.maxMnpDistance, aligner, refHaplotype, assemblyResultSet, argumentCollection.pileupDetectionArgs.numHaplotypesToIterate, argumentCollection.pileupDetectionArgs.filteringKmerSize, argumentCollection.getHaplotypeToReferenceSWParameters());
}


Expand All @@ -347,7 +346,8 @@ public static AssemblyResultSet assembleReads(final AssemblyRegion region,
@VisibleForTesting
static void processPileupAlleles(final AssemblyRegion region, final List<VariantContext> givenAlleles, final int maxMnpDistance,
final int snpAdjacentToIndelLimit, final SmithWatermanAligner aligner, final Haplotype refHaplotype,
final AssemblyResultSet assemblyResultSet, final SWParameters haplotypeToReferenceSWParameters) {
final AssemblyResultSet assemblyResultSet, final int numHaplotypesPerIteration, final int hapFilteringKmerSize,
final SWParameters haplotypeToReferenceSWParameters) {
final int assemblyRegionStart = region.getPaddedSpan().getStart();
final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef();
final Map<Integer, VariantContext> assembledVariants = assemblyResultSet.getVariationEvents(maxMnpDistance).stream()
Expand All @@ -356,33 +356,23 @@ static void processPileupAlleles(final AssemblyRegion region, final List<Variant
.collect(Collectors.groupingBy(VariantContext::getStart, Collectors.collectingAndThen(Collectors.toList(), AssemblyBasedCallerUtils::makeMergedVariantContext))).values();

Set<Haplotype> baseHaplotypes = new TreeSet<>();
//Todo BG testing to see if limiting the initial list of haplotypes resolves the issue of variants not getting called in clustered regions
baseHaplotypes.addAll(assemblyResultSet.getHaplotypeList().stream()
.sorted(Comparator.comparingInt((Haplotype hap) -> hap.isReference() ? 1 : 0).thenComparingDouble(hap -> hap.getScore()).reversed())
.limit(NUM_HAPLOTYPES_TO_INJECT_FORCE_CALLING_ALLELES_INTO)
.collect(Collectors.toList()));

Map<Kmer, Integer> kmerReadCounts = getKmerReadCounts(region.getHardClippedPileupReads(), 10);
//TODO its unclear whether the correct answer here is to use the hardclipped pileup reads (which we used in generating the pileup alleles for specificty reasons)
//TODO or if it would be more accurate to use the softclipped bases here in filtering down the haplotypes. I suspect the latter but I will evaluate later.
Map<Kmer, Integer> kmerReadCounts = getKmerReadCounts(region.getHardClippedPileupReads(), hapFilteringKmerSize);

// Remove SNPs that are too close to assembled indels.
final List<VariantContext> givenAllelesFiltered = givenAlleles.stream().filter(vc -> vc.isIndel() || assembledIndels.stream().noneMatch(indel -> vc.withinDistanceOf(indel, snpAdjacentToIndelLimit))).collect(Collectors.toList());

for (final VariantContext givenVC : givenAllelesFiltered) {
//TODO BG Question - What does the assembledVC do?
final VariantContext assembledVC = assembledVariants.get(givenVC.getStart());
final int givenVCRefLength = givenVC.getReference().length();
final Allele longerRef = (assembledVC == null || givenVCRefLength > assembledVC.getReference().length()) ? givenVC.getReference() : assembledVC.getReference();
final List<Allele> unassembledGivenAlleles;
if (assembledVC == null) {
unassembledGivenAlleles = givenVC.getAlternateAlleles();
} else {
// map all alleles to the longest common reference
final Set<Allele> assembledAlleleSet = new HashSet<>(longerRef.length() == assembledVC.getReference().length() ? assembledVC.getAlternateAlleles() :
ReferenceConfidenceVariantContextMerger.remapAlleles(assembledVC, longerRef));
final Set<Allele> givenAlleleSet = new HashSet<>(longerRef.length() == givenVCRefLength ? givenVC.getAlternateAlleles() :
ReferenceConfidenceVariantContextMerger.remapAlleles(givenVC, longerRef));
unassembledGivenAlleles = givenAlleleSet.stream().filter(a -> !assembledAlleleSet.contains(a)).collect(Collectors.toList());
}
final List<Allele> unassembledGivenAlleles = getAllelesNotPresentInAssembly(givenVC, assembledVC, givenVCRefLength, longerRef);

final List<Allele> unassembledNonSymbolicAlleles = unassembledGivenAlleles.stream().filter(a -> {
final byte[] bases = a.getBases();
Expand All @@ -401,7 +391,6 @@ static void processPileupAlleles(final AssemblyRegion region, final List<Variant
insertedHaplotype.setCigar(cigar);
insertedHaplotype.setGenomeLocation(refHaplotype.getGenomeLocation());
insertedHaplotype.setAlignmentStartHapwrtRef(activeRegionStart);
// insertedHaplotype.addVariantContext(givenVC);

// and add to our internal list so we get haplotypes that contain all given alleles
// do we want a flag to control this behavior
Expand All @@ -412,8 +401,7 @@ static void processPileupAlleles(final AssemblyRegion region, final List<Variant
}
}

// TODO: convert 5 into adv. argument
baseHaplotypes.addAll(filterPileupHaplotypes(newPileupHaplotypes, kmerReadCounts, 5));
baseHaplotypes.addAll(filterPileupHaplotypes(newPileupHaplotypes, kmerReadCounts, numHaplotypesPerIteration, hapFilteringKmerSize));

}
baseHaplotypes.forEach(assemblyResultSet::add);
Expand All @@ -432,17 +420,7 @@ static void addGivenAlleles(final int assemblyRegionStart, final List<VariantCon
final VariantContext assembledVC = assembledVariants.get(givenVC.getStart());
final int givenVCRefLength = givenVC.getReference().length();
final Allele longerRef = (assembledVC == null || givenVCRefLength > assembledVC.getReference().length()) ? givenVC.getReference() : assembledVC.getReference();
final List<Allele> unassembledGivenAlleles;
if (assembledVC == null) {
unassembledGivenAlleles = givenVC.getAlternateAlleles();
} else {
// map all alleles to the longest common reference
final Set<Allele> assembledAlleleSet = new HashSet<>(longerRef.length() == assembledVC.getReference().length() ? assembledVC.getAlternateAlleles() :
ReferenceConfidenceVariantContextMerger.remapAlleles(assembledVC, longerRef));
final Set<Allele> givenAlleleSet = new HashSet<>(longerRef.length() == givenVCRefLength ? givenVC.getAlternateAlleles() :
ReferenceConfidenceVariantContextMerger.remapAlleles(givenVC, longerRef));
unassembledGivenAlleles = givenAlleleSet.stream().filter(a -> !assembledAlleleSet.contains(a)).collect(Collectors.toList());
}
final List<Allele> unassembledGivenAlleles = getAllelesNotPresentInAssembly(givenVC, assembledVC, givenVCRefLength, longerRef);

final List<Allele> unassembledNonSymbolicAlleles = unassembledGivenAlleles.stream().filter(a -> {
final byte[] bases = a.getBases();
Expand Down Expand Up @@ -476,6 +454,20 @@ static void addGivenAlleles(final int assemblyRegionStart, final List<VariantCon
assemblyResultSet.regenerateVariationEvents(maxMnpDistance);
}

private static List<Allele> getAllelesNotPresentInAssembly(VariantContext givenVC, VariantContext assembledVC, int givenVCRefLength, Allele longerRef) {
List<Allele> unassembledGivenAlleles;
if (assembledVC == null) {
unassembledGivenAlleles = givenVC.getAlternateAlleles();
} else {
// map all alleles to the longest common reference
final Set<Allele> assembledAlleleSet = new HashSet<>(longerRef.length() == assembledVC.getReference().length() ? assembledVC.getAlternateAlleles() :
ReferenceConfidenceVariantContextMerger.remapAlleles(assembledVC, longerRef));
final Set<Allele> givenAlleleSet = new HashSet<>(longerRef.length() == givenVCRefLength ? givenVC.getAlternateAlleles() :
ReferenceConfidenceVariantContextMerger.remapAlleles(givenVC, longerRef));
unassembledGivenAlleles = givenAlleleSet.stream().filter(a -> !assembledAlleleSet.contains(a)).collect(Collectors.toList());
}
return unassembledGivenAlleles;
}

/**
* Returns a map of kmer -> count of total unique occurrences across all of the provided reads. This is a necessary step
Expand All @@ -492,56 +484,56 @@ static Map<Kmer, Integer> getKmerReadCounts(final List<GATKRead> hardClippedPil
return kmerReadCounts;
}

/**
* Method for filtering combinatorial expansion of the number of extra artifical haplotypes that have been created to
* a more tractable number. It works in the following way:
* - For each kmer in the kmerReadsCounts, find haplotypes that also contain that kmer and increment a score for each
* found haplotype by the support found for the kmer.
* - Once this is done select the top #kmerSize haplotypes from the list as the most probable.
*
* @return A set of artificial haplotypes limited to at most numPileupHaplotypes
*/
@VisibleForTesting
static Set<Haplotype> filterPileupHaplotypes(final List<Haplotype> onlyNewHaplotypes,
final Map<Kmer, Integer> kmerReadCounts,
final int numPileupHaplotypes) {
// TODO AH & BG add the following code to check for quality of the SNPs
final int numPileupHaplotypes,
final int kmerSize ) {
// // make sure we're supposed to look for high entropy
// if ( lookForMismatchEntropy &&
// pileup.getNumberOfElements() >= minReadsAtLocus &&
// (double)mismatchQualities / (double)totalQualities >= mismatchThreshold )
// hasPointEvent = true;

int kmerSize = 10;
// List<Haplotype> onlyNewHaplotypes = new ArrayList<>();
// onlyNewHaplotypes.addAll(assembledAndNewHaplotypes);
// onlyNewHaplotypes.removeAll(originalAssemblyHaplotypes);

// int numPileupHaplotypes = 10;
// AH & BG filter
// get reads from assemblyResultSet.regionForGenotyping.reads[x].samRecord mReadBases and mBaseQualities and kmerize them.
// Map<kmer, Integer> # times Kmer in all the reads
// for each kmer in reads - find hapotypes that contain it and
// filter haplotypes that don't have 10% coverage of read kmers
// check if finalizeRegion methods is already applied and it removes the softclipped bases
// Map<Kmer, Integer> kmerReadCounts = new HashMap<>();
// hardClippedPileupReads.forEach(read -> kmerizeAndCountOccurences(read.getBases(), kmerSize, kmerReadCounts));

// get haplotypes from assemblyResultSet and kmerize. for each haplotype create a set of kmers.
// for each haplotype, look up the kmers in the read-map and sum thee counts fo the haplotype score
// create a Map<Haplytope, Score>
LinkedHashMap<Haplotype, Integer> haplotypeScores = new LinkedHashMap<>();
for (Haplotype haplotype : onlyNewHaplotypes) {
// TODO this code might use some normalizing based on haplotype length in the future
Set<Kmer> hapKmers = kmerizeBytes(haplotype.getBases(), kmerSize);
int hapKmerCount = 0;
for(Kmer kmer : hapKmers) {
hapKmerCount += kmerReadCounts.containsKey(kmer) ? 1 : 0;
}

haplotypeScores.put(haplotype, hapKmerCount);
// reuse score to save kmer count for debugging
haplotype.setScore(hapKmerCount);
}

Map<Haplotype,Integer> sortedHaplotypeScores =
Map<Haplotype,Integer> topScoringHaplotypes =
haplotypeScores.entrySet().stream()
.sorted(Collections.reverseOrder(Map.Entry.comparingByValue()))
.limit(numPileupHaplotypes)
.collect(Collectors.toMap(
Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e1, LinkedHashMap::new));

return sortedHaplotypeScores.keySet();
return topScoringHaplotypes.keySet();
}

/** A utility method that increments a counter-map
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -771,7 +771,7 @@ private List<VariantContext> referenceModelForNoVariation(final AssemblyRegion r
//TODO - why the activeRegion cannot manage its own one-time finalization and filtering?
//TODO - perhaps we can remove the last parameter of this method and the three lines bellow?
if ( needsToBeFinalized ) {
AssemblyBasedCallerUtils.finalizeRegion(region, hcArgs.assemblerArgs.errorCorrectReads, hcArgs.dontUseSoftClippedBases, minTailQuality, readsHeader, samplesList, ! hcArgs.doNotCorrectOverlappingBaseQualities, hcArgs.softClipLowQualityEnds, true);
AssemblyBasedCallerUtils.finalizeRegion(region, hcArgs.assemblerArgs.errorCorrectReads, hcArgs.dontUseSoftClippedBases, minTailQuality, readsHeader, samplesList, ! hcArgs.doNotCorrectOverlappingBaseQualities, hcArgs.softClipLowQualityEnds, hcArgs.pileupDetectionArgs.usePileupDetection);
}
filterNonPassingReads(region);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,18 @@ public final class PileupDetectionArgumentCollection {
@Argument(fullName= PILEUP_DETECTION_ENABLE_INDELS, doc = "Pileup Detection: If enabled, pileup detection code will attempt to detect indels missing from assembly. (Requires `--pileup-detection` argument)", optional = true)
public boolean detectIndels = false;

@Advanced
@Hidden
@Argument(fullName= "num-artificial-haplotypes-to-add-per-allele", doc = "Pileup Detection: This argument limits the maximum number of novel haplotypes to be added to the assembly haplotypes per pileup allele added", optional = true)
public int numHaplotypesToIterate = 5;
@Advanced
@Hidden
@Argument(fullName= "artifical-haplotype-filtering-kmer-size", doc = "Pileup Detection: Controls what size to kmerize reads to in order to select best supported artificial haplotypes", optional = true)
public int filteringKmerSize = 10;





/**
* Percentage of reads required to support the alt for a variant to be considered
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,6 @@ public AssemblyResultSet runLocalAssembly(final AssemblyRegion assemblyRegion,
final SmithWatermanAligner aligner,
final SWParameters danglingEndSWParameters,
final SWParameters haplotypeToReferenceSWParameters) {
//TODO BG & AH assemblyRegions now has separate reads for pileup - deal with it!!!
Utils.nonNull(assemblyRegion, "Assembly engine cannot be used with a null AssemblyRegion.");
Utils.nonNull(assemblyRegion.getPaddedSpan(), "Active region must have an extended location.");
Utils.nonNull(refHaplotype, "Reference haplotype cannot be null.");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -189,9 +189,3 @@ private static void incrementAltCount(byte base, Map<Byte, Integer> altCounts){
altCounts.getOrDefault(base,0) + 1);
}
}

/* Questions: how to get cigar info for each base?
How are insertions and deletions represented in Alignment context pileup.getbases
Args to test - error-correction-log-odds; this is not turned on by default; may help with precision
TODO: include Indels
* */
Loading

0 comments on commit 5bfa0d3

Please sign in to comment.