Skip to content

Commit

Permalink
Better HG19/B37 handling, and caching speed fixes for Funcotator. (#4927
Browse files Browse the repository at this point in the history
)

Now Funcotator assumes all data sources for the HG19 reference are compliant with HG19 contig names, and translates B37 contig names to their HG19 equivalents as needed. This fixes a major performance issue with HG19/B37 inputs where we were systematically getting cache misses when querying the datasources with the wrong contig names.

Released new version of datasources to go with this release (1.4.20180615). This was necessary because the data sources needed to be made consistent with HG19 (before they were a mix of HG19 and B37 contig names).

Fixes #4586
  • Loading branch information
jonn-smith authored and droazen committed Jun 29, 2018
1 parent cd1b453 commit ee81df1
Show file tree
Hide file tree
Showing 26 changed files with 720 additions and 384 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ sqlite3 ${OUT_DB_FILE} <<EOF
.import ${COSMIC_FILE} RawCosmic
CREATE TABLE Cosmic AS SELECT * FROM RawCosmic WHERE ("Mutation AA" != "" OR "Mutation genome position" != "");
DROP TABLE RawCosmic;
UPDATE Cosmic SET "Mutation genome position" = "chr"||"Mutation genome position" WHERE "Mutation genome position" != "";
CREATE INDEX GeneIndex ON Cosmic("Gene name");
VACUUM;
EOF
Expand Down
94 changes: 60 additions & 34 deletions scripts/funcotator/data_sources/getDbSNP.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ function usage()
echo -e " 3 UNKNOWN ARGUMENT"
echo -e " 4 BAD CHECKSUM"
echo -e " 5 OUTPUT DIRECTORY ALREADY EXISTS"
echo -e " 6 COULD NOT FIND BGZIP UTILITY"
echo -e ""
}

Expand Down Expand Up @@ -160,42 +161,48 @@ function downloadAndVerifyVcfFiles() {
echo "${indentSpace}Retrieving MD5 sum file: ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${md5File} ... "
wget ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${md5File}

# Get the VCF file, then make sure that the contig names are correct for HG19 (if applicable)
echo "${indentSpace}Retrieving VCF file: ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${vcfFile} ... "
wget ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${vcfFile}

echo "${indentSpace}Retrieving VCF Index file: ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${tbiFile} ... "
wget ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${tbiFile}

echo "${indentSpace}Verifying VCF checksum ..."
if [[ "$(uname)" == "Darwin" ]] ; then
which md5sum-lite &> /dev/null
r=$?
if [ $r == 0 ] ; then
checksum=$( md5sum-lite ${vcfFile} | awk '{print $1}' | sed -e 's#^[ \t]*##g' -e 's#[ \t]*$##g' )
expected=$( head -n1 ${md5File} | awk '{print $1}' | sed -e 's#^[ \t]*##g' -e 's#[ \t]*$##g' )

if [[ "${checksum}" != "${expected}" ]] ; then
error "DOWNLOADED FILE IS CORRUPT! (${checksum} != ${expected})"
error "FAILING"
exit 4
fi
else
error "Unable to validate md5sum of file: cannot locate 'md5sum-lite'. Use these data with caution."
fi
if [[ "${filePrefix}" == "hg19" ]] ; then
curl ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${vcfFile} | gunzip | sed -e 's#^\([0-9][0-9]*\)#chr\1#' -e 's#^MT#chrM#' -e 's#^X#chrX#' -e 's#^Y#chrY#' | bgzip > ${vcfFile}
else
which md5sum &> /dev/null
r=$?
if [ $r == 0 ] ; then
checksum=$( md5sum ${vcfFile} | awk '{print $1}' | sed -e 's#^[ \t]*##g' -e 's#[ \t]*$##g' )
expected=$( head -n1 ${md5File} | awk '{print $1}' | sed -e 's#^[ \t]*##g' -e 's#[ \t]*$##g' )

if [[ "${checksum}" != "${expected}" ]] ; then
error "DOWNLOADED FILE IS CORRUPT! (${checksum} != ${expected})"
error "FAILING"
exit 4
wget ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${vcfFile}

echo "${indentSpace}Retrieving VCF Index file: ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${tbiFile} ... "
wget ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${tbiFile}

# We can only verify the checksum with hg38 because we modify the hg19 file as we stream it in:
echo "${indentSpace}Verifying VCF checksum ..."
if [[ "$(uname)" == "Darwin" ]] ; then
which md5sum-lite &> /dev/null
r=$?
if [ $r == 0 ] ; then
checksum=$( md5sum-lite ${vcfFile} | awk '{print $1}' | sed -e 's#^[ \t]*##g' -e 's#[ \t]*$##g' )
expected=$( head -n1 ${md5File} | awk '{print $1}' | sed -e 's#^[ \t]*##g' -e 's#[ \t]*$##g' )

if [[ "${checksum}" != "${expected}" ]] ; then
error "DOWNLOADED FILE IS CORRUPT! (${checksum} != ${expected})"
error "FAILING"
exit 4
fi
else
error "Unable to validate md5sum of file: cannot locate 'md5sum-lite'. Use these data with caution."
fi
else
error "Unable to validate md5sum of file: cannot locate 'md5sum'. Use these data with caution."
which md5sum &> /dev/null
r=$?
if [ $r == 0 ] ; then
checksum=$( md5sum ${vcfFile} | awk '{print $1}' | sed -e 's#^[ \t]*##g' -e 's#[ \t]*$##g' )
expected=$( head -n1 ${md5File} | awk '{print $1}' | sed -e 's#^[ \t]*##g' -e 's#[ \t]*$##g' )

if [[ "${checksum}" != "${expected}" ]] ; then
error "DOWNLOADED FILE IS CORRUPT! (${checksum} != ${expected})"
error "FAILING"
exit 4
fi
else
error "Unable to validate md5sum of file: cannot locate 'md5sum'. Use these data with caution."
fi
fi
fi

Expand All @@ -205,8 +212,10 @@ function downloadAndVerifyVcfFiles() {

echo "${indentSpace}Moving files to output directory ..."
mv ${vcfFile} ${outputFolder}/${filePrefix}_${vcfFile}
mv ${tbiFile} ${outputFolder}/${filePrefix}_${tbiFile}
rm ${md5File}
if [[ ! "${filePrefix}" == "hg19" ]] ; then
mv ${tbiFile} ${outputFolder}/${filePrefix}_${tbiFile}
rm ${md5File}
fi

echo "${indentSpace}Creating Config File ... "
createConfigFile "${DATA_SOURCE_NAME}" "${version}" ${filePrefix}_${vcfFile} "ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${vcfFile}" > ${outputFolder}/${DATA_SOURCE_NAME}.config
Expand Down Expand Up @@ -258,6 +267,14 @@ if [[ -d ${OUT_DIR_NAME} ]] ; then
exit 5
fi

# Make sure bgzip exists:
which bgzip > /dev/null
r=$?
if [[ $r -ne 0 ]] ; then
error "bgzip utility not found on path. You must have bgzip installed to get the dbSNP resource. Please install bgzip and try again. - aborting!"
exit 6
fi

echo "Querying NCBI listing..."
tmpListing="$( makeTemp )"
curl ftp://ftp.ncbi.nih.gov/snp/organisms/ 2>/dev/null > ${tmpListing}
Expand All @@ -275,4 +292,13 @@ hg38Version=$( cat ${tmpListing} | awk '{print $9}' | grep 'human' | grep 'GRCh3

downloadAndVerifyVcfFiles ${hg38Version} ${OUT_DIR_NAME}/hg38 "hg38"

echo -e "\033[1;33;40m##################################################################################\033[0;0m"
echo -e "\033[1;33;40m# #\033[0;0m"
echo -e "\033[1;33;40m# \033[1;5;37;41mWARNING\033[0;0m: You \033[4;37;40mMUST\033[0;0m index the VCF files for \033[4;37;40mHG19\033[0;0m #\033[0;0m"
echo -e "\033[1;33;40m# before using this data source. #\033[0;0m"
echo -e "\033[1;33;40m# #\033[0;0m"
echo -e "\033[1;33;40m# Use gatk IndexFeatureFile <GTF_FILE> #\033[0;0m"
echo -e "\033[1;33;40m# #\033[0;0m"
echo -e "\033[1;33;40m##################################################################################\033[0;0m"

exit 0
1 change: 1 addition & 0 deletions scripts/funcotator/data_sources/sortClinvarHgmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ def row_comparator(x,y):
print 'Writing dataset ...',
FLUSH()
for row in clinvarHgmd:
row[2] = "chr" + row[2]
out_tsv_writer.writerow(row)
print 'DONE!'
FLUSH()
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
import org.broadinstitute.hellbender.utils.io.IOUtils;

import java.io.File;
import java.util.Iterator;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package org.broadinstitute.hellbender.engine;

import htsjdk.samtools.util.Locatable;
import org.broadinstitute.barclay.argparser.Advanced;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineException;
Expand All @@ -12,13 +11,10 @@
import org.broadinstitute.hellbender.utils.IGVUtils;
import org.broadinstitute.hellbender.utils.IntervalUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.activityprofile.ActivityProfileState;
import org.broadinstitute.hellbender.utils.downsampling.PositionalDownsampler;
import org.broadinstitute.hellbender.utils.downsampling.ReadsDownsampler;
import org.broadinstitute.hellbender.utils.read.GATKRead;

import java.io.File;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
Expand Down Expand Up @@ -147,7 +143,7 @@ public abstract class AssemblyRegionWalker extends GATKTool {

@Override
public String getProgressMeterRecordLabel() { return "regions"; }

private List<MultiIntervalLocalReadShard> readShards;

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -223,11 +223,23 @@ public List<CACHED_FEATURE> getCachedFeaturesUpToStopPosition( final int stopPos
}

/**
* Print statistics about the cache hit rate for debugging
* Print statistics about the cache hit rate for debugging.
*/
public void printCacheStatistics() {
printCacheStatistics("");
}

/**
* Print statistics about the cache hit rate for debugging.
* @param sourceName The source for the features in this cache.
*/
public void printCacheStatistics(final String sourceName) {

final String sourceNameString = sourceName.isEmpty() ? "" : "for data source " + sourceName;

final int totalQueries = getNumCacheHits() + getNumCacheMisses();
logger.debug(String.format("Cache hit rate was %.2f%% (%d out of %d total queries)",
logger.debug(String.format("Cache hit rate %s was %.2f%% (%d out of %d total queries)",
sourceNameString,
totalQueries > 0 ? ((double)getNumCacheHits() / totalQueries) * 100.0 : 0.0,
getNumCacheHits(),
totalQueries));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import org.broadinstitute.hellbender.utils.Utils;

import java.io.File;
import java.nio.file.Path;

/**
* A FeatureWalker is a tool that processes a {@link Feature} at a time from a source of Features, with
Expand Down
14 changes: 8 additions & 6 deletions src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,6 @@
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.VCFHeaderLine;
import htsjdk.variant.vcf.VCFSimpleHeaderLine;
import java.io.File;
import java.nio.file.Path;
import java.time.ZonedDateTime;
import java.util.*;
import java.util.stream.Stream;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.CommandLinePluginDescriptor;
Expand Down Expand Up @@ -43,6 +38,12 @@
import org.broadinstitute.hellbender.utils.reference.ReferenceUtils;
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;

import java.io.File;
import java.nio.file.Path;
import java.time.ZonedDateTime;
import java.util.*;
import java.util.stream.Stream;

/**
* Base class for all GATK tools. Tool authors that wish to write a "GATK" tool but not use one of
* the pre-packaged Walker traversals should feel free to extend this class directly. All other
Expand All @@ -69,7 +70,7 @@ public abstract class GATKTool extends CommandLineProgram {
private double secondsBetweenProgressUpdates = ProgressMeter.DEFAULT_SECONDS_BETWEEN_UPDATES;

@ArgumentCollection
private SequenceDictionaryValidationArgumentCollection seqValidationArguments = getSequenceDictionaryValidationArgumentCollection();
protected SequenceDictionaryValidationArgumentCollection seqValidationArguments = getSequenceDictionaryValidationArgumentCollection();

@Argument(fullName=StandardArgumentDefinitions.CREATE_OUTPUT_BAM_INDEX_LONG_NAME,
shortName=StandardArgumentDefinitions.CREATE_OUTPUT_BAM_INDEX_SHORT_NAME,
Expand Down Expand Up @@ -150,6 +151,7 @@ public abstract class GATKTool extends CommandLineProgram {
FeatureManager features;

/**
*
* Intervals to be used for traversal (null if no intervals were provided).
*
* Walker base classes (ReadWalker, etc.) are responsible for hooking these intervals up to
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.utils.SimpleInterval;

import java.nio.file.Path;

/**
* An IntervalWalker is a tool that processes a single interval at a time, with the ability to query
* optional overlapping sources of reads, reference data, and/or variants/features.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,16 @@
import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
import org.broadinstitute.hellbender.engine.filters.WellformedReadFilter;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.locusiterator.AlignmentContextIteratorBuilder;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.locusiterator.AlignmentContextIteratorBuilder;
import org.broadinstitute.hellbender.utils.locusiterator.LIBSDownsamplingInfo;
import org.broadinstitute.hellbender.utils.locusiterator.LocusIteratorByState;
import org.broadinstitute.hellbender.utils.read.GATKRead;

import java.util.*;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;

/**
Expand Down
Original file line number Diff line number Diff line change
@@ -1,16 +1,11 @@
package org.broadinstitute.hellbender.engine;

import org.broadinstitute.barclay.argparser.CommandLinePluginDescriptor;
import org.broadinstitute.hellbender.cmdline.GATKPlugin.GATKReadFilterPluginDescriptor;
import org.broadinstitute.hellbender.engine.filters.CountingReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.engine.filters.WellformedReadFilter;
import org.broadinstitute.hellbender.transformers.ReadTransformer;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.engine.filters.CountingReadFilter;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.read.GATKRead;

import java.nio.file.Path;
import java.util.Collections;
import java.util.List;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ public final class ReferenceContext implements Iterable<Byte> {
* empty arrays/iterators in response to queries.
*/
public ReferenceContext() {
this(null, null);
this(null, null, 0, 0);
}

/**
Expand Down Expand Up @@ -92,6 +92,26 @@ public ReferenceContext( final ReferenceDataSource dataSource, final SimpleInter
setWindow(windowLeadingBases, windowTrailingBases);
}

/**
* Create a windowed ReferenceContext set up to lazily query the provided interval.
*
* Window is preserved from {@code thatReferenceContext}.
*
* @param thatContext An existing {@link ReferenceContext} on which to base this new one.
* @param interval our location on the reference (may be null if our location is unknown)
*/
public ReferenceContext( final ReferenceContext thatContext, final SimpleInterval interval ) {
this.dataSource = thatContext.dataSource;
this.cachedSequence = null;
this.interval = interval;

// Determine the window:
final int windowLeadingBases = thatContext.numWindowLeadingBases();
final int windowTrailingBases = thatContext.numWindowTrailingBases();

setWindow(windowLeadingBases, windowTrailingBases);
}

/**
* Create a windowed ReferenceContext set up to lazily query the provided interval,
* expanded by the specified number of bases in each direction.
Expand Down
Loading

0 comments on commit ee81df1

Please sign in to comment.