diff --git a/scripts/funcotator/data_sources/cosmic/createSqliteCosmicDb.sh b/scripts/funcotator/data_sources/cosmic/createSqliteCosmicDb.sh index f077ec9ad2c..800242de91a 100755 --- a/scripts/funcotator/data_sources/cosmic/createSqliteCosmicDb.sh +++ b/scripts/funcotator/data_sources/cosmic/createSqliteCosmicDb.sh @@ -44,6 +44,7 @@ sqlite3 ${OUT_DB_FILE} < /dev/null - r=$? - if [ $r == 0 ] ; then - checksum=$( md5sum-lite ${vcfFile} | awk '{print $1}' | sed -e 's#^[ \t]*##g' -e 's#[ \t]*$##g' ) - expected=$( head -n1 ${md5File} | awk '{print $1}' | sed -e 's#^[ \t]*##g' -e 's#[ \t]*$##g' ) - - if [[ "${checksum}" != "${expected}" ]] ; then - error "DOWNLOADED FILE IS CORRUPT! (${checksum} != ${expected})" - error "FAILING" - exit 4 - fi - else - error "Unable to validate md5sum of file: cannot locate 'md5sum-lite'. Use these data with caution." - fi + if [[ "${filePrefix}" == "hg19" ]] ; then + curl ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${vcfFile} | gunzip | sed -e 's#^\([0-9][0-9]*\)#chr\1#' -e 's#^MT#chrM#' -e 's#^X#chrX#' -e 's#^Y#chrY#' | bgzip > ${vcfFile} else - which md5sum &> /dev/null - r=$? - if [ $r == 0 ] ; then - checksum=$( md5sum ${vcfFile} | awk '{print $1}' | sed -e 's#^[ \t]*##g' -e 's#[ \t]*$##g' ) - expected=$( head -n1 ${md5File} | awk '{print $1}' | sed -e 's#^[ \t]*##g' -e 's#[ \t]*$##g' ) - - if [[ "${checksum}" != "${expected}" ]] ; then - error "DOWNLOADED FILE IS CORRUPT! (${checksum} != ${expected})" - error "FAILING" - exit 4 + wget ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${vcfFile} + + echo "${indentSpace}Retrieving VCF Index file: ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${tbiFile} ... " + wget ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${tbiFile} + + # We can only verify the checksum with hg38 because we modify the hg19 file as we stream it in: + echo "${indentSpace}Verifying VCF checksum ..." + if [[ "$(uname)" == "Darwin" ]] ; then + which md5sum-lite &> /dev/null + r=$? + if [ $r == 0 ] ; then + checksum=$( md5sum-lite ${vcfFile} | awk '{print $1}' | sed -e 's#^[ \t]*##g' -e 's#[ \t]*$##g' ) + expected=$( head -n1 ${md5File} | awk '{print $1}' | sed -e 's#^[ \t]*##g' -e 's#[ \t]*$##g' ) + + if [[ "${checksum}" != "${expected}" ]] ; then + error "DOWNLOADED FILE IS CORRUPT! (${checksum} != ${expected})" + error "FAILING" + exit 4 + fi + else + error "Unable to validate md5sum of file: cannot locate 'md5sum-lite'. Use these data with caution." fi else - error "Unable to validate md5sum of file: cannot locate 'md5sum'. Use these data with caution." + which md5sum &> /dev/null + r=$? + if [ $r == 0 ] ; then + checksum=$( md5sum ${vcfFile} | awk '{print $1}' | sed -e 's#^[ \t]*##g' -e 's#[ \t]*$##g' ) + expected=$( head -n1 ${md5File} | awk '{print $1}' | sed -e 's#^[ \t]*##g' -e 's#[ \t]*$##g' ) + + if [[ "${checksum}" != "${expected}" ]] ; then + error "DOWNLOADED FILE IS CORRUPT! (${checksum} != ${expected})" + error "FAILING" + exit 4 + fi + else + error "Unable to validate md5sum of file: cannot locate 'md5sum'. Use these data with caution." + fi fi fi @@ -205,8 +212,10 @@ function downloadAndVerifyVcfFiles() { echo "${indentSpace}Moving files to output directory ..." mv ${vcfFile} ${outputFolder}/${filePrefix}_${vcfFile} - mv ${tbiFile} ${outputFolder}/${filePrefix}_${tbiFile} - rm ${md5File} + if [[ ! "${filePrefix}" == "hg19" ]] ; then + mv ${tbiFile} ${outputFolder}/${filePrefix}_${tbiFile} + rm ${md5File} + fi echo "${indentSpace}Creating Config File ... " createConfigFile "${DATA_SOURCE_NAME}" "${version}" ${filePrefix}_${vcfFile} "ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${vcfFile}" > ${outputFolder}/${DATA_SOURCE_NAME}.config @@ -258,6 +267,14 @@ if [[ -d ${OUT_DIR_NAME} ]] ; then exit 5 fi +# Make sure bgzip exists: +which bgzip > /dev/null +r=$? +if [[ $r -ne 0 ]] ; then + error "bgzip utility not found on path. You must have bgzip installed to get the dbSNP resource. Please install bgzip and try again. - aborting!" + exit 6 +fi + echo "Querying NCBI listing..." tmpListing="$( makeTemp )" curl ftp://ftp.ncbi.nih.gov/snp/organisms/ 2>/dev/null > ${tmpListing} @@ -275,4 +292,13 @@ hg38Version=$( cat ${tmpListing} | awk '{print $9}' | grep 'human' | grep 'GRCh3 downloadAndVerifyVcfFiles ${hg38Version} ${OUT_DIR_NAME}/hg38 "hg38" +echo -e "\033[1;33;40m##################################################################################\033[0;0m" +echo -e "\033[1;33;40m# #\033[0;0m" +echo -e "\033[1;33;40m# \033[1;5;37;41mWARNING\033[0;0m: You \033[4;37;40mMUST\033[0;0m index the VCF files for \033[4;37;40mHG19\033[0;0m #\033[0;0m" +echo -e "\033[1;33;40m# before using this data source. #\033[0;0m" +echo -e "\033[1;33;40m# #\033[0;0m" +echo -e "\033[1;33;40m# Use gatk IndexFeatureFile #\033[0;0m" +echo -e "\033[1;33;40m# #\033[0;0m" +echo -e "\033[1;33;40m##################################################################################\033[0;0m" + exit 0 diff --git a/scripts/funcotator/data_sources/sortClinvarHgmd.py b/scripts/funcotator/data_sources/sortClinvarHgmd.py index 4057c661f6f..55b7650f5f4 100755 --- a/scripts/funcotator/data_sources/sortClinvarHgmd.py +++ b/scripts/funcotator/data_sources/sortClinvarHgmd.py @@ -85,6 +85,7 @@ def row_comparator(x,y): print 'Writing dataset ...', FLUSH() for row in clinvarHgmd: + row[2] = "chr" + row[2] out_tsv_writer.writerow(row) print 'DONE!' FLUSH() \ No newline at end of file diff --git a/src/main/java/org/broadinstitute/hellbender/engine/AbstractConcordanceWalker.java b/src/main/java/org/broadinstitute/hellbender/engine/AbstractConcordanceWalker.java index 82591007f84..e07275345a2 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/AbstractConcordanceWalker.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/AbstractConcordanceWalker.java @@ -14,7 +14,6 @@ import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.gcs.BucketUtils; -import org.broadinstitute.hellbender.utils.io.IOUtils; import java.io.File; import java.util.Iterator; diff --git a/src/main/java/org/broadinstitute/hellbender/engine/AssemblyRegionWalker.java b/src/main/java/org/broadinstitute/hellbender/engine/AssemblyRegionWalker.java index 246d348f3b5..91275a36aae 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/AssemblyRegionWalker.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/AssemblyRegionWalker.java @@ -1,6 +1,5 @@ package org.broadinstitute.hellbender.engine; -import htsjdk.samtools.util.Locatable; import org.broadinstitute.barclay.argparser.Advanced; import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.CommandLineException; @@ -12,13 +11,10 @@ import org.broadinstitute.hellbender.utils.IGVUtils; import org.broadinstitute.hellbender.utils.IntervalUtils; import org.broadinstitute.hellbender.utils.SimpleInterval; -import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.activityprofile.ActivityProfileState; import org.broadinstitute.hellbender.utils.downsampling.PositionalDownsampler; import org.broadinstitute.hellbender.utils.downsampling.ReadsDownsampler; -import org.broadinstitute.hellbender.utils.read.GATKRead; -import java.io.File; import java.io.IOException; import java.io.PrintStream; import java.util.ArrayList; @@ -147,7 +143,7 @@ public abstract class AssemblyRegionWalker extends GATKTool { @Override public String getProgressMeterRecordLabel() { return "regions"; } - + private List readShards; /** diff --git a/src/main/java/org/broadinstitute/hellbender/engine/FeatureCache.java b/src/main/java/org/broadinstitute/hellbender/engine/FeatureCache.java index 74d351e372b..b33b35ecd0f 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/FeatureCache.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/FeatureCache.java @@ -223,11 +223,23 @@ public List getCachedFeaturesUpToStopPosition( final int stopPos } /** - * Print statistics about the cache hit rate for debugging + * Print statistics about the cache hit rate for debugging. */ public void printCacheStatistics() { + printCacheStatistics(""); + } + + /** + * Print statistics about the cache hit rate for debugging. + * @param sourceName The source for the features in this cache. + */ + public void printCacheStatistics(final String sourceName) { + + final String sourceNameString = sourceName.isEmpty() ? "" : "for data source " + sourceName; + final int totalQueries = getNumCacheHits() + getNumCacheMisses(); - logger.debug(String.format("Cache hit rate was %.2f%% (%d out of %d total queries)", + logger.debug(String.format("Cache hit rate %s was %.2f%% (%d out of %d total queries)", + sourceNameString, totalQueries > 0 ? ((double)getNumCacheHits() / totalQueries) * 100.0 : 0.0, getNumCacheHits(), totalQueries)); diff --git a/src/main/java/org/broadinstitute/hellbender/engine/FeatureWalker.java b/src/main/java/org/broadinstitute/hellbender/engine/FeatureWalker.java index 1cbf38637bf..45b0d73374c 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/FeatureWalker.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/FeatureWalker.java @@ -8,7 +8,6 @@ import org.broadinstitute.hellbender.utils.Utils; import java.io.File; -import java.nio.file.Path; /** * A FeatureWalker is a tool that processes a {@link Feature} at a time from a source of Features, with diff --git a/src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java b/src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java index 5bd675d2724..dcf75cefb7f 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java @@ -10,11 +10,6 @@ import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.vcf.VCFHeaderLine; import htsjdk.variant.vcf.VCFSimpleHeaderLine; -import java.io.File; -import java.nio.file.Path; -import java.time.ZonedDateTime; -import java.util.*; -import java.util.stream.Stream; import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.ArgumentCollection; import org.broadinstitute.barclay.argparser.CommandLinePluginDescriptor; @@ -43,6 +38,12 @@ import org.broadinstitute.hellbender.utils.reference.ReferenceUtils; import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils; +import java.io.File; +import java.nio.file.Path; +import java.time.ZonedDateTime; +import java.util.*; +import java.util.stream.Stream; + /** * Base class for all GATK tools. Tool authors that wish to write a "GATK" tool but not use one of * the pre-packaged Walker traversals should feel free to extend this class directly. All other @@ -69,7 +70,7 @@ public abstract class GATKTool extends CommandLineProgram { private double secondsBetweenProgressUpdates = ProgressMeter.DEFAULT_SECONDS_BETWEEN_UPDATES; @ArgumentCollection - private SequenceDictionaryValidationArgumentCollection seqValidationArguments = getSequenceDictionaryValidationArgumentCollection(); + protected SequenceDictionaryValidationArgumentCollection seqValidationArguments = getSequenceDictionaryValidationArgumentCollection(); @Argument(fullName=StandardArgumentDefinitions.CREATE_OUTPUT_BAM_INDEX_LONG_NAME, shortName=StandardArgumentDefinitions.CREATE_OUTPUT_BAM_INDEX_SHORT_NAME, @@ -150,6 +151,7 @@ public abstract class GATKTool extends CommandLineProgram { FeatureManager features; /** + * * Intervals to be used for traversal (null if no intervals were provided). * * Walker base classes (ReadWalker, etc.) are responsible for hooking these intervals up to diff --git a/src/main/java/org/broadinstitute/hellbender/engine/IntervalWalker.java b/src/main/java/org/broadinstitute/hellbender/engine/IntervalWalker.java index 20aef8a4919..f73105b5d76 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/IntervalWalker.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/IntervalWalker.java @@ -3,8 +3,6 @@ import org.broadinstitute.hellbender.engine.filters.ReadFilter; import org.broadinstitute.hellbender.utils.SimpleInterval; -import java.nio.file.Path; - /** * An IntervalWalker is a tool that processes a single interval at a time, with the ability to query * optional overlapping sources of reads, reference data, and/or variants/features. diff --git a/src/main/java/org/broadinstitute/hellbender/engine/LocusWalker.java b/src/main/java/org/broadinstitute/hellbender/engine/LocusWalker.java index b15520154c0..a0a56ecc6d6 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/LocusWalker.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/LocusWalker.java @@ -9,13 +9,16 @@ import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary; import org.broadinstitute.hellbender.engine.filters.WellformedReadFilter; import org.broadinstitute.hellbender.exceptions.UserException; -import org.broadinstitute.hellbender.utils.locusiterator.AlignmentContextIteratorBuilder; import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.utils.locusiterator.AlignmentContextIteratorBuilder; import org.broadinstitute.hellbender.utils.locusiterator.LIBSDownsamplingInfo; import org.broadinstitute.hellbender.utils.locusiterator.LocusIteratorByState; import org.broadinstitute.hellbender.utils.read.GATKRead; -import java.util.*; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Set; import java.util.stream.Collectors; /** diff --git a/src/main/java/org/broadinstitute/hellbender/engine/ReadWalker.java b/src/main/java/org/broadinstitute/hellbender/engine/ReadWalker.java index b9017e83863..2dbf7319588 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/ReadWalker.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/ReadWalker.java @@ -1,16 +1,11 @@ package org.broadinstitute.hellbender.engine; -import org.broadinstitute.barclay.argparser.CommandLinePluginDescriptor; -import org.broadinstitute.hellbender.cmdline.GATKPlugin.GATKReadFilterPluginDescriptor; +import org.broadinstitute.hellbender.engine.filters.CountingReadFilter; import org.broadinstitute.hellbender.engine.filters.ReadFilter; import org.broadinstitute.hellbender.engine.filters.WellformedReadFilter; -import org.broadinstitute.hellbender.transformers.ReadTransformer; import org.broadinstitute.hellbender.utils.SimpleInterval; -import org.broadinstitute.hellbender.engine.filters.CountingReadFilter; -import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.read.GATKRead; -import java.nio.file.Path; import java.util.Collections; import java.util.List; diff --git a/src/main/java/org/broadinstitute/hellbender/engine/ReferenceContext.java b/src/main/java/org/broadinstitute/hellbender/engine/ReferenceContext.java index ba38f41242d..0482fdc2756 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/ReferenceContext.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/ReferenceContext.java @@ -60,7 +60,7 @@ public final class ReferenceContext implements Iterable { * empty arrays/iterators in response to queries. */ public ReferenceContext() { - this(null, null); + this(null, null, 0, 0); } /** @@ -92,6 +92,26 @@ public ReferenceContext( final ReferenceDataSource dataSource, final SimpleInter setWindow(windowLeadingBases, windowTrailingBases); } + /** + * Create a windowed ReferenceContext set up to lazily query the provided interval. + * + * Window is preserved from {@code thatReferenceContext}. + * + * @param thatContext An existing {@link ReferenceContext} on which to base this new one. + * @param interval our location on the reference (may be null if our location is unknown) + */ + public ReferenceContext( final ReferenceContext thatContext, final SimpleInterval interval ) { + this.dataSource = thatContext.dataSource; + this.cachedSequence = null; + this.interval = interval; + + // Determine the window: + final int windowLeadingBases = thatContext.numWindowLeadingBases(); + final int windowTrailingBases = thatContext.numWindowTrailingBases(); + + setWindow(windowLeadingBases, windowTrailingBases); + } + /** * Create a windowed ReferenceContext set up to lazily query the provided interval, * expanded by the specified number of bases in each direction. diff --git a/src/main/java/org/broadinstitute/hellbender/engine/VariantWalkerBase.java b/src/main/java/org/broadinstitute/hellbender/engine/VariantWalkerBase.java index 9fa43669a2d..b42e670a62e 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/VariantWalkerBase.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/VariantWalkerBase.java @@ -6,10 +6,12 @@ import org.broadinstitute.hellbender.engine.filters.CountingReadFilter; import org.broadinstitute.hellbender.engine.filters.VariantFilter; import org.broadinstitute.hellbender.engine.filters.VariantFilterLibrary; +import org.broadinstitute.hellbender.transformers.VariantTransformer; import org.broadinstitute.hellbender.utils.IndexUtils; import org.broadinstitute.hellbender.utils.SimpleInterval; import java.util.Spliterator; +import java.util.stream.Stream; import java.util.stream.StreamSupport; /** @@ -93,17 +95,57 @@ public final SAMSequenceDictionary getBestAvailableSequenceDictionary() { */ protected abstract Spliterator getSpliteratorForDrivingVariants(); + /** + * Returns the pre-filter variant transformer (simple or composite) that will be applied to the variants before filtering. + * The default implementation uses the {@link VariantTransformer#identity()}. + * Default implementation of {@link #traverse()} calls this method once before iterating over the variants and reuses + * the transformer object to avoid object allocation. + * + * Subclasses can extend to provide own transformers (i.e. override and call super). + * Multiple transformers can be composed by using {@link VariantTransformer} composition methods. + */ + public VariantTransformer makePreVariantFilterTransformer() { + return VariantTransformer.identity(); + } + + /** + * Returns the post-filter variant transformer (simple or composite) that will be applied to the variants after filtering. + * The default implementation uses the {@link VariantTransformer#identity()}. + * Default implementation of {@link #traverse()} calls this method once before iterating over the variants and reuses + * the transformer object to avoid object allocation. + * + * Subclasses can extend to provide own transformers (i.e. override and call super). + * Multiple transformers can be composed by using {@link VariantTransformer} composition methods. + */ + public VariantTransformer makePostVariantFilterTransformer(){ + return VariantTransformer.identity(); + } + + /** + * Returns a stream over the variants, which are: + * + * 1. Transformed with {@link #makePreVariantFilterTransformer()}. + * 2. Filtered with {@code filter}. + * 3. Transformed with {@link #makePostVariantFilterTransformer()}. + */ + protected Stream getTransformedVariantStream(final VariantFilter filter) { + final VariantTransformer preTransformer = makePreVariantFilterTransformer(); + final VariantTransformer postTransformer = makePostVariantFilterTransformer(); + return StreamSupport.stream(getSpliteratorForDrivingVariants(), false) + .map(preTransformer) + .filter(filter) + .map(postTransformer); + } + /** * Implementation of variant-based traversal. * Subclasses can override to provide their own behavior but default implementation should be suitable for most uses. */ @Override public void traverse() { - final VariantFilter variantfilter = makeVariantFilter(); final CountingReadFilter readFilter = makeReadFilter(); // Process each variant in the input stream. - StreamSupport.stream(getSpliteratorForDrivingVariants(), false) - .filter(variantfilter) + getTransformedVariantStream( makeVariantFilter() ) .forEach(variant -> { final SimpleInterval variantInterval = new SimpleInterval(variant); apply(variant, diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/Funcotator.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/Funcotator.java index a4697c3d9f1..cbdb712ed4c 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/Funcotator.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/Funcotator.java @@ -1,16 +1,17 @@ package org.broadinstitute.hellbender.tools.funcotator; +import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.tribble.Feature; import htsjdk.tribble.util.ParsingUtils; import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; -import org.broadinstitute.barclay.argparser.Argument; -import org.broadinstitute.barclay.argparser.BetaFeature; -import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.argparser.*; import org.broadinstitute.barclay.help.DocumentedFeature; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.hellbender.engine.filters.VariantFilter; import org.broadinstitute.hellbender.exceptions.GATKException; import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.tools.funcotator.dataSources.DataSourceUtils; @@ -19,6 +20,8 @@ import org.broadinstitute.hellbender.tools.funcotator.metadata.FuncotationMetadata; import org.broadinstitute.hellbender.tools.funcotator.metadata.VcfFuncotationMetadata; import org.broadinstitute.hellbender.tools.funcotator.vcfOutput.VcfOutputRenderer; +import org.broadinstitute.hellbender.transformers.VariantTransformer; +import org.broadinstitute.hellbender.utils.SequenceDictionaryUtils; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.codecs.gencode.GencodeGtfFeature; import org.broadinstitute.hellbender.utils.codecs.xsvLocatableTable.XsvTableFeature; @@ -212,7 +215,7 @@ public class Funcotator extends VariantWalker { /** * The current version of {@link Funcotator}. */ - public static final String VERSION = "0.0.3"; + public static final String VERSION = "0.0.4"; //================================================================================================================== // Arguments: @@ -282,13 +285,6 @@ public class Funcotator extends VariantWalker { ) private List annotationOverrides = new ArrayList<>(); - @Argument( - fullName = FuncotatorArgumentDefinitions.ALLOW_HG19_GENCODE_B37_CONTIG_MATCHING_LONG_NAME, - optional = true, - doc = "Allow for the HG19 Reference version of GENCODE (or any other datasource) to match with B37 Contig names. (May create erroneous annotations in some contigs where B37 != HG19)." - ) - private boolean allowHg19ContigNamesWithB37 = true; - @Argument( fullName = FuncotatorArgumentDefinitions.LOOKAHEAD_CACHE_IN_BP_NAME, optional = true, @@ -297,12 +293,14 @@ public class Funcotator extends VariantWalker { ) private int lookaheadFeatureCachingInBp = FuncotatorArgumentDefinitions.LOOKAHEAD_CACHE_IN_BP_DEFAULT_VALUE; + @Advanced + @Hidden @Argument( - fullName = FuncotatorArgumentDefinitions.ALLOW_HG19_GENCODE_B37_CONTIG_MATCHING_OVERRIDE_LONG_NAME, + fullName = FuncotatorArgumentDefinitions.FORCE_B37_TO_HG19_REFERENCE_CONTIG_CONVERSION, optional = true, - doc = "(Advanced/Use at your own risk) Use in conjunction with allow hg19 contig names with b37. If you also select this flag, no check that your input reference is b37 is actually performed. Otherwise, ignored. Typically, this option is useful in integration tests (written by devs) only." + doc = "(Advanced / DO NOT USE*) If you select this flag, Funcotator will force a conversion of variant contig names from b37 to hg19. *This option is useful in integration tests (written by devs) only." ) - private boolean allowHg19ContigNamesWithB37Lenient = false; + private boolean forceB37ToHg19ContigNameConversion = false; //================================================================================================================== @@ -311,7 +309,12 @@ public class Funcotator extends VariantWalker { private final List> manualLocatableFeatureInputs = new ArrayList<>(); - private boolean inputReferenceIsB37 = false; + /** + * Whether the input variant contigs must be converted to hg19. + * This is only the case when the input reference is b37 AND when + * the reference version is hg19 (i.e. {@link #referenceVersion} == {@link FuncotatorArgumentDefinitions#HG19_REFERENCE_VERSION_STRING}). + */ + private boolean mustConvertInputContigsToHg19 = false; private FuncotationMetadata inputMetadata; @@ -330,7 +333,12 @@ public boolean requiresReference() { @Override public void onTraversalStart() { - // First set up our transcript list: + if (seqValidationArguments.performSequenceDictionaryValidation()) { + // Ensure that the reference dictionary is a superset of the variant dictionary: + checkReferenceDictionaryIsSupersetOfVariantDictionary(); + } + + // Next set up our transcript list: userTranscriptIdSet = processTranscriptList(userTranscriptIdSet); final LinkedHashMap annotationDefaultsMap = splitAnnotationArgsIntoMap(annotationDefaults); @@ -342,7 +350,7 @@ public void onTraversalStart() { final Map configData = DataSourceUtils.getAndValidateDataSourcesFromPaths(referenceVersion, dataSourceDirectories); initializeManualFeaturesForLocatableDataSources(configData); dataSourceFactories.addAll( - DataSourceUtils.createDataSourceFuncotationFactoriesForDataSources(configData, annotationOverridesMap, transcriptSelectionMode, userTranscriptIdSet, allowHg19ContigNamesWithB37) + DataSourceUtils.createDataSourceFuncotationFactoriesForDataSources(configData, annotationOverridesMap, transcriptSelectionMode, userTranscriptIdSet) ); // Sort our data source factories to ensure they're always in the same order: gencode datasources first @@ -378,32 +386,107 @@ public void onTraversalStart() { } // Check for reference version (in)compatibility: - if ( allowHg19ContigNamesWithB37 && - referenceVersion.equals(FuncotatorArgumentDefinitions.HG19_REFERENCE_VERSION_STRING) ) { + determineReferenceAndDatasourceCompatibility(); + } + + /** + * Checks to see that the given reference's sequence dictionary is a + * superset of the given variant file's dictionary. + * + * This is a more strict check than the one found in {@link GATKTool#validateSequenceDictionaries()}. + */ + private void checkReferenceDictionaryIsSupersetOfVariantDictionary() { + + final SAMSequenceDictionary referenceDictionary = getReferenceDictionary(); + final SAMSequenceDictionary variantDictionary = getSequenceDictionaryForDrivingVariants(); + + if ( referenceDictionary == null ) { + throw new UserException.BadInput("Reference fasta sequence dictionary is null!"); + } + + if ( variantDictionary == null ) { + throw new UserException.BadInput("Funcotator by default requires that the variant input have a sequence dictionary in its header. To disable this safety check, use argument --" + StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME); + } + + SequenceDictionaryUtils.validateDictionaries( + "Reference", getReferenceDictionary(), + "Driving Variants", getSequenceDictionaryForDrivingVariants(), + true, + false + ); + } + + private void determineReferenceAndDatasourceCompatibility() { + if ( forceB37ToHg19ContigNameConversion || + ( referenceVersion.equals(FuncotatorArgumentDefinitions.HG19_REFERENCE_VERSION_STRING) && + FuncotatorUtils.isSequenceDictionaryUsingB37Reference(getSequenceDictionaryForDrivingVariants()) )) { + // NOTE AND WARNING: // hg19 is from ucsc. b37 is from the genome reference consortium. - // ucsc decided the grc version had crap in it, so they blocked out some of the bases, aka "masked" them - // so the lengths of the contigs are the same, the bases are just _slightly_ different - inputReferenceIsB37 = FuncotatorUtils.isSequenceDictionaryUsingB37Reference(getSequenceDictionaryForDrivingVariants()); - if ( inputReferenceIsB37 ) { - logger.warn("WARNING: You are using B37 as a reference. " + - "Funcotator will convert your variants to GRCh37, and this will be fine in 99.9% of cases. " + - "There MAY be some errors in the Y chromosome due to changes between the two references."); - } + // ucsc decided the grc version had some bad data in it, so they blocked out some of the bases, aka "masked" them + // so the lengths of the contigs are the same, the bases are just _slightly_ different. + // ALSO, the contig naming convention is different between hg19 and hg38: + // hg19 uses contigs of the form "chr1" + // b37 uses contigs of the form "1" + // This naming convention difference causes a LOT of issues and was a bad idea. + + logger.warn("WARNING: You are using B37 as a reference. " + + "Funcotator will convert your variants to GRCh37, and this will be fine in the vast majority of cases. " + + "There MAY be some errors (e.g. in the Y chromosome, but possibly in other places as well) due to changes between the two references."); + + mustConvertInputContigsToHg19 = true; } + } - // Issue a warning if the lenient reference checking is being used without the b37-hg19 checking relaxed - if (allowHg19ContigNamesWithB37Lenient && !allowHg19ContigNamesWithB37) { - logger.warn(FuncotatorArgumentDefinitions.ALLOW_HG19_GENCODE_B37_CONTIG_MATCHING_OVERRIDE_LONG_NAME + " was specified without " + - FuncotatorArgumentDefinitions.ALLOW_HG19_GENCODE_B37_CONTIG_MATCHING_LONG_NAME + ". It will be ignored."); + private VariantContext getCorrectVariantContextForReference(final VariantContext variant) { + if ( mustConvertInputContigsToHg19 ) { + final VariantContextBuilder vcb = new VariantContextBuilder(variant); + vcb.chr(FuncotatorUtils.convertB37ContigToHg19Contig(variant.getContig())); + return vcb.make(); + } + else { + return variant; } } + @Override + protected VariantFilter makeVariantFilter() { + return variant -> { + // Ignore variants that have been filtered if the user requests it: + if ( removeFilteredVariants && variant.isFiltered() ) { + return false; + } + return true; + }; + } + + @Override + public VariantTransformer makePostVariantFilterTransformer(){ + return variantContext -> getCorrectVariantContextForReference(variantContext); + } + @Override public void apply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext) { + final ReferenceContext correctReferenceContext; + + // Check to see if we need to revert the ReferenceContext's interval to the original variant interval + // (This would only happen in the case where we were given b37 variants with hg19 data sources): + if ( mustConvertInputContigsToHg19 ) { + + // Convert our contig back to B37 here so it matches the variant: + final SimpleInterval interval = new SimpleInterval( + FuncotatorUtils.convertHG19ContigToB37Contig(variant.getContig()), variant.getStart(), variant.getEnd() + ); + + correctReferenceContext = new ReferenceContext(referenceContext, interval); + } + else { + correctReferenceContext = referenceContext; + } + // Place the variant on our queue to be funcotated: - enqueueAndHandleVariant(variant, referenceContext, featureContext); + enqueueAndHandleVariant(variant, correctReferenceContext, featureContext); } @Override @@ -463,44 +546,20 @@ private LinkedHashMap getUnaccountedForAnnotations( final List> featureSourceMap = new HashMap<>(); - // Check to see if we need to query with a different reference convention (i.e. "chr1" vs "1"). - if (allowHg19ContigNamesWithB37 && (inputReferenceIsB37 || allowHg19ContigNamesWithB37Lenient)) { - - // Construct a new contig and new interval with no "chr" in front of it: - final String hg19Contig = FuncotatorUtils.convertB37ContigToHg19Contig( variant.getContig() ); - final SimpleInterval hg19Interval = new SimpleInterval(hg19Contig, variant.getStart(), variant.getEnd()); - - // Get our features for the new interval: - for ( final FeatureInput featureInput : manualLocatableFeatureInputs ) { - @SuppressWarnings("unchecked") - final List featureList = (List)featureContext.getValues(featureInput, hg19Interval); - - // If we found features without relaxing the criteria, we should not continue to query. - if (featureList.size() == 0) { - // TODO: This is a little sloppy, since it checks every datasource twice. Once for hg19 contig names and once for b37 contig names. See https://github.com/broadinstitute/gatk/issues/4798 - featureList.addAll(featureContext.getValues(featureInput)); - } - featureSourceMap.put( featureInput.getName(), featureList); - } - } - else { - for ( final FeatureInput featureInput : manualLocatableFeatureInputs ) { - @SuppressWarnings("unchecked") - final List featureList = (List)featureContext.getValues(featureInput); - featureSourceMap.put( featureInput.getName(), featureList ); - } + for ( final FeatureInput featureInput : manualLocatableFeatureInputs ) { + @SuppressWarnings("unchecked") + final List featureList = (List)featureContext.getValues(featureInput); + featureSourceMap.put( featureInput.getName(), featureList ); } - // Create only the gencode funcotations. + //============================================================================================================== + // First create only the transcript (Gencode) funcotations: + if (retrieveGencodeFuncotationFactoryStream().count() > 1) { logger.warn("Attempting to annotate with more than one GENCODE datasource. If these have overlapping transcript IDs, errors may occur."); } @@ -510,6 +569,9 @@ private void enqueueAndHandleVariant(final VariantContext variant, final Referen .flatMap(List::stream) .map(gf -> (GencodeFuncotation) gf).collect(Collectors.toList()); + //============================================================================================================== + // Create the funcotations for non-Gencode data sources: + // Create a place to keep our funcotations: final FuncotationMap funcotationMap = FuncotationMap.createFromGencodeFuncotations(transcriptFuncotations); @@ -526,7 +588,9 @@ private void enqueueAndHandleVariant(final VariantContext variant, final Referen } } + //============================================================================================================== // Create the funcotations for the input and add to all txID mappings. + final List txIds = funcotationMap.getTranscriptList(); for (final String txId: txIds) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorArgumentDefinitions.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorArgumentDefinitions.java index 6db180434d5..2954cb4bd98 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorArgumentDefinitions.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorArgumentDefinitions.java @@ -48,16 +48,13 @@ public class FuncotatorArgumentDefinitions { */ public static final String ANNOTATION_OVERRIDES_LONG_NAME = "annotation-override"; - - public static final String ALLOW_HG19_GENCODE_B37_CONTIG_MATCHING_LONG_NAME = "allow-hg19-gencode-b37-contig-matching"; - public static final String HG19_REFERENCE_VERSION_STRING = "hg19"; public static final String HG38_REFERENCE_VERSION_STRING = "hg38"; public static final String LOOKAHEAD_CACHE_IN_BP_NAME = "lookahead-cache-bp"; public static final int LOOKAHEAD_CACHE_IN_BP_DEFAULT_VALUE = VariantWalkerBase.FEATURE_CACHE_LOOKAHEAD; - public static final String ALLOW_HG19_GENCODE_B37_CONTIG_MATCHING_OVERRIDE_LONG_NAME = "allow-hg19-gencode-b37-contig-matching-override"; + public static final String FORCE_B37_TO_HG19_REFERENCE_CONTIG_CONVERSION = "force-b37-to-hg19-reference-contig-conversion"; // ------------------------------------------------------------ // Helper Types: diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorUtils.java index a7e34644cd5..caa05531a20 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorUtils.java @@ -48,6 +48,7 @@ private FuncotatorUtils() {} private static SAMSequenceDictionary B37_SEQUENCE_DICTIONARY = null; private static final Map B37_To_HG19_CONTIG_NAME_MAP; + private static final Map HG19_TO_B37_CONTIG_NAME_MAP; /** * Initialize our hashmaps of lookup tables: @@ -71,6 +72,9 @@ private FuncotatorUtils() {} tableByLetter = Collections.unmodifiableMap(mapByLetter); B37_To_HG19_CONTIG_NAME_MAP = initializeB37ToHg19ContigNameMap(); + HG19_TO_B37_CONTIG_NAME_MAP = B37_To_HG19_CONTIG_NAME_MAP.entrySet() + .stream() + .collect(Collectors.toMap(Map.Entry::getValue, Map.Entry::getKey)); } /** @@ -1482,6 +1486,8 @@ public static String getBasesInWindowAroundReferenceAllele( final Allele refAlle final ReferenceContext referenceContext) { // TODO: This is generating too long a string for INDELs because of VCF format alleles - see Issue #4407 + // TODO: this seems to be the same as GencodeFunotationFactory::getReferenceBases - should that method call into this? + Utils.nonNull( refAllele ); Utils.nonNull( altAllele ); assertValidStrand( strand ); @@ -1492,9 +1498,12 @@ public static String getBasesInWindowAroundReferenceAllele( final Allele refAlle final String referenceBases; + // Get the reference bases for this interval. + final byte[] bases = referenceContext.getBases(referenceWindowInBases, endWindow); + if ( strand == Strand.POSITIVE ) { // Get the reference sequence: - referenceBases = new String(referenceContext.getBases(referenceWindowInBases, endWindow)); + referenceBases = new String(bases); } else { // Get the reference sequence: @@ -1748,6 +1757,15 @@ public static String convertB37ContigToHg19Contig( final String b37Contig ) { return B37_To_HG19_CONTIG_NAME_MAP.getOrDefault(b37Contig, b37Contig); } + /** + * Converts a given HG19 style contig name to the equivalent in B37. + * @param hg19Contig The contig name from the HG19 Human Genome reference to convert to the equivalent contig from the B37 Human Genome reference. + * @return The B37 equivalent of the given HG19 contig name, if such an equivalent name exists. If no equivalent name exists, returns the given {@code hg19Contig}. + */ + public static String convertHG19ContigToB37Contig( final String hg19Contig ) { + return HG19_TO_B37_CONTIG_NAME_MAP.getOrDefault(hg19Contig, hg19Contig); + } + /** * Get the overlapping exon start/stop as a {@link SimpleInterval} for the given altAllele / reference. * @param refAllele {@link Allele} for the given {@code altAllele}. Must not be {@code null}. diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DataSourceUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DataSourceUtils.java index 5e16b749a7e..531ed0e0545 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DataSourceUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DataSourceUtils.java @@ -59,13 +59,13 @@ private DataSourceUtils() {} @VisibleForTesting static final int MIN_MAJOR_VERSION_NUMBER = 1; @VisibleForTesting - static final int MIN_MINOR_VERSION_NUMBER = 2; + static final int MIN_MINOR_VERSION_NUMBER = 4; @VisibleForTesting static final int MIN_YEAR_RELEASED = 2018; @VisibleForTesting - static final int MIN_MONTH_RELEASED = 3; + static final int MIN_MONTH_RELEASED = 6; @VisibleForTesting - static final int MIN_DAY_RELEASED = 29; + static final int MIN_DAY_RELEASED = 15; //================================================================================================================== // Public Static Members: @@ -230,10 +230,9 @@ public static boolean isValidDirectory(final Path p) { * @return A {@link List} of {@link DataSourceFuncotationFactory} given the data source metadata, overrides, and transcript reporting priority information. */ public static List createDataSourceFuncotationFactoriesForDataSources(final Map dataSourceMetaData, - final LinkedHashMap annotationOverridesMap, - final TranscriptSelectionMode transcriptSelectionMode, - final Set userTranscriptIdSet, - final boolean isAllowingNoChrMatchesForTranscripts) { + final LinkedHashMap annotationOverridesMap, + final TranscriptSelectionMode transcriptSelectionMode, + final Set userTranscriptIdSet) { Utils.nonNull(dataSourceMetaData); Utils.nonNull(annotationOverridesMap); @@ -266,7 +265,7 @@ public static List createDataSourceFuncotationFact funcotationFactory = DataSourceUtils.createCosmicDataSource(path, properties, annotationOverridesMap); break; case GENCODE: - funcotationFactory = DataSourceUtils.createGencodeDataSource(path, properties, annotationOverridesMap, transcriptSelectionMode, userTranscriptIdSet, isAllowingNoChrMatchesForTranscripts); + funcotationFactory = DataSourceUtils.createGencodeDataSource(path, properties, annotationOverridesMap, transcriptSelectionMode, userTranscriptIdSet); break; case VCF: funcotationFactory = DataSourceUtils.createVcfDataSource(path, properties, annotationOverridesMap, transcriptSelectionMode, userTranscriptIdSet); @@ -376,14 +375,13 @@ public static CosmicFuncotationFactory createCosmicDataSource(final Path dataSou * @param annotationOverridesMap {@link LinkedHashMap}{@code String>} containing any annotation overrides to be included in the resulting data source. Must not be {@code null}. * @param transcriptSelectionMode {@link TranscriptSelectionMode} to use when choosing the transcript for detailed reporting. Must not be {@code null}. * @param userTranscriptIdSet {@link Set} of {@link String}s containing transcript IDs of interest to be selected for first. Must not be {@code null}. - * @param isAllowingNoChrMatchesForTranscripts Whether the datasource should disregard chr for a contig match. * @return A new {@link GencodeFuncotationFactory} based on the given data source file information, field overrides map, and transcript information. */ public static GencodeFuncotationFactory createGencodeDataSource(final Path dataSourceFile, final Properties dataSourceProperties, final LinkedHashMap annotationOverridesMap, final TranscriptSelectionMode transcriptSelectionMode, - final Set userTranscriptIdSet, final boolean isAllowingNoChrMatchesForTranscripts) { + final Set userTranscriptIdSet) { Utils.nonNull(dataSourceFile); Utils.nonNull(dataSourceProperties); @@ -402,8 +400,7 @@ public static GencodeFuncotationFactory createGencodeDataSource(final Path dataS name, transcriptSelectionMode, userTranscriptIdSet, - annotationOverridesMap, - isAllowingNoChrMatchesForTranscripts + annotationOverridesMap ); } @@ -573,9 +570,12 @@ private static boolean logDataSourcesInfo(final Path dataSourcesPath) { // Warn the user if they need newer stuff. if ( !dataSourcesPathIsAcceptable ) { - logger.error("ERROR: Given data source path is too old! Minimum required version is: " + CURRENT_MINIMUM_DATA_SOURCE_VERSION + " (yours: " + version + ")"); - logger.error(" You must download a newer version of the data sources from the Broad Institute FTP site: " + DATA_SOURCES_FTP_PATH); - logger.error(" or the Broad Institute Google Bucket: " + DATA_SOURCES_BUCKET_PATH); + + String message = ""; + message = message + "ERROR: Given data source path is too old! Minimum required version is: " + CURRENT_MINIMUM_DATA_SOURCE_VERSION + " (yours: " + version + ")\n"; + message = message + " You must download a newer version of the data sources from the Broad Institute FTP site: " + DATA_SOURCES_FTP_PATH + "\n"; + message = message + " or the Broad Institute Google Bucket: " + DATA_SOURCES_BUCKET_PATH + "\n"; + throw new UserException( message ); } return dataSourcesPathIsAcceptable; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/gencode/GencodeFuncotationFactory.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/gencode/GencodeFuncotationFactory.java index c4018310dda..4c90029d65a 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/gencode/GencodeFuncotationFactory.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/gencode/GencodeFuncotationFactory.java @@ -8,7 +8,6 @@ import htsjdk.tribble.annotation.Strand; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.variantcontext.VariantContextBuilder; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.broadinstitute.hellbender.engine.ReferenceContext; @@ -148,13 +147,6 @@ public class GencodeFuncotationFactory extends DataSourceFuncotationFactory { */ private final TranscriptSelectionMode transcriptSelectionMode; - /** - * Whether this factory will disregard the string "chr" in matching contig names. - * - * Setting this to true is useful in cases of b37 variants (e.g. contig 3) matching to gencode (e.g. contig chr3) - */ - private boolean isAllowingNoChrMatches = false; - /** * {@link List} of Transcript IDs that the user has requested that we annotate. * If this list is empty, will default to keeping ALL transcripts. @@ -187,8 +179,7 @@ public GencodeFuncotationFactory(final Path gencodeTranscriptFastaFile, final String name, final TranscriptSelectionMode transcriptSelectionMode, final Set userRequestedTranscripts, - final LinkedHashMap annotationOverrides, - final boolean isAllowingNoChrMatches) { + final LinkedHashMap annotationOverrides) { this.gencodeTranscriptFastaFile = gencodeTranscriptFastaFile; @@ -212,8 +203,6 @@ public GencodeFuncotationFactory(final Path gencodeTranscriptFastaFile, // Initialize overrides / defaults: initializeAnnotationOverrides( annotationOverrides ); - - this.isAllowingNoChrMatches = isAllowingNoChrMatches; } //================================================================================================================== @@ -626,18 +615,10 @@ private GencodeFuncotation createGencodeFuncotationOnTranscript(final VariantCon // TODO: check for complex INDEL and warn and skip. - VariantContext variantToUse = variant; + final VariantContext variantToUse = variant; // Find the sub-feature of transcript that contains our variant: - GencodeGtfFeature containingSubfeature = getContainingGtfSubfeature(variantToUse, transcript); - - // If we got no hits, let's check as if this contig was translated to hg19 - if (isAllowingNoChrMatches && containingSubfeature == null) { - final VariantContextBuilder vcb = new VariantContextBuilder(variant); - vcb.chr(FuncotatorUtils.convertB37ContigToHg19Contig(variant.getContig())); - variantToUse = vcb.make(); - containingSubfeature = getContainingGtfSubfeature(variantToUse, transcript); - } + final GencodeGtfFeature containingSubfeature = getContainingGtfSubfeature(variantToUse, transcript); // Make sure the sub-regions in the transcript actually contain the variant: // TODO: this is slow, and repeats work that is done later in the process (we call getSortedCdsAndStartStopPositions when creating the sequence comparison) @@ -1288,6 +1269,8 @@ private GencodeFuncotation createIntronFuncotation(final VariantContext variant, @VisibleForTesting static String getReferenceBases(final Allele refAllele, final Allele altAllele, final ReferenceContext reference, final Strand strand ) { + // TODO: this seems to be the same as FuncotatorUtils::getBasesInWindowAroundReferenceAllele - should this method call into that? + final int indelAdjustment; if ( altAllele.length() > refAllele.length() ) { indelAdjustment = altAllele.length() - refAllele.length(); @@ -1302,12 +1285,15 @@ static String getReferenceBases(final Allele refAllele, final Allele altAllele, reference.getWindow().getStart() - referenceWindow, reference.getWindow().getEnd() + referenceWindow + indelAdjustment); + // Get the reference bases for this interval. + byte[] referenceBases = reference.getBases(refBasesInterval); + // Get the bases in the correct direction: if ( strand == Strand.POSITIVE ) { - return new String(reference.getBases(refBasesInterval)); + return new String(referenceBases); } else { - return ReadUtils.getBasesReverseComplement(reference.getBases(refBasesInterval)); + return ReadUtils.getBasesReverseComplement(referenceBases); } } @@ -1580,10 +1566,11 @@ public static double calculateGcContent( final Allele refAllele, final ReferenceContext referenceContext, final int windowSize ) { + // TODO: this seems to do something similar to FuncotatorUtils::getBasesInWindowAroundReferenceAllele - should this method call into that? + Utils.nonNull( referenceContext ); ParamUtils.isPositive( windowSize, "Window size must be >= 1." ); - final int leadingWindowSize; final int trailingWindowSize = windowSize; @@ -1605,11 +1592,8 @@ public static double calculateGcContent( final Allele refAllele, leadingWindowSize = windowSize; } - // Create a placeholder for the bases: - final byte[] bases; - // Get the bases: - bases = referenceContext.getBases(leadingWindowSize, trailingWindowSize); + byte[] bases = referenceContext.getBases(leadingWindowSize, trailingWindowSize); // Get the gcCount: long gcCount = 0; diff --git a/src/main/java/org/broadinstitute/hellbender/transformers/VariantTransformer.java b/src/main/java/org/broadinstitute/hellbender/transformers/VariantTransformer.java new file mode 100644 index 00000000000..2f71fa3b0e6 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/transformers/VariantTransformer.java @@ -0,0 +1,34 @@ +package org.broadinstitute.hellbender.transformers; + +import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.hellbender.utils.SerializableFunction; + +import java.util.Objects; +import java.util.function.UnaryOperator; + +/** + * Classes which perform transformations from {@link htsjdk.variant.variantcontext.VariantContext} -> {@link htsjdk.variant.variantcontext.VariantContext} + * should implement this interface by overriding {@link SerializableFunction < VariantContext ,VariantContext>#apply(VariantContext)} + * Created by jonn on 6/26/18. + */ +@FunctionalInterface +public interface VariantTransformer extends UnaryOperator, SerializableFunction { + public static final long serialVersionUID = 1L; + + //HACK: These methods are a hack to get to get the type system to accept compositions of VariantTransformers. + @SuppressWarnings("overloads") + default VariantTransformer andThen(final VariantTransformer after) { + Objects.requireNonNull(after); + return (VariantContext vc) -> after.apply(apply(vc)); + } + + @SuppressWarnings("overloads") + default VariantTransformer compose(final VariantTransformer before) { + Objects.requireNonNull(before); + return (VariantContext vc) -> apply(before.apply(vc)); + } + + static VariantTransformer identity(){ + return variantContext -> variantContext; + } +} diff --git a/src/test/java/org/broadinstitute/hellbender/engine/ReferenceContextUnitTest.java b/src/test/java/org/broadinstitute/hellbender/engine/ReferenceContextUnitTest.java index 365c3faddd1..d9c9f30e547 100644 --- a/src/test/java/org/broadinstitute/hellbender/engine/ReferenceContextUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/engine/ReferenceContextUnitTest.java @@ -1,15 +1,14 @@ package org.broadinstitute.hellbender.engine; -import java.nio.file.Path; -import org.broadinstitute.hellbender.utils.SimpleInterval; -import org.broadinstitute.hellbender.exceptions.GATKException; import org.broadinstitute.hellbender.GATKBaseTest; +import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.io.IOUtils; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.io.File; +import java.nio.file.Path; import java.util.ArrayList; import java.util.Iterator; import java.util.List; @@ -24,8 +23,8 @@ public Object[][] getEmptyReferenceContextData() { // and/or null intervals should behave as empty context objects. return new Object[][] { { new ReferenceContext() }, - { new ReferenceContext(null, null) }, - { new ReferenceContext(null, new SimpleInterval("1", 1, 1) ) }, + { new ReferenceContext(null, null, 0, 0) }, + { new ReferenceContext(null, new SimpleInterval("1", 1, 1), 0, 0 ) }, { new ReferenceContext(new ReferenceFileSource(TEST_REFERENCE), null) } }; } @@ -211,6 +210,80 @@ public void testGetBasesStaticWindow() { } } + @DataProvider + private Object[][] provideForTestCopyConstructor() { + return new Object[][] { + { + new SimpleInterval("1", 11210, 11220), + new SimpleInterval("1", 2650, 2650), + 0, + 0 + }, + { + new SimpleInterval("1", 11210, 11220), + new SimpleInterval("1", 2650, 2650), + 3, + 5 + }, + { + new SimpleInterval("1", 11210, 11220), + new SimpleInterval("1", 2640, 2650), + 0, + 0 + }, + { + new SimpleInterval("1", 11210, 11220), + new SimpleInterval("1", 2640, 2650), + 3, + 5 + }, + { + new SimpleInterval("1", 11210, 11220), + new SimpleInterval("2", 2650, 2650), + 3, + 5 + }, + { + new SimpleInterval("1", 11210, 11220), + new SimpleInterval("2", 2650, 2650), + 0, + 0 + }, + { + new SimpleInterval("1", 11210, 11220), + new SimpleInterval("2", 2650, 2660), + 3, + 5 + }, + { + new SimpleInterval("1", 11210, 11220), + new SimpleInterval("2", 2650, 2660), + 0, + 0 + }, + }; + } + + @Test(dataProvider = "provideForTestCopyConstructor") + public void testCopyConstructor(final SimpleInterval originalInterval, final SimpleInterval newInterval, final int leadingBases, final int trailingBases) { + try (final ReferenceDataSource reference = new ReferenceFileSource(TEST_REFERENCE)) { + + final ReferenceContext refContext = new ReferenceContext(reference, originalInterval, leadingBases, trailingBases); + Assert.assertEquals(refContext.getInterval(), originalInterval, "Set interval is different from expected interval!"); + + final ReferenceContext newRefContext = new ReferenceContext(refContext, newInterval); + Assert.assertEquals(newRefContext.getInterval(), newInterval, "Set interval is different from expected interval!"); + + final SimpleInterval newWindow = newRefContext.getWindow(); + + final int newLeadingBases = newInterval.getStart() - newWindow.getStart(); + final int newTrailingBases = newWindow.getEnd() - newInterval.getEnd(); + + Assert.assertEquals(newLeadingBases, leadingBases, "New window leading bases are not the same as old window leading bases!"); + Assert.assertEquals(newTrailingBases, trailingBases, "New window trailing bases are not the same as old window trailing bases!"); + } + } + private void checkReferenceContextBases( final ReferenceContext refContext, final String expectedBases ) { final byte[] contextBases = refContext.getBases(); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/funcotator/FuncotationMapUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/funcotator/FuncotationMapUnitTest.java index f9b48160230..56e83262201 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/funcotator/FuncotationMapUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/funcotator/FuncotationMapUnitTest.java @@ -263,8 +263,7 @@ private static List createGencodeFuncotations(final String c final String gencode_test = "GENCODE_TEST"; final GencodeFuncotationFactory gencodeFactory = new GencodeFuncotationFactory(Paths.get(transcriptFastaFile), - "TEST", gencode_test, transcriptSelectionMode, new HashSet<>(), new LinkedHashMap<>(), - true); + "TEST", gencode_test, transcriptSelectionMode, new HashSet<>(), new LinkedHashMap<>()); return gencodeFactory.createFuncotations(variantContext, referenceContext, Collections.singletonMap(gencode_test, featureList)).stream() .map(f -> (GencodeFuncotation) f).collect(Collectors.toList()); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorIntegrationTest.java index 6644ceca907..057aeae7d89 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorIntegrationTest.java @@ -3,11 +3,13 @@ import avro.shaded.com.google.common.collect.Sets; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFCompoundHeaderLine; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFInfoHeaderLine; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.Pair; import org.broadinstitute.hellbender.CommandLineProgramTest; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.engine.FeatureDataSource; import org.broadinstitute.hellbender.exceptions.GATKException; import org.broadinstitute.hellbender.exceptions.UserException; @@ -145,6 +147,32 @@ private static void addManualAnnotationsToArguments(final ArgumentsBuilder argum arguments.addArgument(FuncotatorArgumentDefinitions.ANNOTATION_OVERRIDES_LONG_NAME, "Oreganno_Build:BUILDED_GOOD_REAL_BIG"); } + private ArgumentsBuilder createBaselineArgumentsForFuncotator(final String variantFileName, + final File outputFile, + final String referenceFileName, + final String dataSourcesPath, + final String refVer, + final FuncotatorArgumentDefinitions.OutputFormatType outputFormatType, + final boolean shouldValidateSeqDicts) { + + final ArgumentsBuilder arguments = new ArgumentsBuilder(); + + arguments.addVCF(new File(variantFileName)); + arguments.addOutput(outputFile); + arguments.addReference(new File(referenceFileName)); + arguments.addArgument(FuncotatorArgumentDefinitions.DATA_SOURCES_PATH_LONG_NAME, dataSourcesPath); + arguments.addArgument(FuncotatorArgumentDefinitions.REFERENCE_VERSION_LONG_NAME, refVer); + arguments.addArgument(FuncotatorArgumentDefinitions.OUTPUT_FORMAT_LONG_NAME, outputFormatType.toString()); + arguments.addArgument("verbosity", "INFO"); + + if ( !shouldValidateSeqDicts ) { + // Disable the sequence dictionary check for the tests: + arguments.addBooleanArgument(StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME, true); + } + + return arguments; + } + //================================================================================================================== @DataProvider @@ -234,32 +262,32 @@ public Object[][] provideForLargeDataValidationTest() { { "M2_01115161-TA1-filtered.vcf", "Homo_sapiens_assembly19.fasta", - true, FuncotatorTestConstants.REFERENCE_VERSION_HG19, }, { "C828.TCGA-D3-A2JP-06A-11D-A19A-08.3-filtered.PASS.vcf", "Homo_sapiens_assembly19.fasta", - true, FuncotatorTestConstants.REFERENCE_VERSION_HG19 }, { "hg38_test_variants.vcf", "Homo_sapiens_assembly38.fasta", - false, FuncotatorTestConstants.REFERENCE_VERSION_HG38 }, { "sample21.trimmed.vcf", "Homo_sapiens_assembly38.fasta", - false, FuncotatorTestConstants.REFERENCE_VERSION_HG38 }, { "0816201804HC0_R01C01.vcf", "Homo_sapiens_assembly19.fasta", - true, FuncotatorTestConstants.REFERENCE_VERSION_HG19 + }, + { + "hg38_trio.vcf", + "Homo_sapiens_assembly38.fasta", + FuncotatorTestConstants.REFERENCE_VERSION_HG38 } }; } @@ -285,24 +313,33 @@ public void testFuncotatorWithoutValidatingResults(final String dataSourcesPath, final File outputFile = getOutputFile(outputFormatType); - final ArgumentsBuilder arguments = new ArgumentsBuilder(); - - arguments.addVCF(new File(variantFileName)); - arguments.addOutput(outputFile); - arguments.addReference(new File(referenceFileName)); - arguments.addArgument(FuncotatorArgumentDefinitions.DATA_SOURCES_PATH_LONG_NAME, dataSourcesPath); - arguments.addArgument(FuncotatorArgumentDefinitions.REFERENCE_VERSION_LONG_NAME, refVer); - arguments.addArgument(FuncotatorArgumentDefinitions.OUTPUT_FORMAT_LONG_NAME, outputFormatType.toString()); + final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator(variantFileName, outputFile, referenceFileName, dataSourcesPath, refVer, outputFormatType, false); runCommandLine(arguments); } + private void validateFuncotationsOnVcf(final Iterable vcfIterable, final String[] funcotationFieldNames) { + for (final VariantContext vc : vcfIterable ) { + final String funcotation = vc.getAttributeAsString(VcfOutputRenderer.FUNCOTATOR_VCF_FIELD_NAME, ""); + + Assert.assertNotEquals(funcotation, ""); + + final String rawFuncotations = funcotation.substring(1,funcotation.length()-1); + + Assert.assertEquals(StringUtils.countMatches(rawFuncotations, VcfOutputRenderer.FIELD_DELIMITER), funcotationFieldNames.length - 1); + + // This is here to make sure we can create the FuncotationMap object without exploding. + // It serves as a secondary check. + final FuncotationMap funkyMap = FuncotationMap.createAsAllTableFuncotationsFromVcf(FuncotationMap.NO_TRANSCRIPT_AVAILABLE_KEY, funcotationFieldNames, + funcotation, vc.getAlternateAllele(0), "VCF"); + } + } + @Test(enabled = doDebugTests, groups = {"funcotatorValidation"}, dataProvider = "provideForLargeDataValidationTest") public void largeDataValidationTest(final String inputVcfName, final String referencePath, - final boolean allowHg19B37ContigMatches, final String referenceVersion) throws IOException { // Get our main test folder path from our environment: @@ -324,21 +361,14 @@ public void largeDataValidationTest(final String inputVcfName, outputFile = getOutputFile(outFileBaseName + ".maf", "tsv"); } - final ArgumentsBuilder arguments = new ArgumentsBuilder(); - - arguments.addArgument("verbosity", "INFO"); - - arguments.addVCF(new File(testFolderInputPath + inputVcfName)); - - arguments.addReference(new File(testFolderInputPath + referencePath)); - - arguments.addArgument(FuncotatorArgumentDefinitions.DATA_SOURCES_PATH_LONG_NAME, getFuncotatorLargeDataValidationTestInputPath() + LARGE_DATASOURCES_FOLDER); - - arguments.addBooleanArgument(FuncotatorArgumentDefinitions.ALLOW_HG19_GENCODE_B37_CONTIG_MATCHING_LONG_NAME, allowHg19B37ContigMatches); - - arguments.addArgument(FuncotatorArgumentDefinitions.REFERENCE_VERSION_LONG_NAME, referenceVersion); - arguments.addOutput(outputFile); - arguments.addArgument(FuncotatorArgumentDefinitions.OUTPUT_FORMAT_LONG_NAME, outFormat.toString()); + final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator( + testFolderInputPath + inputVcfName, + outputFile, + testFolderInputPath + referencePath, + getFuncotatorLargeDataValidationTestInputPath() + LARGE_DATASOURCES_FOLDER, + referenceVersion, + outFormat, + true); // Add our manual annotations to the arguments: addManualAnnotationsToArguments(arguments); @@ -349,6 +379,21 @@ public void largeDataValidationTest(final String inputVcfName, endTime = System.nanoTime(); System.out.println(" Elapsed Time (" + outFormat.toString() + "): " + (endTime - startTime) / 1e9 + "s"); + + // Perform a simple validation if the file was a VCF: + if ( outFormat == FuncotatorArgumentDefinitions.OutputFormatType.VCF) { + + try ( final FeatureDataSource vcfReader = new FeatureDataSource<>(outputFile.getAbsolutePath()) ) { + Assert.assertTrue(vcfReader.getHeader() instanceof VCFHeader, "Header is not a VCFHeader!"); + final VCFHeader vcfHeader = (VCFHeader)vcfReader.getHeader(); + + final VCFInfoHeaderLine funcotationHeaderLine = vcfHeader.getInfoHeaderLine(VcfOutputRenderer.FUNCOTATOR_VCF_FIELD_NAME); + final String[] funcotationFieldNames = FuncotatorUtils.extractFuncotatorKeysFromHeaderDescription(funcotationHeaderLine.getDescription()); + + validateFuncotationsOnVcf(vcfReader, funcotationFieldNames); + + } + } } System.out.println("Total Elapsed Time: " + (endTime - overallStartTime) / 1e9 + "s"); @@ -374,16 +419,15 @@ public void exhaustiveArgumentTest(final String dataSourcesPath, bw.write(transcriptName); } - final ArgumentsBuilder arguments = new ArgumentsBuilder(); - // Required Args: - arguments.addVCF(new File(variantFileName)); - arguments.addOutput(outputFile); - arguments.addReference(new File(referenceFileName)); - - arguments.addArgument(FuncotatorArgumentDefinitions.DATA_SOURCES_PATH_LONG_NAME, dataSourcesPath); - arguments.addArgument(FuncotatorArgumentDefinitions.REFERENCE_VERSION_LONG_NAME, refVer); - arguments.addArgument(FuncotatorArgumentDefinitions.OUTPUT_FORMAT_LONG_NAME, outputFormatType.toString()); + final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator( + variantFileName, + outputFile, + referenceFileName, + dataSourcesPath, + refVer, + outputFormatType, + false); // Transcript selection: arguments.addArgument(FuncotatorArgumentDefinitions.TRANSCRIPT_SELECTION_MODE_LONG_NAME, TranscriptSelectionMode.BEST_EFFECT.toString()); @@ -414,15 +458,17 @@ public void testCanAnnotateMixedContigHg19Clinvar() { final FuncotatorArgumentDefinitions.OutputFormatType outputFormatType = FuncotatorArgumentDefinitions.OutputFormatType.VCF; final File outputFile = getOutputFile(outputFormatType); - final ArgumentsBuilder arguments = new ArgumentsBuilder(); + final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator( + PIK3CA_VCF_HG19, + outputFile, + b37Chr3Ref, + DS_PIK3CA_DIR, + FuncotatorTestConstants.REFERENCE_VERSION_HG19, + outputFormatType, + false); - arguments.addVCF(new File(PIK3CA_VCF_HG19)); - arguments.addOutput(outputFile); - arguments.addReference(new File(b37Chr3Ref)); - arguments.addArgument(FuncotatorArgumentDefinitions.DATA_SOURCES_PATH_LONG_NAME, DS_PIK3CA_DIR); - arguments.addArgument(FuncotatorArgumentDefinitions.REFERENCE_VERSION_LONG_NAME, FuncotatorTestConstants.REFERENCE_VERSION_HG19); - arguments.addArgument(FuncotatorArgumentDefinitions.OUTPUT_FORMAT_LONG_NAME, outputFormatType.toString()); - arguments.addBooleanArgument(FuncotatorArgumentDefinitions.ALLOW_HG19_GENCODE_B37_CONTIG_MATCHING_LONG_NAME, true); + // We need this argument since we are testing on a subset of b37 + arguments.addBooleanArgument(FuncotatorArgumentDefinitions.FORCE_B37_TO_HG19_REFERENCE_CONTIG_CONVERSION, true); runCommandLine(arguments); @@ -434,30 +480,12 @@ public void testCanAnnotateMixedContigHg19Clinvar() { final int NUM_VARIANTS = 21; final int NUM_CLINVAR_HITS = 4; - Assert.assertEquals(variantContexts.size(), NUM_VARIANTS); + Assert.assertEquals(variantContexts.size(), NUM_VARIANTS, "Found unexpected number of variants!"); // Look for "MedGen" to know that we have a clinvar hit. Assert.assertEquals(variantContexts.stream() .filter(vc -> StringUtils.contains(vc.getAttributeAsString(VcfOutputRenderer.FUNCOTATOR_VCF_FIELD_NAME, ""), "MedGen")) - .count(), NUM_CLINVAR_HITS); - } - - - private void checkVariantContextFuncotationResultsForParseability(final Pair> vcfInfo, final String[] funcotationFieldNames) { - for (final VariantContext vc : vcfInfo.getRight() ) { - final String funcotation = vc.getAttributeAsString(VcfOutputRenderer.FUNCOTATOR_VCF_FIELD_NAME, ""); - - Assert.assertNotEquals(funcotation, "", "Funcotation string is empty!"); - - final String rawFuncotations = funcotation.substring(1,funcotation.length()-1); - - Assert.assertEquals(StringUtils.countMatches(rawFuncotations, VcfOutputRenderer.FIELD_DELIMITER), funcotationFieldNames.length - 1, "Found unexpected number of funcotation delimiters (indicating wrong number of funcotations)!"); - - // This is here to make sure we can create the FuncotationMap object without exploding. - // It serves as a secondary check. - final FuncotationMap funkyMap = FuncotationMap.createAsAllTableFuncotationsFromVcf(FuncotationMap.NO_TRANSCRIPT_AVAILABLE_KEY, funcotationFieldNames, - funcotation, vc.getAlternateAllele(0), "VCF"); - } + .count(), NUM_CLINVAR_HITS, "Found unexpected number of ClinVar hits!"); } @Test @@ -465,15 +493,17 @@ public void testXsvLocatableAnnotationsHaveOnlyOneEntryForMultiHitLocations() { final FuncotatorArgumentDefinitions.OutputFormatType outputFormatType = FuncotatorArgumentDefinitions.OutputFormatType.VCF; final File outputFile = getOutputFile(outputFormatType); - final ArgumentsBuilder arguments = new ArgumentsBuilder(); + final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator( + XSV_CLINVAR_MULTIHIT_TEST_VCF, + outputFile, + b37Chr2Ref, + DS_XSV_CLINVAR_TESTS, + FuncotatorTestConstants.REFERENCE_VERSION_HG19, + outputFormatType, + false); - arguments.addVCF(new File(XSV_CLINVAR_MULTIHIT_TEST_VCF)); - arguments.addOutput(outputFile); - arguments.addReference(new File(b37Chr2Ref)); - arguments.addArgument(FuncotatorArgumentDefinitions.DATA_SOURCES_PATH_LONG_NAME, DS_XSV_CLINVAR_TESTS); - arguments.addArgument(FuncotatorArgumentDefinitions.REFERENCE_VERSION_LONG_NAME, FuncotatorTestConstants.REFERENCE_VERSION_HG19); - arguments.addArgument(FuncotatorArgumentDefinitions.OUTPUT_FORMAT_LONG_NAME, outputFormatType.toString()); - arguments.addBooleanArgument(FuncotatorArgumentDefinitions.ALLOW_HG19_GENCODE_B37_CONTIG_MATCHING_LONG_NAME, true); + // We need this argument since we are testing on a subset of b37 + arguments.addBooleanArgument(FuncotatorArgumentDefinitions.FORCE_B37_TO_HG19_REFERENCE_CONTIG_CONVERSION, true); runCommandLine(arguments); @@ -485,7 +515,7 @@ public void testXsvLocatableAnnotationsHaveOnlyOneEntryForMultiHitLocations() { final int EXPECTED_NUM_VARIANTS = 1; Assert.assertEquals(vcfInfo.getRight().size(), EXPECTED_NUM_VARIANTS, "Found more than " + EXPECTED_NUM_VARIANTS + " variants!"); - checkVariantContextFuncotationResultsForParseability(vcfInfo, funcotationFieldNames); + validateFuncotationsOnVcf(vcfInfo.getRight(), funcotationFieldNames); } @Test @@ -493,15 +523,16 @@ public void testXsvLocatableAnnotationsHaveCorrectColsForOnlyOnePositionSpecifie final FuncotatorArgumentDefinitions.OutputFormatType outputFormatType = FuncotatorArgumentDefinitions.OutputFormatType.VCF; final File outputFile = getOutputFile(outputFormatType); - final ArgumentsBuilder arguments = new ArgumentsBuilder(); + final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator( + XSV_CLINVAR_COL_TEST_VCF, + outputFile, + b37Chr2Ref, + DS_XSV_CLINVAR_TESTS, + FuncotatorTestConstants.REFERENCE_VERSION_HG19, + outputFormatType, + false); - arguments.addVCF(new File(XSV_CLINVAR_COL_TEST_VCF)); - arguments.addOutput(outputFile); - arguments.addReference(new File(b37Chr2Ref)); - arguments.addArgument(FuncotatorArgumentDefinitions.DATA_SOURCES_PATH_LONG_NAME, DS_XSV_CLINVAR_TESTS); - arguments.addArgument(FuncotatorArgumentDefinitions.REFERENCE_VERSION_LONG_NAME, FuncotatorTestConstants.REFERENCE_VERSION_HG19); - arguments.addArgument(FuncotatorArgumentDefinitions.OUTPUT_FORMAT_LONG_NAME, outputFormatType.toString()); - arguments.addBooleanArgument(FuncotatorArgumentDefinitions.ALLOW_HG19_GENCODE_B37_CONTIG_MATCHING_LONG_NAME, true); + arguments.addBooleanArgument(FuncotatorArgumentDefinitions.FORCE_B37_TO_HG19_REFERENCE_CONTIG_CONVERSION, true); runCommandLine(arguments); @@ -513,7 +544,7 @@ public void testXsvLocatableAnnotationsHaveCorrectColsForOnlyOnePositionSpecifie final int EXPECTED_NUM_VARIANTS = 10; Assert.assertEquals(vcfInfo.getRight().size(), EXPECTED_NUM_VARIANTS); - checkVariantContextFuncotationResultsForParseability(vcfInfo, funcotationFieldNames); + validateFuncotationsOnVcf(vcfInfo.getRight(), funcotationFieldNames); } @Test @@ -522,14 +553,14 @@ public void testCanAnnotateHg38ClinvarAndGencodeV28() { final FuncotatorArgumentDefinitions.OutputFormatType outputFormatType = FuncotatorArgumentDefinitions.OutputFormatType.VCF; final File outputFile = getOutputFile(outputFormatType); - final ArgumentsBuilder arguments = new ArgumentsBuilder(); - - arguments.addVCF(new File(PIK3CA_VCF_HG38)); - arguments.addOutput(outputFile); - arguments.addReference(new File(hg38Chr3Ref)); - arguments.addArgument(FuncotatorArgumentDefinitions.DATA_SOURCES_PATH_LONG_NAME, DS_PIK3CA_DIR); - arguments.addArgument(FuncotatorArgumentDefinitions.REFERENCE_VERSION_LONG_NAME, FuncotatorTestConstants.REFERENCE_VERSION_HG38); - arguments.addArgument(FuncotatorArgumentDefinitions.OUTPUT_FORMAT_LONG_NAME, outputFormatType.toString()); + final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator( + PIK3CA_VCF_HG38, + outputFile, + hg38Chr3Ref, + DS_PIK3CA_DIR, + FuncotatorTestConstants.REFERENCE_VERSION_HG38, + outputFormatType, + false); runCommandLine(arguments); @@ -552,9 +583,9 @@ public void testCanAnnotateHg38ClinvarAndGencodeV28() { @DataProvider(name = "provideForMafVcfConcordanceProteinChange") final Object[][] provideForMafVcfConcordanceProteinChange() { return new Object[][]{ - {PIK3CA_VCF_HG19_SNPS, b37Chr3Ref, FuncotatorTestConstants.REFERENCE_VERSION_HG19, Arrays.asList("Gencode_19_proteinChange"), Arrays.asList(MafOutputRendererConstants.FieldName_Protein_Change), DS_PIK3CA_DIR, 15}, - {PIK3CA_VCF_HG19_INDELS, b37Chr3Ref, FuncotatorTestConstants.REFERENCE_VERSION_HG19, Arrays.asList("Gencode_19_proteinChange"), Arrays.asList(MafOutputRendererConstants.FieldName_Protein_Change), DS_PIK3CA_DIR, 57}, - {MUC16_VCF_HG19, hg19Chr19Ref, FuncotatorTestConstants.REFERENCE_VERSION_HG19, Arrays.asList("Gencode_19_proteinChange"), Arrays.asList(MafOutputRendererConstants.FieldName_Protein_Change), DS_MUC16_DIR, 2057} + {PIK3CA_VCF_HG19_SNPS, b37Chr3Ref, FuncotatorTestConstants.REFERENCE_VERSION_HG19, Collections.singletonList("Gencode_19_proteinChange"), Collections.singletonList(MafOutputRendererConstants.FieldName_Protein_Change), DS_PIK3CA_DIR, true, 15}, + {PIK3CA_VCF_HG19_INDELS, b37Chr3Ref, FuncotatorTestConstants.REFERENCE_VERSION_HG19, Collections.singletonList("Gencode_19_proteinChange"), Collections.singletonList(MafOutputRendererConstants.FieldName_Protein_Change), DS_PIK3CA_DIR, true, 57}, + {MUC16_VCF_HG19, hg19Chr19Ref, FuncotatorTestConstants.REFERENCE_VERSION_HG19, Collections.singletonList("Gencode_19_proteinChange"), Collections.singletonList(MafOutputRendererConstants.FieldName_Protein_Change), DS_MUC16_DIR, false, 2057} }; } @@ -567,40 +598,50 @@ public void testVcfMafConcordanceForProteinChange(final String inputVcf, final S final String funcotatorRef, final List annotationsToCheckVcf, final List annotationsToCheckMaf, final String datasourceDir, + final boolean forceB37Hg19Conversion, final int gtNumVariants) { final FuncotatorArgumentDefinitions.OutputFormatType vcfOutputFormatType = FuncotatorArgumentDefinitions.OutputFormatType.VCF; final File vcfOutputFile = getOutputFile(vcfOutputFormatType); - final ArgumentsBuilder argumentsVcf = new ArgumentsBuilder(); + final ArgumentsBuilder argumentsVcf = createBaselineArgumentsForFuncotator( + inputVcf, + vcfOutputFile, + inputRef, + datasourceDir, + funcotatorRef, + vcfOutputFormatType, + false); - argumentsVcf.addVCF(new File(inputVcf)); - argumentsVcf.addOutput(vcfOutputFile); - argumentsVcf.addReference(new File(inputRef)); - argumentsVcf.addArgument(FuncotatorArgumentDefinitions.DATA_SOURCES_PATH_LONG_NAME, datasourceDir); - argumentsVcf.addArgument(FuncotatorArgumentDefinitions.REFERENCE_VERSION_LONG_NAME, funcotatorRef); - argumentsVcf.addArgument(FuncotatorArgumentDefinitions.OUTPUT_FORMAT_LONG_NAME, vcfOutputFormatType.toString()); argumentsVcf.addBooleanArgument(FuncotatorArgumentDefinitions.REMOVE_FILTERED_VARIANTS_LONG_NAME, false); argumentsVcf.addArgument(FuncotatorArgumentDefinitions.TRANSCRIPT_SELECTION_MODE_LONG_NAME, TranscriptSelectionMode.CANONICAL.toString()); - // We need this argument since we are testing on a subset of b37 - argumentsVcf.addBooleanArgument(FuncotatorArgumentDefinitions.ALLOW_HG19_GENCODE_B37_CONTIG_MATCHING_OVERRIDE_LONG_NAME, true); + if ( forceB37Hg19Conversion ) { + // We need this argument since we are testing on a subset of b37 + argumentsVcf.addBooleanArgument(FuncotatorArgumentDefinitions.FORCE_B37_TO_HG19_REFERENCE_CONTIG_CONVERSION, true); + } + runCommandLine(argumentsVcf); final FuncotatorArgumentDefinitions.OutputFormatType mafOutputFormatType = FuncotatorArgumentDefinitions.OutputFormatType.MAF; final File mafOutputFile = getOutputFile(mafOutputFormatType); - final ArgumentsBuilder argumentsMaf = new ArgumentsBuilder(); + final ArgumentsBuilder argumentsMaf = createBaselineArgumentsForFuncotator( + inputVcf, + mafOutputFile, + inputRef, + datasourceDir, + funcotatorRef, + mafOutputFormatType, + false); - argumentsMaf.addVCF(new File(inputVcf)); - argumentsMaf.addOutput(mafOutputFile); - argumentsMaf.addReference(new File(inputRef)); - argumentsMaf.addArgument(FuncotatorArgumentDefinitions.DATA_SOURCES_PATH_LONG_NAME, datasourceDir); - argumentsMaf.addArgument(FuncotatorArgumentDefinitions.REFERENCE_VERSION_LONG_NAME, funcotatorRef); - argumentsMaf.addArgument(FuncotatorArgumentDefinitions.OUTPUT_FORMAT_LONG_NAME, mafOutputFormatType.toString()); argumentsMaf.addBooleanArgument(FuncotatorArgumentDefinitions.REMOVE_FILTERED_VARIANTS_LONG_NAME, false); - argumentsMaf.addBooleanArgument(FuncotatorArgumentDefinitions.ALLOW_HG19_GENCODE_B37_CONTIG_MATCHING_OVERRIDE_LONG_NAME, true); argumentsMaf.addArgument(FuncotatorArgumentDefinitions.TRANSCRIPT_SELECTION_MODE_LONG_NAME, TranscriptSelectionMode.CANONICAL.toString()); + if ( forceB37Hg19Conversion ) { + // We need this argument since we are testing on a subset of b37 + argumentsMaf.addBooleanArgument(FuncotatorArgumentDefinitions.FORCE_B37_TO_HG19_REFERENCE_CONTIG_CONVERSION, true); + } + runCommandLine(argumentsMaf); final Pair> vcfInfo = VariantContextTestUtils.readEntireVCFIntoMemory(vcfOutputFile.getAbsolutePath()); @@ -615,11 +656,11 @@ public void testVcfMafConcordanceForProteinChange(final String inputVcf, final S // Some errors manifest as all of the variant classifications being IGR. Check to make sure that is not the case. Assert.assertTrue(maf.getRecords().stream() - .anyMatch(v -> !v.getAnnotationValue(MafOutputRendererConstants.FieldName_Variant_Classification).equals("IGR"))); + .anyMatch(v -> !v.getAnnotationValue(MafOutputRendererConstants.FieldName_Variant_Classification).equals("IGR")), "Output produced only IGR annotations!"); Assert.assertTrue(maf.getRecords().stream() .anyMatch(v -> v.getAnnotationValue(MafOutputRendererConstants.FieldName_Variant_Classification).equals("Missense_Mutation") || - v.getAnnotationValue(MafOutputRendererConstants.FieldName_Variant_Classification).startsWith("Frame_Shift"))); + v.getAnnotationValue(MafOutputRendererConstants.FieldName_Variant_Classification).startsWith("Frame_Shift")), "Output produced unexpected VariantClassification"); // Get the protein changes: final String[] funcotationKeys = extractFuncotatorKeysFromHeaderDescription(funcotationHeaderLine.getDescription()); @@ -649,19 +690,19 @@ public void testVcfDatasourceAccountsForAltAlleles() { final FuncotatorArgumentDefinitions.OutputFormatType vcfOutputFormatType = FuncotatorArgumentDefinitions.OutputFormatType.VCF; final File vcfOutputFile = getOutputFile(vcfOutputFormatType); - final ArgumentsBuilder argumentsVcf = new ArgumentsBuilder(); - - argumentsVcf.addVCF(new File(PIK3CA_VCF_HG19_ALTS)); - argumentsVcf.addOutput(vcfOutputFile); - argumentsVcf.addReference(new File(b37Chr3Ref)); - argumentsVcf.addArgument(FuncotatorArgumentDefinitions.DATA_SOURCES_PATH_LONG_NAME, DS_PIK3CA_DIR); - argumentsVcf.addArgument(FuncotatorArgumentDefinitions.REFERENCE_VERSION_LONG_NAME, FuncotatorTestConstants.REFERENCE_VERSION_HG19); - argumentsVcf.addArgument(FuncotatorArgumentDefinitions.OUTPUT_FORMAT_LONG_NAME, vcfOutputFormatType.toString()); - argumentsVcf.addBooleanArgument(FuncotatorArgumentDefinitions.REMOVE_FILTERED_VARIANTS_LONG_NAME, false); + final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator( + PIK3CA_VCF_HG19_ALTS, + vcfOutputFile, + b37Chr3Ref, + DS_PIK3CA_DIR, + FuncotatorTestConstants.REFERENCE_VERSION_HG19, + vcfOutputFormatType, + false); // We need this argument since we are testing on a subset of b37 - argumentsVcf.addBooleanArgument(FuncotatorArgumentDefinitions.ALLOW_HG19_GENCODE_B37_CONTIG_MATCHING_OVERRIDE_LONG_NAME, true); - runCommandLine(argumentsVcf); + arguments.addBooleanArgument(FuncotatorArgumentDefinitions.FORCE_B37_TO_HG19_REFERENCE_CONTIG_CONVERSION, true); + + runCommandLine(arguments); final Pair> vcfInfo = VariantContextTestUtils.readEntireVCFIntoMemory(vcfOutputFile.getAbsolutePath()); final List variantContexts = vcfInfo.getRight(); @@ -677,17 +718,17 @@ public void testVcfDatasourceAccountsForAltAlleles() { final String gtString = (i == 0) ? "MedGen:C0027672,SNOMED_CT:699346009" : ""; final Map alleleToFuncotationMap = FuncotatorUtils.createAlleleToFuncotationMapFromFuncotationVcfAttribute(funcotationKeys, variantContexts.get(i), "Gencode_19_annotationTranscript", "TEST"); - Assert.assertEquals(alleleToFuncotationMap.entrySet().size(), 1); + Assert.assertEquals(alleleToFuncotationMap.entrySet().size(), 1, "Found more than 1 alternate allele!"); final FuncotationMap funcotationMap = alleleToFuncotationMap.values().iterator().next(); - Assert.assertEquals(funcotationMap.getTranscriptList().size(), 1); - Assert.assertTrue(funcotationMap.getTranscriptList().stream().noneMatch(k -> k.equals(FuncotationMap.NO_TRANSCRIPT_AVAILABLE_KEY))); - Assert.assertTrue(funcotationMap.getTranscriptList().stream().noneMatch(StringUtils::isEmpty)); + Assert.assertEquals(funcotationMap.getTranscriptList().size(), 1, "Found more than 1 funcotation!"); + Assert.assertTrue(funcotationMap.getTranscriptList().stream().noneMatch(k -> k.equals(FuncotationMap.NO_TRANSCRIPT_AVAILABLE_KEY)), "Found a funcotation with an unknown transcript name: " + FuncotationMap.NO_TRANSCRIPT_AVAILABLE_KEY); + Assert.assertTrue(funcotationMap.getTranscriptList().stream().noneMatch(StringUtils::isEmpty), "Found a funcotation with an empty transcript!"); final List funcotations = funcotationMap.get(funcotationMap.getTranscriptList().get(0)); - Assert.assertEquals(funcotations.size(), 1); + Assert.assertEquals(funcotations.size(), 1, "Found more than one funcotation in the funcotation map!"); final Funcotation funcotation = funcotations.get(0); - Assert.assertEquals(funcotation.getField("dummy_ClinVar_VCF_CLNDISDB"), FuncotatorUtils.sanitizeFuncotationForVcf(gtString)); + Assert.assertEquals(funcotation.getField("dummy_ClinVar_VCF_CLNDISDB"), FuncotatorUtils.sanitizeFuncotationForVcf(gtString), "Field (dummy_ClinVar_VCF_CLNDISDB) was unsanititzed: " + funcotation.getField("dummy_ClinVar_VCF_CLNDISDB")); } } @@ -696,15 +737,14 @@ public void testCanAnnotateSpanningDeletions() { final FuncotatorArgumentDefinitions.OutputFormatType outputFormatType = FuncotatorArgumentDefinitions.OutputFormatType.VCF; final File outputFile = getOutputFile(outputFormatType); - final ArgumentsBuilder arguments = new ArgumentsBuilder(); - - arguments.addVCF(new File(SPANNING_DEL_VCF)); - arguments.addOutput(outputFile); - arguments.addReference(new File(hg38Chr3Ref)); - arguments.addArgument(FuncotatorArgumentDefinitions.DATA_SOURCES_PATH_LONG_NAME, DS_PIK3CA_DIR); - arguments.addArgument(FuncotatorArgumentDefinitions.REFERENCE_VERSION_LONG_NAME, FuncotatorTestConstants.REFERENCE_VERSION_HG38); - arguments.addArgument(FuncotatorArgumentDefinitions.TRANSCRIPT_SELECTION_MODE_LONG_NAME, TranscriptSelectionMode.ALL.toString()); - arguments.addArgument(FuncotatorArgumentDefinitions.OUTPUT_FORMAT_LONG_NAME, outputFormatType.toString()); + final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator( + SPANNING_DEL_VCF, + outputFile, + hg38Chr3Ref, + DS_PIK3CA_DIR, + FuncotatorTestConstants.REFERENCE_VERSION_HG38, + outputFormatType, + false); runCommandLine(arguments); @@ -743,14 +783,14 @@ public void testNoSpanningDeletionWriteWithMAF() { final FuncotatorArgumentDefinitions.OutputFormatType outputFormatType = FuncotatorArgumentDefinitions.OutputFormatType.MAF; final File outputFile = getOutputFile(outputFormatType); - final ArgumentsBuilder arguments = new ArgumentsBuilder(); - - arguments.addVCF(new File(SPANNING_DEL_VCF)); - arguments.addOutput(outputFile); - arguments.addReference(new File(hg38Chr3Ref)); - arguments.addArgument(FuncotatorArgumentDefinitions.DATA_SOURCES_PATH_LONG_NAME, DS_PIK3CA_DIR); - arguments.addArgument(FuncotatorArgumentDefinitions.REFERENCE_VERSION_LONG_NAME, FuncotatorTestConstants.REFERENCE_VERSION_HG38); - arguments.addArgument(FuncotatorArgumentDefinitions.OUTPUT_FORMAT_LONG_NAME, outputFormatType.toString()); + final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator( + SPANNING_DEL_VCF, + outputFile, + hg38Chr3Ref, + DS_PIK3CA_DIR, + FuncotatorTestConstants.REFERENCE_VERSION_HG38, + outputFormatType, + false); runCommandLine(arguments); @@ -766,17 +806,20 @@ public void testVCFToMAFPreservesFields() { final FuncotatorArgumentDefinitions.OutputFormatType outputFormatType = FuncotatorArgumentDefinitions.OutputFormatType.MAF; final File outputFile = getOutputFile(outputFormatType); - final ArgumentsBuilder arguments = new ArgumentsBuilder(); + final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator( + PIK3CA_VCF_HG19, + outputFile, + b37Chr3Ref, + DS_PIK3CA_DIR, + FuncotatorTestConstants.REFERENCE_VERSION_HG19, + outputFormatType, + false); - arguments.addVCF(new File(PIK3CA_VCF_HG19)); - arguments.addOutput(outputFile); - arguments.addReference(new File(b37Chr3Ref)); - arguments.addArgument(FuncotatorArgumentDefinitions.DATA_SOURCES_PATH_LONG_NAME, DS_PIK3CA_DIR); - arguments.addArgument(FuncotatorArgumentDefinitions.REFERENCE_VERSION_LONG_NAME, FuncotatorTestConstants.REFERENCE_VERSION_HG19); - arguments.addArgument(FuncotatorArgumentDefinitions.OUTPUT_FORMAT_LONG_NAME, outputFormatType.toString()); - arguments.addBooleanArgument(FuncotatorArgumentDefinitions.ALLOW_HG19_GENCODE_B37_CONTIG_MATCHING_LONG_NAME, true); arguments.addArgument(FuncotatorArgumentDefinitions.TRANSCRIPT_SELECTION_MODE_LONG_NAME, TranscriptSelectionMode.CANONICAL.toString()); + // Disable the sequence dictionary check for the tests: + arguments.addBooleanArgument(FuncotatorArgumentDefinitions.FORCE_B37_TO_HG19_REFERENCE_CONTIG_CONVERSION, true); + runCommandLine(arguments); final AnnotatedIntervalCollection maf = AnnotatedIntervalCollection.create(outputFile.toPath(), null); @@ -793,7 +836,7 @@ public void testVCFToMAFPreservesFields() { // Get all of the alias lists final Pair> vcf = VariantContextTestUtils.readEntireVCFIntoMemory(PIK3CA_VCF_HG19); final Set vcfHeaderInfoSet = vcf.getLeft().getInfoHeaderLines().stream() - .map(h -> h.getID()) + .map(VCFCompoundHeaderLine::getID) .map(s -> mafAliasMap.getOrDefault(s, Collections.singleton(s))) .flatMap(Set::stream) .collect(Collectors.toSet()); @@ -809,23 +852,26 @@ public void testVCFToVCFPreservesFields() { final FuncotatorArgumentDefinitions.OutputFormatType outputFormatType = FuncotatorArgumentDefinitions.OutputFormatType.VCF; final File outputFile = getOutputFile(outputFormatType); - final ArgumentsBuilder arguments = new ArgumentsBuilder(); + final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator( + PIK3CA_VCF_HG19, + outputFile, + b37Chr3Ref, + DS_PIK3CA_DIR, + FuncotatorTestConstants.REFERENCE_VERSION_HG19, + outputFormatType, + false); - arguments.addVCF(new File(PIK3CA_VCF_HG19)); - arguments.addOutput(outputFile); - arguments.addReference(new File(b37Chr3Ref)); - arguments.addArgument(FuncotatorArgumentDefinitions.DATA_SOURCES_PATH_LONG_NAME, DS_PIK3CA_DIR); - arguments.addArgument(FuncotatorArgumentDefinitions.REFERENCE_VERSION_LONG_NAME, FuncotatorTestConstants.REFERENCE_VERSION_HG19); - arguments.addArgument(FuncotatorArgumentDefinitions.OUTPUT_FORMAT_LONG_NAME, outputFormatType.toString()); - arguments.addBooleanArgument(FuncotatorArgumentDefinitions.ALLOW_HG19_GENCODE_B37_CONTIG_MATCHING_LONG_NAME, true); arguments.addArgument(FuncotatorArgumentDefinitions.TRANSCRIPT_SELECTION_MODE_LONG_NAME, TranscriptSelectionMode.CANONICAL.toString()); + // We need this argument since we are testing on a subset of b37 + arguments.addBooleanArgument(FuncotatorArgumentDefinitions.FORCE_B37_TO_HG19_REFERENCE_CONTIG_CONVERSION, true); + runCommandLine(arguments); final Pair> vcf = VariantContextTestUtils.readEntireVCFIntoMemory(outputFile.getAbsolutePath()); final Set vcfFields = vcf.getLeft().getInfoHeaderLines().stream() - .map(h -> h.getID()) + .map(VCFCompoundHeaderLine::getID) .collect(Collectors.toSet()); final Set missingFields = Sets.difference(PIK3CA_VCF_HG19_INPUT_FIELDS, vcfFields); @@ -859,15 +905,16 @@ public void testMafCustomCountFields(final String tnVcf) { final FuncotatorArgumentDefinitions.OutputFormatType outputFormatType = FuncotatorArgumentDefinitions.OutputFormatType.MAF; final File outputFile = getOutputFile(outputFormatType); - final ArgumentsBuilder arguments = new ArgumentsBuilder(); + final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator( + tnVcf, + outputFile, + b37Chr3Ref, + DS_PIK3CA_DIR, + FuncotatorTestConstants.REFERENCE_VERSION_HG19, + outputFormatType, + false); - arguments.addVCF(new File(tnVcf)); - arguments.addOutput(outputFile); - arguments.addReference(new File(b37Chr3Ref)); - arguments.addArgument(FuncotatorArgumentDefinitions.DATA_SOURCES_PATH_LONG_NAME, DS_PIK3CA_DIR); - arguments.addArgument(FuncotatorArgumentDefinitions.REFERENCE_VERSION_LONG_NAME, FuncotatorTestConstants.REFERENCE_VERSION_HG19); - arguments.addArgument(FuncotatorArgumentDefinitions.OUTPUT_FORMAT_LONG_NAME, outputFormatType.toString()); - arguments.addBooleanArgument(FuncotatorArgumentDefinitions.ALLOW_HG19_GENCODE_B37_CONTIG_MATCHING_LONG_NAME, true); + arguments.addBooleanArgument(FuncotatorArgumentDefinitions.FORCE_B37_TO_HG19_REFERENCE_CONTIG_CONVERSION, true); arguments.addArgument(FuncotatorArgumentDefinitions.TRANSCRIPT_SELECTION_MODE_LONG_NAME, TranscriptSelectionMode.CANONICAL.toString()); runCommandLine(arguments); @@ -914,15 +961,16 @@ public void testMafCustomCountFieldsTumorOnly(final String tnVcf) { final FuncotatorArgumentDefinitions.OutputFormatType outputFormatType = FuncotatorArgumentDefinitions.OutputFormatType.MAF; final File outputFile = getOutputFile(outputFormatType); - final ArgumentsBuilder arguments = new ArgumentsBuilder(); + final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator( + tnVcf, + outputFile, + b37Chr3Ref, + DS_PIK3CA_DIR, + FuncotatorTestConstants.REFERENCE_VERSION_HG19, + outputFormatType, + false); - arguments.addVCF(new File(tnVcf)); - arguments.addOutput(outputFile); - arguments.addReference(new File(b37Chr3Ref)); - arguments.addArgument(FuncotatorArgumentDefinitions.DATA_SOURCES_PATH_LONG_NAME, DS_PIK3CA_DIR); - arguments.addArgument(FuncotatorArgumentDefinitions.REFERENCE_VERSION_LONG_NAME, FuncotatorTestConstants.REFERENCE_VERSION_HG19); - arguments.addArgument(FuncotatorArgumentDefinitions.OUTPUT_FORMAT_LONG_NAME, outputFormatType.toString()); - arguments.addBooleanArgument(FuncotatorArgumentDefinitions.ALLOW_HG19_GENCODE_B37_CONTIG_MATCHING_LONG_NAME, true); + arguments.addBooleanArgument(FuncotatorArgumentDefinitions.FORCE_B37_TO_HG19_REFERENCE_CONTIG_CONVERSION, true); arguments.addArgument(FuncotatorArgumentDefinitions.TRANSCRIPT_SELECTION_MODE_LONG_NAME, TranscriptSelectionMode.CANONICAL.toString()); runCommandLine(arguments); @@ -967,18 +1015,43 @@ private void runSomaticVcf(final String vcfFile) { final FuncotatorArgumentDefinitions.OutputFormatType outputFormatType = FuncotatorArgumentDefinitions.OutputFormatType.MAF; final File outputFile = getOutputFile(outputFormatType); - final ArgumentsBuilder arguments = new ArgumentsBuilder(); + final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator( + vcfFile, + outputFile, + b37Chr3Ref, + DS_PIK3CA_DIR, + FuncotatorTestConstants.REFERENCE_VERSION_HG19, + outputFormatType, + false); - arguments.addVCF(new File(vcfFile)); - arguments.addOutput(outputFile); - arguments.addReference(new File(b37Chr3Ref)); - arguments.addArgument(FuncotatorArgumentDefinitions.DATA_SOURCES_PATH_LONG_NAME, DS_PIK3CA_DIR); - arguments.addArgument(FuncotatorArgumentDefinitions.REFERENCE_VERSION_LONG_NAME, FuncotatorTestConstants.REFERENCE_VERSION_HG19); - arguments.addArgument(FuncotatorArgumentDefinitions.OUTPUT_FORMAT_LONG_NAME, outputFormatType.toString()); - arguments.addBooleanArgument(FuncotatorArgumentDefinitions.ALLOW_HG19_GENCODE_B37_CONTIG_MATCHING_LONG_NAME, true); + arguments.addBooleanArgument(FuncotatorArgumentDefinitions.FORCE_B37_TO_HG19_REFERENCE_CONTIG_CONVERSION, true); arguments.addArgument(FuncotatorArgumentDefinitions.TRANSCRIPT_SELECTION_MODE_LONG_NAME, TranscriptSelectionMode.CANONICAL.toString()); runCommandLine(arguments); } + + @Test(expectedExceptions = UserException.IncompatibleSequenceDictionaries.class) + public void testSequenceDictionaryCheck() { + final FuncotatorArgumentDefinitions.OutputFormatType outputFormatType = FuncotatorArgumentDefinitions.OutputFormatType.VCF; + final File outputFile = getOutputFile(outputFormatType); + + // The input VCF and Reference file are incompatible because + // the reference file dictionary has only chromosome 2 and the + // input VCF has a dictionary that contains all contigs for HG19. + // Therefore the reference dictionary is NOT a superset of the input VCF dictionary. + + final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator( + XSV_CLINVAR_COL_TEST_VCF, + outputFile, + b37Chr2Ref, + DS_XSV_CLINVAR_TESTS, + FuncotatorTestConstants.REFERENCE_VERSION_HG19, + outputFormatType, + true); + + arguments.addBooleanArgument(FuncotatorArgumentDefinitions.FORCE_B37_TO_HG19_REFERENCE_CONTIG_CONVERSION, true); + + runCommandLine(arguments); + } } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/gencode/GencodeFuncotationFactoryUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/gencode/GencodeFuncotationFactoryUnitTest.java index 25cb411c2ed..e5525336094 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/gencode/GencodeFuncotationFactoryUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/gencode/GencodeFuncotationFactoryUnitTest.java @@ -70,7 +70,7 @@ public class GencodeFuncotationFactoryUnitTest extends GATKBaseTest { GencodeFuncotationFactory.DEFAULT_NAME, FuncotatorArgumentDefinitions.TRANSCRIPT_SELECTION_MODE_DEFAULT_VALUE, new HashSet<>(), - new LinkedHashMap<>(), false); + new LinkedHashMap<>()); } //================================================================================================================== @@ -1158,7 +1158,7 @@ void testMuc16SnpCreateFuncotations(final int chromosomeNumber, GencodeFuncotationFactory.DEFAULT_NAME, FuncotatorArgumentDefinitions.TRANSCRIPT_SELECTION_MODE_DEFAULT_VALUE, requestedTranscriptIds, - new LinkedHashMap<>(), true)) { + new LinkedHashMap<>())) { // Generate our funcotations: final List featureList = new ArrayList<>(); @@ -1215,7 +1215,7 @@ void createNonBasicFuncotations(final int start, final int end) { GencodeFuncotationFactory.DEFAULT_NAME, FuncotatorArgumentDefinitions.TRANSCRIPT_SELECTION_MODE_DEFAULT_VALUE, new HashSet<>(), - new LinkedHashMap<>(), false)) { + new LinkedHashMap<>())) { // Generate our funcotations: final List featureList = new ArrayList<>(); @@ -1285,7 +1285,7 @@ void testCreateFuncotations(final String expectedGeneName, GencodeFuncotationFactory.DEFAULT_NAME, FuncotatorArgumentDefinitions.TRANSCRIPT_SELECTION_MODE_DEFAULT_VALUE, requestedTranscriptIds, - new LinkedHashMap<>(), true)) { + new LinkedHashMap<>())) { final List featureList = new ArrayList<>(); featureList.add( gene ); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/funcotator/mafOutput/MafOutputRendererUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/funcotator/mafOutput/MafOutputRendererUnitTest.java index 31d5cf2226c..0e8d6f3d61b 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/funcotator/mafOutput/MafOutputRendererUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/funcotator/mafOutput/MafOutputRendererUnitTest.java @@ -108,7 +108,7 @@ private MafOutputRenderer createMafOutputRenderer(final File outputFile) { configData, new LinkedHashMap<>(), TranscriptSelectionMode.BEST_EFFECT, - new HashSet<>(), true + new HashSet<>() ); // Sort the datasources to ensure the same order every time: diff --git a/src/test/resources/large/funcotator/small_ds_pik3ca/dummy_clinvar_pik3ca/hg19/dummy_clinvar_hg19_pik3ca.vcf b/src/test/resources/large/funcotator/small_ds_pik3ca/dummy_clinvar_pik3ca/hg19/dummy_clinvar_hg19_pik3ca.vcf index b13df260408..0cfed940df1 100755 --- a/src/test/resources/large/funcotator/small_ds_pik3ca/dummy_clinvar_pik3ca/hg19/dummy_clinvar_hg19_pik3ca.vcf +++ b/src/test/resources/large/funcotator/small_ds_pik3ca/dummy_clinvar_pik3ca/hg19/dummy_clinvar_hg19_pik3ca.vcf @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:69c07c3c94f5ed87b879307bd7c3ea404c12527af300362b59c567a54463751f -size 135125 +oid sha256:8b00b13d5903fa283d5aa2e041699d282fc5790767d08e3ae70b645cce551ff1 +size 135686 diff --git a/src/test/resources/large/funcotator/small_ds_pik3ca/dummy_clinvar_pik3ca/hg19/dummy_clinvar_hg19_pik3ca.vcf.idx b/src/test/resources/large/funcotator/small_ds_pik3ca/dummy_clinvar_pik3ca/hg19/dummy_clinvar_hg19_pik3ca.vcf.idx old mode 100755 new mode 100644 index 2982cfd9471..d27bc5c17af --- a/src/test/resources/large/funcotator/small_ds_pik3ca/dummy_clinvar_pik3ca/hg19/dummy_clinvar_hg19_pik3ca.vcf.idx +++ b/src/test/resources/large/funcotator/small_ds_pik3ca/dummy_clinvar_pik3ca/hg19/dummy_clinvar_hg19_pik3ca.vcf.idx @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:14226ec0498bfa25c17d0b38d69029da4deb88ab40c611ab09a6f1f895400461 -size 89784 +oid sha256:c206317ba24f60b5fb1104639172be9eed5870e931332636f87a64492afd02f9 +size 89844