diff --git a/src/main/java/org/broadinstitute/hellbender/cmdline/argumentcollections/OptionalReadInputArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/cmdline/argumentcollections/OptionalReadInputArgumentCollection.java index 0c25df0bd9a..eb9dc86b062 100644 --- a/src/main/java/org/broadinstitute/hellbender/cmdline/argumentcollections/OptionalReadInputArgumentCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/cmdline/argumentcollections/OptionalReadInputArgumentCollection.java @@ -16,8 +16,12 @@ public final class OptionalReadInputArgumentCollection extends ReadInputArgumentCollection { private static final long serialVersionUID = 1L; - @Argument(fullName = StandardArgumentDefinitions.INPUT_LONG_NAME, shortName = StandardArgumentDefinitions.INPUT_SHORT_NAME, doc = "BAM/SAM/CRAM file containing reads", optional = true, common = true) - private List readFilesNames; + @Argument(fullName = StandardArgumentDefinitions.INPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.INPUT_SHORT_NAME, + doc = "BAM/SAM/CRAM file containing reads", + optional = true, + common = true) + private List readFilesNames = new ArrayList<>(); @Override public List getReadFiles() { diff --git a/src/main/java/org/broadinstitute/hellbender/engine/FeatureContext.java b/src/main/java/org/broadinstitute/hellbender/engine/FeatureContext.java index 324a53be269..34b02280b10 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/FeatureContext.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/FeatureContext.java @@ -1,13 +1,13 @@ package org.broadinstitute.hellbender.engine; +import com.google.common.annotations.VisibleForTesting; import htsjdk.tribble.Feature; +import org.broadinstitute.hellbender.cmdline.CommandLineProgram; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.Utils; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.List; +import java.nio.file.Path; +import java.util.*; import java.util.stream.Collectors; /** @@ -290,4 +290,32 @@ public List getValues(final Collection> f private List subsetToStartPosition(final Collection features, final int start) { return features.stream().filter(feat -> feat.getStart() == start).collect(Collectors.toList()); } + + /** + * Convenience method to create a new instance for test methods. + * This method should be used for testing only. + * + * @param featureInputsWithType {@link Map} of a {@link FeatureInput} to the output type that must extend {@link Feature}. + * Never {@code null}, but empty list is acceptable. + * @param dummyToolInstanceName A name to use for the "tool". Any string will work here. Never {@code null}. + * @param interval genomic interval for the result. Typically, this would be the interval of the variant. Never {@link null}. + * @param featureQueryLookahead When querying FeatureDataSources, cache this many extra bases of context beyond + * the end of query intervals in anticipation of future queries. Must be >= 0. If uncertain, use zero. + * @param cloudPrefetchBuffer See {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, Path)} If uncertain, use zero. + * @param cloudIndexPrefetchBuffer See {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, Path)} If uncertain, use zero. + * @param reference See {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, Path)} If uncertain, use {@code null}. + */ + @VisibleForTesting + public static FeatureContext createFeatureContextForTesting(final Map, Class> featureInputsWithType, final String dummyToolInstanceName, + final SimpleInterval interval, final int featureQueryLookahead, final int cloudPrefetchBuffer, + final int cloudIndexPrefetchBuffer, final Path reference) { + Utils.nonNull(featureInputsWithType); + Utils.nonNull(dummyToolInstanceName); + Utils.nonNull(interval); + + final FeatureManager featureManager = new FeatureManager(featureInputsWithType, dummyToolInstanceName, + featureQueryLookahead, cloudPrefetchBuffer, cloudIndexPrefetchBuffer, reference); + + return new FeatureContext(featureManager, interval); + } } diff --git a/src/main/java/org/broadinstitute/hellbender/engine/FeatureInput.java b/src/main/java/org/broadinstitute/hellbender/engine/FeatureInput.java index 017da21e7fd..320f9c6ef94 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/FeatureInput.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/FeatureInput.java @@ -25,6 +25,9 @@ * system only in order to be recognized by the Feature management system. This is why the constructor is * marked as protected. * + * If you still want to instantiate this class directly, you will have to call {@link GATKTool#addFeatureInputsAfterInitialization(String, String, Class, int)} + * in order to register the FeatureInput with the engine. + * * FeatureInputs can be assigned logical names on the command line using the syntax: * * --argument_name logical_name:feature_file diff --git a/src/main/java/org/broadinstitute/hellbender/engine/FeatureManager.java b/src/main/java/org/broadinstitute/hellbender/engine/FeatureManager.java index 8bcdda19e34..d44c7769db7 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/FeatureManager.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/FeatureManager.java @@ -1,5 +1,6 @@ package org.broadinstitute.hellbender.engine; +import com.google.common.annotations.VisibleForTesting; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.tribble.Feature; import htsjdk.tribble.FeatureCodec; @@ -14,6 +15,7 @@ import org.broadinstitute.hellbender.exceptions.GATKException; import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.config.ConfigFactory; import org.broadinstitute.hellbender.utils.config.GATKConfig; @@ -153,6 +155,30 @@ public FeatureManager(final CommandLineProgram toolInstance, final int featureQu initializeFeatureSources(featureQueryLookahead, toolInstance, cloudPrefetchBuffer, cloudIndexPrefetchBuffer, reference); } + /** + * Same as {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, Path)}, except used when the + * FeatureInputs (and associated types) are known. + * + * This constructor should only be used in test code. + * + * @param featureInputsToTypeMap {@link Map} of a {@link FeatureInput} to the output type that must extend {@link Feature}. Never {@code null} + * @param toolInstanceName See {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, Path)} + * @param featureQueryLookahead See {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, Path)} + * @param cloudPrefetchBuffer See {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, Path)} + * @param cloudIndexPrefetchBuffer See {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, Path)} + * @param reference See {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, Path)} + */ + @VisibleForTesting + FeatureManager(final Map, Class> featureInputsToTypeMap, final String toolInstanceName, final int featureQueryLookahead, final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer, final Path reference) { + + Utils.nonNull(featureInputsToTypeMap); + + this.toolInstanceSimpleClassName = toolInstanceName; + this.featureSources = new LinkedHashMap<>(); + Utils.nonNull(featureInputsToTypeMap); + featureInputsToTypeMap.forEach((k,v) -> addToFeatureSources(featureQueryLookahead, k, v, cloudPrefetchBuffer, cloudIndexPrefetchBuffer, reference)); + } + /** * Given our tool instance, discover all argument of type FeatureInput (or Collections thereof), determine * the type of each Feature-containing file, and add a FeatureDataSource for each file to our query pool. diff --git a/src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java b/src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java index 07c65fadf93..2c1adabad34 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java @@ -863,22 +863,8 @@ record = header.getProgramRecord(pgID); } /** - * Call {@link GATKTool#addFeatureInputsAfterInitialization(String, String, Class, int)} with no caching. - * - * @param filePath See {@link #addFeatureInputsAfterInitialization(String, String, Class, int)} - * @param name See {@link #addFeatureInputsAfterInitialization(String, String, Class, int)} - * @param featureType See {@link #addFeatureInputsAfterInitialization(String, String, Class, int)} - * @return The {@link FeatureInput} used as the key for this data source. - */ - protected FeatureInput addFeatureInputsAfterInitialization(final String filePath, final String name, - final Class featureType) { - - return addFeatureInputsAfterInitialization(filePath, name, featureType, 0); - } - - /** - * A method to allow a user to inject data sources after initialization that were not specified as command-line - * arguments. + * A method to allow a user to inject {@link FeatureInput}s after initialization that were not + * specified as command-line arguments. * * @param filePath path to the Feature file to register * @param name what to call the Feature input @@ -886,14 +872,15 @@ protected FeatureInput addFeatureInputsAfterInitialization(fi * @param featureQueryLookahead look ahead this many bases during queries that produce cache misses * @return The {@link FeatureInput} used as the key for this data source. */ - protected FeatureInput addFeatureInputsAfterInitialization(final String filePath, - final String name, - final Class featureType, final int featureQueryLookahead) { + public FeatureInput addFeatureInputsAfterInitialization(final String filePath, + final String name, + final Class featureType, + final int featureQueryLookahead) { final FeatureInput featureInput = new FeatureInput<>(filePath, name); - //Add datasource to the feature manager too so that it can be queried. Setting lookahead to 0 to avoid caching. - //Note: we are disabling lookahead here because of windowed queries that need to "look behind" as well. + // Add the FeatureInput to our FeatureManager so that it will be available for FeatureContext queries + // from the tool features.addToFeatureSources( featureQueryLookahead, featureInput, diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/DataSourceFuncotationFactory.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/DataSourceFuncotationFactory.java index 9729cf96aaa..ea3abe44152 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/DataSourceFuncotationFactory.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/DataSourceFuncotationFactory.java @@ -5,12 +5,14 @@ import htsjdk.variant.variantcontext.VariantContext; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import org.broadinstitute.barclay.utils.Utils; +import org.broadinstitute.hellbender.engine.FeatureContext; +import org.broadinstitute.hellbender.engine.FeatureInput; import org.broadinstitute.hellbender.engine.ReferenceContext; import org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation; import java.io.Closeable; import java.util.*; -import java.util.stream.Collectors; /** * An abstract class to allow for the creation of a {@link Funcotation} for a given data source. @@ -37,6 +39,33 @@ public abstract class DataSourceFuncotationFactory implements Closeable { */ protected Map annotationOverrideMap; + /** + * The backing data store as a FeatureInput to leverage tribble querying. Can be {@code null} for non-locatable + * funcotation factories. + */ + protected final FeatureInput mainSourceFileAsFeatureInput; + + @VisibleForTesting + public FeatureInput getMainSourceFileAsFeatureInput() { + return mainSourceFileAsFeatureInput; + } + + /** + * Constructor to initialize final fields in this class with defaults. + */ + protected DataSourceFuncotationFactory() { + this.mainSourceFileAsFeatureInput = null; + } + + /** + * Constructor to initialize final fields in this class. + * @param mainSourceFileAsFeatureInput The backing data store as a FeatureInput to leverage tribble querying. Can be {@code null} for non-locatable funcotation factories. + */ + protected DataSourceFuncotationFactory(final FeatureInput mainSourceFileAsFeatureInput) { + this.mainSourceFileAsFeatureInput = mainSourceFileAsFeatureInput; + } + + /** * Set values in {@link DataSourceFuncotationFactory#annotationOverrideMap} based on the given annotation override values * and whether or not this {@link DataSourceFuncotationFactory} supports those annotations. @@ -106,27 +135,33 @@ public String getVersion() { * Accounts for override values passed into the constructor as well. * @param variant {@link VariantContext} to annotate. * @param referenceContext {@link ReferenceContext} corresponding to the given {@code variant}. - * @param featureSourceMap {@link Map} of {@link String} -> {@link List} of {@link Feature} (data source name -> data source features corresponding to the given {@code variant}. + * @param featureContext {@link FeatureContext} corresponding to the variant. Never {@code null}. * @return {@link List} of {@link Funcotation} given the {@code variant}, {@code referenceContext}, and {@code featureContext}. This should never be empty. */ - public List createFuncotations(final VariantContext variant, final ReferenceContext referenceContext, final Map> featureSourceMap) { - return createFuncotations(variant, referenceContext, featureSourceMap, null); + public List createFuncotations(final VariantContext variant, final ReferenceContext referenceContext, final FeatureContext featureContext) { + return createFuncotations(variant, referenceContext, featureContext, null); } /** * Creates a {@link List} of {@link Funcotation} for the given {@code variant}, {@code referenceContext}, {@code featureContext}, and {@code gencodeFuncotations}. * For some Data Sources knowledge of Gene Name or Transcript ID is required for annotation. * Accounts for override values passed into the constructor as well. - * @param variant {@link VariantContext} to annotate. - * @param referenceContext {@link ReferenceContext} corresponding to the given {@code variant}. - * @param featureSourceMap {@link Map} of {@link String} -> {@link List} of {@link Feature} (data source name -> data source features corresponding to the given {@code variant}. + * @param variant {@link VariantContext} to annotate. Never {@code null}. + * @param referenceContext {@link ReferenceContext} corresponding to the given {@code variant}. Never {@code null}. + * @param featureContext {@link FeatureContext} corresponding to the variant. Never {@code null}. * @param gencodeFuncotations {@link List} of {@link GencodeFuncotation} that have already been created for the given {@code variant}/{@code referenceContext}/{@code featureContext}. + * {@code null} is acceptable if there are no corresponding gencode funcotations. * @return {@link List} of {@link Funcotation} given the {@code variant}, {@code referenceContext}, and {@code featureContext}. This should never be empty. */ - public List createFuncotations(final VariantContext variant, final ReferenceContext referenceContext, final Map> featureSourceMap, final List gencodeFuncotations) { + public List createFuncotations(final VariantContext variant, final ReferenceContext referenceContext, final FeatureContext featureContext, final List gencodeFuncotations) { + + Utils.nonNull(variant); + Utils.nonNull(referenceContext); + Utils.nonNull(featureContext); - // Get the features that this funcotation factory is responsible for: - final List featureList = getFeatureListFromMap(featureSourceMap); + // Query this funcotation factory to get the list of overlapping features. + @SuppressWarnings("unchecked") + final List featureList = (List) featureContext.getValues(mainSourceFileAsFeatureInput); final List outputFuncotations; @@ -175,30 +210,6 @@ private boolean isFeatureListCompatible(final List featureList) { return foundCompatibleFeature; } - /** - * Get the list of features to annotate from the given Map of features. - * Extracts the feature list given the name of this {@link DataSourceFuncotationFactory}. - * @param featureSourceMap {@link Map} of {@link String} -> ({@link List} of {@link Feature}) (Data source name -> feature list) containing all features that could be used for this {@link DataSourceFuncotationFactory}. - * @return A {@link List} of {@link Feature} that are to be annotated by this {@link DataSourceFuncotationFactory} - */ - private List getFeatureListFromMap(final Map> featureSourceMap) { - // Get the features that this funcotation factory is responsible for: - final List featureList; - - // Only worry about name filtering if we care about the specific feature type: - // NOTE: This should probably be fixed to key off some other abstract class logic. - if ( getAnnotationFeatureClass().equals(Feature.class) ) { - featureList = featureSourceMap.entrySet().stream() - .map(Map.Entry::getValue) - .flatMap(Collection::stream) - .collect(Collectors.toList()); - } - else { - featureList = featureSourceMap.getOrDefault( getName(), Collections.emptyList() ); - } - return featureList; - } - /** * Creates a {@link List} of {@link Funcotation} for the given {@code variant} and {@code referenceContext}. * These will be default funcotations that essentially have empty values. @@ -234,5 +245,6 @@ protected abstract List createFuncotationsOnVariant(final VariantCo /** * @return Get the {@link Class} of the feature type that can be used to create annotations by this {@link DataSourceFuncotationFactory}. */ - protected abstract Class getAnnotationFeatureClass(); + @VisibleForTesting + public abstract Class getAnnotationFeatureClass(); } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotationMap.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotationMap.java index a94723178c8..08b0789cc28 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotationMap.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotationMap.java @@ -61,7 +61,9 @@ public List get(final String transcriptId) { * @param transcriptId the specified transcript ID. Use {@see NO_TRANSCRIPT_AVAILABLE_KEY} if there are no transcripts. Never {@code null} * @param fieldName The field name to search. Never {@code null} * @param allele Only return fields from funcotations with the specified allele. Never {@code null} - * @return Value of the given field for the transcript ID and allele. Return {@code null} if field not found. + * @return Value of the given field for the transcript ID and allele. Return {@code null} if field not found in any + * funcotation. Note that if the funcotations support the given field name, but the variant did not overlap any + * records, an empty string will be returned. */ public String getFieldValue(final String transcriptId, final String fieldName, final Allele allele) { Utils.nonNull(transcriptId); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/Funcotator.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/Funcotator.java index 75d78301747..d591da1f2e9 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/Funcotator.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/Funcotator.java @@ -1,41 +1,26 @@ package org.broadinstitute.hellbender.tools.funcotator; import htsjdk.samtools.SAMSequenceDictionary; -import htsjdk.tribble.Feature; -import htsjdk.tribble.util.ParsingUtils; import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.vcf.VCFHeader; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; -import org.broadinstitute.barclay.argparser.*; +import org.broadinstitute.barclay.argparser.ArgumentCollection; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; import org.broadinstitute.barclay.help.DocumentedFeature; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.engine.*; import org.broadinstitute.hellbender.engine.filters.VariantFilter; -import org.broadinstitute.hellbender.exceptions.GATKException; import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.tools.funcotator.dataSources.DataSourceUtils; -import org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation; -import org.broadinstitute.hellbender.tools.funcotator.mafOutput.MafOutputRenderer; -import org.broadinstitute.hellbender.tools.funcotator.metadata.FuncotationMetadata; import org.broadinstitute.hellbender.tools.funcotator.metadata.VcfFuncotationMetadata; -import org.broadinstitute.hellbender.tools.funcotator.vcfOutput.VcfOutputRenderer; import org.broadinstitute.hellbender.transformers.VariantTransformer; import org.broadinstitute.hellbender.utils.SequenceDictionaryUtils; -import org.broadinstitute.hellbender.utils.SimpleInterval; -import org.broadinstitute.hellbender.utils.codecs.gencode.GencodeGtfFeature; -import org.broadinstitute.hellbender.utils.codecs.xsvLocatableTable.XsvTableFeature; -import org.broadinstitute.hellbender.utils.io.IOUtils; import picard.cmdline.programgroups.VariantEvaluationProgramGroup; -import java.io.BufferedReader; -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; import java.nio.file.Path; import java.util.*; -import java.util.stream.Collectors; -import java.util.stream.Stream; /** * Funcotator (FUNCtional annOTATOR) analyzes given variants for their function (as retrieved from a set of data sources) and produces the analysis in a specified output file. @@ -220,105 +205,23 @@ public class Funcotator extends VariantWalker { //================================================================================================================== // Arguments: - //----------------------------------------------------- - // Required args: - - @Argument( - shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, - fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, - doc = "Output VCF file to which annotated variants should be written.") - protected File outputFile; - - @Argument( - fullName = FuncotatorArgumentDefinitions.REFERENCE_VERSION_LONG_NAME, - doc = "The version of the Human Genome reference to use (e.g. hg19, hg38, etc.). This will correspond to a sub-folder of each data source corresponding to that data source for the given reference." - ) - private String referenceVersion; - - @Argument( - fullName = FuncotatorArgumentDefinitions.DATA_SOURCES_PATH_LONG_NAME, - doc = "The path to a data source folder for Funcotator. May be specified more than once to handle multiple data source folders." - ) - private List dataSourceDirectories; - - @Argument( - fullName = FuncotatorArgumentDefinitions.OUTPUT_FORMAT_LONG_NAME, - doc = "The output file format. Either VCF or MAF. Please note that MAF output for germline use case VCFs is unsupported." - ) - private FuncotatorArgumentDefinitions.OutputFormatType outputFormatType; - - //----------------------------------------------------- - // Optional args: - - @Argument( - fullName = FuncotatorArgumentDefinitions.REMOVE_FILTERED_VARIANTS_LONG_NAME, - optional = true, - doc = "Ignore/drop variants that have been filtered in the input. These variants will not appear in the output file." - ) - private boolean removeFilteredVariants = false; - - @Argument( - fullName = FuncotatorArgumentDefinitions.TRANSCRIPT_SELECTION_MODE_LONG_NAME, - optional = true, - doc = "Method of detailed transcript selection. This will select the transcript for detailed annotation (CANONICAL, ALL, or BEST_EFFECT)." - ) - private TranscriptSelectionMode transcriptSelectionMode = FuncotatorArgumentDefinitions.TRANSCRIPT_SELECTION_MODE_DEFAULT_VALUE; - - @Argument( - fullName = FuncotatorArgumentDefinitions.TRANSCRIPT_LIST_LONG_NAME, - optional = true, - doc = "File to use as a list of transcripts (one transcript ID per line, version numbers are ignored) OR A set of transcript IDs to use for annotation to override selected transcript." - ) - private Set userTranscriptIdSet = new HashSet<>(); - - @Argument( - fullName = FuncotatorArgumentDefinitions.ANNOTATION_DEFAULTS_LONG_NAME, - optional = true, - doc = "Annotations to include in all annotated variants if the annotation is not specified in the data sources (in the format :). This will add the specified annotation to every annotated variant if it is not already present." - ) - private List annotationDefaults = new ArrayList<>(); - - @Argument( - fullName = FuncotatorArgumentDefinitions.ANNOTATION_OVERRIDES_LONG_NAME, - optional = true, - doc = "Override values for annotations (in the format :). Replaces existing annotations of the given name with given values." - ) - private List annotationOverrides = new ArrayList<>(); - - @Argument( - fullName = FuncotatorArgumentDefinitions.LOOKAHEAD_CACHE_IN_BP_NAME, - optional = true, - minValue = 0, - doc = "Number of base-pairs to cache when querying variants." - ) - private int lookaheadFeatureCachingInBp = FuncotatorArgumentDefinitions.LOOKAHEAD_CACHE_IN_BP_DEFAULT_VALUE; - - @Advanced - @Hidden - @Argument( - fullName = FuncotatorArgumentDefinitions.FORCE_B37_TO_HG19_REFERENCE_CONTIG_CONVERSION, - optional = true, - doc = "(Advanced / DO NOT USE*) If you select this flag, Funcotator will force a conversion of variant contig names from b37 to hg19. *This option is useful in integration tests (written by devs) only." - ) - private boolean forceB37ToHg19ContigNameConversion = false; + @ArgumentCollection + private final FuncotatorArgumentCollection funcotatorArgs = new FuncotatorArgumentCollection(); //================================================================================================================== private OutputRenderer outputRenderer; - private final List dataSourceFactories = new ArrayList<>(); - private final List> manualLocatableFeatureInputs = new ArrayList<>(); + private FuncotatorEngine funcotatorEngine; + + //================================================================================================================== /** - * Whether the input variant contigs must be converted to hg19. - * This is only the case when the input reference is b37 AND when - * the reference version is hg19 (i.e. {@link #referenceVersion} == {@link FuncotatorArgumentDefinitions#HG19_REFERENCE_VERSION_STRING}). + * @return The {@link Funcotator}-specific arguments used to instantiate this {@link Funcotator} instance. */ - private boolean mustConvertInputContigsToHg19 = false; - - private FuncotationMetadata inputMetadata; - - //================================================================================================================== + public FuncotatorArgumentCollection getArguments() { + return funcotatorArgs; + } @Override protected String getVersion() { @@ -339,56 +242,51 @@ public void onTraversalStart() { } // Next set up our transcript list: - userTranscriptIdSet = processTranscriptList(userTranscriptIdSet); + final Set finalUserTranscriptIdSet = FuncotatorEngine.processTranscriptList(funcotatorArgs.userTranscriptIdSet); + + // Get our overrides for annotations: + final LinkedHashMap annotationDefaultsMap = FuncotatorEngine.splitAnnotationArgsIntoMap(funcotatorArgs.annotationDefaults); + final LinkedHashMap annotationOverridesMap = FuncotatorEngine.splitAnnotationArgsIntoMap(funcotatorArgs.annotationOverrides); - final LinkedHashMap annotationDefaultsMap = splitAnnotationArgsIntoMap(annotationDefaults); - final LinkedHashMap annotationOverridesMap = splitAnnotationArgsIntoMap(annotationOverrides); + // Get the header for our variants: + final VCFHeader vcfHeader = getHeaderForVariants(); // Initialize all of our data sources: // Sort data sources to make them process in the same order each time: - dataSourceDirectories.sort(Comparator.naturalOrder()); - final Map configData = DataSourceUtils.getAndValidateDataSourcesFromPaths(referenceVersion, dataSourceDirectories); - initializeManualFeaturesForLocatableDataSources(configData); - dataSourceFactories.addAll( - DataSourceUtils.createDataSourceFuncotationFactoriesForDataSources(configData, annotationOverridesMap, transcriptSelectionMode, userTranscriptIdSet) + funcotatorArgs.dataSourceDirectories.sort(Comparator.naturalOrder()); + final Map configData = DataSourceUtils.getAndValidateDataSourcesFromPaths(funcotatorArgs.referenceVersion, funcotatorArgs.dataSourceDirectories); + + // Create the data sources from the input: + // This will also create and register the FeatureInputs (created by the Data Sources) + // with the GATK Engine, so we do not have to plumb them in after the fact. + final List dataSourceFuncotationFactories = DataSourceUtils.createDataSourceFuncotationFactoriesForDataSources( + configData, + annotationOverridesMap, + funcotatorArgs.transcriptSelectionMode, + finalUserTranscriptIdSet, + this, + funcotatorArgs.lookaheadFeatureCachingInBp ); - // Sort our data source factories to ensure they're always in the same order: gencode datasources first - dataSourceFactories.sort(DataSourceUtils::datasourceComparator); - - // Create the metadata directly from the input. - inputMetadata = VcfFuncotationMetadata.create(new ArrayList<>(getHeaderForVariants().getInfoHeaderLines())); - - // Determine which annotations are accounted for (by the funcotation factories) and which are not. - final LinkedHashMap unaccountedForDefaultAnnotations = getUnaccountedForAnnotations( dataSourceFactories, annotationDefaultsMap ); - final LinkedHashMap unaccountedForOverrideAnnotations = getUnaccountedForAnnotations( dataSourceFactories, annotationOverridesMap ); - - // Set up our output renderer: - switch (outputFormatType) { - case MAF: - outputRenderer = new MafOutputRenderer(outputFile.toPath(), - dataSourceFactories, - getHeaderForVariants(), - unaccountedForDefaultAnnotations, - unaccountedForOverrideAnnotations, - getDefaultToolVCFHeaderLines().stream().map(Object::toString).collect(Collectors.toCollection(LinkedHashSet::new)), - referenceVersion); - break; - case VCF: - outputRenderer = new VcfOutputRenderer(createVCFWriter(outputFile), - dataSourceFactories, - getHeaderForVariants(), - unaccountedForDefaultAnnotations, - unaccountedForOverrideAnnotations, - getDefaultToolVCFHeaderLines()); - break; - default: - throw new GATKException("Unsupported output format type specified: " + outputFormatType.toString()); - } - logger.info("Creating a " + outputFormatType + " file for output: " + outputFile.toURI()); + // Create our engine to do our work and drive this Funcotation train! + funcotatorEngine = new FuncotatorEngine( + funcotatorArgs, + getSequenceDictionaryForDrivingVariants(), + VcfFuncotationMetadata.create( + new ArrayList<>(vcfHeader.getInfoHeaderLines()) + ), + dataSourceFuncotationFactories + ); - // Check for reference version (in)compatibility: - determineReferenceAndDatasourceCompatibility(); + // Create our output renderer: + logger.info("Creating a " + funcotatorArgs.outputFormatType + " file for output: " + funcotatorArgs.outputFile.toURI()); + outputRenderer = funcotatorEngine.createOutputRenderer( + annotationDefaultsMap, + annotationOverridesMap, + vcfHeader, + getDefaultToolVCFHeaderLines(), + this + ); } /** @@ -418,74 +316,22 @@ private void checkReferenceDictionaryIsSupersetOfVariantDictionary() { ); } - private void determineReferenceAndDatasourceCompatibility() { - if ( forceB37ToHg19ContigNameConversion || - ( referenceVersion.equals(FuncotatorArgumentDefinitions.HG19_REFERENCE_VERSION_STRING) && - FuncotatorUtils.isSequenceDictionaryUsingB37Reference(getSequenceDictionaryForDrivingVariants()) )) { - - // NOTE AND WARNING: - // hg19 is from ucsc. b37 is from the genome reference consortium. - // ucsc decided the grc version had some bad data in it, so they blocked out some of the bases, aka "masked" them - // so the lengths of the contigs are the same, the bases are just _slightly_ different. - // ALSO, the contig naming convention is different between hg19 and hg38: - // hg19 uses contigs of the form "chr1" - // b37 uses contigs of the form "1" - // This naming convention difference causes a LOT of issues and was a bad idea. - - logger.warn("WARNING: You are using B37 as a reference. " + - "Funcotator will convert your variants to GRCh37, and this will be fine in the vast majority of cases. " + - "There MAY be some errors (e.g. in the Y chromosome, but possibly in other places as well) due to changes between the two references."); - - mustConvertInputContigsToHg19 = true; - } - } - - private VariantContext getCorrectVariantContextForReference(final VariantContext variant) { - if ( mustConvertInputContigsToHg19 ) { - final VariantContextBuilder vcb = new VariantContextBuilder(variant); - vcb.chr(FuncotatorUtils.convertB37ContigToHg19Contig(variant.getContig())); - return vcb.make(); - } - else { - return variant; - } - } - @Override protected VariantFilter makeVariantFilter() { - return variant -> { - // Ignore variants that have been filtered if the user requests it: - if ( removeFilteredVariants && variant.isFiltered() ) { - return false; - } - return true; - }; + return funcotatorEngine.makeVariantFilter(); } @Override public VariantTransformer makePostVariantFilterTransformer(){ - return variantContext -> getCorrectVariantContextForReference(variantContext); + return funcotatorEngine.getDefaultVariantTransformer(); } @Override public void apply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext) { - final ReferenceContext correctReferenceContext; - - // Check to see if we need to revert the ReferenceContext's interval to the original variant interval - // (This would only happen in the case where we were given b37 variants with hg19 data sources): - if ( mustConvertInputContigsToHg19 ) { - - // Convert our contig back to B37 here so it matches the variant: - final SimpleInterval interval = new SimpleInterval( - FuncotatorUtils.convertHG19ContigToB37Contig(variant.getContig()), variant.getStart(), variant.getEnd() - ); - - correctReferenceContext = new ReferenceContext(referenceContext, interval); - } - else { - correctReferenceContext = referenceContext; - } + // Get the correct reference for B37/HG19 compliance: + // This is necessary because of the variant transformation that gets applied in VariantWalkerBase::apply. + final ReferenceContext correctReferenceContext = funcotatorEngine.getCorrectReferenceContext(variant, referenceContext); // Place the variant on our queue to be funcotated: enqueueAndHandleVariant(variant, correctReferenceContext, featureContext); @@ -498,48 +344,17 @@ public Object onTraversalSuccess() { @Override public void closeTool() { - - for ( final DataSourceFuncotationFactory factory : dataSourceFactories ) { - if ( factory != null ) { - factory.close(); - } + if ( funcotatorEngine != null) { + funcotatorEngine.close(); } + if ( outputRenderer != null ) { outputRenderer.close(); } - } //================================================================================================================== - /** - * Creates a {@link LinkedHashMap} of annotations in the given {@code annotationMap} that do not occur in the given {@code dataSourceFactories}. - * @param dataSourceFactories {@link List} of {@link DataSourceFuncotationFactory} to check for whether each annotation in the {@code annotationMap} is handled. - * @param annotationMap {@link Map} (of ANNOTATION_NAME : ANNOTATION_VALUE) to check - * @return A {@link LinkedHashMap} of annotations in the given {@code annotationMap} that do not occur in the given {@code dataSourceFactories}. - */ - private LinkedHashMap getUnaccountedForAnnotations( final List dataSourceFactories, - final Map annotationMap ) { - final LinkedHashMap outAnnotations = new LinkedHashMap<>(); - - // Check each field in each factory: - for ( final String field : annotationMap.keySet() ) { - boolean accountedFor = false; - for ( final DataSourceFuncotationFactory funcotationFactory : dataSourceFactories ) { - - if ( funcotationFactory.getSupportedFuncotationFields().contains(field) ) { - accountedFor = true; - break; - } - } - if ( !accountedFor ) { - outAnnotations.put(field, annotationMap.get(field)); - } - } - - return outAnnotations; - } - /** * Creates an annotation on the given {@code variant} or enqueues it to be processed during a later call to this method. * @param variant {@link VariantContext} to annotate. @@ -548,170 +363,9 @@ private LinkedHashMap getUnaccountedForAnnotations( final List> featureSourceMap = new HashMap<>(); - - for ( final FeatureInput featureInput : manualLocatableFeatureInputs ) { - @SuppressWarnings("unchecked") - final List featureList = (List)featureContext.getValues(featureInput); - featureSourceMap.put( featureInput.getName(), featureList ); - } - - //============================================================================================================== - // First create only the transcript (Gencode) funcotations: - - if (retrieveGencodeFuncotationFactoryStream().count() > 1) { - logger.warn("Attempting to annotate with more than one GENCODE datasource. If these have overlapping transcript IDs, errors may occur."); - } - - final List transcriptFuncotations = retrieveGencodeFuncotationFactoryStream() - .map(gf -> gf.createFuncotations(variant, referenceContext, featureSourceMap)) - .flatMap(List::stream) - .map(gf -> (GencodeFuncotation) gf).collect(Collectors.toList()); - - //============================================================================================================== - // Create the funcotations for non-Gencode data sources: - - // Create a place to keep our funcotations: - final FuncotationMap funcotationMap = FuncotationMap.createFromGencodeFuncotations(transcriptFuncotations); - - // Perform the rest of the annotation. Note that this code manually excludes the Gencode Funcotations. - for (final DataSourceFuncotationFactory funcotationFactory : dataSourceFactories ) { - - // Note that this guarantees that we do not add GencodeFuncotations a second time. - if (!funcotationFactory.getType().equals(FuncotatorArgumentDefinitions.DataSourceType.GENCODE)) { - final List txIds = funcotationMap.getTranscriptList(); - - for (final String txId: txIds) { - funcotationMap.add(txId, funcotationFactory.createFuncotations(variant, referenceContext, featureSourceMap, funcotationMap.getGencodeFuncotations(txId))); - } - } - } - - //============================================================================================================== - // Create the funcotations for the input and add to all txID mappings. - - final List txIds = funcotationMap.getTranscriptList(); - - for (final String txId: txIds) { - funcotationMap.add(txId, FuncotatorUtils.createFuncotations(variant, inputMetadata, FuncotatorConstants.DATASOURCE_NAME_FOR_INPUT_VCFS)); - } + final FuncotationMap funcotationMap = funcotatorEngine.createFuncotationMapForVariant(variant, referenceContext, featureContext); // At this point there is only one transcript ID in the funcotation map if canonical or best effect are selected outputRenderer.write(variant, funcotationMap); } - - private Stream retrieveGencodeFuncotationFactoryStream() { - return dataSourceFactories.stream() - .filter(f -> f.getType().equals(FuncotatorArgumentDefinitions.DataSourceType.GENCODE)); - } - - /** - * Split each element of the given {@link List} into a key and value. - * Assumes each element of the given {@link List} is formatted as follows: - * KEY:VALUE - * @param annotationArgs {@link List} of strings formatted KEY:VALUE to turn into a {@link Map}. - * @return A {@link LinkedHashMap} of KEY:VALUE pairs corresponding to entries in the given list. - */ - private LinkedHashMap splitAnnotationArgsIntoMap( final List annotationArgs ) { - - final LinkedHashMap annotationMap = new LinkedHashMap<>(); - - for ( final String s : annotationArgs ) { - final List keyVal = ParsingUtils.split(s, FuncotatorArgumentDefinitions.MAP_NAME_VALUE_DELIMITER); - if ( keyVal.size() != 2) { - throw new UserException.BadInput( "Argument annotation incorrectly formatted: " + s ); - } - - annotationMap.put( keyVal.get(0), keyVal.get(1) ); - } - - return annotationMap; - } - - private void initializeManualFeaturesForLocatableDataSources(final Map metaData) { - for ( final Map.Entry entry : metaData.entrySet() ) { - - logger.debug("Initializing Features for: " + entry.getValue().getProperty("name") + " ..."); - - // Note: we need no default case since we know these are valid: - final String stringType = entry.getValue().getProperty("type"); - switch ( FuncotatorArgumentDefinitions.DataSourceType.getEnum(stringType) ) { - case LOCATABLE_XSV: - // Add our features manually so we can match over them: - addFeaturesForLocatableDataSource(entry.getKey(), entry.getValue(), XsvTableFeature.class); - break; - case GENCODE: - // Add our features manually so we can match over them: - addFeaturesForLocatableDataSource(entry.getKey(), entry.getValue(), GencodeGtfFeature.class); - break; - case VCF: - // Add our features manually so we can match over them: - addFeaturesForLocatableDataSource(entry.getKey(), entry.getValue(), VariantContext.class); - break; - // Non-locatable data source types go here: - case SIMPLE_XSV: - case COSMIC: - break; - default: - throw new GATKException("Non-locatable type of DataSourceFuncotationFactory encountered: " + stringType ); - } - } - } - - private void addFeaturesForLocatableDataSource(final Path dataSourceFile, - final Properties dataSourceProperties, - final Class featureClazz) { - - final String name = dataSourceProperties.getProperty(DataSourceUtils.CONFIG_FILE_FIELD_NAME_NAME); - - // Inject our features into our list of feature data sources: - final FeatureInput featureInput = addFeatureInputsAfterInitialization( - dataSourceFile.resolveSibling( - IOUtils.getPath( dataSourceProperties.getProperty(DataSourceUtils.CONFIG_FILE_FIELD_NAME_SRC_FILE) ) - ).toUri().toString(), - name, - featureClazz, lookaheadFeatureCachingInBp); - - // Add our feature input to our list of manual inputs: - manualLocatableFeatureInputs.add(featureInput); - } - - /** - * Processes the given {@link Set} into a list of transcript IDs. - * This is necessary because the command-line input argument is overloaded to be either a file containing transcript - * IDs (1 per line) OR as a list of transcript IDs. - * @param rawTranscriptSet {@link Set} of {@link String}s from which to create a list of Transcript IDs. If of size 1, will try to open as a file. - * @return A {@link Set} of {@link String} contianing Transcript IDs in which the user is interested. - */ - private Set processTranscriptList(final Set rawTranscriptSet) { - if ( rawTranscriptSet.size() == 1 ) { - final String filePathString = rawTranscriptSet.iterator().next(); - try ( final BufferedReader bufferedReader = Files.newBufferedReader(IOUtils.getPath(filePathString)) ) { - logger.info("Opened transcript file: " + filePathString); - - // Create a place to put our output: - final Set transcriptIdSet = new HashSet<>(); - - String line = bufferedReader.readLine(); - while ( line != null ) { - logger.info(" Adding transcript ID to transcript set: " + line); - transcriptIdSet.add(line); - line = bufferedReader.readLine(); - } - logger.info("Transcript parsing complete."); - - return transcriptIdSet; - } - catch ( final IOException ex ) { - logger.warn("Could not open transcript selection list as a file. Using it as a singleton list of transcript IDs: [" + filePathString + "]"); - return rawTranscriptSet; - } - } - else { - return rawTranscriptSet; - } - } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorArgumentCollection.java new file mode 100644 index 00000000000..af888d17a0a --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorArgumentCollection.java @@ -0,0 +1,104 @@ +package org.broadinstitute.hellbender.tools.funcotator; + +import org.broadinstitute.barclay.argparser.Advanced; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.Hidden; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; + +import java.io.File; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +/** + * Arguments to be be used by the {@link Funcotator} {@link org.broadinstitute.hellbender.engine.GATKTool}, + * which are specific to {@link Funcotator}. + * Created by jonn on 9/12/18. + */ +public class FuncotatorArgumentCollection implements Serializable { + private static final long serialVersionUID = 1L; + + //----------------------------------------------------- + // Required args: + + @Argument( + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + doc = "Output VCF file to which annotated variants should be written.") + public File outputFile; + + @Argument( + fullName = FuncotatorArgumentDefinitions.REFERENCE_VERSION_LONG_NAME, + doc = "The version of the Human Genome reference to use (e.g. hg19, hg38, etc.). This will correspond to a sub-folder of each data source corresponding to that data source for the given reference." + ) + public String referenceVersion; + + @Argument( + fullName = FuncotatorArgumentDefinitions.DATA_SOURCES_PATH_LONG_NAME, + doc = "The path to a data source folder for Funcotator. May be specified more than once to handle multiple data source folders." + ) + public List dataSourceDirectories; + + @Argument( + fullName = FuncotatorArgumentDefinitions.OUTPUT_FORMAT_LONG_NAME, + doc = "The output file format. Either VCF or MAF. Please note that MAF output for germline use case VCFs is unsupported." + ) + public FuncotatorArgumentDefinitions.OutputFormatType outputFormatType; + + //----------------------------------------------------- + // Optional args: + + @Argument( + fullName = FuncotatorArgumentDefinitions.REMOVE_FILTERED_VARIANTS_LONG_NAME, + optional = true, + doc = "Ignore/drop variants that have been filtered in the input. These variants will not appear in the output file." + ) + public boolean removeFilteredVariants = false; + + @Argument( + fullName = FuncotatorArgumentDefinitions.TRANSCRIPT_SELECTION_MODE_LONG_NAME, + optional = true, + doc = "Method of detailed transcript selection. This will select the transcript for detailed annotation (CANONICAL, ALL, or BEST_EFFECT)." + ) + public TranscriptSelectionMode transcriptSelectionMode = FuncotatorArgumentDefinitions.TRANSCRIPT_SELECTION_MODE_DEFAULT_VALUE; + + @Argument( + fullName = FuncotatorArgumentDefinitions.TRANSCRIPT_LIST_LONG_NAME, + optional = true, + doc = "File to use as a list of transcripts (one transcript ID per line, version numbers are ignored) OR A set of transcript IDs to use for annotation to override selected transcript." + ) + public Set userTranscriptIdSet = new HashSet<>(); + + @Argument( + fullName = FuncotatorArgumentDefinitions.ANNOTATION_DEFAULTS_LONG_NAME, + optional = true, + doc = "Annotations to include in all annotated variants if the annotation is not specified in the data sources (in the format :). This will add the specified annotation to every annotated variant if it is not already present." + ) + public List annotationDefaults = new ArrayList<>(); + + @Argument( + fullName = FuncotatorArgumentDefinitions.ANNOTATION_OVERRIDES_LONG_NAME, + optional = true, + doc = "Override values for annotations (in the format :). Replaces existing annotations of the given name with given values." + ) + public List annotationOverrides = new ArrayList<>(); + + @Argument( + fullName = FuncotatorArgumentDefinitions.LOOKAHEAD_CACHE_IN_BP_NAME, + optional = true, + minValue = 0, + doc = "Number of base-pairs to cache when querying variants." + ) + public int lookaheadFeatureCachingInBp = FuncotatorArgumentDefinitions.LOOKAHEAD_CACHE_IN_BP_DEFAULT_VALUE; + + @Advanced + @Hidden + @Argument( + fullName = FuncotatorArgumentDefinitions.FORCE_B37_TO_HG19_REFERENCE_CONTIG_CONVERSION, + optional = true, + doc = "(Advanced / DO NOT USE*) If you select this flag, Funcotator will force a conversion of variant contig names from b37 to hg19. *This option is useful in integration tests (written by devs) only." + ) + public boolean forceB37ToHg19ContigNameConversion = false; +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorEngine.java new file mode 100644 index 00000000000..170b7ec3f3f --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorEngine.java @@ -0,0 +1,411 @@ +package org.broadinstitute.hellbender.tools.funcotator; + +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.tribble.util.ParsingUtils; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLine; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.broadinstitute.hellbender.engine.FeatureContext; +import org.broadinstitute.hellbender.engine.GATKTool; +import org.broadinstitute.hellbender.engine.ReferenceContext; +import org.broadinstitute.hellbender.engine.filters.VariantFilter; +import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.tools.funcotator.dataSources.DataSourceUtils; +import org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation; +import org.broadinstitute.hellbender.tools.funcotator.mafOutput.MafOutputRenderer; +import org.broadinstitute.hellbender.tools.funcotator.metadata.FuncotationMetadata; +import org.broadinstitute.hellbender.tools.funcotator.vcfOutput.VcfOutputRenderer; +import org.broadinstitute.hellbender.transformers.VariantTransformer; +import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.io.IOUtils; + +import java.io.BufferedReader; +import java.io.IOException; +import java.nio.file.Files; +import java.util.*; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +/** + * Class that performs functional annotation of variants. + * + * Requires a set of data sources ({@link DataSourceFuncotationFactory}) from which to create {@link Funcotation}s. + */ +public final class FuncotatorEngine implements AutoCloseable { + + /** Obligatory logger. */ + private static final Logger logger = LogManager.getLogger(FuncotatorEngine.class); + + /** + * Information about what kinds of {@link Funcotation}s are going to be created by this {@link FuncotatorEngine}. + */ + private final FuncotationMetadata inputMetadata; + + /** + * The {@link DataSourceFuncotationFactory} that will create {@link Funcotation}s for this {@link FuncotatorEngine}. + */ + private final List dataSourceFactories; + + /** + * The arguments given to the instance of the {@link GATKTool} running this {@link FuncotatorEngine}. + */ + private final FuncotatorArgumentCollection funcotatorArgs; + + /** + * The {@link SAMSequenceDictionary} for the driving variants (i.e. the input variant file). + */ + private final SAMSequenceDictionary sequenceDictionaryForDrivingVariants; + + /** + * Whether the input variant contigs must be converted to hg19. + * This is only the case when the input reference is b37 AND when + * the reference version is hg19 (i.e. {@link FuncotatorArgumentCollection#referenceVersion} == {@link FuncotatorArgumentDefinitions#HG19_REFERENCE_VERSION_STRING}). + */ + private final boolean mustConvertInputContigsToHg19; + + /** + * Create a {@link FuncotatorEngine} using the given {@code metadata} and {@code funcotationFactories} representing + * the kinds of {@link Funcotation}s to be created and the data sources from which they should be created, + * respectively. + * @param metadata {@link FuncotationMetadata} containing information on the kinds of {@link Funcotation}s this {@link FuncotatorEngine} will create. + * @param funcotationFactories A {@link List} which can create the desired {@link Funcotation}s. + */ + public FuncotatorEngine(final FuncotatorArgumentCollection funcotatorArgs, + final SAMSequenceDictionary sequenceDictionaryForDrivingVariants, + final FuncotationMetadata metadata, + final List funcotationFactories) { + + this.sequenceDictionaryForDrivingVariants = sequenceDictionaryForDrivingVariants; + this.funcotatorArgs = funcotatorArgs; + inputMetadata = metadata; + + dataSourceFactories = funcotationFactories; + // Note: The dataSourceFactories must be sorted to ensure that as we iterate through them + // to create funcotations, the inherent dependencies between different funcotation types are preserved. + // For example, most FuncotationFactories require that a GencodeFuncotation is present before they can + // create their annotations. This sorting enables such dependencies. + dataSourceFactories.sort(DataSourceUtils::datasourceComparator); + + // Determine whether we have to convert given variants from B37 to HG19: + mustConvertInputContigsToHg19 = determineReferenceAndDatasourceCompatibility(); + } + + /** + * @return An unmodifiable {@link List} being used by this {@link FuncotatorEngine} to create {@link Funcotation}s. + */ + public List getFuncotationFactories() { + return Collections.unmodifiableList(dataSourceFactories); + } + + /** + * Creates a {@link FuncotationMap} for the given {@code variantContext}. + * + * @param variantContext {@link VariantContext} to annotate. Never {@code null}. + * @param referenceContext {@link ReferenceContext} corresponding to the given {@code variantContext}. Never {@code null}. + * @param featureContext {@link FeatureContext} corresponding to the given {@code variantContext}. Never {@code null}. + * @return an instance of FuncotationMap that maps transcript IDs to lists of funcotations for the given variantContext context. + */ + public FuncotationMap createFuncotationMapForVariant(final VariantContext variantContext, + final ReferenceContext referenceContext, + final FeatureContext featureContext) { + + Utils.nonNull(variantContext); + Utils.nonNull(referenceContext); + Utils.nonNull(featureContext); + + //============================================================================================================== + // First create only the transcript (Gencode) funcotations: + + if (retrieveGencodeFuncotationFactoryStream().count() > 1) { + logger.warn("Attempting to annotate with more than one GENCODE datasource. If these have overlapping transcript IDs, errors may occur."); + } + + final List transcriptFuncotations = retrieveGencodeFuncotationFactoryStream() + .map(gf -> gf.createFuncotations(variantContext, referenceContext, featureContext)) + .flatMap(List::stream) + .map(gf -> (GencodeFuncotation) gf).collect(Collectors.toList()); + + //============================================================================================================== + // Create the funcotations for non-Gencode data sources: + + // Create a place to keep our funcotations: + final FuncotationMap funcotationMap = FuncotationMap.createFromGencodeFuncotations(transcriptFuncotations); + + // Perform the rest of the annotation. Note that this code manually excludes the Gencode Funcotations. + for (final DataSourceFuncotationFactory funcotationFactory : dataSourceFactories ) { + + // Note that this guarantees that we do not add GencodeFuncotations a second time. + if (!funcotationFactory.getType().equals(FuncotatorArgumentDefinitions.DataSourceType.GENCODE)) { + final List txIds = funcotationMap.getTranscriptList(); + + for (final String txId: txIds) { + funcotationMap.add(txId, funcotationFactory.createFuncotations(variantContext, referenceContext, + featureContext, funcotationMap.getGencodeFuncotations(txId))); + } + } + } + + //============================================================================================================== + // Create the funcotations for the input and add to all txID mappings. + + final List txIds = funcotationMap.getTranscriptList(); + + for (final String txId: txIds) { + funcotationMap.add(txId, FuncotatorUtils.createFuncotations(variantContext, inputMetadata, FuncotatorConstants.DATASOURCE_NAME_FOR_INPUT_VCFS)); + } + + return funcotationMap; + } + + /** + * Create an output renderer for the data created by this instance of {@link FuncotatorEngine}. + * @param annotationDefaultsMap {@link LinkedHashMap} of annotation names and their default values. + * @param annotationOverridesMap {@link LinkedHashMap} of annotation names and the values for these fields overridden by the user. + * @param headerForVariants {@link VCFHeader} for the input VCF file containing the variants to annotate. + * @param defaultToolVcfHeaderLines {@link Set} containing the default {@link VCFHeaderLine}s for the given {@code gatkToolInstance}. + * @param gatkToolInstance {@link GATKTool} instance from which we will be using this {@link FuncotatorEngine}. + * @return The requested {@link OutputRenderer} based on the given {@code funcotatorArgs}. + */ + public OutputRenderer createOutputRenderer(final LinkedHashMap annotationDefaultsMap, + final LinkedHashMap annotationOverridesMap, + final VCFHeader headerForVariants, + final Set defaultToolVcfHeaderLines, + final GATKTool gatkToolInstance) { + + final OutputRenderer outputRenderer; + + // Determine which annotations are accounted for (by the funcotation factories) and which are not. + final LinkedHashMap unaccountedForDefaultAnnotations = getUnaccountedForAnnotations( getFuncotationFactories(), annotationDefaultsMap ); + final LinkedHashMap unaccountedForOverrideAnnotations = getUnaccountedForAnnotations( getFuncotationFactories(), annotationOverridesMap ); + + // Set up our output renderer: + switch (funcotatorArgs.outputFormatType) { + case MAF: + outputRenderer = new MafOutputRenderer(funcotatorArgs.outputFile.toPath(), + getFuncotationFactories(), + headerForVariants, + unaccountedForDefaultAnnotations, + unaccountedForOverrideAnnotations, + defaultToolVcfHeaderLines.stream().map(Object::toString).collect(Collectors.toCollection(LinkedHashSet::new)), + funcotatorArgs.referenceVersion); + break; + + case VCF: + outputRenderer = new VcfOutputRenderer( + gatkToolInstance.createVCFWriter(funcotatorArgs.outputFile), + getFuncotationFactories(), + headerForVariants, + unaccountedForDefaultAnnotations, + unaccountedForOverrideAnnotations, + defaultToolVcfHeaderLines + ); + break; + default: + throw new GATKException("Unsupported output format type specified: " + funcotatorArgs.outputFormatType.toString()); + } + + return outputRenderer; + } + + /** + * @return A {@link VariantFilter} that will ignore any variants that have been filtered (if the user requested that the filter is turned on). Otherwise returns a no-op filter. + */ + public VariantFilter makeVariantFilter() { + return variant -> { + // Ignore variants that have been filtered if the user requests it: + if ( funcotatorArgs.removeFilteredVariants && variant.isFiltered() ) { + return false; + } + return true; + }; + } + + /** + * Create a new {@link VariantContext} which will match the given Reference if there is a mismatch for input between the B37 reference and the HG19 reference. + * @param variant A {@link VariantContext} object containing the variant to convert. + * @return A {@link VariantContext} whose contig has been transformed to HG19 if requested by the user. Otherwise, an identical variant. + */ + public VariantContext getCorrectVariantContextForReference(final VariantContext variant) { + if ( mustConvertInputContigsToHg19 ) { + final VariantContextBuilder vcb = new VariantContextBuilder(variant); + vcb.chr(FuncotatorUtils.convertB37ContigToHg19Contig(variant.getContig())); + return vcb.make(); + } + else { + return variant; + } + } + + /** + * @return The default {@link VariantTransformer} which will automatically convert from the B37 reference standard to the HG19 reference standard for contig names. + */ + public VariantTransformer getDefaultVariantTransformer() { + return variantContext -> getCorrectVariantContextForReference(variantContext); + } + + /** + * Shutdown the engine. Closes all datasource factories. + */ + public void close() { + for ( final DataSourceFuncotationFactory factory : dataSourceFactories ) { + if ( factory != null ) { + factory.close(); + } + } + } + + /** + * Processes the given {@link Set} into a list of transcript IDs. + * This is necessary because the command-line input argument is overloaded to be either a file containing transcript + * IDs (1 per line) OR as a list of transcript IDs. + * @param rawTranscriptSet {@link Set} of {@link String}s from which to create a list of Transcript IDs. If of size 1, will try to open as a file. + * @return A {@link Set} of {@link String} contianing Transcript IDs in which the user is interested. + */ + public static Set processTranscriptList(final Set rawTranscriptSet) { + if ( rawTranscriptSet.size() == 1 ) { + final String filePathString = rawTranscriptSet.iterator().next(); + try ( final BufferedReader bufferedReader = Files.newBufferedReader(IOUtils.getPath(filePathString)) ) { + logger.info("Opened transcript file: " + filePathString); + + // Create a place to put our output: + final Set transcriptIdSet = new HashSet<>(); + + String line = bufferedReader.readLine(); + while ( line != null ) { + logger.info(" Adding transcript ID to transcript set: " + line); + transcriptIdSet.add(line); + line = bufferedReader.readLine(); + } + logger.info("Transcript parsing complete."); + + return transcriptIdSet; + } + catch ( final IOException ex ) { + logger.warn("Could not open transcript selection list as a file. Using it as a singleton list of transcript IDs: [" + filePathString + "]"); + return rawTranscriptSet; + } + } + else { + return rawTranscriptSet; + } + } + + + /** + * Gets the correct {@link ReferenceContext} for the {@code variant} being processed based on if the B37->HG19 conversion is required. + * @param variant {@link VariantContext} to check for B37/HG19 compliance. + * @param referenceContext {@link ReferenceContext} on which the given {@code variant} was originally based before the variant transformation. + * @return A {@link ReferenceContext} that is guaranteed to match the given {@code variant} for HG19/B37 compliance. + */ + public ReferenceContext getCorrectReferenceContext(final VariantContext variant, final ReferenceContext referenceContext) { + + final ReferenceContext correctReferenceContext; + + // Check to see if we need to revert the ReferenceContext's interval to the original variant interval + // (This would only happen in the case where we were given b37 variants with hg19 data sources): + if ( mustConvertInputContigsToHg19 ) { + + // Convert our contig back to B37 here so it matches the variant: + final SimpleInterval interval = new SimpleInterval( + FuncotatorUtils.convertHG19ContigToB37Contig(variant.getContig()), variant.getStart(), variant.getEnd() + ); + + correctReferenceContext = new ReferenceContext(referenceContext, interval); + } + else { + correctReferenceContext = referenceContext; + } + + return correctReferenceContext; + } + + // ================================================================================================================= + + /** + * Creates a {@link LinkedHashMap} of annotations in the given {@code annotationMap} that do not occur in the given {@code dataSourceFactories}. + * @param dataSourceFactories {@link List} of {@link DataSourceFuncotationFactory} to check for whether each annotation in the {@code annotationMap} is handled. + * @param annotationMap {@link Map} (of ANNOTATION_NAME : ANNOTATION_VALUE) to check + * @return A {@link LinkedHashMap} of annotations in the given {@code annotationMap} that do not occur in the given {@code dataSourceFactories}. + */ + private LinkedHashMap getUnaccountedForAnnotations( final List dataSourceFactories, + final Map annotationMap ) { + final LinkedHashMap outAnnotations = new LinkedHashMap<>(); + + // Check each field in each factory: + for ( final String field : annotationMap.keySet() ) { + boolean accountedFor = false; + for ( final DataSourceFuncotationFactory funcotationFactory : dataSourceFactories ) { + + if ( funcotationFactory.getSupportedFuncotationFields().contains(field) ) { + accountedFor = true; + break; + } + } + if ( !accountedFor ) { + outAnnotations.put(field, annotationMap.get(field)); + } + } + + return outAnnotations; + } + + /** + * Split each element of the given {@link List} into a key and value. + * Assumes each element of the given {@link List} is formatted as follows: + * KEY:VALUE + * @param annotationArgs {@link List} of strings formatted KEY:VALUE to turn into a {@link Map}. + * @return A {@link LinkedHashMap} of KEY:VALUE pairs corresponding to entries in the given list. + */ + public static LinkedHashMap splitAnnotationArgsIntoMap( final List annotationArgs ) { + + final LinkedHashMap annotationMap = new LinkedHashMap<>(); + + for ( final String s : annotationArgs ) { + final List keyVal = ParsingUtils.split(s, FuncotatorArgumentDefinitions.MAP_NAME_VALUE_DELIMITER); + if ( keyVal.size() != 2) { + throw new UserException.BadInput( "Argument annotation incorrectly formatted: " + s ); + } + + annotationMap.put( keyVal.get(0), keyVal.get(1) ); + } + + return annotationMap; + } + + private boolean determineReferenceAndDatasourceCompatibility() { + + boolean mustConvertInputContigsToHg19 = false; + + if ( funcotatorArgs.forceB37ToHg19ContigNameConversion || + ( funcotatorArgs.referenceVersion.equals(FuncotatorArgumentDefinitions.HG19_REFERENCE_VERSION_STRING) && + FuncotatorUtils.isSequenceDictionaryUsingB37Reference(sequenceDictionaryForDrivingVariants) )) { + + // NOTE AND WARNING: + // hg19 is from ucsc. b37 is from the genome reference consortium. + // ucsc decided the grc version had some bad data in it, so they blocked out some of the bases, aka "masked" them + // so the lengths of the contigs are the same, the bases are just _slightly_ different. + // ALSO, the contig naming convention is different between hg19 and hg38: + // hg19 uses contigs of the form "chr1" + // b37 uses contigs of the form "1" + // This naming convention difference causes a LOT of issues and was a bad idea. + + logger.warn("WARNING: You are using B37 as a reference. " + + "Funcotator will convert your variants to GRCh37, and this will be fine in the vast majority of cases. " + + "There MAY be some errors (e.g. in the Y chromosome, but possibly in other places as well) due to changes between the two references."); + + mustConvertInputContigsToHg19 = true; + } + + return mustConvertInputContigsToHg19; + } + + private Stream retrieveGencodeFuncotationFactoryStream() { + return dataSourceFactories.stream() + .filter(f -> f.getType().equals(FuncotatorArgumentDefinitions.DataSourceType.GENCODE)); + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DataSourceUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DataSourceUtils.java index 518331db258..23030e9a17c 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DataSourceUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DataSourceUtils.java @@ -1,12 +1,11 @@ package org.broadinstitute.hellbender.tools.funcotator.dataSources; import com.google.common.annotations.VisibleForTesting; +import htsjdk.tribble.Feature; import htsjdk.variant.variantcontext.VariantContext; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; -import org.broadinstitute.hellbender.engine.FeatureContext; -import org.broadinstitute.hellbender.engine.ReadsContext; -import org.broadinstitute.hellbender.engine.ReferenceContext; +import org.broadinstitute.hellbender.engine.*; import org.broadinstitute.hellbender.exceptions.GATKException; import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.tools.funcotator.DataSourceFuncotationFactory; @@ -19,6 +18,8 @@ import org.broadinstitute.hellbender.tools.funcotator.dataSources.xsv.LocatableXsvFuncotationFactory; import org.broadinstitute.hellbender.tools.funcotator.dataSources.xsv.SimpleKeyXsvFuncotationFactory; import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.codecs.gencode.GencodeGtfFeature; +import org.broadinstitute.hellbender.utils.codecs.xsvLocatableTable.XsvTableFeature; import org.broadinstitute.hellbender.utils.io.IOUtils; import java.io.BufferedReader; @@ -228,13 +229,85 @@ public static boolean isValidDirectory(final Path p) { * @param annotationOverridesMap {@link LinkedHashMap} of {@link String}->{@link String} containing any annotation overrides to include in data sources. Must not be {@code null}. * @param transcriptSelectionMode {@link TranscriptSelectionMode} to use when choosing the transcript for detailed reporting. Must not be {@code null}. * @param userTranscriptIdSet {@link Set} of {@link String}s containing transcript IDs of interest to be selected for first. Must not be {@code null}. + * @param gatkToolInstance Instance of the {@link GATKTool} into which to add {@link FeatureInput}s. Must not be {@code null}. + * @param lookaheadFeatureCachingInBp Number of base-pairs to cache when querying variants. * @return A {@link List} of {@link DataSourceFuncotationFactory} given the data source metadata, overrides, and transcript reporting priority information. */ public static List createDataSourceFuncotationFactoriesForDataSources(final Map dataSourceMetaData, final LinkedHashMap annotationOverridesMap, final TranscriptSelectionMode transcriptSelectionMode, - final Set userTranscriptIdSet) { + final Set userTranscriptIdSet, + final GATKTool gatkToolInstance, + final int lookaheadFeatureCachingInBp) { + Utils.nonNull(dataSourceMetaData); + Utils.nonNull(annotationOverridesMap); + Utils.nonNull(transcriptSelectionMode); + Utils.nonNull(userTranscriptIdSet); + Utils.nonNull(gatkToolInstance); + + final List dataSourceFactories = new ArrayList<>(dataSourceMetaData.size()); + + // Now we know we have unique and valid data. + // Now we must instantiate our data sources: + for ( final Map.Entry entry : dataSourceMetaData.entrySet() ) { + + final String funcotationFactoryName = entry.getValue().getProperty(CONFIG_FILE_FIELD_NAME_NAME); + logger.debug("Creating Funcotation Factory for " + funcotationFactoryName + " ..."); + + final Path path = entry.getKey(); + final Properties properties = entry.getValue(); + + final DataSourceFuncotationFactory funcotationFactory; + + // Note: we need no default case since we know these are valid: + final String stringType = properties.getProperty("type"); + final FeatureInput featureInput; + switch ( FuncotatorArgumentDefinitions.DataSourceType.getEnum(stringType) ) { + case LOCATABLE_XSV: + featureInput = createAndRegisterFeatureInputs(path, properties, gatkToolInstance, lookaheadFeatureCachingInBp, XsvTableFeature.class); + funcotationFactory = DataSourceUtils.createLocatableXsvDataSource(path, properties, annotationOverridesMap, featureInput); + break; + case SIMPLE_XSV: + funcotationFactory = DataSourceUtils.createSimpleXsvDataSource(path, properties, annotationOverridesMap); + break; + case COSMIC: + funcotationFactory = DataSourceUtils.createCosmicDataSource(path, properties, annotationOverridesMap); + break; + case GENCODE: + featureInput = createAndRegisterFeatureInputs(path, properties, gatkToolInstance, lookaheadFeatureCachingInBp, GencodeGtfFeature.class); + funcotationFactory = DataSourceUtils.createGencodeDataSource(path, properties, annotationOverridesMap, transcriptSelectionMode, userTranscriptIdSet, featureInput); + break; + case VCF: + featureInput = createAndRegisterFeatureInputs(path, properties, gatkToolInstance, lookaheadFeatureCachingInBp, VariantContext.class); + funcotationFactory = DataSourceUtils.createVcfDataSource(path, properties, annotationOverridesMap, featureInput); + break; + default: + throw new GATKException("Unknown type of DataSourceFuncotationFactory encountered: " + stringType ); + } + + // Add in our factory: + dataSourceFactories.add(funcotationFactory); + } + + logger.debug("All Data Sources have been created."); + return dataSourceFactories; + } + /** + * Create a {@link List} of {@link DataSourceFuncotationFactory} based on meta data on the data sources, overrides, and transcript reporting priority information. + * THIS METHOD IS FOR TESTING ONLY! + * @param dataSourceMetaData {@link Map} of {@link Path}->{@link Properties} containing metadata about each data source. Must not be {@code null}. + * @param annotationOverridesMap {@link LinkedHashMap} of {@link String}->{@link String} containing any annotation overrides to include in data sources. Must not be {@code null}. + * @param transcriptSelectionMode {@link TranscriptSelectionMode} to use when choosing the transcript for detailed reporting. Must not be {@code null}. + * @param userTranscriptIdSet {@link Set} of {@link String}s containing transcript IDs of interest to be selected for first. Must not be {@code null}. + * @return A {@link List} of {@link DataSourceFuncotationFactory} given the data source metadata, overrides, and transcript reporting priority information. + */ + @VisibleForTesting + public static List createDataSourceFuncotationFactoriesForDataSourcesForTesting( + final Map dataSourceMetaData, + final LinkedHashMap annotationOverridesMap, + final TranscriptSelectionMode transcriptSelectionMode, + final Set userTranscriptIdSet) { Utils.nonNull(dataSourceMetaData); Utils.nonNull(annotationOverridesMap); Utils.nonNull(transcriptSelectionMode); @@ -246,7 +319,8 @@ public static List createDataSourceFuncotationFact // Now we must instantiate our data sources: for ( final Map.Entry entry : dataSourceMetaData.entrySet() ) { - logger.debug("Creating Funcotation Factory for " + entry.getValue().getProperty("name") + " ..."); + final String funcotationFactoryName = entry.getValue().getProperty(CONFIG_FILE_FIELD_NAME_NAME); + logger.debug("Creating Funcotation Factory for " + funcotationFactoryName + " ..."); final Path path = entry.getKey(); final Properties properties = entry.getValue(); @@ -255,9 +329,11 @@ public static List createDataSourceFuncotationFact // Note: we need no default case since we know these are valid: final String stringType = properties.getProperty("type"); + final FeatureInput featureInput; switch ( FuncotatorArgumentDefinitions.DataSourceType.getEnum(stringType) ) { case LOCATABLE_XSV: - funcotationFactory = DataSourceUtils.createLocatableXsvDataSource(path, properties, annotationOverridesMap); + featureInput = createFeatureInputsForTesting(path, properties); + funcotationFactory = DataSourceUtils.createLocatableXsvDataSource(path, properties, annotationOverridesMap, featureInput); break; case SIMPLE_XSV: funcotationFactory = DataSourceUtils.createSimpleXsvDataSource(path, properties, annotationOverridesMap); @@ -266,10 +342,12 @@ public static List createDataSourceFuncotationFact funcotationFactory = DataSourceUtils.createCosmicDataSource(path, properties, annotationOverridesMap); break; case GENCODE: - funcotationFactory = DataSourceUtils.createGencodeDataSource(path, properties, annotationOverridesMap, transcriptSelectionMode, userTranscriptIdSet); + featureInput = createFeatureInputsForTesting(path, properties); + funcotationFactory = DataSourceUtils.createGencodeDataSource(path, properties, annotationOverridesMap, transcriptSelectionMode, userTranscriptIdSet, featureInput); break; case VCF: - funcotationFactory = DataSourceUtils.createVcfDataSource(path, properties, annotationOverridesMap, transcriptSelectionMode, userTranscriptIdSet); + featureInput = createFeatureInputsForTesting(path, properties); + funcotationFactory = DataSourceUtils.createVcfDataSource(path, properties, annotationOverridesMap, featureInput); break; default: throw new GATKException("Unknown type of DataSourceFuncotationFactory encountered: " + stringType ); @@ -283,16 +361,53 @@ public static List createDataSourceFuncotationFact return dataSourceFactories; } + private static FeatureInput createAndRegisterFeatureInputs(final Path dataSourceFile, + final Properties dataSourceProperties, + final GATKTool funcotatorToolInstance, + final int lookaheadFeatureCachingInBp, + final Class featureType) { + Utils.nonNull(dataSourceFile); + Utils.nonNull(dataSourceProperties); + + final String name = dataSourceProperties.getProperty(CONFIG_FILE_FIELD_NAME_NAME); + final String sourceFile = dataSourceFile.resolveSibling(dataSourceProperties.getProperty(CONFIG_FILE_FIELD_NAME_SRC_FILE)).toString(); + + // Get feature inputs by creating them with the tool instance itself. + // This has the side effect of registering the FeatureInputs with the engine, so that they can be later queried. + return funcotatorToolInstance.addFeatureInputsAfterInitialization(sourceFile, name, featureType, lookaheadFeatureCachingInBp); + } + + /** + * Create {@link FeatureInput} FOR TESTING ONLY. + * @param dataSourceFile + * @param dataSourceProperties + * @return + */ + private static FeatureInput createFeatureInputsForTesting(final Path dataSourceFile, + final Properties dataSourceProperties) { + + Utils.nonNull(dataSourceFile); + Utils.nonNull(dataSourceProperties); + + final String name = dataSourceProperties.getProperty(CONFIG_FILE_FIELD_NAME_NAME); + final String sourceFile = dataSourceFile.resolveSibling(dataSourceProperties.getProperty(CONFIG_FILE_FIELD_NAME_SRC_FILE)).toString(); + + // Get feature inputs by creating them with the funcotator tool instance itself: + return new FeatureInput<>(sourceFile, name, Collections.emptyMap()); + } + /** * Create a {@link LocatableXsvFuncotationFactory} from filesystem resources and field overrides. * @param dataSourceFile {@link Path} to the data source file. Must not be {@code null}. * @param dataSourceProperties {@link Properties} consisting of the contents of the config file for the data source. Must not be {@code null}. * @param annotationOverridesMap {@link LinkedHashMap}{@code String>} containing any annotation overrides to be included in the resulting data source. Must not be {@code null}. + * @param featureInput The {@link FeatureInput} object for the LocatableXsv data source we are creating. * @return A new {@link LocatableXsvFuncotationFactory} based on the given data source file information and field overrides map. */ - public static LocatableXsvFuncotationFactory createLocatableXsvDataSource(final Path dataSourceFile, - final Properties dataSourceProperties, - final LinkedHashMap annotationOverridesMap) { + private static LocatableXsvFuncotationFactory createLocatableXsvDataSource(final Path dataSourceFile, + final Properties dataSourceProperties, + final LinkedHashMap annotationOverridesMap, + final FeatureInput featureInput) { Utils.nonNull(dataSourceFile); Utils.nonNull(dataSourceProperties); Utils.nonNull(annotationOverridesMap); @@ -301,7 +416,13 @@ public static LocatableXsvFuncotationFactory createLocatableXsvDataSource(final final String version = dataSourceProperties.getProperty(CONFIG_FILE_FIELD_NAME_VERSION); // Create a locatable XSV feature reader to handle XSV Locatable features: - final LocatableXsvFuncotationFactory locatableXsvFuncotationFactory = new LocatableXsvFuncotationFactory(name, version, annotationOverridesMap); + final LocatableXsvFuncotationFactory locatableXsvFuncotationFactory = + new LocatableXsvFuncotationFactory( + name, + version, + annotationOverridesMap, + featureInput + ); // Set the supported fields by the LocatableXsvFuncotationFactory: locatableXsvFuncotationFactory.setSupportedFuncotationFields( @@ -324,7 +445,7 @@ public static LocatableXsvFuncotationFactory createLocatableXsvDataSource(final * @param annotationOverridesMap {@link LinkedHashMap}{@code String>} containing any annotation overrides to be included in the resulting data source. Must not be {@code null}. * @return A new {@link SimpleKeyXsvFuncotationFactory} based on the given data source file information and field overrides map. */ - public static SimpleKeyXsvFuncotationFactory createSimpleXsvDataSource(final Path dataSourceFile, + private static SimpleKeyXsvFuncotationFactory createSimpleXsvDataSource(final Path dataSourceFile, final Properties dataSourceProperties, final LinkedHashMap annotationOverridesMap) { @@ -353,7 +474,7 @@ public static SimpleKeyXsvFuncotationFactory createSimpleXsvDataSource(final Pat * @param annotationOverridesMap {@link LinkedHashMap}{@code String>} containing any annotation overrides to be included in the resulting data source. Must not be {@code null}. * @return A new {@link CosmicFuncotationFactory} based on the given data source file information and field overrides map. */ - public static CosmicFuncotationFactory createCosmicDataSource(final Path dataSourceFile, + private static CosmicFuncotationFactory createCosmicDataSource(final Path dataSourceFile, final Properties dataSourceProperties, final LinkedHashMap annotationOverridesMap) { Utils.nonNull(dataSourceFile); @@ -376,13 +497,15 @@ public static CosmicFuncotationFactory createCosmicDataSource(final Path dataSou * @param annotationOverridesMap {@link LinkedHashMap}{@code String>} containing any annotation overrides to be included in the resulting data source. Must not be {@code null}. * @param transcriptSelectionMode {@link TranscriptSelectionMode} to use when choosing the transcript for detailed reporting. Must not be {@code null}. * @param userTranscriptIdSet {@link Set} of {@link String}s containing transcript IDs of interest to be selected for first. Must not be {@code null}. + * @param featureInput The {@link FeatureInput} object for the Gencode data source we are creating. * @return A new {@link GencodeFuncotationFactory} based on the given data source file information, field overrides map, and transcript information. */ - public static GencodeFuncotationFactory createGencodeDataSource(final Path dataSourceFile, - final Properties dataSourceProperties, - final LinkedHashMap annotationOverridesMap, - final TranscriptSelectionMode transcriptSelectionMode, - final Set userTranscriptIdSet) { + private static GencodeFuncotationFactory createGencodeDataSource(final Path dataSourceFile, + final Properties dataSourceProperties, + final LinkedHashMap annotationOverridesMap, + final TranscriptSelectionMode transcriptSelectionMode, + final Set userTranscriptIdSet, + final FeatureInput featureInput) { Utils.nonNull(dataSourceFile); Utils.nonNull(dataSourceProperties); @@ -396,13 +519,15 @@ public static GencodeFuncotationFactory createGencodeDataSource(final Path dataS final String name = dataSourceProperties.getProperty(CONFIG_FILE_FIELD_NAME_NAME); // Create our gencode factory: - return new GencodeFuncotationFactory(dataSourceFile.resolveSibling(fastaPath), - version, - name, - transcriptSelectionMode, - userTranscriptIdSet, - annotationOverridesMap - ); + return new GencodeFuncotationFactory( + dataSourceFile.resolveSibling(fastaPath), + version, + name, + transcriptSelectionMode, + userTranscriptIdSet, + annotationOverridesMap, + featureInput + ); } /** @@ -410,34 +535,30 @@ public static GencodeFuncotationFactory createGencodeDataSource(final Path dataS * @param dataSourceFile {@link Path} to the data source file. Must not be {@code null}. * @param dataSourceProperties {@link Properties} consisting of the contents of the config file for the data source. Must not be {@code null}. * @param annotationOverridesMap {@link LinkedHashMap}{@code String>} containing any annotation overrides to be included in the resulting data source. Must not be {@code null}. - * @param transcriptSelectionMode {@link TranscriptSelectionMode} to use when choosing the transcript for detailed reporting. Must not be {@code null}. - * @param userTranscriptIdSet {@link Set} of {@link String}s containing transcript IDs of interest to be selected for first. Must not be {@code null}. + * @param featureInput The {@link FeatureInput} object for the VCF data source we are creating. * @return A new {@link GencodeFuncotationFactory} based on the given data source file information, field overrides map, and transcript information. */ - public static VcfFuncotationFactory createVcfDataSource(final Path dataSourceFile, - final Properties dataSourceProperties, - final LinkedHashMap annotationOverridesMap, - final TranscriptSelectionMode transcriptSelectionMode, - final Set userTranscriptIdSet) { + private static VcfFuncotationFactory createVcfDataSource(final Path dataSourceFile, + final Properties dataSourceProperties, + final LinkedHashMap annotationOverridesMap, + final FeatureInput featureInput) { Utils.nonNull(dataSourceFile); Utils.nonNull(dataSourceProperties); Utils.nonNull(annotationOverridesMap); - Utils.nonNull(transcriptSelectionMode); - Utils.nonNull(userTranscriptIdSet); // Get some metadata: - final String name = dataSourceProperties.getProperty(CONFIG_FILE_FIELD_NAME_NAME); - final String srcFile = dataSourceProperties.getProperty(CONFIG_FILE_FIELD_NAME_SRC_FILE); - final String version = dataSourceProperties.getProperty(CONFIG_FILE_FIELD_NAME_VERSION); + final String name = dataSourceProperties.getProperty(CONFIG_FILE_FIELD_NAME_NAME); + final String srcFile = dataSourceProperties.getProperty(CONFIG_FILE_FIELD_NAME_SRC_FILE); + final String version = dataSourceProperties.getProperty(CONFIG_FILE_FIELD_NAME_VERSION); // Create our VCF factory: - return new VcfFuncotationFactory( name, version, dataSourceFile.resolveSibling(srcFile).toAbsolutePath(), - annotationOverridesMap + annotationOverridesMap, + featureInput ); } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/cosmic/CosmicFuncotationFactory.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/cosmic/CosmicFuncotationFactory.java index 06f50632128..6088cee292c 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/cosmic/CosmicFuncotationFactory.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/cosmic/CosmicFuncotationFactory.java @@ -129,8 +129,8 @@ public CosmicFuncotationFactory(final Path pathToCosmicDb) { public CosmicFuncotationFactory(final Path pathToCosmicDb, final LinkedHashMap annotationOverridesMap, final String version) { - this.pathToCosmicDb = pathToCosmicDb; + this.pathToCosmicDb = pathToCosmicDb; this.version = version; // Connect to the DB: @@ -165,7 +165,7 @@ public CosmicFuncotationFactory(final Path pathToCosmicDb, // Override Methods: @Override - protected Class getAnnotationFeatureClass() { + public Class getAnnotationFeatureClass() { // Returning Feature.class here implies that this class doesn't care about what features it gets. return Feature.class; } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/gencode/GencodeFuncotationFactory.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/gencode/GencodeFuncotationFactory.java index f37a314b472..46f143d9006 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/gencode/GencodeFuncotationFactory.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/gencode/GencodeFuncotationFactory.java @@ -11,6 +11,7 @@ import htsjdk.variant.variantcontext.VariantContext; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import org.broadinstitute.hellbender.engine.FeatureInput; import org.broadinstitute.hellbender.engine.ReferenceContext; import org.broadinstitute.hellbender.engine.ReferenceDataSource; import org.broadinstitute.hellbender.exceptions.GATKException; @@ -177,12 +178,25 @@ public class GencodeFuncotationFactory extends DataSourceFuncotationFactory { //================================================================================================================== // Constructors: + /** + * Create a {@link GencodeFuncotationFactory}. + * @param gencodeTranscriptFastaFile {@link Path} to the FASTA file contianing the sequences of all transcripts in the Gencode data source. + * @param version The version {@link String} of Gencode from which {@link Funcotation}s will be made. + * @param name A {@link String} containing the name of this {@link GencodeFuncotationFactory}. + * @param transcriptSelectionMode The {@link TranscriptSelectionMode} by which representative/verbose transcripts will be chosen for overlapping variants. + * @param userRequestedTranscripts A {@link Set} containing Gencode TranscriptIDs that the user requests to be annotated with priority over all other transcripts for overlapping variants. + * @param annotationOverrides A {@link LinkedHashMap} containing user-specified overrides for specific {@link Funcotation}s. + * @param mainFeatureInput The backing {@link FeatureInput} for this {@link GencodeFuncotationFactory}, from which all {@link Funcotation}s will be created. + */ public GencodeFuncotationFactory(final Path gencodeTranscriptFastaFile, final String version, final String name, final TranscriptSelectionMode transcriptSelectionMode, final Set userRequestedTranscripts, - final LinkedHashMap annotationOverrides) { + final LinkedHashMap annotationOverrides, + final FeatureInput mainFeatureInput) { + + super(mainFeatureInput); this.gencodeTranscriptFastaFile = gencodeTranscriptFastaFile; @@ -212,7 +226,7 @@ public GencodeFuncotationFactory(final Path gencodeTranscriptFastaFile, // Override Methods: @Override - protected Class getAnnotationFeatureClass() { + public Class getAnnotationFeatureClass() { return GencodeGtfFeature.class; } @@ -664,7 +678,7 @@ else if ( GencodeGtfUTRFeature.class.isAssignableFrom(containingSubfeature.getCl } else if ( GencodeGtfTranscriptFeature.class.isAssignableFrom(containingSubfeature.getClass()) ) { // We have an intron variant - gencodeFuncotation = createIntronFuncotation(variantToUse, altAllele, reference, gtfFeature, transcript, reference); + gencodeFuncotation = createIntronFuncotation(variantToUse, altAllele, reference, gtfFeature, transcript); } else { // Uh-oh! Problemz. @@ -1230,15 +1244,13 @@ private GencodeFuncotation createUtrFuncotation(final VariantContext variant, * @param reference The {@link ReferenceContext} for the given {@code variant}. * @param gtfFeature The {@link GencodeGtfGeneFeature} in which the given {@code variant} occurs. * @param transcript The {@link GencodeGtfTranscriptFeature} in which the given {@code variant} occurs. - * @param referenceContext The {@link ReferenceContext} in which the given variant appears. * @return A {@link GencodeFuncotation} containing information about the given {@code variant} given the corresponding {@code transcript}. */ private GencodeFuncotation createIntronFuncotation(final VariantContext variant, final Allele altAllele, final ReferenceContext reference, final GencodeGtfGeneFeature gtfFeature, - final GencodeGtfTranscriptFeature transcript, - final ReferenceContext referenceContext) { + final GencodeGtfTranscriptFeature transcript) { // Get the strand-corrected alleles from the inputs. // Also get the reference sequence for the variant region. diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/vcf/VcfFuncotationFactory.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/vcf/VcfFuncotationFactory.java index 42b9cd0918a..c49642e884e 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/vcf/VcfFuncotationFactory.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/vcf/VcfFuncotationFactory.java @@ -9,6 +9,7 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.broadinstitute.hellbender.engine.FeatureDataSource; +import org.broadinstitute.hellbender.engine.FeatureInput; import org.broadinstitute.hellbender.engine.ReferenceContext; import org.broadinstitute.hellbender.tools.funcotator.DataSourceFuncotationFactory; import org.broadinstitute.hellbender.tools.funcotator.Funcotation; @@ -87,11 +88,22 @@ public class VcfFuncotationFactory extends DataSourceFuncotationFactory { //================================================================================================================== // Constructors: - public VcfFuncotationFactory(final String name, final String version, final Path sourceFilePath) { - this(name, version, sourceFilePath, new LinkedHashMap<>()); - } + /** + * Create a {@link VcfFuncotationFactory}. + * @param name A {@link String} containing the name of this {@link VcfFuncotationFactory}. + * @param version The version {@link String} of the backing data source from which {@link Funcotation}s will be made. + * @param sourceFilePath {@link Path} to the VCF file from which {@link VariantContext}s will be read in and used as Features from which to create {@link Funcotation}s. + * @param annotationOverridesMap A {@link LinkedHashMap} containing user-specified overrides for specific {@link Funcotation}s. + * @param mainSourceFileAsFeatureInput The backing {@link FeatureInput} for this {@link VcfFuncotationFactory}, from which all {@link Funcotation}s will be created. + */ + public VcfFuncotationFactory(final String name, + final String version, + final Path sourceFilePath, + final LinkedHashMap annotationOverridesMap, + final FeatureInput mainSourceFileAsFeatureInput) { + + super(mainSourceFileAsFeatureInput); - public VcfFuncotationFactory(final String name, final String version, final Path sourceFilePath, final LinkedHashMap annotationOverridesMap) { this.name = name; this.version = version; this.sourceFilePath = sourceFilePath; @@ -157,7 +169,7 @@ private static VCFInfoHeaderLine copyWithRename(final VCFInfoHeaderLine vcfInfoH // Override Methods: @Override - protected Class getAnnotationFeatureClass() { + public Class getAnnotationFeatureClass() { return VariantContext.class; } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/xsv/LocatableXsvFuncotationFactory.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/xsv/LocatableXsvFuncotationFactory.java index 05f3354bdeb..1b9580bb894 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/xsv/LocatableXsvFuncotationFactory.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/xsv/LocatableXsvFuncotationFactory.java @@ -6,6 +6,7 @@ import htsjdk.tribble.readers.AsciiLineReaderIterator; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.hellbender.engine.FeatureInput; import org.broadinstitute.hellbender.engine.ReferenceContext; import org.broadinstitute.hellbender.exceptions.GATKException; import org.broadinstitute.hellbender.exceptions.UserException; @@ -71,35 +72,30 @@ public class LocatableXsvFuncotationFactory extends DataSourceFuncotationFactory //================================================================================================================== // Constructors: - public LocatableXsvFuncotationFactory(){ - this(DEFAULT_NAME, DEFAULT_VERSION_STRING); - } + /** + * Create a {@link LocatableXsvFuncotationFactory}. + * @param name A {@link String} containing the name of this {@link LocatableXsvFuncotationFactory}. + * @param version The version {@link String} of the backing data source from which {@link Funcotation}s will be made. + * @param annotationOverridesMap A {@link LinkedHashMap} containing user-specified overrides for specific {@link Funcotation}s. + * @param mainSourceFileAsFeatureInput The backing {@link FeatureInput} for this {@link LocatableXsvFuncotationFactory}, from which all {@link Funcotation}s will be created. + */ + public LocatableXsvFuncotationFactory(final String name, final String version, final LinkedHashMap annotationOverridesMap, + final FeatureInput mainSourceFileAsFeatureInput){ - public LocatableXsvFuncotationFactory(final String name, final String version){ - this(name, version, new LinkedHashMap<>()); - } + super(mainSourceFileAsFeatureInput); - public LocatableXsvFuncotationFactory(final String name, final String version, final LinkedHashMap annotationOverridesMap){ this.name = name; this.version = version; this.annotationOverrideMap = new LinkedHashMap<>(annotationOverridesMap); } - @VisibleForTesting - LocatableXsvFuncotationFactory(final String name, final String version, final List supportedFields){ - this.name = name; - this.version = version; - - supportedFieldNames = new LinkedHashSet<>(supportedFields); - initializeFieldNameLists(); - } //================================================================================================================== // Override Methods: @Override - protected Class getAnnotationFeatureClass() { + public Class getAnnotationFeatureClass() { return XsvTableFeature.class; } @@ -200,6 +196,11 @@ private List createDefaultFuncotationsOnVariantHelper( final Varian return funcotationList; } + /** + * Set the field names that this {@link LocatableXsvFuncotationFactory} can create. + * Does so by reading the headers of backing data files for this {@link LocatableXsvFuncotationFactory}. + * @param inputDataFilePaths {@link List} to backing data files from which annotations can be made for this {@link LocatableXsvFuncotationFactory}. + */ public void setSupportedFuncotationFields(final List inputDataFilePaths) { if ( supportedFieldNames == null ) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/xsv/SimpleKeyXsvFuncotationFactory.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/xsv/SimpleKeyXsvFuncotationFactory.java index a7cf29792ae..14cb997b649 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/xsv/SimpleKeyXsvFuncotationFactory.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/xsv/SimpleKeyXsvFuncotationFactory.java @@ -107,6 +107,7 @@ public SimpleKeyXsvFuncotationFactory(final String name, final LinkedHashMap annotationOverrides, final int numHeaderLinesToIgnore, final boolean permissiveColumns ) { + this.name = name; delimiter = delim; @@ -150,7 +151,7 @@ public SimpleKeyXsvFuncotationFactory(final String name, // Override Methods: @Override - protected Class getAnnotationFeatureClass() { + public Class getAnnotationFeatureClass() { // Returning Feature.class here implies that this class doesn't care about what features it gets. return Feature.class; } diff --git a/src/main/java/org/broadinstitute/hellbender/utils/codecs/xsvLocatableTable/XsvLocatableTableCodec.java b/src/main/java/org/broadinstitute/hellbender/utils/codecs/xsvLocatableTable/XsvLocatableTableCodec.java index 9181fc30d6b..25f498a7b14 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/codecs/xsvLocatableTable/XsvLocatableTableCodec.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/codecs/xsvLocatableTable/XsvLocatableTableCodec.java @@ -80,8 +80,8 @@ public final class XsvLocatableTableCodec extends AsciiFeatureCodec pik3caFeatureReader = AbstractFeatureReader.getFeatureReader( FuncotatorTestConstants.PIK3CA_GENCODE_ANNOTATIONS_FILE_NAME, new GencodeGtfCodec() ); private static final FeatureReader muc16FeatureReader = AbstractFeatureReader.getFeatureReader(FuncotatorTestConstants.MUC16_GENCODE_ANNOTATIONS_FILE_NAME, new GencodeGtfCodec() ); @@ -352,34 +359,34 @@ public Object[][] provideGencodeFuncotationCreation() { return new Object[][] { {"chr3", 178916538, 178916538, "G", "C", FuncotatorReferenceTestUtils.retrieveHg19Chr3Ref(), ReferenceDataSource.of( IOUtils.getPath(FuncotatorReferenceTestUtils.retrieveHg19Chr3Ref())), - pik3caFeatureReader, DS_PIK3CA_HG19_GENCODE_FASTA, + pik3caFeatureReader, DS_PIK3CA_HG19_GENCODE_FASTA, DS_PIK3CA_HG19_GENCODE_GTF, TranscriptSelectionMode.ALL, Collections.singletonList("ENST00000263967.3") },{"chr3", 178916538, 178916538, "G", "C", FuncotatorReferenceTestUtils.retrieveHg19Chr3Ref(), ReferenceDataSource.of( IOUtils.getPath(FuncotatorReferenceTestUtils.retrieveHg19Chr3Ref())), - pik3caFeatureReader, DS_PIK3CA_HG19_GENCODE_FASTA, + pik3caFeatureReader, DS_PIK3CA_HG19_GENCODE_FASTA, DS_PIK3CA_HG19_GENCODE_GTF, TranscriptSelectionMode.CANONICAL, Collections.singletonList("ENST00000263967.3") },{"chr19", 8994200, 8994200, "G", "C", FuncotatorReferenceTestUtils.retrieveHg19Chr19Ref(), ReferenceDataSource.of( IOUtils.getPath(FuncotatorReferenceTestUtils.retrieveHg19Chr19Ref())), - muc16FeatureReader, DS_MUC16_HG19_GENCODE_FASTA, + muc16FeatureReader, DS_MUC16_HG19_GENCODE_FASTA, DS_MUC16_HG19_GENCODE_GTF, TranscriptSelectionMode.ALL, Arrays.asList("ENST00000397910.4", "ENST00000380951.5") // Next one tests where we would be in a gene with more than one basic transcript, but variant only overlaps one. And we still ask for all, // but since one is IGR, it will never get added the the FuncotationMap. }, {"chr19", 9014550, 9014550, "T", "A", FuncotatorReferenceTestUtils.retrieveHg19Chr19Ref(), ReferenceDataSource.of(IOUtils.getPath(FuncotatorReferenceTestUtils.retrieveHg19Chr19Ref())), - muc16FeatureReader, DS_MUC16_HG19_GENCODE_FASTA, + muc16FeatureReader, DS_MUC16_HG19_GENCODE_FASTA, DS_MUC16_HG19_GENCODE_GTF, TranscriptSelectionMode.ALL, Collections.singletonList("ENST00000397910.4") // Next one tests where we would be in a gene with more than one basic transcript, variant overlaps both, but we are in canonical mode. },{"chr19", 8994200, 8994200, "G", "C", FuncotatorReferenceTestUtils.retrieveHg19Chr19Ref(), ReferenceDataSource.of( IOUtils.getPath(FuncotatorReferenceTestUtils.retrieveHg19Chr19Ref())), - muc16FeatureReader, DS_MUC16_HG19_GENCODE_FASTA, + muc16FeatureReader, DS_MUC16_HG19_GENCODE_FASTA, DS_MUC16_HG19_GENCODE_GTF, TranscriptSelectionMode.CANONICAL, Collections.singletonList("ENST00000397910.4") // Next one tests where we would be in a gene with more than one basic transcript, variant overlaps both, but we are in effect mode. },{"chr19", 8994200, 8994200, "G", "C", FuncotatorReferenceTestUtils.retrieveHg19Chr19Ref(), ReferenceDataSource.of( IOUtils.getPath(FuncotatorReferenceTestUtils.retrieveHg19Chr19Ref())), - muc16FeatureReader, DS_MUC16_HG19_GENCODE_FASTA, + muc16FeatureReader, DS_MUC16_HG19_GENCODE_FASTA, DS_MUC16_HG19_GENCODE_GTF, TranscriptSelectionMode.BEST_EFFECT, Collections.singletonList("ENST00000397910.4") } }; @@ -394,10 +401,11 @@ public void testGencodeFuncotationCreation(final String contig, final ReferenceDataSource referenceDataSource, final FeatureReader featureReader, final String transcriptFastaFile, + final String transcriptGtfFile, final TranscriptSelectionMode transcriptSelectionMode, final List gtTranscripts) { - final List gencodeFuncotations = createGencodeFuncotations(contig, start, end, ref, alt, referenceFileName, referenceDataSource, featureReader, transcriptFastaFile, transcriptSelectionMode); + final List gencodeFuncotations = createGencodeFuncotations(contig, start, end, ref, alt, referenceFileName, referenceDataSource, featureReader, transcriptFastaFile, transcriptGtfFile, transcriptSelectionMode); final FuncotationMap funcotationMap = FuncotationMap.createFromGencodeFuncotations(gencodeFuncotations); @@ -409,7 +417,7 @@ public void testGencodeFuncotationCreation(final String contig, .noneMatch(k -> ((GencodeFuncotation) funcotationMap.get(k).get(0)).getVariantClassification().equals(GencodeFuncotation.VariantClassification.COULD_NOT_DETERMINE) )); } - private static List createGencodeFuncotations(final String contig, final int start, final int end, final String ref, final String alt, final String referenceFileName, final ReferenceDataSource referenceDataSource, final FeatureReader featureReader, final String transcriptFastaFile, final TranscriptSelectionMode transcriptSelectionMode) { + private static List createGencodeFuncotations(final String contig, final int start, final int end, final String ref, final String alt, final String referenceFileName, final ReferenceDataSource referenceDataSource, final FeatureReader featureReader, final String transcriptFastaFile, final String transcriptGtfFile, final TranscriptSelectionMode transcriptSelectionMode) { final SimpleInterval variantInterval = new SimpleInterval( contig, start, end ); final VariantContext variantContext = createVariantContext(contig, start, end, ref, alt, referenceFileName); @@ -427,10 +435,14 @@ private static List createGencodeFuncotations(final String c final String gencode_test = "GENCODE_TEST"; final GencodeFuncotationFactory gencodeFactory = new GencodeFuncotationFactory(Paths.get(transcriptFastaFile), - "TEST", gencode_test, transcriptSelectionMode, new HashSet<>(), new LinkedHashMap<>()); + "TEST", gencode_test, transcriptSelectionMode, new HashSet<>(), new LinkedHashMap<>(), + new FeatureInput<>(transcriptGtfFile, gencode_test, Collections.emptyMap())); + + final FeatureContext featureContext = FuncotatorTestUtils.createFeatureContext(Collections.singletonList(gencodeFactory), "FuncotationMapUnitTest", + variantInterval, 0, 0, 0, null); - return gencodeFactory.createFuncotations(variantContext, referenceContext, Collections.singletonMap(gencode_test, featureList)).stream() - .map(f -> (GencodeFuncotation) f).collect(Collectors.toList()); + return gencodeFactory.createFuncotations(variantContext, referenceContext, featureContext).stream() + .map(f -> (GencodeFuncotation) f).collect(Collectors.toList()); } private List createFieldValuesFromNameList(final String prefix, final List baseFieldList) { @@ -507,7 +519,7 @@ public Object[][] provideTestAdd() { return new Object[][]{ {"chr3", 178916538, 178916538, "G", "C", FuncotatorReferenceTestUtils.retrieveHg19Chr3Ref(), ReferenceDataSource.of( IOUtils.getPath(FuncotatorReferenceTestUtils.retrieveHg19Chr3Ref())), - pik3caFeatureReader, DS_PIK3CA_HG19_GENCODE_FASTA, + pik3caFeatureReader, DS_PIK3CA_HG19_GENCODE_FASTA, DS_PIK3CA_HG19_GENCODE_GTF, TranscriptSelectionMode.ALL, Collections.singletonList("ENST00000263967.3"), Arrays.asList( TableFuncotation.create( @@ -531,7 +543,7 @@ public Object[][] provideTestAdd() { ) },{"chr19", 8994200, 8994200, "G", "C", FuncotatorReferenceTestUtils.retrieveHg19Chr19Ref(), ReferenceDataSource.of( IOUtils.getPath(FuncotatorReferenceTestUtils.retrieveHg19Chr19Ref())), - muc16FeatureReader, DS_MUC16_HG19_GENCODE_FASTA, + muc16FeatureReader, DS_MUC16_HG19_GENCODE_FASTA, DS_MUC16_HG19_GENCODE_GTF, TranscriptSelectionMode.BEST_EFFECT, Collections.singletonList("ENST00000397910.4"), Arrays.asList( TableFuncotation.create( @@ -555,7 +567,7 @@ public Object[][] provideTestAdd() { ) }, {"chr19", 8994200, 8994200, "G", "C", FuncotatorReferenceTestUtils.retrieveHg19Chr19Ref(), ReferenceDataSource.of( IOUtils.getPath(FuncotatorReferenceTestUtils.retrieveHg19Chr19Ref())), - muc16FeatureReader, DS_MUC16_HG19_GENCODE_FASTA, + muc16FeatureReader, DS_MUC16_HG19_GENCODE_FASTA, DS_MUC16_HG19_GENCODE_GTF, TranscriptSelectionMode.ALL, Arrays.asList("ENST00000397910.4", "ENST00000380951.5"), Arrays.asList( TableFuncotation.create( @@ -594,9 +606,10 @@ public void testAddAndGet(final String contig, final ReferenceDataSource referenceDataSource, final FeatureReader featureReader, final String transcriptFastaFile, + final String transcriptGtfFile, final TranscriptSelectionMode transcriptSelectionMode, final List gtTranscripts, final List funcotationsToAdd){ - final List gencodeFuncotations = createGencodeFuncotations(contig, start, end, ref, alt, referenceFileName, referenceDataSource, featureReader, transcriptFastaFile, transcriptSelectionMode); + final List gencodeFuncotations = createGencodeFuncotations(contig, start, end, ref, alt, referenceFileName, referenceDataSource, featureReader, transcriptFastaFile, transcriptGtfFile, transcriptSelectionMode); final FuncotationMap funcotationMap = FuncotationMap.createFromGencodeFuncotations(gencodeFuncotations); // Let's make sure that the gtTranscripts match what is in the map, even if this is tested elsewhere @@ -672,7 +685,7 @@ public void testAddingGencodeFuncotationToFuncotationMap() { // Create some gencode funcotations. The content does not really matter here. final List gencodeFuncotations = createGencodeFuncotations("chr19", 8994200, 8994200, "G", "C", FuncotatorReferenceTestUtils.retrieveHg19Chr19Ref(), ReferenceDataSource.of( IOUtils.getPath(FuncotatorReferenceTestUtils.retrieveHg19Chr19Ref())), - muc16FeatureReader, DS_MUC16_HG19_GENCODE_FASTA, + muc16FeatureReader, DS_MUC16_HG19_GENCODE_FASTA, DS_MUC16_HG19_GENCODE_GTF, TranscriptSelectionMode.ALL).stream().map(gf -> (Funcotation) gf).collect(Collectors.toList()); // Create a funcotationMap with some pre-made funcotations. Content does not really matter. diff --git a/src/test/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorEngineUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorEngineUnitTest.java new file mode 100644 index 00000000000..881d985f456 --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorEngineUnitTest.java @@ -0,0 +1,82 @@ +package org.broadinstitute.hellbender.tools.funcotator; + +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFHeader; +import org.apache.commons.lang3.tuple.Pair; +import org.broadinstitute.hellbender.GATKBaseTest; +import org.broadinstitute.hellbender.engine.DummyPlaceholderGatkTool; +import org.broadinstitute.hellbender.engine.FeatureContext; +import org.broadinstitute.hellbender.engine.ReferenceContext; +import org.broadinstitute.hellbender.engine.ReferenceDataSource; +import org.broadinstitute.hellbender.testutils.FuncotatorReferenceTestUtils; +import org.broadinstitute.hellbender.testutils.VariantContextTestUtils; +import org.broadinstitute.hellbender.tools.funcotator.dataSources.DataSourceUtils; +import org.broadinstitute.hellbender.tools.funcotator.metadata.VcfFuncotationMetadata; +import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.utils.test.FuncotatorTestUtils; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.*; + +public class FuncotatorEngineUnitTest extends GATKBaseTest { + final static private String INPUT_VCF = FuncotatorTestConstants.FUNCOTATOR_TEST_DIR + "/PIK3CA_SNPS_engine_test_chr3.vcf"; + private static final String DS_PIK3CA_DIR = largeFileTestDir + "funcotator" + File.separator + "small_ds_pik3ca" + File.separator; + @DataProvider + public Object[][] provideGt() { + return new Object[][] { + // ground truth gene name, hasClinvarAnnotation + {new File(INPUT_VCF), Arrays.asList("PIK3CA", "PIK3CA", "PIK3CA"), new boolean[]{true, true, false}} + }; + } + @Test(dataProvider = "provideGt") + public void testGetFuncotationFactoriesAndCreateFuncotationMapForVariant(final File vcfFile, + final List correspondingGeneName, + final boolean[] hasClinvarHit) { + + final Pair> entireVcf = VariantContextTestUtils.readEntireVCFIntoMemory(vcfFile.getAbsolutePath()); + final Map configData = DataSourceUtils.getAndValidateDataSourcesFromPaths("hg19", Collections.singletonList(DS_PIK3CA_DIR)); + + final Pair> vcfFileContents = VariantContextTestUtils.readEntireVCFIntoMemory(vcfFile.getAbsolutePath()); + + // Set up our arguments: + final FuncotatorArgumentCollection funcotatorArguments = new FuncotatorArgumentCollection(); + funcotatorArguments.referenceVersion = FuncotatorArgumentDefinitions.HG19_REFERENCE_VERSION_STRING; + + // Create the metadata directly from the input. + final FuncotatorEngine funcotatorEngine = + new FuncotatorEngine( + funcotatorArguments, + vcfFileContents.getLeft().getSequenceDictionary(), + VcfFuncotationMetadata.create(new ArrayList<>(entireVcf.getLeft().getInfoHeaderLines())), + DataSourceUtils.createDataSourceFuncotationFactoriesForDataSources( + configData, + new LinkedHashMap<>(), + TranscriptSelectionMode.CANONICAL, + new HashSet<>(), + new DummyPlaceholderGatkTool(), + FuncotatorArgumentDefinitions.LOOKAHEAD_CACHE_IN_BP_DEFAULT_VALUE) + ); + + for (int i = 0; i < entireVcf.getRight().size(); i++) { + final VariantContext vc = entireVcf.getRight().get(i); + final SimpleInterval variantInterval = new SimpleInterval(vc.getContig(), vc.getStart(), vc.getEnd()); + final ReferenceContext referenceContext = new ReferenceContext(ReferenceDataSource.of(Paths.get(FuncotatorReferenceTestUtils.retrieveHg19Chr3Ref())), variantInterval); + final FeatureContext featureContext = FuncotatorTestUtils.createFeatureContext(funcotatorEngine.getFuncotationFactories(), "TEST", variantInterval, + 0,0,0, null); + final FuncotationMap funcotationMap = funcotatorEngine.createFuncotationMapForVariant(vc, referenceContext, featureContext); + + // Check that all of the transcripts at this location have the same gene name as the corresponding gene. + // The ground truth selected has the same gene name for all transcripts. + // Also, input VCF has no multiallelics. + for (final String txId : funcotationMap.getTranscriptList()) { + Assert.assertEquals(funcotationMap.getFieldValue(txId, "Gencode_19_hugoSymbol", vc.getAlternateAllele(0)), correspondingGeneName.get(i)); + Assert.assertTrue((funcotationMap.getFieldValue(txId, "dummy_ClinVar_VCF_ALLELEID", vc.getAlternateAllele(0)).isEmpty()) != hasClinvarHit[i]); + } + } + } +} diff --git a/src/test/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/gencode/GencodeFuncotationFactoryUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/gencode/GencodeFuncotationFactoryUnitTest.java index a1b95ffa2be..ac00edc8cec 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/gencode/GencodeFuncotationFactoryUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/gencode/GencodeFuncotationFactoryUnitTest.java @@ -14,16 +14,17 @@ import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; import org.broadinstitute.hellbender.GATKBaseTest; +import org.broadinstitute.hellbender.engine.FeatureInput; import org.broadinstitute.hellbender.engine.ReferenceContext; import org.broadinstitute.hellbender.engine.ReferenceDataSource; import org.broadinstitute.hellbender.engine.ReferenceMemorySource; import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.testutils.FuncotatorReferenceTestUtils; import org.broadinstitute.hellbender.tools.funcotator.*; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.codecs.gencode.*; import org.broadinstitute.hellbender.utils.io.IOUtils; import org.broadinstitute.hellbender.utils.reference.ReferenceBases; -import org.broadinstitute.hellbender.testutils.FuncotatorReferenceTestUtils; import org.testng.Assert; import org.testng.annotations.AfterClass; import org.testng.annotations.BeforeClass; @@ -76,7 +77,8 @@ public class GencodeFuncotationFactoryUnitTest extends GATKBaseTest { GencodeFuncotationFactory.DEFAULT_NAME, FuncotatorArgumentDefinitions.TRANSCRIPT_SELECTION_MODE_DEFAULT_VALUE, new HashSet<>(), - new LinkedHashMap<>()); + new LinkedHashMap<>(), + createFeatureInputForMuc16Ds(GencodeFuncotationFactory.DEFAULT_NAME)); } //================================================================================================================== @@ -133,19 +135,21 @@ private static List addReferenceDataToUnitTestData(final List featureReader, final ReferenceDataSource referenceDataSource, - final String transcriptFastaFile) { + final String transcriptFastaFile, + final String transcriptGtfFile) { final List outList = new ArrayList<>(unitTestData.size()); for ( final Object[] rawData : unitTestData ) { - final Object[] dataWithReference = new Object[rawData.length + 4]; + final Object[] dataWithReference = new Object[rawData.length + 5]; for ( int i = 0; i < rawData.length; ++i ) { dataWithReference[i] = rawData[i]; } - dataWithReference[dataWithReference.length-4] = referenceFileName; - dataWithReference[dataWithReference.length-3] = featureReader; - dataWithReference[dataWithReference.length-2] = referenceDataSource; - dataWithReference[dataWithReference.length-1] = transcriptFastaFile; + dataWithReference[dataWithReference.length-5] = referenceFileName; + dataWithReference[dataWithReference.length-4] = featureReader; + dataWithReference[dataWithReference.length-3] = referenceDataSource; + dataWithReference[dataWithReference.length-2] = transcriptFastaFile; + dataWithReference[dataWithReference.length-1] = transcriptGtfFile; outList.add(dataWithReference); } @@ -371,24 +375,24 @@ Object[][] provideDataForCreateFuncotations() { final List outList = new ArrayList<>(); // MUC16 SNPs / DNPs: - outList.addAll( addReferenceDataToUnitTestData(DataProviderForMuc16MnpFullData.provideMnpDataForMuc16_1(), FuncotatorReferenceTestUtils.retrieveHg19Chr19Ref(), muc16FeatureReader, refDataSourceHg19Ch19, FuncotatorTestConstants.MUC16_GENCODE_TRANSCRIPT_FASTA_FILE ) ); - outList.addAll( addReferenceDataToUnitTestData(DataProviderForMuc16MnpFullData.provideMnpDataForMuc16_2(), FuncotatorReferenceTestUtils.retrieveHg19Chr19Ref(), muc16FeatureReader, refDataSourceHg19Ch19, FuncotatorTestConstants.MUC16_GENCODE_TRANSCRIPT_FASTA_FILE ) ); - outList.addAll( addReferenceDataToUnitTestData(DataProviderForMuc16MnpFullData.provideMnpDataForMuc16_3(), FuncotatorReferenceTestUtils.retrieveHg19Chr19Ref(), muc16FeatureReader, refDataSourceHg19Ch19, FuncotatorTestConstants.MUC16_GENCODE_TRANSCRIPT_FASTA_FILE ) ); - outList.addAll( addReferenceDataToUnitTestData(DataProviderForMuc16MnpFullData.provideMnpDataForMuc16_4(), FuncotatorReferenceTestUtils.retrieveHg19Chr19Ref(), muc16FeatureReader, refDataSourceHg19Ch19, FuncotatorTestConstants.MUC16_GENCODE_TRANSCRIPT_FASTA_FILE ) ); - outList.addAll( addReferenceDataToUnitTestData(DataProviderForMuc16MnpFullData.provideMnpDataForMuc16_5(), FuncotatorReferenceTestUtils.retrieveHg19Chr19Ref(), muc16FeatureReader, refDataSourceHg19Ch19, FuncotatorTestConstants.MUC16_GENCODE_TRANSCRIPT_FASTA_FILE ) ); - outList.addAll( addReferenceDataToUnitTestData(DataProviderForMuc16MnpFullData.provideEdgeCasesForMUC16Data_1(), FuncotatorReferenceTestUtils.retrieveHg19Chr19Ref(), muc16FeatureReader, refDataSourceHg19Ch19, FuncotatorTestConstants.MUC16_GENCODE_TRANSCRIPT_FASTA_FILE ) ); + outList.addAll( addReferenceDataToUnitTestData(DataProviderForMuc16MnpFullData.provideMnpDataForMuc16_1(), FuncotatorReferenceTestUtils.retrieveHg19Chr19Ref(), muc16FeatureReader, refDataSourceHg19Ch19, FuncotatorTestConstants.MUC16_GENCODE_TRANSCRIPT_FASTA_FILE, FuncotatorTestConstants.MUC16_GENCODE_ANNOTATIONS_FILE_NAME ) ); + outList.addAll( addReferenceDataToUnitTestData(DataProviderForMuc16MnpFullData.provideMnpDataForMuc16_2(), FuncotatorReferenceTestUtils.retrieveHg19Chr19Ref(), muc16FeatureReader, refDataSourceHg19Ch19, FuncotatorTestConstants.MUC16_GENCODE_TRANSCRIPT_FASTA_FILE, FuncotatorTestConstants.MUC16_GENCODE_ANNOTATIONS_FILE_NAME ) ); + outList.addAll( addReferenceDataToUnitTestData(DataProviderForMuc16MnpFullData.provideMnpDataForMuc16_3(), FuncotatorReferenceTestUtils.retrieveHg19Chr19Ref(), muc16FeatureReader, refDataSourceHg19Ch19, FuncotatorTestConstants.MUC16_GENCODE_TRANSCRIPT_FASTA_FILE, FuncotatorTestConstants.MUC16_GENCODE_ANNOTATIONS_FILE_NAME ) ); + outList.addAll( addReferenceDataToUnitTestData(DataProviderForMuc16MnpFullData.provideMnpDataForMuc16_4(), FuncotatorReferenceTestUtils.retrieveHg19Chr19Ref(), muc16FeatureReader, refDataSourceHg19Ch19, FuncotatorTestConstants.MUC16_GENCODE_TRANSCRIPT_FASTA_FILE, FuncotatorTestConstants.MUC16_GENCODE_ANNOTATIONS_FILE_NAME ) ); + outList.addAll( addReferenceDataToUnitTestData(DataProviderForMuc16MnpFullData.provideMnpDataForMuc16_5(), FuncotatorReferenceTestUtils.retrieveHg19Chr19Ref(), muc16FeatureReader, refDataSourceHg19Ch19, FuncotatorTestConstants.MUC16_GENCODE_TRANSCRIPT_FASTA_FILE, FuncotatorTestConstants.MUC16_GENCODE_ANNOTATIONS_FILE_NAME ) ); + outList.addAll( addReferenceDataToUnitTestData(DataProviderForMuc16MnpFullData.provideEdgeCasesForMUC16Data_1(), FuncotatorReferenceTestUtils.retrieveHg19Chr19Ref(), muc16FeatureReader, refDataSourceHg19Ch19, FuncotatorTestConstants.MUC16_GENCODE_TRANSCRIPT_FASTA_FILE, FuncotatorTestConstants.MUC16_GENCODE_ANNOTATIONS_FILE_NAME ) ); // MUC16 INDELs: - outList.addAll( addReferenceDataToUnitTestData(DataProviderForMuc16IndelData.provideIndelDataForMuc16(), FuncotatorReferenceTestUtils.retrieveHg19Chr19Ref(), muc16FeatureReader, refDataSourceHg19Ch19, FuncotatorTestConstants.MUC16_GENCODE_TRANSCRIPT_FASTA_FILE ) ); + outList.addAll( addReferenceDataToUnitTestData(DataProviderForMuc16IndelData.provideIndelDataForMuc16(), FuncotatorReferenceTestUtils.retrieveHg19Chr19Ref(), muc16FeatureReader, refDataSourceHg19Ch19, FuncotatorTestConstants.MUC16_GENCODE_TRANSCRIPT_FASTA_FILE, FuncotatorTestConstants.MUC16_GENCODE_ANNOTATIONS_FILE_NAME ) ); // PIK3CA SNPs / DNPs: - outList.addAll( addReferenceDataToUnitTestData(DataProviderForPik3caTestData.providePik3caMnpData(), FuncotatorReferenceTestUtils.retrieveHg19Chr3Ref(), pik3caFeatureReader, refDataSourceHg19Ch3, FuncotatorTestConstants.PIK3CA_GENCODE_TRANSCRIPT_FASTA_FILE ) ); + outList.addAll( addReferenceDataToUnitTestData(DataProviderForPik3caTestData.providePik3caMnpData(), FuncotatorReferenceTestUtils.retrieveHg19Chr3Ref(), pik3caFeatureReader, refDataSourceHg19Ch3, FuncotatorTestConstants.PIK3CA_GENCODE_TRANSCRIPT_FASTA_FILE, FuncotatorTestConstants.PIK3CA_GENCODE_ANNOTATIONS_FILE_NAME ) ); // PIK3CA INDELs: - outList.addAll( addReferenceDataToUnitTestData(DataProviderForPik3caTestData.providePik3caInDelData(), FuncotatorReferenceTestUtils.retrieveHg19Chr3Ref(), pik3caFeatureReader, refDataSourceHg19Ch3, FuncotatorTestConstants.PIK3CA_GENCODE_TRANSCRIPT_FASTA_FILE ) ); + outList.addAll( addReferenceDataToUnitTestData(DataProviderForPik3caTestData.providePik3caInDelData(), FuncotatorReferenceTestUtils.retrieveHg19Chr3Ref(), pik3caFeatureReader, refDataSourceHg19Ch3, FuncotatorTestConstants.PIK3CA_GENCODE_TRANSCRIPT_FASTA_FILE, FuncotatorTestConstants.PIK3CA_GENCODE_ANNOTATIONS_FILE_NAME ) ); // PIK3CA Other Indels: - outList.addAll( addReferenceDataToUnitTestData(DataProviderForPik3caTestData.providePik3caInDelData2(), FuncotatorReferenceTestUtils.retrieveHg19Chr3Ref(), pik3caFeatureReader, refDataSourceHg19Ch3, FuncotatorTestConstants.PIK3CA_GENCODE_TRANSCRIPT_FASTA_FILE ) ); + outList.addAll( addReferenceDataToUnitTestData(DataProviderForPik3caTestData.providePik3caInDelData2(), FuncotatorReferenceTestUtils.retrieveHg19Chr3Ref(), pik3caFeatureReader, refDataSourceHg19Ch3, FuncotatorTestConstants.PIK3CA_GENCODE_TRANSCRIPT_FASTA_FILE, FuncotatorTestConstants.PIK3CA_GENCODE_ANNOTATIONS_FILE_NAME ) ); return outList.toArray(new Object[][]{{}}); } @@ -1164,7 +1168,7 @@ void testMuc16SnpCreateFuncotations(final int chromosomeNumber, GencodeFuncotationFactory.DEFAULT_NAME, FuncotatorArgumentDefinitions.TRANSCRIPT_SELECTION_MODE_DEFAULT_VALUE, requestedTranscriptIds, - new LinkedHashMap<>())) { + new LinkedHashMap<>(), createFeatureInputForMuc16Ds(GencodeFuncotationFactory.DEFAULT_NAME))) { // Generate our funcotations: final List featureList = new ArrayList<>(); @@ -1221,7 +1225,8 @@ void createNonBasicFuncotations(final int start, final int end) { GencodeFuncotationFactory.DEFAULT_NAME, FuncotatorArgumentDefinitions.TRANSCRIPT_SELECTION_MODE_DEFAULT_VALUE, new HashSet<>(), - new LinkedHashMap<>())) { + new LinkedHashMap<>(), createFeatureInputForMuc16Ds(GencodeFuncotationFactory.DEFAULT_NAME) + )) { // Generate our funcotations: final List featureList = new ArrayList<>(); @@ -1250,7 +1255,7 @@ void testCreateFuncotations(final String expectedGeneName, final String referenceFileName, final FeatureReader featureReader, final ReferenceDataSource referenceDataSource, - final String transcriptFastaFile) { + final String transcriptFastaFile, final String transcriptGtfFile) { final String contig = "chr" + Integer.toString(chromosomeNumber); final SimpleInterval variantInterval = new SimpleInterval( contig, start, end ); @@ -1291,7 +1296,8 @@ void testCreateFuncotations(final String expectedGeneName, GencodeFuncotationFactory.DEFAULT_NAME, FuncotatorArgumentDefinitions.TRANSCRIPT_SELECTION_MODE_DEFAULT_VALUE, requestedTranscriptIds, - new LinkedHashMap<>())) { + new LinkedHashMap<>(), + new FeatureInput<>(transcriptGtfFile, GencodeFuncotationFactory.DEFAULT_NAME, Collections.emptyMap()))) { final List featureList = new ArrayList<>(); featureList.add( gene ); @@ -1565,7 +1571,7 @@ public void testMultipleGeneFeaturesOnlyProduceOneTranscript() throws IOExceptio GencodeFuncotationFactory.DEFAULT_NAME, TranscriptSelectionMode.CANONICAL, Collections.emptySet(), - new LinkedHashMap<>())) { + new LinkedHashMap<>(), createFeatureInputForCntn4Ds(GencodeFuncotationFactory.DEFAULT_NAME))) { final List gencodeFuncotations = funcotationFactory.createFuncotationsOnVariant(vc, referenceContext, gencodeFeatures); Assert.assertEquals(gencodeFuncotations.size(), 1); } @@ -1576,7 +1582,7 @@ public void testMultipleGeneFeaturesOnlyProduceOneTranscript() throws IOExceptio GencodeFuncotationFactory.DEFAULT_NAME, TranscriptSelectionMode.BEST_EFFECT, Collections.emptySet(), - new LinkedHashMap<>())) { + new LinkedHashMap<>(), createFeatureInputForCntn4Ds(GencodeFuncotationFactory.DEFAULT_NAME))) { final List gencodeFuncotations = funcotationFactory.createFuncotationsOnVariant(vc, referenceContext, gencodeFeatures); Assert.assertEquals(gencodeFuncotations.size(), 1); } @@ -1587,9 +1593,17 @@ public void testMultipleGeneFeaturesOnlyProduceOneTranscript() throws IOExceptio GencodeFuncotationFactory.DEFAULT_NAME, TranscriptSelectionMode.ALL, Collections.emptySet(), - new LinkedHashMap<>())) { + new LinkedHashMap<>(), createFeatureInputForCntn4Ds(GencodeFuncotationFactory.DEFAULT_NAME))) { final List gencodeFuncotations = funcotationFactory.createFuncotationsOnVariant(vc, referenceContext, gencodeFeatures); Assert.assertTrue(gencodeFuncotations.size() > 1); } } + + private static FeatureInput createFeatureInputForMuc16Ds(final String dsName) { + return new FeatureInput<>(FuncotatorTestConstants.MUC16_GENCODE_ANNOTATIONS_FILE_NAME, dsName, Collections.emptyMap()); + } + + private static FeatureInput createFeatureInputForCntn4Ds(final String dsName) { + return new FeatureInput<>(CNTN4_GENCODE_ANNOTATIONS_FILE_NAME, dsName, Collections.emptyMap()); + } } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/vcf/VcfFuncotationFactoryUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/vcf/VcfFuncotationFactoryUnitTest.java index 94ae66d3660..211bef0f6d8 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/vcf/VcfFuncotationFactoryUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/vcf/VcfFuncotationFactoryUnitTest.java @@ -12,21 +12,23 @@ import org.apache.commons.lang3.tuple.Pair; import org.apache.commons.lang3.tuple.Triple; import org.broadinstitute.hellbender.GATKBaseTest; +import org.broadinstitute.hellbender.engine.FeatureInput; import org.broadinstitute.hellbender.engine.ReferenceContext; import org.broadinstitute.hellbender.engine.ReferenceDataSource; +import org.broadinstitute.hellbender.testutils.FuncotatorReferenceTestUtils; +import org.broadinstitute.hellbender.testutils.VariantContextTestUtils; import org.broadinstitute.hellbender.tools.funcotator.Funcotation; import org.broadinstitute.hellbender.tools.funcotator.FuncotatorArgumentDefinitions; import org.broadinstitute.hellbender.tools.funcotator.FuncotatorTestConstants; import org.broadinstitute.hellbender.tools.funcotator.dataSources.TableFuncotation; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.io.IOUtils; -import org.broadinstitute.hellbender.testutils.FuncotatorReferenceTestUtils; -import org.broadinstitute.hellbender.testutils.VariantContextTestUtils; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.File; +import java.nio.file.Path; import java.nio.file.Paths; import java.util.*; import java.util.stream.Collectors; @@ -188,26 +190,26 @@ private Object[][] provideForTestCreateFuncotationsOnVariant() { @Test public void testGetAnnotationFeatureClass() { - final VcfFuncotationFactory vcfFuncotationFactory = new VcfFuncotationFactory(FACTORY_NAME, FACTORY_VERSION, IOUtils.getPath(FuncotatorTestConstants.VARIANT_FILE_HG19_CHR3)); + final VcfFuncotationFactory vcfFuncotationFactory = createVcfFuncotationFactory(FACTORY_NAME, FACTORY_VERSION, IOUtils.getPath(FuncotatorTestConstants.VARIANT_FILE_HG19_CHR3)); Assert.assertEquals(vcfFuncotationFactory.getAnnotationFeatureClass(), VariantContext.class); } @Test public void testGetType() { - final VcfFuncotationFactory vcfFuncotationFactory = new VcfFuncotationFactory(FACTORY_NAME, FACTORY_VERSION, IOUtils.getPath(FuncotatorTestConstants.VARIANT_FILE_HG19_CHR3)); + final VcfFuncotationFactory vcfFuncotationFactory = createVcfFuncotationFactory(FACTORY_NAME, FACTORY_VERSION, IOUtils.getPath(FuncotatorTestConstants.VARIANT_FILE_HG19_CHR3)); Assert.assertEquals(vcfFuncotationFactory.getType(), FuncotatorArgumentDefinitions.DataSourceType.VCF); } @Test(dataProvider = "provideForTestGetName") public void testGetName(final String name) { - final VcfFuncotationFactory vcfFuncotationFactory = new VcfFuncotationFactory(name, FACTORY_VERSION, IOUtils.getPath(FuncotatorTestConstants.VARIANT_FILE_HG19_CHR3)); + final VcfFuncotationFactory vcfFuncotationFactory = createVcfFuncotationFactory(name, FACTORY_VERSION, IOUtils.getPath(FuncotatorTestConstants.VARIANT_FILE_HG19_CHR3)); Assert.assertEquals(vcfFuncotationFactory.getName(), name); } @Test public void testGetSupportedFuncotationFields() { final VcfFuncotationFactory vcfFuncotationFactory = - new VcfFuncotationFactory(FACTORY_NAME, FACTORY_VERSION, IOUtils.getPath(FuncotatorTestConstants.DBSNP_HG19_SNIPPET_FILE_PATH)); + createVcfFuncotationFactory(FACTORY_NAME, FACTORY_VERSION, IOUtils.getPath(FuncotatorTestConstants.DBSNP_HG19_SNIPPET_FILE_PATH)); final LinkedHashSet expectedFieldNames = new LinkedHashSet<>(); @@ -225,7 +227,7 @@ public void testCreateFuncotationsOnVariant(final VariantContext variant, // Make our factory: final VcfFuncotationFactory vcfFuncotationFactory = - new VcfFuncotationFactory(FACTORY_NAME, FACTORY_VERSION, IOUtils.getPath(FuncotatorTestConstants.DBSNP_HG19_SNIPPET_FILE_PATH)); + createVcfFuncotationFactory(FACTORY_NAME, FACTORY_VERSION, IOUtils.getPath(FuncotatorTestConstants.DBSNP_HG19_SNIPPET_FILE_PATH)); // Create features from the file: final List vcfFeatures; @@ -262,7 +264,7 @@ public void testCreateFuncotationMetadata(final VariantContext variant, // Don't need the expected gt for this test, but useful to reuse the data provider. // Make our factory: final VcfFuncotationFactory vcfFuncotationFactory = - new VcfFuncotationFactory(FACTORY_NAME, FACTORY_VERSION, IOUtils.getPath(FuncotatorTestConstants.DBSNP_HG19_SNIPPET_FILE_PATH)); + createVcfFuncotationFactory(FACTORY_NAME, FACTORY_VERSION, IOUtils.getPath(FuncotatorTestConstants.DBSNP_HG19_SNIPPET_FILE_PATH)); // Create features from the file: final List vcfFeatures; @@ -346,7 +348,7 @@ public void testQueryIntoMultiallelic(final SimpleInterval variantInterval, fina // Make the factory final VcfFuncotationFactory vcfFuncotationFactory = - new VcfFuncotationFactory(FACTORY_NAME, FACTORY_VERSION, IOUtils.getPath(EXAC_SNIPPET)); + createVcfFuncotationFactory(FACTORY_NAME, FACTORY_VERSION, IOUtils.getPath(EXAC_SNIPPET)); final ReferenceContext referenceContext = new ReferenceContext(ReferenceDataSource.of(Paths.get(FuncotatorReferenceTestUtils.retrieveB37Chr3Ref())), variantInterval); @@ -414,7 +416,7 @@ public void testCacheOnObjectReference(){ // Create our funcotation factory to test final VcfFuncotationFactory vcfFuncotationFactory = - new VcfFuncotationFactory(FACTORY_NAME, FACTORY_VERSION, IOUtils.getPath(EXAC_SNIPPET)); + createVcfFuncotationFactory(FACTORY_NAME, FACTORY_VERSION, IOUtils.getPath(EXAC_SNIPPET)); for (int i = 0; i < VcfFuncotationFactory.LRUCache.MAX_ENTRIES; i++) { funcotateForCacheTest(vcfFuncotationFactory, dummyTriples.get(i)); @@ -461,4 +463,10 @@ private Triple> createDummyCache return Triple.of(vc, referenceContext, vcfFeatures); } + + private VcfFuncotationFactory createVcfFuncotationFactory(final String name, + final String version, + final Path sourceFilePath) { + return new VcfFuncotationFactory(name, version, sourceFilePath, new LinkedHashMap<>(), new FeatureInput(sourceFilePath.toString(), name, new HashMap<>())); + } } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/xsv/LocatableXsvFuncotationFactoryUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/xsv/LocatableXsvFuncotationFactoryUnitTest.java index be099ab728b..1d3434f6b1f 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/xsv/LocatableXsvFuncotationFactoryUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/xsv/LocatableXsvFuncotationFactoryUnitTest.java @@ -4,10 +4,12 @@ import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; +import org.apache.commons.io.FilenameUtils; import org.broadinstitute.hellbender.GATKBaseTest; import org.broadinstitute.hellbender.engine.ReferenceContext; import org.broadinstitute.hellbender.engine.ReferenceDataSource; import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.testutils.FuncotatorReferenceTestUtils; import org.broadinstitute.hellbender.tools.funcotator.DataSourceFuncotationFactory; import org.broadinstitute.hellbender.tools.funcotator.Funcotation; import org.broadinstitute.hellbender.tools.funcotator.FuncotatorTestConstants; @@ -15,16 +17,20 @@ import org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation; import org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotationBuilder; import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.utils.codecs.xsvLocatableTable.XsvLocatableTableCodec; import org.broadinstitute.hellbender.utils.codecs.xsvLocatableTable.XsvTableFeature; -import org.broadinstitute.hellbender.testutils.FuncotatorReferenceTestUtils; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.*; +import java.util.stream.Collectors; /** * Unit tests for {@link LocatableXsvFuncotationFactory}. @@ -277,10 +283,10 @@ private Object[][] provideForTestSetSupportedFuncotationFields() { public void testGetName(final String name, final String expected) { final LocatableXsvFuncotationFactory locatableXsvFuncotationFactory; if ( name == null ) { - locatableXsvFuncotationFactory = new LocatableXsvFuncotationFactory(LocatableXsvFuncotationFactory.DEFAULT_NAME, DataSourceFuncotationFactory.DEFAULT_VERSION_STRING); + locatableXsvFuncotationFactory = new LocatableXsvFuncotationFactory(LocatableXsvFuncotationFactory.DEFAULT_NAME, DataSourceFuncotationFactory.DEFAULT_VERSION_STRING, new LinkedHashMap<>(), null); } else { - locatableXsvFuncotationFactory = new LocatableXsvFuncotationFactory(name, DataSourceFuncotationFactory.DEFAULT_VERSION_STRING); + locatableXsvFuncotationFactory = new LocatableXsvFuncotationFactory(name, DataSourceFuncotationFactory.DEFAULT_VERSION_STRING, new LinkedHashMap<>(), null); } Assert.assertEquals( locatableXsvFuncotationFactory.getName(), expected ); @@ -294,8 +300,20 @@ public void testCreateFuncotations(final VariantContext variant, final List gencodeFuncotations, final List expected) { - final LocatableXsvFuncotationFactory locatableXsvFuncotationFactory = new LocatableXsvFuncotationFactory(defaultDataSourceName, DataSourceFuncotationFactory.DEFAULT_VERSION_STRING, reportableFuncotationFieldNames); + // Create a temporary file for the "backing data" which will only contain the header: + final Path headerBackingDataFilePath = createTempPath("headerBackingDataFile", "csv"); + try { + Files.write(headerBackingDataFilePath, ("CONTIG,START,END," + reportableFuncotationFieldNames.stream().collect(Collectors.joining(","))).getBytes()); + + // Create a temporary file for the config file that points to the temporary file for the backing data: + createTemporaryConfigFile(headerBackingDataFilePath); + } + catch (final IOException ex) { + throw new GATKException("Could not write to temp file for testing: " + headerBackingDataFilePath.toUri(), ex); + } + final LocatableXsvFuncotationFactory locatableXsvFuncotationFactory = new LocatableXsvFuncotationFactory(defaultDataSourceName, DataSourceFuncotationFactory.DEFAULT_VERSION_STRING, new LinkedHashMap<>(), null); + locatableXsvFuncotationFactory.setSupportedFuncotationFields(new ArrayList<>(Collections.singletonList(headerBackingDataFilePath))); Assert.assertEquals( locatableXsvFuncotationFactory.createFuncotationsOnVariant( variant, referenceContext, featureList ), @@ -311,7 +329,7 @@ public void testCreateFuncotations(final VariantContext variant, @Test(dataProvider = "provideForTestSetSupportedFuncotationFields") public void testSetSupportedFuncotationFields(final List dataFilePaths, final LinkedHashSet expected) { - final LocatableXsvFuncotationFactory locatableXsvFuncotationFactory = new LocatableXsvFuncotationFactory(); + final LocatableXsvFuncotationFactory locatableXsvFuncotationFactory = new LocatableXsvFuncotationFactory(LocatableXsvFuncotationFactory.DEFAULT_NAME, DataSourceFuncotationFactory.DEFAULT_VERSION_STRING, new LinkedHashMap<>(), null); locatableXsvFuncotationFactory.setSupportedFuncotationFields(dataFilePaths); @@ -323,7 +341,71 @@ public void testSetSupportedFuncotationFields(final List dataFilePaths, @Test(expectedExceptions = GATKException.class) public void testGetSupportedFuncotationFields() { - final LocatableXsvFuncotationFactory locatableXsvFuncotationFactory = new LocatableXsvFuncotationFactory(); + final LocatableXsvFuncotationFactory locatableXsvFuncotationFactory = new LocatableXsvFuncotationFactory(LocatableXsvFuncotationFactory.DEFAULT_NAME, DataSourceFuncotationFactory.DEFAULT_VERSION_STRING, new LinkedHashMap<>(), null); locatableXsvFuncotationFactory.getSupportedFuncotationFields(); } + + private void createTemporaryConfigFile(final Path backingDataSourcePath) throws IOException { + + // Config file must be next to backingDataSourcePath, and have the same base name, with the .config extension: + final String backingDataSourceFileName = backingDataSourcePath.toFile().getName(); + final String configFileBaseName = FilenameUtils.removeExtension(backingDataSourceFileName); + final Path configPath = backingDataSourcePath.resolveSibling(configFileBaseName + XsvLocatableTableCodec.CONFIG_FILE_EXTENSION); + + final File configFile = configPath.toAbsolutePath().toFile(); + configFile.createNewFile(); + + try(final PrintWriter writer = new PrintWriter(configPath.toAbsolutePath().toFile())) { + writer.println("name = "); + writer.println("version = TEST"); + writer.println("src_file = " + backingDataSourceFileName); + writer.println("origin_location = LocatableXsvFuncotationFactoryUnitTest.java"); + writer.println("preprocessing_script = "); + writer.println(""); + writer.println("# Supported types:"); + writer.println("# simpleXSV -- Arbitrary separated value table (e.g. CSV), keyed off Gene Name OR Transcript ID"); + writer.println("# locatableXSV -- Arbitrary separated value table (e.g. CSV), keyed off a genome location"); + writer.println("# gencode -- Custom datasource class for GENCODE"); + writer.println("# cosmic -- Custom datasource class for COSMIC"); + writer.println("# vcf -- Custom datasource class for Variant Call Format (VCF) files"); + writer.println(" type = locatableXSV"); + writer.println(""); + writer.println("# Required field for GENCODE files."); + writer.println("# Path to the FASTA file from which to load the sequences for GENCODE transcripts:"); + writer.println(" gencode_fasta_path ="); + writer.println(""); + writer.println("# Required field for simpleXSV files."); + writer.println("# Valid values:"); + writer.println("# GENE_NAME"); + writer.println("# TRANSCRIPT_ID"); + writer.println(" xsv_key = "); + writer.println(""); + writer.println("# Required field for simpleXSV files."); + writer.println("# The 0-based index of the column containing the key on which to match"); + writer.println(" xsv_key_column ="); + writer.println(""); + writer.println("# Required field for simpleXSV AND locatableXSV files."); + writer.println("# The delimiter by which to split the XSV file into columns."); + writer.println(" xsv_delimiter = ,"); + writer.println(""); + writer.println("# Required field for simpleXSV files."); + writer.println("# Whether to permissively match the number of columns in the header and data rows"); + writer.println("# Valid values:"); + writer.println("# true"); + writer.println("# false"); + writer.println(" xsv_permissive_cols = "); + writer.println(""); + writer.println("# Required field for locatableXSV files."); + writer.println("# The 0-based index of the column containing the contig for each row"); + writer.println(" contig_column = 0 "); + writer.println(""); + writer.println("# Required field for locatableXSV files."); + writer.println("# The 0-based index of the column containing the start position for each row"); + writer.println(" start_column = 1 "); + writer.println(""); + writer.println("# Required field for locatableXSV files."); + writer.println("# The 0-based index of the column containing the end position for each row"); + writer.println(" end_column = 2"); + } + } } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/funcotator/mafOutput/CustomMafFuncotationCreatorUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/funcotator/mafOutput/CustomMafFuncotationCreatorUnitTest.java index 1fb0c2be69e..69de18c400d 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/funcotator/mafOutput/CustomMafFuncotationCreatorUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/funcotator/mafOutput/CustomMafFuncotationCreatorUnitTest.java @@ -8,6 +8,8 @@ import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.variant.vcf.VCFFileReader; import org.broadinstitute.hellbender.GATKBaseTest; +import org.broadinstitute.hellbender.engine.FeatureContext; +import org.broadinstitute.hellbender.engine.FeatureInput; import org.broadinstitute.hellbender.engine.ReferenceContext; import org.broadinstitute.hellbender.engine.ReferenceDataSource; import org.broadinstitute.hellbender.tools.funcotator.DataSourceFuncotationFactory; @@ -19,16 +21,15 @@ import org.broadinstitute.hellbender.tools.funcotator.metadata.TumorNormalPair; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.io.IOUtils; +import org.broadinstitute.hellbender.utils.test.FuncotatorTestUtils; import org.codehaus.plexus.util.StringUtils; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.nio.file.Path; import java.nio.file.Paths; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.Map; +import java.util.*; import java.util.stream.Collectors; import static org.broadinstitute.hellbender.tools.funcotator.mafOutput.CustomMafFuncotationCreator.MAF_DBSNP_VAL_STATUS_DELIMITER; @@ -145,8 +146,15 @@ public Object[][] provideDbSnpVariants(){ */ @Test(dataProvider = "provideDbSnpVariants") public void testCreateDbSnpCustomFields(final VariantContext variant, final int gtNumHits, final String gtDbSnpValStatusField) { + + final Path sourceFilePath = IOUtils.getPath(FuncotatorTestConstants.DBSNP_HG19_SNIPPET_FILE_PATH); final DataSourceFuncotationFactory vcfFuncotationFactory = - new VcfFuncotationFactory(DBSNP_DS_NAME, "snippetTest", IOUtils.getPath(FuncotatorTestConstants.DBSNP_HG19_SNIPPET_FILE_PATH)); + new VcfFuncotationFactory(DBSNP_DS_NAME, + "snippetTest", + sourceFilePath, + new LinkedHashMap<>(), + new FeatureInput(sourceFilePath.toString(), DBSNP_DS_NAME, new HashMap<>()) + ); /* dbSNP records of relevance. 1 10177 rs367896724 A AC . . RS=367896724;RSPOS=10177;dbSNPBuildID=138;SSR=0;SAO=0;VP=0x050000020005170026000200;GENEINFO=DDX11L1:100287102;WGT=1;VC=DIV;R5;ASP;VLD;G5A;G5;KGPhase3;CAF=0.5747,0.4253;COMMON=1 @@ -166,7 +174,12 @@ public void testCreateDbSnpCustomFields(final VariantContext variant, final int new SimpleInterval(variant.getContig(), variant.getStart(), variant.getEnd())); - final List funcotations = vcfFuncotationFactory.createFuncotations(variant, referenceContext, vcfFuncotationSourceMap); + final FeatureContext featureContext = FuncotatorTestUtils.createFeatureContext(Collections.singletonList(vcfFuncotationFactory), + "TEST_CREATE_DB_SNP_CUSTOM_FIELDS", + new SimpleInterval(variant.getContig(), variant.getStart(), variant.getEnd()), + 0, 0, 0, null); + + final List funcotations = vcfFuncotationFactory.createFuncotations(variant, referenceContext, featureContext); Assert.assertTrue(funcotations.size() > 0); for (final Funcotation f : funcotations) { Assert.assertEquals(StringUtils.split(f.getField(DBSNP_DS_NAME + "_VLD"), "|").length, vcfFuncotationSourceMap.get(DBSNP_DS_NAME).size()); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/funcotator/mafOutput/MafOutputRendererUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/funcotator/mafOutput/MafOutputRendererUnitTest.java index dcd019b7bc7..ed0676a0398 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/funcotator/mafOutput/MafOutputRendererUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/funcotator/mafOutput/MafOutputRendererUnitTest.java @@ -6,7 +6,9 @@ import htsjdk.variant.vcf.VCFHeader; import org.apache.commons.collections.MapUtils; import org.broadinstitute.hellbender.GATKBaseTest; +import org.broadinstitute.hellbender.engine.DummyPlaceholderGatkTool; import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.testutils.IntegrationTestSpec; import org.broadinstitute.hellbender.tools.funcotator.*; import org.broadinstitute.hellbender.tools.funcotator.dataSources.DataSourceUtils; import org.broadinstitute.hellbender.tools.funcotator.dataSources.TableFuncotation; @@ -14,7 +16,6 @@ import org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotationFactory; import org.broadinstitute.hellbender.tools.funcotator.vcfOutput.VcfOutputRenderer; import org.broadinstitute.hellbender.utils.io.IOUtils; -import org.broadinstitute.hellbender.testutils.IntegrationTestSpec; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -108,7 +109,9 @@ private MafOutputRenderer createMafOutputRenderer(final File outputFile, final S configData, new LinkedHashMap<>(), TranscriptSelectionMode.BEST_EFFECT, - new HashSet<>() + new HashSet<>(), + new DummyPlaceholderGatkTool(), + FuncotatorArgumentDefinitions.LOOKAHEAD_CACHE_IN_BP_DEFAULT_VALUE ); // Sort the datasources to ensure the same order every time: diff --git a/src/test/java/org/broadinstitute/hellbender/utils/test/FuncotatorTestUtils.java b/src/test/java/org/broadinstitute/hellbender/utils/test/FuncotatorTestUtils.java new file mode 100644 index 00000000000..360f3f0c385 --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/utils/test/FuncotatorTestUtils.java @@ -0,0 +1,52 @@ +package org.broadinstitute.hellbender.utils.test; + +import com.google.common.annotations.VisibleForTesting; +import htsjdk.tribble.Feature; +import org.broadinstitute.hellbender.cmdline.CommandLineProgram; +import org.broadinstitute.hellbender.engine.FeatureContext; +import org.broadinstitute.hellbender.engine.FeatureInput; +import org.broadinstitute.hellbender.engine.FeatureManager; +import org.broadinstitute.hellbender.tools.funcotator.DataSourceFuncotationFactory; +import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.utils.Utils; + +import java.nio.file.Path; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +public class FuncotatorTestUtils { + private FuncotatorTestUtils() {} + + + /** + * Since funcotation factories need an instance of {@link FeatureContext} to funcotate, this convenience method can + * create a new instance for test methods. + * + * @param funcotationFactories {@link List} of {@link DataSourceFuncotationFactory} that should be used to generate the + * {@link FeatureContext}. Never {@code null}, but empty list is acceptable. + * @param dummyToolInstanceName A name to use for the "tool". Any string will work here. Never {@code null}. + * @param interval genomic interval for the result. Typically, this would be the interval of the variant. Never {@link null}. + * @param featureQueryLookahead When querying FeatureDataSources, cache this many extra bases of context beyond + * the end of query intervals in anticipation of future queries. Must be >= 0. If uncertain, use zero. + * @param cloudPrefetchBuffer See {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, Path)} If uncertain, use zero. + * @param cloudIndexPrefetchBuffer See {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, Path)} If uncertain, use zero. + * @param reference See {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, Path)} If uncertain, use {@code null}. + * @return a {@link FeatureContext} ready for querying the funcotation factories on the given interval. Never {@code null}. + */ + @VisibleForTesting + public static FeatureContext createFeatureContext(final List funcotationFactories, final String dummyToolInstanceName, + final SimpleInterval interval, final int featureQueryLookahead, final int cloudPrefetchBuffer, + final int cloudIndexPrefetchBuffer, final Path reference) { + Utils.nonNull(funcotationFactories); + Utils.nonNull(dummyToolInstanceName); + Utils.nonNull(interval); + + final Map, Class> featureInputsWithType = + funcotationFactories.stream() + .collect(Collectors.toMap(ff -> ff.getMainSourceFileAsFeatureInput(), ff -> ff.getAnnotationFeatureClass())); + + return FeatureContext.createFeatureContextForTesting(featureInputsWithType, dummyToolInstanceName, interval, + featureQueryLookahead, cloudPrefetchBuffer, cloudIndexPrefetchBuffer, reference); + } +} diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/funcotator/PIK3CA_SNPS_engine_test_chr3.vcf b/src/test/resources/org/broadinstitute/hellbender/tools/funcotator/PIK3CA_SNPS_engine_test_chr3.vcf new file mode 100644 index 00000000000..a95b28bfc1b --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/funcotator/PIK3CA_SNPS_engine_test_chr3.vcf @@ -0,0 +1,8 @@ +##fileformat=VCFv4.1 +##fileDate=201708028 +##source=FuncotatorTestsV0.1 +##reference=file:///Users/jonn/Development/references/Homo_sapiens_assembly19.fasta +#CHROM POS ID REF ALT QUAL FILTER INFO +chr3 178866587 . G A 40 . . +chr3 178916400 . A T 40 . . +chr3 178916617 . C T 40 . .