Skip to content

Commit

Permalink
Funcotation Engine Refactoring (#5134)
Browse files Browse the repository at this point in the history
Massive refactoring of `Funcotator` tool to aid long-term maintenance.

Extracted a `FuncotatorEngine` from `Funcotator.java`

A large refactoring of the architecture of `Funcotator` to make the
creation of annotations separate from the command line interface.

This will enable better testing of the annotation functionality by
allowing for unit tests that can test the created funcotations directly
(as opposed to requiring integration tests for the same purpose).
  • Loading branch information
LeeTL1220 authored and jonn-smith committed Sep 19, 2018
1 parent fadc67b commit 6391a47
Show file tree
Hide file tree
Showing 27 changed files with 1,293 additions and 606 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,12 @@
public final class OptionalReadInputArgumentCollection extends ReadInputArgumentCollection {
private static final long serialVersionUID = 1L;

@Argument(fullName = StandardArgumentDefinitions.INPUT_LONG_NAME, shortName = StandardArgumentDefinitions.INPUT_SHORT_NAME, doc = "BAM/SAM/CRAM file containing reads", optional = true, common = true)
private List<String> readFilesNames;
@Argument(fullName = StandardArgumentDefinitions.INPUT_LONG_NAME,
shortName = StandardArgumentDefinitions.INPUT_SHORT_NAME,
doc = "BAM/SAM/CRAM file containing reads",
optional = true,
common = true)
private List<String> readFilesNames = new ArrayList<>();

@Override
public List<File> getReadFiles() {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
package org.broadinstitute.hellbender.engine;

import com.google.common.annotations.VisibleForTesting;
import htsjdk.tribble.Feature;
import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.nio.file.Path;
import java.util.*;
import java.util.stream.Collectors;

/**
Expand Down Expand Up @@ -290,4 +290,32 @@ public <T extends Feature> List<T> getValues(final Collection<FeatureInput<T>> f
private <T extends Feature> List<T> subsetToStartPosition(final Collection<T> features, final int start) {
return features.stream().filter(feat -> feat.getStart() == start).collect(Collectors.toList());
}

/**
* Convenience method to create a new instance for test methods.
* This method should be used for testing only.
*
* @param featureInputsWithType {@link Map} of a {@link FeatureInput} to the output type that must extend {@link Feature}.
* Never {@code null}, but empty list is acceptable.
* @param dummyToolInstanceName A name to use for the "tool". Any string will work here. Never {@code null}.
* @param interval genomic interval for the result. Typically, this would be the interval of the variant. Never {@link null}.
* @param featureQueryLookahead When querying FeatureDataSources, cache this many extra bases of context beyond
* the end of query intervals in anticipation of future queries. Must be >= 0. If uncertain, use zero.
* @param cloudPrefetchBuffer See {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, Path)} If uncertain, use zero.
* @param cloudIndexPrefetchBuffer See {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, Path)} If uncertain, use zero.
* @param reference See {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, Path)} If uncertain, use {@code null}.
*/
@VisibleForTesting
public static FeatureContext createFeatureContextForTesting(final Map<FeatureInput<? extends Feature>, Class<? extends Feature>> featureInputsWithType, final String dummyToolInstanceName,
final SimpleInterval interval, final int featureQueryLookahead, final int cloudPrefetchBuffer,
final int cloudIndexPrefetchBuffer, final Path reference) {
Utils.nonNull(featureInputsWithType);
Utils.nonNull(dummyToolInstanceName);
Utils.nonNull(interval);

final FeatureManager featureManager = new FeatureManager(featureInputsWithType, dummyToolInstanceName,
featureQueryLookahead, cloudPrefetchBuffer, cloudIndexPrefetchBuffer, reference);

return new FeatureContext(featureManager, interval);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@
* system only in order to be recognized by the Feature management system. This is why the constructor is
* marked as protected.
*
* If you still want to instantiate this class directly, you will have to call {@link GATKTool#addFeatureInputsAfterInitialization(String, String, Class, int)}
* in order to register the FeatureInput with the engine.
*
* FeatureInputs can be assigned logical names on the command line using the syntax:
*
* --argument_name logical_name:feature_file
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.broadinstitute.hellbender.engine;

import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.tribble.Feature;
import htsjdk.tribble.FeatureCodec;
Expand All @@ -14,6 +15,7 @@
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.config.ConfigFactory;
import org.broadinstitute.hellbender.utils.config.GATKConfig;

Expand Down Expand Up @@ -153,6 +155,30 @@ public FeatureManager(final CommandLineProgram toolInstance, final int featureQu
initializeFeatureSources(featureQueryLookahead, toolInstance, cloudPrefetchBuffer, cloudIndexPrefetchBuffer, reference);
}

/**
* Same as {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, Path)}, except used when the
* FeatureInputs (and associated types) are known.
*
* This constructor should only be used in test code.
*
* @param featureInputsToTypeMap {@link Map} of a {@link FeatureInput} to the output type that must extend {@link Feature}. Never {@code null}
* @param toolInstanceName See {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, Path)}
* @param featureQueryLookahead See {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, Path)}
* @param cloudPrefetchBuffer See {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, Path)}
* @param cloudIndexPrefetchBuffer See {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, Path)}
* @param reference See {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, Path)}
*/
@VisibleForTesting
FeatureManager(final Map<FeatureInput<? extends Feature>, Class<? extends Feature>> featureInputsToTypeMap, final String toolInstanceName, final int featureQueryLookahead, final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer, final Path reference) {

Utils.nonNull(featureInputsToTypeMap);

this.toolInstanceSimpleClassName = toolInstanceName;
this.featureSources = new LinkedHashMap<>();
Utils.nonNull(featureInputsToTypeMap);
featureInputsToTypeMap.forEach((k,v) -> addToFeatureSources(featureQueryLookahead, k, v, cloudPrefetchBuffer, cloudIndexPrefetchBuffer, reference));
}

/**
* Given our tool instance, discover all argument of type FeatureInput (or Collections thereof), determine
* the type of each Feature-containing file, and add a FeatureDataSource for each file to our query pool.
Expand Down
29 changes: 8 additions & 21 deletions src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java
Original file line number Diff line number Diff line change
Expand Up @@ -863,37 +863,24 @@ record = header.getProgramRecord(pgID);
}

/**
* Call {@link GATKTool#addFeatureInputsAfterInitialization(String, String, Class, int)} with no caching.
*
* @param filePath See {@link #addFeatureInputsAfterInitialization(String, String, Class, int)}
* @param name See {@link #addFeatureInputsAfterInitialization(String, String, Class, int)}
* @param featureType See {@link #addFeatureInputsAfterInitialization(String, String, Class, int)}
* @return The {@link FeatureInput} used as the key for this data source.
*/
protected FeatureInput<? extends Feature> addFeatureInputsAfterInitialization(final String filePath, final String name,
final Class<? extends Feature> featureType) {

return addFeatureInputsAfterInitialization(filePath, name, featureType, 0);
}

/**
* A method to allow a user to inject data sources after initialization that were not specified as command-line
* arguments.
* A method to allow a user to inject {@link FeatureInput}s after initialization that were not
* specified as command-line arguments.
*
* @param filePath path to the Feature file to register
* @param name what to call the Feature input
* @param featureType class of features
* @param featureQueryLookahead look ahead this many bases during queries that produce cache misses
* @return The {@link FeatureInput} used as the key for this data source.
*/
protected FeatureInput<? extends Feature> addFeatureInputsAfterInitialization(final String filePath,
final String name,
final Class<? extends Feature> featureType, final int featureQueryLookahead) {
public FeatureInput<? extends Feature> addFeatureInputsAfterInitialization(final String filePath,
final String name,
final Class<? extends Feature> featureType,
final int featureQueryLookahead) {

final FeatureInput<? extends Feature> featureInput = new FeatureInput<>(filePath, name);

//Add datasource to the feature manager too so that it can be queried. Setting lookahead to 0 to avoid caching.
//Note: we are disabling lookahead here because of windowed queries that need to "look behind" as well.
// Add the FeatureInput to our FeatureManager so that it will be available for FeatureContext queries
// from the tool
features.addToFeatureSources(
featureQueryLookahead,
featureInput,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@
import htsjdk.variant.variantcontext.VariantContext;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.barclay.utils.Utils;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.FeatureInput;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation;

import java.io.Closeable;
import java.util.*;
import java.util.stream.Collectors;

/**
* An abstract class to allow for the creation of a {@link Funcotation} for a given data source.
Expand All @@ -37,6 +39,33 @@ public abstract class DataSourceFuncotationFactory implements Closeable {
*/
protected Map<String, String> annotationOverrideMap;

/**
* The backing data store as a FeatureInput to leverage tribble querying. Can be {@code null} for non-locatable
* funcotation factories.
*/
protected final FeatureInput<? extends Feature> mainSourceFileAsFeatureInput;

@VisibleForTesting
public FeatureInput<? extends Feature> getMainSourceFileAsFeatureInput() {
return mainSourceFileAsFeatureInput;
}

/**
* Constructor to initialize final fields in this class with defaults.
*/
protected DataSourceFuncotationFactory() {
this.mainSourceFileAsFeatureInput = null;
}

/**
* Constructor to initialize final fields in this class.
* @param mainSourceFileAsFeatureInput The backing data store as a FeatureInput to leverage tribble querying. Can be {@code null} for non-locatable funcotation factories.
*/
protected DataSourceFuncotationFactory(final FeatureInput<? extends Feature> mainSourceFileAsFeatureInput) {
this.mainSourceFileAsFeatureInput = mainSourceFileAsFeatureInput;
}


/**
* Set values in {@link DataSourceFuncotationFactory#annotationOverrideMap} based on the given annotation override values
* and whether or not this {@link DataSourceFuncotationFactory} supports those annotations.
Expand Down Expand Up @@ -106,27 +135,33 @@ public String getVersion() {
* Accounts for override values passed into the constructor as well.
* @param variant {@link VariantContext} to annotate.
* @param referenceContext {@link ReferenceContext} corresponding to the given {@code variant}.
* @param featureSourceMap {@link Map} of {@link String} -> {@link List} of {@link Feature} (data source name -> data source features corresponding to the given {@code variant}.
* @param featureContext {@link FeatureContext} corresponding to the variant. Never {@code null}.
* @return {@link List} of {@link Funcotation} given the {@code variant}, {@code referenceContext}, and {@code featureContext}. This should never be empty.
*/
public List<Funcotation> createFuncotations(final VariantContext variant, final ReferenceContext referenceContext, final Map<String, List<Feature>> featureSourceMap) {
return createFuncotations(variant, referenceContext, featureSourceMap, null);
public List<Funcotation> createFuncotations(final VariantContext variant, final ReferenceContext referenceContext, final FeatureContext featureContext) {
return createFuncotations(variant, referenceContext, featureContext, null);
}

/**
* Creates a {@link List} of {@link Funcotation} for the given {@code variant}, {@code referenceContext}, {@code featureContext}, and {@code gencodeFuncotations}.
* For some Data Sources knowledge of Gene Name or Transcript ID is required for annotation.
* Accounts for override values passed into the constructor as well.
* @param variant {@link VariantContext} to annotate.
* @param referenceContext {@link ReferenceContext} corresponding to the given {@code variant}.
* @param featureSourceMap {@link Map} of {@link String} -> {@link List} of {@link Feature} (data source name -> data source features corresponding to the given {@code variant}.
* @param variant {@link VariantContext} to annotate. Never {@code null}.
* @param referenceContext {@link ReferenceContext} corresponding to the given {@code variant}. Never {@code null}.
* @param featureContext {@link FeatureContext} corresponding to the variant. Never {@code null}.
* @param gencodeFuncotations {@link List} of {@link GencodeFuncotation} that have already been created for the given {@code variant}/{@code referenceContext}/{@code featureContext}.
* {@code null} is acceptable if there are no corresponding gencode funcotations.
* @return {@link List} of {@link Funcotation} given the {@code variant}, {@code referenceContext}, and {@code featureContext}. This should never be empty.
*/
public List<Funcotation> createFuncotations(final VariantContext variant, final ReferenceContext referenceContext, final Map<String, List<Feature>> featureSourceMap, final List<GencodeFuncotation> gencodeFuncotations) {
public List<Funcotation> createFuncotations(final VariantContext variant, final ReferenceContext referenceContext, final FeatureContext featureContext, final List<GencodeFuncotation> gencodeFuncotations) {

Utils.nonNull(variant);
Utils.nonNull(referenceContext);
Utils.nonNull(featureContext);

// Get the features that this funcotation factory is responsible for:
final List<Feature> featureList = getFeatureListFromMap(featureSourceMap);
// Query this funcotation factory to get the list of overlapping features.
@SuppressWarnings("unchecked")
final List<Feature> featureList = (List<Feature>) featureContext.getValues(mainSourceFileAsFeatureInput);

final List<Funcotation> outputFuncotations;

Expand Down Expand Up @@ -175,30 +210,6 @@ private boolean isFeatureListCompatible(final List<Feature> featureList) {
return foundCompatibleFeature;
}

/**
* Get the list of features to annotate from the given Map of features.
* Extracts the feature list given the name of this {@link DataSourceFuncotationFactory}.
* @param featureSourceMap {@link Map} of {@link String} -> ({@link List} of {@link Feature}) (Data source name -> feature list) containing all features that could be used for this {@link DataSourceFuncotationFactory}.
* @return A {@link List} of {@link Feature} that are to be annotated by this {@link DataSourceFuncotationFactory}
*/
private List<Feature> getFeatureListFromMap(final Map<String, List<Feature>> featureSourceMap) {
// Get the features that this funcotation factory is responsible for:
final List<Feature> featureList;

// Only worry about name filtering if we care about the specific feature type:
// NOTE: This should probably be fixed to key off some other abstract class logic.
if ( getAnnotationFeatureClass().equals(Feature.class) ) {
featureList = featureSourceMap.entrySet().stream()
.map(Map.Entry::getValue)
.flatMap(Collection::stream)
.collect(Collectors.toList());
}
else {
featureList = featureSourceMap.getOrDefault( getName(), Collections.emptyList() );
}
return featureList;
}

/**
* Creates a {@link List} of {@link Funcotation} for the given {@code variant} and {@code referenceContext}.
* These will be default funcotations that essentially have empty values.
Expand Down Expand Up @@ -234,5 +245,6 @@ protected abstract List<Funcotation> createFuncotationsOnVariant(final VariantCo
/**
* @return Get the {@link Class} of the feature type that can be used to create annotations by this {@link DataSourceFuncotationFactory}.
*/
protected abstract Class<? extends Feature> getAnnotationFeatureClass();
@VisibleForTesting
public abstract Class<? extends Feature> getAnnotationFeatureClass();
}
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,9 @@ public List<Funcotation> get(final String transcriptId) {
* @param transcriptId the specified transcript ID. Use {@see NO_TRANSCRIPT_AVAILABLE_KEY} if there are no transcripts. Never {@code null}
* @param fieldName The field name to search. Never {@code null}
* @param allele Only return fields from funcotations with the specified allele. Never {@code null}
* @return Value of the given field for the transcript ID and allele. Return {@code null} if field not found.
* @return Value of the given field for the transcript ID and allele. Return {@code null} if field not found in any
* funcotation. Note that if the funcotations support the given field name, but the variant did not overlap any
* records, an empty string will be returned.
*/
public String getFieldValue(final String transcriptId, final String fieldName, final Allele allele) {
Utils.nonNull(transcriptId);
Expand Down
Loading

0 comments on commit 6391a47

Please sign in to comment.