Skip to content

Commit

Permalink
- Added flag to ease restrictions on b37 reference for chr/non-chr co…
Browse files Browse the repository at this point in the history
…mparisons. This is useful for dev testing.

- Added parsing of funcotation attribute using the VCF Header.  Mostly used for dev testing.
- VCF Datasources now have to match the alt and ref alleles to be condsidered a hit.
- Fixing #4808
- Added FuncotationMap to improve encapsulation and make rendering a bit easier.
- Fixed issue with FuncotatorIntegrationTest where default values were references that were changed, thereby causing cross-test errors.
- A lot of refactoring to create FuncotationMap and use that to send to OutputRenderers.
- VCF and MAF will honor the canonical vs. all vs best effect.
  • Loading branch information
LeeTL1220 committed May 31, 2018
1 parent ca2a4e3 commit 43c5098
Show file tree
Hide file tree
Showing 53 changed files with 1,230 additions and 368 deletions.
11 changes: 11 additions & 0 deletions src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java
Original file line number Diff line number Diff line change
Expand Up @@ -821,6 +821,17 @@ record = header.getProgramRecord(pgID);
*/
protected String getToolkitShortName() { return "GATK"; }

/**
* Call {@link GATKTool#addFeatureInputsAfterInitialization(String, String, Class, int)} with no caching.
*
* @return The {@link FeatureInput} used as the key for this data source.
*/
protected FeatureInput<? extends Feature> addFeatureInputsAfterInitialization(final String filePath, final String name,
final Class<? extends Feature> featureType) {

return addFeatureInputsAfterInitialization(filePath, name, featureType, 0);
}

/**
* A method to allow a user to inject data sources after initialization that were not specified as command-line
* arguments.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ public AnnotatedIntervalHeader readActualHeader(final LineIterator reader) {

@Override
public boolean canDecode(final String path) {
return (path.endsWith(".seg") && xsvLocatableTableCodec.canDecodeMinusExtensionChecks(path));
return (path.endsWith(".seg") || path.endsWith(".maf") || path.endsWith(".maf.annotated")) && xsvLocatableTableCodec.canDecodeMinusExtensionChecks(path);
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,11 +128,13 @@ public List<Funcotation> createFuncotations(final VariantContext variant, final
// Get the features that this funcotation factory is responsible for:
final List<Feature> featureList = getFeatureListFromMap(featureSourceMap);

final List<Funcotation> outputFuncotations;

// If our featureList is compatible with this DataSourceFuncotationFactory, then we make our funcotations:
if ( isFeatureListCompatible(featureList) ) {

// Create our funcotations:
final List<Funcotation> outputFuncotations;

if ( gencodeFuncotations == null ) {
outputFuncotations = createFuncotationsOnVariant(variant, referenceContext, featureList);
}
Expand All @@ -142,12 +144,16 @@ public List<Funcotation> createFuncotations(final VariantContext variant, final

// Set our overrides:
setOverrideValuesInFuncotations(outputFuncotations);

return outputFuncotations;
}
else {
return createDefaultFuncotationsOnVariant(variant, referenceContext);
}

if ((outputFuncotations == null) || (outputFuncotations.size() == 0)) {
return createDefaultFuncotationsOnVariant(variant, referenceContext);
} else {
return outputFuncotations;
}
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,4 +72,9 @@ default void setFieldSerializationOverrideValues(final Map<String,String> fieldS
default String serializeToVcfString(final String manualAnnotationString) {
return (manualAnnotationString == null ? "" : manualAnnotationString) + serializeToVcfString();
}

/**
* @return Return whether the field exists in this {@link Funcotation}.
*/
boolean hasField(final String fieldName);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
package org.broadinstitute.hellbender.tools.funcotator;

import htsjdk.variant.variantcontext.Allele;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.tools.funcotator.dataSources.TableFuncotation;
import org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation;
import org.broadinstitute.hellbender.tools.funcotator.vcfOutput.VcfOutputRenderer;
import org.broadinstitute.hellbender.utils.Utils;

import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream;


/**
* A linked map of transcript IDs to funcotations. Also, supports some querying.
*
* Supports ordering of the transcripts.
*
* Multiple GencodeFuncotations for the same transcript are prohibited.
*
* Not thread-safe.
*/
public class FuncotationMap {

public final static String NO_TRANSCRIPT_AVAILABLE_KEY = "no_transcript";

/** Standard Logger. */
protected static final Logger logger = LogManager.getLogger(FuncotationMap.class);


final private Map<String, LinkedHashSet<Funcotation>> txToFuncotations = new LinkedHashMap<>();

private FuncotationMap() {}

private static boolean isGencodeFuncotation(final Funcotation f) {
return f instanceof GencodeFuncotation;
}

/**
* @param transcriptId the specified transcript ID. Use {@see NO_TRANSCRIPT_AVAILABLE_KEY} if there are no transcripts. Never {@code null}
* @return A list of the Gencode Funcotations only. Empty list, if nothing found. Never {@code null}
*/
public List<GencodeFuncotation> getGencodeFuncotations(final String transcriptId) {
Utils.nonNull(transcriptId);
return txToFuncotations.getOrDefault(transcriptId, new LinkedHashSet<>()).stream()
.filter(FuncotationMap::isGencodeFuncotation).map(f-> (GencodeFuncotation) f).collect(Collectors.toList());
}

/**
* Get all funcotations for the given transcript. Note that the alleles will be mixed.
* @param transcriptId the specified transcript ID. Use {@see NO_TRANSCRIPT_AVAILABLE_KEY} if there are no transcripts.
* @return Empty list, if nothing found. Never {@code null}
*/
public List<Funcotation> get(final String transcriptId) {
Utils.nonNull(transcriptId);
return new ArrayList<>(txToFuncotations.getOrDefault(transcriptId, new LinkedHashSet<>()));
}

/**
* @param transcriptId the specified transcript ID. Use {@see NO_TRANSCRIPT_AVAILABLE_KEY} if there are no transcripts. Never {@code null}
* @param fieldName The field name to search. Never {@code null}
* @param allele Only return fields from funcotations with the specified allele. Never {@code null}
* @return Value of the given field for the transcript ID and allele. Return {@code null} if field not found.
*/
public String getFieldValue(final String transcriptId, final String fieldName, final Allele allele) {
Utils.nonNull(transcriptId);
Utils.nonNull(fieldName);
Utils.nonNull(allele);
final List<String> values = txToFuncotations.getOrDefault(transcriptId, new LinkedHashSet<>()).stream()
.filter(f -> f.hasField(fieldName))
.filter(f -> f.getAltAllele().equals(allele))
.map(f -> f.getField(fieldName))
.collect(Collectors.toList());
if (values.size() > 1) {
throw new GATKException.ShouldNeverReachHereException("Found more than one value for " + transcriptId + ", "
+ allele + ", " + fieldName);
}
if (values.size() == 0) {
return null;
} else {
return values.get(0);
}
}

/**
* @return An empty FuncotationMap. Never {@code null}
*/
private static FuncotationMap createEmpty() {
return new FuncotationMap();
}

/**
* @return A FuncotationMap with only one transcript ID, which is {@see NO_TRANSCRIPT_AVAILABLE_KEY}. Never {@code null}
*/
public static FuncotationMap createNoTranscriptInfo(final List<Funcotation> funcotations) {
final FuncotationMap result = createEmpty();
result.add(NO_TRANSCRIPT_AVAILABLE_KEY, funcotations);
return result;
}

/** This method checks for transcript duplications in the list of GencodeFuncotations.
* Recommended to gather all gencode funcotations before calling this method.
* @param gencodeFuncotations Never {@code null}
* @return A new FuncotationMap created only from the given list of GencodeFuncotations. Never {@code null}
*/
public static FuncotationMap createFromGencodeFuncotations(final List<GencodeFuncotation> gencodeFuncotations) {

Utils.nonNull(gencodeFuncotations);
Utils.validateArg(!areDuplicateTranscriptIDsFound(gencodeFuncotations), "Duplicate transcript ID entries were found in input: " +
gencodeFuncotations.stream().map(gf -> gf.getAnnotationTranscript()).collect(Collectors.joining(",")));
final FuncotationMap result = createEmpty();
gencodeFuncotations.forEach(f -> result.add(f.getAnnotationTranscript(), f));
return result;
}

/**
* Add the given funcotations to the given transcript ID.
*
* @param txId Never {@code null}
* @param funcotations Never {@code null}
*/
public void add(final String txId, final List<Funcotation> funcotations) {
Utils.nonNull(txId);
Utils.nonNull(funcotations);

final LinkedHashSet<Funcotation> existingFuncotationsToUpdate = txToFuncotations.getOrDefault(txId, new LinkedHashSet<>());
existingFuncotationsToUpdate.addAll(funcotations);
txToFuncotations.put(txId, existingFuncotationsToUpdate);
}
/** Add the given funcotation to the given transcript ID.
*
* @param txId Never {@code null}
* @param funcotation Never {@code null}
*/
public void add(final String txId, final Funcotation funcotation) {
Utils.nonNull(txId);
Utils.nonNull(funcotation);
final LinkedHashSet<Funcotation> existingFuncotationsToUpdate = txToFuncotations.getOrDefault(txId, new LinkedHashSet<>());
existingFuncotationsToUpdate.add(funcotation);
txToFuncotations.put(txId, existingFuncotationsToUpdate);
}

/**
* Get the key list (i.e. transcript IDs) in order.
*
* @return Never {@code null}
*/
public List<String> keyList() {
return new ArrayList<>(txToFuncotations.keySet());
}

/** Create a FuncotationMap where all Funcotations will be TableFuncotations. This is useful for parsing existing
* VCFs.
* Only renders for a single allele.
* See {@link FuncotatorUtils#extractFuncotatorKeysFromHeaderDescription(String)} for getting the funcotation keys.
*
* @param transcriptFieldName The field name to use for transcript IDs. Use {@see NO_TRANSCRIPT_AVAILABLE_KEY} if unknown.
* If not in the funcotation keys, then the Funcotation map will be created with one transcript ID, {@see NO_TRANSCRIPT_AVAILABLE_KEY}
* Never {@code null}
* @param funcotationKeys The ordered keys of the funcotation field. Never {@code null}
* @param funcotationAttributeForSingleAllele The funcotation attribute from a VCF. Never {@code null}
* @param altAllele The alternate allele for the created funcotations. Never {@code null}
* @param datasourceName The datasource name to use for all of the created funcotatinos. Never {@code null}
* @return a funcotation map. Note that no funcotations will be GencodeFuncotations. Never {@code null}
*/
public static FuncotationMap createAsAllTableFuncotationsFromVcf(final String transcriptFieldName, final String[] funcotationKeys,
final String funcotationAttributeForSingleAllele, final Allele altAllele,
final String datasourceName) {
Utils.nonNull(transcriptFieldName);
Utils.nonNull(funcotationKeys);
Utils.nonNull(funcotationAttributeForSingleAllele);
Utils.nonNull(altAllele);
Utils.nonNull(datasourceName);

final FuncotationMap result = createEmpty();
final String[] funcotationAttributeForSingleAlleleByTranscript = StringUtils.splitByWholeSeparator(funcotationAttributeForSingleAllele, "]#[");

for (final String funcotationAttribute : funcotationAttributeForSingleAlleleByTranscript) {
final String[] values = StringUtils.splitByWholeSeparatorPreserveAllTokens(funcotationAttribute, "|");
if (values[0].startsWith(VcfOutputRenderer.START_TRANSCRIPT_DELIMITER)) {
values[0] = values[0].replace(VcfOutputRenderer.START_TRANSCRIPT_DELIMITER, "");
}
if (values[values.length - 1].endsWith(VcfOutputRenderer.END_TRANSCRIPT_DELIMITER)) {
values[values.length - 1] = values[values.length - 1].replace(VcfOutputRenderer.END_TRANSCRIPT_DELIMITER, "");
}
if (values.length != funcotationKeys.length) {
logger.error("Keys: " + StringUtils.join(funcotationKeys, ", "));
logger.error("Values: " + StringUtils.join(values, ", "));
throw new GATKException.ShouldNeverReachHereException("Cannot parse the funcotation attribute. Num values: " + values.length + " Num keys: " + funcotationKeys.length);
}
final Map<String, String> simpleNameValuePairs = IntStream.range(0, values.length).boxed().collect(Collectors
.toMap(i -> funcotationKeys[i], i-> values[i]));

final List<String> valuesAsList = Arrays.asList(funcotationKeys).stream().map(k -> simpleNameValuePairs.get(k)).collect(Collectors.toList());
result.add(simpleNameValuePairs.getOrDefault(transcriptFieldName, NO_TRANSCRIPT_AVAILABLE_KEY), new TableFuncotation(Arrays.asList(funcotationKeys), valuesAsList, altAllele, datasourceName));

}
return result;
}

private static boolean areDuplicateTranscriptIDsFound(final List<GencodeFuncotation> gencodeFuncotations) {
return gencodeFuncotations.size() != new HashSet<>(gencodeFuncotations).size();
}
}
Loading

0 comments on commit 43c5098

Please sign in to comment.