Skip to content

Commit

Permalink
prevent sequence dictionary validation when aligning reads
Browse files Browse the repository at this point in the history
previously, tools that align reads required you to manually disable sequence dictionary validation
if you didn't, they would fail because the unaligned bam didn't have the required sequence dictionary

extracting out a SequenceDictionaryValidationArgumentCollection and providing a method for GATKSparkTools to configure it
ReadsPipeline couldn't easily make use of this, so instead it overrides the method that does validation

BwaSpark / BwaAndMarkDuplicatesPipelineSpark now do not require or allow dictionary validation
fixes #4131
  • Loading branch information
lbergelson committed May 9, 2018
1 parent 869292b commit 3195b02
Show file tree
Hide file tree
Showing 10 changed files with 122 additions and 25 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package org.broadinstitute.hellbender.cmdline.argumentcollections;

import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;

/**
* interface for argument collections that control
*/
public interface SequenceDictionaryValidationArgumentCollection {

/**
* Should sequence dictionary validation be performed
* @return true if the tool should perform sequence dictionary validation
*/
boolean performSequenceDictionaryValidation();


/**
* most tools will want to use this, it defaults to performing sequence dictionary validation but provides the option
* to disable it
*/
class StandardValidationCollection implements SequenceDictionaryValidationArgumentCollection {
@Argument(fullName = StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME, shortName = StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME, doc = "If specified, do not check the sequence dictionaries from our inputs for compatibility. Use at your own risk!", optional = true)
private boolean disableSequenceDictionaryValidation = false;

@Override
public boolean performSequenceDictionaryValidation() {
return !disableSequenceDictionaryValidation;
}
}

/**
* doesn't provide a configuration argument, and always returns false, useful for tools that do not want to perform
* sequence dictionary validation, like aligners
*/
class NoValidationCollection implements SequenceDictionaryValidationArgumentCollection {
@Override
public boolean performSequenceDictionaryValidation() {
return false;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
import org.broadinstitute.hellbender.engine.filters.WellformedReadFilter;
import org.broadinstitute.hellbender.cmdline.argumentcollections.SequenceDictionaryValidationArgumentCollection;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.transformers.ReadTransformer;
import org.broadinstitute.hellbender.utils.SequenceDictionaryUtils;
Expand Down Expand Up @@ -66,8 +67,8 @@ public abstract class GATKTool extends CommandLineProgram {
@Argument(fullName = SECONDS_BETWEEN_PROGRESS_UPDATES_NAME, shortName = SECONDS_BETWEEN_PROGRESS_UPDATES_NAME, doc = "Output traversal statistics every time this many seconds elapse", optional = true, common = true)
private double secondsBetweenProgressUpdates = ProgressMeter.DEFAULT_SECONDS_BETWEEN_UPDATES;

@Argument(fullName = StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME, shortName = StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME, doc = "If specified, do not check the sequence dictionaries from our inputs for compatibility. Use at your own risk!", optional = true, common = true)
private boolean disableSequenceDictionaryValidation = false;
@ArgumentCollection
SequenceDictionaryValidationArgumentCollection seqValidationArguments = new SequenceDictionaryValidationArgumentCollection.StandardValidationCollection();

@Argument(fullName=StandardArgumentDefinitions.CREATE_OUTPUT_BAM_INDEX_LONG_NAME,
shortName=StandardArgumentDefinitions.CREATE_OUTPUT_BAM_INDEX_SHORT_NAME,
Expand Down Expand Up @@ -558,7 +559,7 @@ protected void onStartup() {

initializeIntervals(); // Must be initialized after reference, reads and features, since intervals currently require a sequence dictionary from another data source

if ( ! disableSequenceDictionaryValidation ) {
if ( seqValidationArguments.performSequenceDictionaryValidation()) {
validateSequenceDictionaries();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.CommandLinePluginDescriptor;
import org.broadinstitute.hellbender.cmdline.GATKPlugin.GATKReadFilterPluginDescriptor;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.argumentcollections.*;
import org.broadinstitute.hellbender.engine.TraversalParameters;
import org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource;
Expand Down Expand Up @@ -83,8 +82,9 @@ public abstract class GATKSparkTool extends SparkCommandLineProgram {
optional = true)
protected long bamPartitionSplitSize = 0;

@Argument(fullName = StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME, shortName = StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME, doc = "If specified, do not check the sequence dictionaries from our inputs for compatibility. Use at your own risk!", optional = true)
private boolean disableSequenceDictionaryValidation = false;

@ArgumentCollection
protected SequenceDictionaryValidationArgumentCollection sequenceDictionaryValidationArguments = getSequenceDictionaryValidationArgumentCollection();

@Argument(doc = "For tools that write an output, write the output in multiple pieces (shards)",
fullName = SHARDED_OUTPUT_LONG_NAME,
Expand Down Expand Up @@ -179,6 +179,14 @@ public SerializableFunction<GATKRead, SimpleInterval> getReferenceWindowFunction
return ReferenceWindowFunctions.IDENTITY_FUNCTION;
}

/**
* subclasses can override this to provide different default behavior for sequence dictionary validation
* @return a SequenceDictionaryValidationArgumentCollection
*/
protected SequenceDictionaryValidationArgumentCollection getSequenceDictionaryValidationArgumentCollection() {
return new SequenceDictionaryValidationArgumentCollection.StandardValidationCollection();
}

/**
* Returns the "best available" sequence dictionary. This will be the reference sequence dictionary if
* there is a reference, otherwise it will be the sequence dictionary constructed from the reads if
Expand Down Expand Up @@ -383,7 +391,7 @@ public List<SimpleInterval> getIntervals() {
@Override
protected void runPipeline( JavaSparkContext sparkContext ) {
initializeToolInputs(sparkContext);
validateToolInputs();
validateSequenceDictionaries();
runTool(sparkContext);
}

Expand Down Expand Up @@ -477,8 +485,8 @@ protected List<SimpleInterval> editIntervals(final List<SimpleInterval> rawInter
/**
* Validates standard tool inputs against each other.
*/
private void validateToolInputs() {
if ( ! disableSequenceDictionaryValidation ) {
protected void validateSequenceDictionaries() {
if ( sequenceDictionaryValidationArguments.performSequenceDictionaryValidation() ) {
// Validate the reference sequence dictionary against the reads sequence dictionary, if both are present,
// using standard GATK validation settings (requiring a common subset of equivalent contigs without respect
// to ordering).
Expand All @@ -502,4 +510,5 @@ private void validateToolInputs() {
* @param ctx our Spark context
*/
protected abstract void runTool( JavaSparkContext ctx );

}
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.argumentcollections.SequenceDictionaryValidationArgumentCollection;
import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
Expand All @@ -22,8 +23,8 @@
import java.util.List;

@DocumentedFeature
@CommandLineProgramProperties(summary = "Runs BWA",
oneLineSummary = "BWA on Spark",
@CommandLineProgramProperties(summary = "align reads using BWA",
oneLineSummary = "align reads to a given reference using BWA on Spark",
programGroup = ReadDataManipulationProgramGroup.class)
@BetaFeature
public final class BwaSpark extends GATKSparkTool {
Expand Down Expand Up @@ -54,6 +55,11 @@ public List<ReadFilter> getDefaultReadFilters() {
return Arrays.asList(ReadFilterLibrary.PRIMARY_LINE, ReadFilterLibrary.SEQ_IS_STORED);
}

@Override
public SequenceDictionaryValidationArgumentCollection getSequenceDictionaryValidationArgumentCollection(){
return new SequenceDictionaryValidationArgumentCollection.NoValidationCollection();
}

@Override
protected void runTool(final JavaSparkContext ctx) {
try ( final BwaSparkEngine bwaEngine =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.argumentcollections.SequenceDictionaryValidationArgumentCollection;
import org.broadinstitute.hellbender.cmdline.argumentcollections.MarkDuplicatesSparkArgumentCollection;
import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
Expand Down Expand Up @@ -56,6 +57,11 @@ public final class BwaAndMarkDuplicatesPipelineSpark extends GATKSparkTool {
protected MarkDuplicatesSparkArgumentCollection markDuplicatesSparkArgumentCollection = new MarkDuplicatesSparkArgumentCollection();


@Override
protected SequenceDictionaryValidationArgumentCollection getSequenceDictionaryValidationArgumentCollection() {
return new SequenceDictionaryValidationArgumentCollection.NoValidationCollection();
}

@Override
protected void runTool(final JavaSparkContext ctx) {
try (final BwaSparkEngine bwaEngine = new BwaSparkEngine(ctx, referenceArguments.getReferenceFileName(), bwaArgs.indexImageFile, getHeaderForReads(), getReferenceSequenceDictionary())) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,14 @@ public SerializableFunction<GATKRead, SimpleInterval> getReferenceWindowFunction
return BaseRecalibrationEngine.BQSR_REFERENCE_WINDOW_FUNCTION;
}

@Override
protected void validateSequenceDictionaries(){
//don't validate unaligned reads because we don't require them to have a sequence dictionary
if( !align ){
super.validateSequenceDictionaries();
}
}

@Override
protected void runTool(final JavaSparkContext ctx) {
if (joinStrategy == JoinStrategy.BROADCAST && ! getReference().isCompatibleWithSparkBroadcast()){
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package org.broadinstitute.hellbender.cmdline.argumentcollections;

import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.CommandLineArgumentParser;
import org.broadinstitute.barclay.argparser.CommandLineParser;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.argumentcollections.SequenceDictionaryValidationArgumentCollection;
import org.testng.Assert;
import org.testng.annotations.Test;

public class SequenceDictionaryValidationArgumentCollectionTest {

private static class StandardArgumentCollection {
@ArgumentCollection
public SequenceDictionaryValidationArgumentCollection standard = new SequenceDictionaryValidationArgumentCollection.StandardValidationCollection();
}

@Test
public void testStandardArgumentCollectionDefaultsToTrue(){
Assert.assertTrue(new SequenceDictionaryValidationArgumentCollection.StandardValidationCollection().performSequenceDictionaryValidation());
}

@Test
public void testStandardArgumentCollectionCanBeDisabled(){
final String[] disabled = {"--"+StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME};
StandardArgumentCollection std = new StandardArgumentCollection();
CommandLineParser clp = new CommandLineArgumentParser(std);
clp.parseArguments(System.out, disabled);
Assert.assertFalse(std.standard.performSequenceDictionaryValidation());
}

@Test
public void testNoValidationDefaultsToFalse(){
Assert.assertFalse(new SequenceDictionaryValidationArgumentCollection.NoValidationCollection().performSequenceDictionaryValidation());
}

}
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package org.broadinstitute.hellbender.tools.spark.bwa;

import org.broadinstitute.hellbender.CommandLineProgramTest;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.argumentcollections.MarkDuplicatesSparkArgumentCollection;
import org.broadinstitute.hellbender.tools.spark.pipelines.BwaAndMarkDuplicatesPipelineSpark;
import org.broadinstitute.hellbender.utils.test.ArgumentsBuilder;
Expand Down Expand Up @@ -34,8 +33,7 @@ public void test() throws Exception {
args.addReference(ref);
args.addInput(input);
args.addOutput(output);
args.addBooleanArgument(StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME, true);
args.add("--"+MarkDuplicatesSparkArgumentCollection.DO_NOT_MARK_UNMAPPED_MATES_LONG_NAME);
args.addBooleanArgument(MarkDuplicatesSparkArgumentCollection.DO_NOT_MARK_UNMAPPED_MATES_LONG_NAME, true);
this.runCommandLine(args.getArgsArray());

SamAssertionUtils.assertSamsEqual(output, expectedSam);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ public void testPairedEnd() throws Exception {
final ArgumentsBuilder args = new ArgumentsBuilder();
args.addFileArgument(StandardArgumentDefinitions.REFERENCE_LONG_NAME, ref);
args.addFileArgument(StandardArgumentDefinitions.INPUT_LONG_NAME, input);
args.addBooleanArgument(StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME, true); // disable since input does not have a sequence dictionary
args.addBooleanArgument(GATKSparkTool.SHARDED_OUTPUT_LONG_NAME, true);
args.addArgument(GATKSparkTool.NUM_REDUCERS_LONG_NAME,"1");
args.addOutput(output);
Expand All @@ -52,7 +51,6 @@ public void testSingleEnd() throws Exception {
final ArgumentsBuilder args = new ArgumentsBuilder();
args.addFileArgument(StandardArgumentDefinitions.REFERENCE_LONG_NAME, ref);
args.addFileArgument(StandardArgumentDefinitions.INPUT_LONG_NAME, input);
args.addBooleanArgument(StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME , true); // disable since input does not have a sequence dictionary
args.addBooleanArgument(GATKSparkTool.SHARDED_OUTPUT_LONG_NAME, true);
args.addArgument(GATKSparkTool.NUM_REDUCERS_LONG_NAME,"1");
args.addOutput(output);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,6 @@ private PipelineTest(String referenceURL, String bam, String outputExtension, St
this.expectedVcfFileName = expectedVcfFileName;
}

public String getCommandLine() {
return " -R " + referenceURL +
" -I " + bam +
" " + args +
(knownSites.isEmpty() ? "": " --known-sites " + knownSites) +
" -O %s";
}

@Override
public String toString() {
return String.format("ReadsPipeline(bam='%s', args='%s')", bam, args);
Expand Down Expand Up @@ -97,7 +89,7 @@ public Object[][] createReadsPipelineSparkTestData() {
{new PipelineTest(GRCh37Ref2bit_chr2021, hiSeqBam_chr20, ".bam", dbSNPb37_20, "--join-strategy OVERLAPS_PARTITIONER --read-shard-padding 1000 --known-sites " + more20Sites, getResourceDir() + expectedMultipleKnownSites, getResourceDir() + expectedMultipleKnownSitesVcf)},

// BWA-MEM
{new PipelineTest(GRCh37Ref2bit_chr2021, unalignedBam, ".bam", dbSNPb37_20, "--align --bwa-mem-index-image " + GRCh37Ref_2021_img + " --disable-sequence-dictionary-validation true --join-strategy BROADCAST --known-sites " + more20Sites, null, largeFileTestDir + expectedMultipleKnownSitesFromUnalignedVcf)},
{new PipelineTest(GRCh37Ref2bit_chr2021, unalignedBam, ".bam", dbSNPb37_20, "--align --bwa-mem-index-image " + GRCh37Ref_2021_img + " --join-strategy BROADCAST --known-sites " + more20Sites, null, largeFileTestDir + expectedMultipleKnownSitesFromUnalignedVcf)},
};
}

Expand Down

0 comments on commit 3195b02

Please sign in to comment.