From 21322db3abce729ae7369ccec796b08c2e41933b Mon Sep 17 00:00:00 2001 From: Louis Bergelson Date: Tue, 30 Jan 2018 14:53:25 -0500 Subject: [PATCH] prevent sequence dictionary validation when aligning reads previously, tools that align reads required you to manually disable sequence dictionary validation if you didn't, they would fail because the unaligned bam didn't have the required sequence dictionary extracting out a SequenceDictionaryValidationArgumentCollection and providing a method for GATKSparkTools to configure it ReadsPipeline couldn't easily make use of this, so instead it overrides the method that does validation BwaSpark / BwaAndMarkDuplicatesPipelineSpark now do not require or allow dictionary validation fixes #4131 --- ...ictionaryValidationArgumentCollection.java | 42 +++++++++++++++++++ .../hellbender/engine/GATKTool.java | 18 ++++++-- .../engine/spark/GATKSparkTool.java | 21 +++++++--- .../hellbender/tools/spark/bwa/BwaSpark.java | 10 ++++- .../BwaAndMarkDuplicatesPipelineSpark.java | 6 +++ .../spark/pipelines/ReadsPipelineSpark.java | 8 ++++ ...onaryValidationArgumentCollectionTest.java | 37 ++++++++++++++++ ...uplicatesPipelineSparkIntegrationTest.java | 4 +- .../spark/bwa/BwaSparkIntegrationTest.java | 2 - .../ReadsPipelineSparkIntegrationTest.java | 10 +---- 10 files changed, 133 insertions(+), 25 deletions(-) create mode 100644 src/main/java/org/broadinstitute/hellbender/cmdline/argumentcollections/SequenceDictionaryValidationArgumentCollection.java create mode 100644 src/test/java/org/broadinstitute/hellbender/cmdline/argumentcollections/SequenceDictionaryValidationArgumentCollectionTest.java diff --git a/src/main/java/org/broadinstitute/hellbender/cmdline/argumentcollections/SequenceDictionaryValidationArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/cmdline/argumentcollections/SequenceDictionaryValidationArgumentCollection.java new file mode 100644 index 00000000000..b8ae6e4720d --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/cmdline/argumentcollections/SequenceDictionaryValidationArgumentCollection.java @@ -0,0 +1,42 @@ +package org.broadinstitute.hellbender.cmdline.argumentcollections; + +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; + +/** + * interface for argument collections that control how sequence dictionary validation should be handled + */ +public interface SequenceDictionaryValidationArgumentCollection { + + /** + * Should sequence dictionary validation be performed + * @return true if the tool should perform sequence dictionary validation + */ + boolean performSequenceDictionaryValidation(); + + + /** + * most tools will want to use this, it defaults to performing sequence dictionary validation but provides the option + * to disable it + */ + class StandardValidationCollection implements SequenceDictionaryValidationArgumentCollection { + @Argument(fullName = StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME, shortName = StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME, doc = "If specified, do not check the sequence dictionaries from our inputs for compatibility. Use at your own risk!", optional = true) + private boolean disableSequenceDictionaryValidation = false; + + @Override + public boolean performSequenceDictionaryValidation() { + return !disableSequenceDictionaryValidation; + } + } + + /** + * doesn't provide a configuration argument, and always returns false, useful for tools that do not want to perform + * sequence dictionary validation, like aligners + */ + class NoValidationCollection implements SequenceDictionaryValidationArgumentCollection { + @Override + public boolean performSequenceDictionaryValidation() { + return false; + } + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java b/src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java index c9eee1182d7..001ce1a893f 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java @@ -21,6 +21,7 @@ import org.broadinstitute.hellbender.engine.filters.ReadFilter; import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary; import org.broadinstitute.hellbender.engine.filters.WellformedReadFilter; +import org.broadinstitute.hellbender.cmdline.argumentcollections.SequenceDictionaryValidationArgumentCollection; import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.transformers.ReadTransformer; import org.broadinstitute.hellbender.utils.SequenceDictionaryUtils; @@ -66,8 +67,8 @@ public abstract class GATKTool extends CommandLineProgram { @Argument(fullName = SECONDS_BETWEEN_PROGRESS_UPDATES_NAME, shortName = SECONDS_BETWEEN_PROGRESS_UPDATES_NAME, doc = "Output traversal statistics every time this many seconds elapse", optional = true, common = true) private double secondsBetweenProgressUpdates = ProgressMeter.DEFAULT_SECONDS_BETWEEN_UPDATES; - @Argument(fullName = StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME, shortName = StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME, doc = "If specified, do not check the sequence dictionaries from our inputs for compatibility. Use at your own risk!", optional = true, common = true) - private boolean disableSequenceDictionaryValidation = false; + @ArgumentCollection + private SequenceDictionaryValidationArgumentCollection seqValidationArguments = getSequenceDictionaryValidationArgumentCollection(); @Argument(fullName=StandardArgumentDefinitions.CREATE_OUTPUT_BAM_INDEX_LONG_NAME, shortName=StandardArgumentDefinitions.CREATE_OUTPUT_BAM_INDEX_SHORT_NAME, @@ -452,6 +453,17 @@ public boolean requiresIntervals() { return false; } + + /** + * Get the {@link SequenceDictionaryValidationArgumentCollection} for the tool. + * Subclasses may override this method in order to customize validation options. + * + * @return a SequenceDictionaryValidationArgumentCollection + */ + protected SequenceDictionaryValidationArgumentCollection getSequenceDictionaryValidationArgumentCollection() { + return new SequenceDictionaryValidationArgumentCollection.StandardValidationCollection(); + } + /** * Load the master sequence dictionary as specified in {@code masterSequenceDictionaryFilename}. * Will only load the master sequence dictionary if it has not already been loaded. @@ -562,7 +574,7 @@ protected void onStartup() { initializeIntervals(); // Must be initialized after reference, reads and features, since intervals currently require a sequence dictionary from another data source - if ( ! disableSequenceDictionaryValidation ) { + if ( seqValidationArguments.performSequenceDictionaryValidation()) { validateSequenceDictionaries(); } diff --git a/src/main/java/org/broadinstitute/hellbender/engine/spark/GATKSparkTool.java b/src/main/java/org/broadinstitute/hellbender/engine/spark/GATKSparkTool.java index 91221c05781..c257bb18f72 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/spark/GATKSparkTool.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/spark/GATKSparkTool.java @@ -8,7 +8,6 @@ import org.broadinstitute.barclay.argparser.ArgumentCollection; import org.broadinstitute.barclay.argparser.CommandLinePluginDescriptor; import org.broadinstitute.hellbender.cmdline.GATKPlugin.GATKReadFilterPluginDescriptor; -import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.cmdline.argumentcollections.*; import org.broadinstitute.hellbender.engine.TraversalParameters; import org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource; @@ -84,8 +83,9 @@ public abstract class GATKSparkTool extends SparkCommandLineProgram { optional = true) protected long bamPartitionSplitSize = 0; - @Argument(fullName = StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME, shortName = StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME, doc = "If specified, do not check the sequence dictionaries from our inputs for compatibility. Use at your own risk!", optional = true) - private boolean disableSequenceDictionaryValidation = false; + + @ArgumentCollection + protected SequenceDictionaryValidationArgumentCollection sequenceDictionaryValidationArguments = getSequenceDictionaryValidationArgumentCollection(); @Argument(doc = "For tools that write an output, write the output in multiple pieces (shards)", fullName = SHARDED_OUTPUT_LONG_NAME, @@ -187,6 +187,14 @@ public SerializableFunction getReferenceWindowFunction return ReferenceWindowFunctions.IDENTITY_FUNCTION; } + /** + * subclasses can override this to provide different default behavior for sequence dictionary validation + * @return a SequenceDictionaryValidationArgumentCollection + */ + protected SequenceDictionaryValidationArgumentCollection getSequenceDictionaryValidationArgumentCollection() { + return new SequenceDictionaryValidationArgumentCollection.StandardValidationCollection(); + } + /** * Returns the "best available" sequence dictionary. This will be the reference sequence dictionary if * there is a reference, otherwise it will be the sequence dictionary constructed from the reads if @@ -391,7 +399,7 @@ public List getIntervals() { @Override protected void runPipeline( JavaSparkContext sparkContext ) { initializeToolInputs(sparkContext); - validateToolInputs(); + validateSequenceDictionaries(); runTool(sparkContext); } @@ -485,8 +493,8 @@ protected List editIntervals(final List rawInter /** * Validates standard tool inputs against each other. */ - private void validateToolInputs() { - if ( ! disableSequenceDictionaryValidation ) { + protected void validateSequenceDictionaries() { + if ( sequenceDictionaryValidationArguments.performSequenceDictionaryValidation() ) { // Validate the reference sequence dictionary against the reads sequence dictionary, if both are present, // using standard GATK validation settings (requiring a common subset of equivalent contigs without respect // to ordering). @@ -510,4 +518,5 @@ private void validateToolInputs() { * @param ctx our Spark context */ protected abstract void runTool( JavaSparkContext ctx ); + } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/bwa/BwaSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/bwa/BwaSpark.java index dd69ae4abc7..70df3334b23 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/bwa/BwaSpark.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/bwa/BwaSpark.java @@ -8,6 +8,7 @@ import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; import org.broadinstitute.barclay.help.DocumentedFeature; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.cmdline.argumentcollections.SequenceDictionaryValidationArgumentCollection; import picard.cmdline.programgroups.ReadDataManipulationProgramGroup; import org.broadinstitute.hellbender.engine.filters.ReadFilter; import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary; @@ -22,8 +23,8 @@ import java.util.List; @DocumentedFeature -@CommandLineProgramProperties(summary = "Runs BWA", - oneLineSummary = "BWA on Spark", +@CommandLineProgramProperties(summary = "Align reads using BWA", + oneLineSummary = "Align reads to a given reference using BWA on Spark", programGroup = ReadDataManipulationProgramGroup.class) @BetaFeature public final class BwaSpark extends GATKSparkTool { @@ -54,6 +55,11 @@ public List getDefaultReadFilters() { return Arrays.asList(ReadFilterLibrary.PRIMARY_LINE, ReadFilterLibrary.SEQ_IS_STORED); } + @Override + public SequenceDictionaryValidationArgumentCollection getSequenceDictionaryValidationArgumentCollection(){ + return new SequenceDictionaryValidationArgumentCollection.NoValidationCollection(); + } + @Override protected void runTool(final JavaSparkContext ctx) { try ( final BwaSparkEngine bwaEngine = diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/BwaAndMarkDuplicatesPipelineSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/BwaAndMarkDuplicatesPipelineSpark.java index 035ec7988e8..303bcd81a74 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/BwaAndMarkDuplicatesPipelineSpark.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/BwaAndMarkDuplicatesPipelineSpark.java @@ -8,6 +8,7 @@ import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; import org.broadinstitute.barclay.help.DocumentedFeature; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.cmdline.argumentcollections.SequenceDictionaryValidationArgumentCollection; import org.broadinstitute.hellbender.cmdline.argumentcollections.MarkDuplicatesSparkArgumentCollection; import org.broadinstitute.hellbender.utils.read.markduplicates.SerializableOpticalDuplicatesFinder; import picard.cmdline.programgroups.ReadDataManipulationProgramGroup; @@ -57,6 +58,11 @@ public final class BwaAndMarkDuplicatesPipelineSpark extends GATKSparkTool { protected MarkDuplicatesSparkArgumentCollection markDuplicatesSparkArgumentCollection = new MarkDuplicatesSparkArgumentCollection(); + @Override + protected SequenceDictionaryValidationArgumentCollection getSequenceDictionaryValidationArgumentCollection() { + return new SequenceDictionaryValidationArgumentCollection.NoValidationCollection(); + } + @Override protected void runTool(final JavaSparkContext ctx) { try (final BwaSparkEngine bwaEngine = new BwaSparkEngine(ctx, referenceArguments.getReferenceFileName(), bwaArgs.indexImageFile, getHeaderForReads(), getReferenceSequenceDictionary())) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/ReadsPipelineSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/ReadsPipelineSpark.java index 690afdf8d74..d1325ad54f3 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/ReadsPipelineSpark.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/ReadsPipelineSpark.java @@ -142,6 +142,14 @@ public SerializableFunction getReferenceWindowFunction return BaseRecalibrationEngine.BQSR_REFERENCE_WINDOW_FUNCTION; } + @Override + protected void validateSequenceDictionaries(){ + //don't validate unaligned reads because we don't require them to have a sequence dictionary + if( !align ){ + super.validateSequenceDictionaries(); + } + } + @Override protected void runTool(final JavaSparkContext ctx) { if (joinStrategy == JoinStrategy.BROADCAST && ! getReference().isCompatibleWithSparkBroadcast()){ diff --git a/src/test/java/org/broadinstitute/hellbender/cmdline/argumentcollections/SequenceDictionaryValidationArgumentCollectionTest.java b/src/test/java/org/broadinstitute/hellbender/cmdline/argumentcollections/SequenceDictionaryValidationArgumentCollectionTest.java new file mode 100644 index 00000000000..76b1065b9d8 --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/cmdline/argumentcollections/SequenceDictionaryValidationArgumentCollectionTest.java @@ -0,0 +1,37 @@ +package org.broadinstitute.hellbender.cmdline.argumentcollections; + +import org.broadinstitute.barclay.argparser.ArgumentCollection; +import org.broadinstitute.barclay.argparser.CommandLineArgumentParser; +import org.broadinstitute.barclay.argparser.CommandLineParser; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.cmdline.argumentcollections.SequenceDictionaryValidationArgumentCollection; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class SequenceDictionaryValidationArgumentCollectionTest { + + private static class StandardArgumentCollection { + @ArgumentCollection + public SequenceDictionaryValidationArgumentCollection standard = new SequenceDictionaryValidationArgumentCollection.StandardValidationCollection(); + } + + @Test + public void testStandardArgumentCollectionDefaultsToTrue(){ + Assert.assertTrue(new SequenceDictionaryValidationArgumentCollection.StandardValidationCollection().performSequenceDictionaryValidation()); + } + + @Test + public void testStandardArgumentCollectionCanBeDisabled(){ + final String[] disabled = {"--"+StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME}; + StandardArgumentCollection std = new StandardArgumentCollection(); + CommandLineParser clp = new CommandLineArgumentParser(std); + clp.parseArguments(System.out, disabled); + Assert.assertFalse(std.standard.performSequenceDictionaryValidation()); + } + + @Test + public void testNoValidationDefaultsToFalse(){ + Assert.assertFalse(new SequenceDictionaryValidationArgumentCollection.NoValidationCollection().performSequenceDictionaryValidation()); + } + +} \ No newline at end of file diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/bwa/BwaAndMarkDuplicatesPipelineSparkIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/bwa/BwaAndMarkDuplicatesPipelineSparkIntegrationTest.java index e149fd2ca58..c9da134feb1 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/bwa/BwaAndMarkDuplicatesPipelineSparkIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/bwa/BwaAndMarkDuplicatesPipelineSparkIntegrationTest.java @@ -1,7 +1,6 @@ package org.broadinstitute.hellbender.tools.spark.bwa; import org.broadinstitute.hellbender.CommandLineProgramTest; -import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.cmdline.argumentcollections.MarkDuplicatesSparkArgumentCollection; import org.broadinstitute.hellbender.tools.spark.pipelines.BwaAndMarkDuplicatesPipelineSpark; import org.broadinstitute.hellbender.utils.test.ArgumentsBuilder; @@ -34,8 +33,7 @@ public void test() throws Exception { args.addReference(ref); args.addInput(input); args.addOutput(output); - args.addBooleanArgument(StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME, true); - args.add("--"+MarkDuplicatesSparkArgumentCollection.DO_NOT_MARK_UNMAPPED_MATES_LONG_NAME); + args.addBooleanArgument(MarkDuplicatesSparkArgumentCollection.DO_NOT_MARK_UNMAPPED_MATES_LONG_NAME, true); this.runCommandLine(args.getArgsArray()); SamAssertionUtils.assertSamsEqual(output, expectedSam); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/bwa/BwaSparkIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/bwa/BwaSparkIntegrationTest.java index ae495151857..7519c9bb2ea 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/bwa/BwaSparkIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/bwa/BwaSparkIntegrationTest.java @@ -30,7 +30,6 @@ public void testPairedEnd() throws Exception { final ArgumentsBuilder args = new ArgumentsBuilder(); args.addFileArgument(StandardArgumentDefinitions.REFERENCE_LONG_NAME, ref); args.addFileArgument(StandardArgumentDefinitions.INPUT_LONG_NAME, input); - args.addBooleanArgument(StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME, true); // disable since input does not have a sequence dictionary args.addBooleanArgument(GATKSparkTool.SHARDED_OUTPUT_LONG_NAME, true); args.addArgument(GATKSparkTool.NUM_REDUCERS_LONG_NAME,"1"); args.addOutput(output); @@ -52,7 +51,6 @@ public void testSingleEnd() throws Exception { final ArgumentsBuilder args = new ArgumentsBuilder(); args.addFileArgument(StandardArgumentDefinitions.REFERENCE_LONG_NAME, ref); args.addFileArgument(StandardArgumentDefinitions.INPUT_LONG_NAME, input); - args.addBooleanArgument(StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME , true); // disable since input does not have a sequence dictionary args.addBooleanArgument(GATKSparkTool.SHARDED_OUTPUT_LONG_NAME, true); args.addArgument(GATKSparkTool.NUM_REDUCERS_LONG_NAME,"1"); args.addOutput(output); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/ReadsPipelineSparkIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/ReadsPipelineSparkIntegrationTest.java index 3dd6ba2d649..3c752e8ed7c 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/ReadsPipelineSparkIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/ReadsPipelineSparkIntegrationTest.java @@ -38,14 +38,6 @@ private PipelineTest(String referenceURL, String bam, String outputExtension, St this.expectedVcfFileName = expectedVcfFileName; } - public String getCommandLine() { - return " -R " + referenceURL + - " -I " + bam + - " " + args + - (knownSites.isEmpty() ? "": " --known-sites " + knownSites) + - " -O %s"; - } - @Override public String toString() { return String.format("ReadsPipeline(bam='%s', args='%s')", bam, args); @@ -97,7 +89,7 @@ public Object[][] createReadsPipelineSparkTestData() { {new PipelineTest(GRCh37Ref2bit_chr2021, hiSeqBam_chr20, ".bam", dbSNPb37_20, "--join-strategy OVERLAPS_PARTITIONER --read-shard-padding 1000 --known-sites " + more20Sites, getResourceDir() + expectedMultipleKnownSites, getResourceDir() + expectedMultipleKnownSitesVcf)}, // BWA-MEM - {new PipelineTest(GRCh37Ref2bit_chr2021, unalignedBam, ".bam", dbSNPb37_20, "--align --bwa-mem-index-image " + GRCh37Ref_2021_img + " --disable-sequence-dictionary-validation true --join-strategy BROADCAST --known-sites " + more20Sites, null, largeFileTestDir + expectedMultipleKnownSitesFromUnalignedVcf)}, + {new PipelineTest(GRCh37Ref2bit_chr2021, unalignedBam, ".bam", dbSNPb37_20, "--align --bwa-mem-index-image " + GRCh37Ref_2021_img + " --join-strategy BROADCAST --known-sites " + more20Sites, null, largeFileTestDir + expectedMultipleKnownSitesFromUnalignedVcf)}, }; }