From 8c24baa8d9025b167adc7345288ba9dafe9c7731 Mon Sep 17 00:00:00 2001 From: Ted Sharpe Date: Wed, 29 Jun 2022 16:40:44 -0400 Subject: [PATCH] CondenseDepthEvidence tool --- .../tools/sv/CondenseDepthEvidence.java | 167 ++++++++++++++++++ .../utils/io/FeatureOutputStream.java | 2 +- .../CondenseDepthEvidenceIntegrationTest.java | 48 +++++ .../walkers/sv/printevidence/merged.rd.txt | 51 ++++++ .../sv/printevidence/no_condense.rd.txt | 7 + 5 files changed, 274 insertions(+), 1 deletion(-) create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/sv/CondenseDepthEvidence.java create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/sv/CondenseDepthEvidenceIntegrationTest.java create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/printevidence/merged.rd.txt create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/printevidence/no_condense.rd.txt diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/CondenseDepthEvidence.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/CondenseDepthEvidence.java new file mode 100644 index 00000000000..f6b77564967 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/CondenseDepthEvidence.java @@ -0,0 +1,167 @@ +package org.broadinstitute.hellbender.tools.sv; + +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.tribble.Feature; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.argparser.ExperimentalFeature; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; +import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.utils.MathUtils; +import org.broadinstitute.hellbender.utils.codecs.FeatureOutputCodec; +import org.broadinstitute.hellbender.utils.codecs.FeatureOutputCodecFinder; +import org.broadinstitute.hellbender.utils.codecs.FeatureSink; + +/** + *

Combines adjacent intervals in DepthEvidence files.

+ * + *

Inputs

+ * + * + * + *

Output

+ * + * + * + *

Usage example

+ * + *
+ *     gatk CondenseDepthEvidence \
+ *       -F input.rd.txt.gz \
+ *       -O merged.rd.txt.gz
+ * 
+ * + * @author Ted Sharpe <tsharpe@broadinstitute.org> + */ +@CommandLineProgramProperties( + summary = "Merges adjacent DepthEvidence records.", + oneLineSummary = "Merges adjacent DepthEvidence records.", + programGroup = StructuralVariantDiscoveryProgramGroup.class +) +@ExperimentalFeature +public class CondenseDepthEvidence extends FeatureWalker { + public static final String INPUT_ARGNAME = "depth-evidence"; + public static final String COMPRESSION_LEVEL_ARGNAME = "compression-level"; + public static final String MAX_INTERVAL_SIZE_ARGNAME = "max-interval-size"; + public static final String MIN_INTERVAL_SIZE_ARGNAME = "min-interval-size"; + + @Argument( + doc = "DepthEvidence input file", + fullName = INPUT_ARGNAME, + shortName = StandardArgumentDefinitions.FEATURE_SHORT_NAME + ) + private GATKPath inputPath; + + @Argument( + doc = "Merged DepthEvidence output file", + fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME + ) + private GATKPath outputPath; + + @Argument( + doc = "Maximum interval size", + fullName = MAX_INTERVAL_SIZE_ARGNAME, + optional = true + ) + private int maxIntervalLength = 1000; + + @Argument( + doc = "Minimum interval size", + fullName = MIN_INTERVAL_SIZE_ARGNAME, + optional = true + ) + private int minIntervalLength = 0; + + @Argument( + doc = "Output compression level", + fullName = COMPRESSION_LEVEL_ARGNAME, + minValue = 0, maxValue = 9, optional = true + ) + private int compressionLevel = 4; + + private FeatureSink outputSink; + private DepthEvidence accumulator; + + + @Override + public GATKPath getDrivingFeaturePath() { + return inputPath; + } + + @Override + protected boolean isAcceptableFeatureType( final Class featureType ) { + return featureType.equals(DepthEvidence.class); + } + + @Override + @SuppressWarnings("unchecked") + public void onTraversalStart() { + super.onTraversalStart(); + + if ( minIntervalLength > maxIntervalLength ) { + throw new UserException("Minimum interval length exceeds maximum interval length."); + } + + final FeatureOutputCodec> codec = + FeatureOutputCodecFinder.find(outputPath); + final Class codecFeatureClass = codec.getFeatureType(); + if ( !codecFeatureClass.equals(DepthEvidence.class) ) { + throw new UserException("Output file " + outputPath + " implies Feature subtype " + + codecFeatureClass.getSimpleName() + + ", but this tool expects to write DepthEvidence."); + } + final SVFeaturesHeader header = (SVFeaturesHeader)getDrivingFeaturesHeader(); + final SAMSequenceDictionary dict = + header.getDictionary() != null ? header.getDictionary() : getBestAvailableSequenceDictionary(); + outputSink = (FeatureSink)codec.makeSink(outputPath, dict, + header.getSampleNames(), compressionLevel); + } + + @Override + public void apply( final DepthEvidence feature, + final ReadsContext readsContext, + final ReferenceContext referenceContext, + final FeatureContext featureContext ) { + if ( accumulator == null ) { + accumulator = feature; + return; + } + final int intervalLength = accumulator.getLengthOnReference(); + if ( !isAdjacent(accumulator, feature) || intervalLength >= maxIntervalLength ) { + if ( intervalLength >= minIntervalLength ) { + outputSink.write(accumulator); + } + accumulator = feature; + return; + } + final int[] accumCounts = accumulator.getCounts(); + MathUtils.addToArrayInPlace(accumCounts, feature.getCounts()); + accumulator = new DepthEvidence(feature.getContig(), accumulator.getStart(), + feature.getEnd(), accumCounts); + } + + @Override + public Object onTraversalSuccess() { + super.onTraversalSuccess(); + if ( accumulator != null && accumulator.getLengthOnReference() >= minIntervalLength ) { + outputSink.write(accumulator); + } + outputSink.close(); + return null; + } + + private static boolean isAdjacent( final DepthEvidence ev1, final DepthEvidence ev2 ) { + return ev1.getContig().equals(ev2.getContig()) && ev1.getEnd() + 1 == ev2.getStart(); + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/utils/io/FeatureOutputStream.java b/src/main/java/org/broadinstitute/hellbender/utils/io/FeatureOutputStream.java index b4b0b0fba4d..c82a8384329 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/io/FeatureOutputStream.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/io/FeatureOutputStream.java @@ -108,11 +108,11 @@ public void write(final F feature) { @Override public void close() { try { + outputStream.close(); // do this first so that the timestamp on the index will be later if (indexCreator != null) { final Index index = indexCreator.finalizeIndex(locationAware.getPosition()); index.writeBasedOnFeaturePath(featurePath); } - outputStream.close(); } catch (final IOException e) { throw new GATKException("Error closing output", e); } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/sv/CondenseDepthEvidenceIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/sv/CondenseDepthEvidenceIntegrationTest.java new file mode 100644 index 00000000000..6c749582b2c --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/sv/CondenseDepthEvidenceIntegrationTest.java @@ -0,0 +1,48 @@ +package org.broadinstitute.hellbender.tools.sv; + +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.util.Log; +import org.broadinstitute.hellbender.CommandLineProgramTest; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; +import org.broadinstitute.hellbender.testutils.IntegrationTestSpec; +import org.testng.annotations.Test; + +import java.io.IOException; +import java.util.Collections; + +public class CondenseDepthEvidenceIntegrationTest extends CommandLineProgramTest { + public static final String printEvidenceTestDir = toolsTestDir + "walkers/sv/printevidence/"; + public static final String sequenceDict = largeFileTestDir + "human_g1k_v37.20.21.dict"; + + @Test + public void positiveTest() throws IOException { + final ArgumentsBuilder argsBuilder = new ArgumentsBuilder(); + argsBuilder.add(StandardArgumentDefinitions.VERBOSITY_NAME, Log.LogLevel.ERROR.name()); + argsBuilder.add(StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME, sequenceDict); + argsBuilder.add(CondenseDepthEvidence.MIN_INTERVAL_SIZE_ARGNAME, 101); + argsBuilder.add(StandardArgumentDefinitions.FEATURE_SHORT_NAME, printEvidenceTestDir + "output.test_hg38.rd.txt"); + argsBuilder.add(StandardArgumentDefinitions.OUTPUT_SHORT_NAME, "%s"); + final IntegrationTestSpec testSpec = + new IntegrationTestSpec(argsBuilder.getString(), + Collections.singletonList(printEvidenceTestDir + "merged.rd.txt")); + testSpec.setOutputFileExtension("rd.txt"); + testSpec.executeTest("positive test", this); + } + + @Test + public void negativeTest() throws IOException { + final ArgumentsBuilder argsBuilder = new ArgumentsBuilder(); + argsBuilder.add(StandardArgumentDefinitions.VERBOSITY_NAME, Log.LogLevel.ERROR.name()); + argsBuilder.add(StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME, sequenceDict); + argsBuilder.add(StandardArgumentDefinitions.FEATURE_SHORT_NAME, printEvidenceTestDir + "no_condense.rd.txt"); + argsBuilder.add(StandardArgumentDefinitions.OUTPUT_SHORT_NAME, "%s"); + + // output equal to input, because this file contains rows that can't be merged for various reasons + final IntegrationTestSpec testSpec = + new IntegrationTestSpec(argsBuilder.getString(), + Collections.singletonList(printEvidenceTestDir + "no_condense.rd.txt")); + testSpec.setOutputFileExtension("rd.txt"); + testSpec.executeTest("negative test", this); + } +} diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/printevidence/merged.rd.txt b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/printevidence/merged.rd.txt new file mode 100644 index 00000000000..76a3ba97cd0 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/printevidence/merged.rd.txt @@ -0,0 +1,51 @@ +#Chr Start End HG00096 HG00129 HG00140 +chr22 30499964 30500964 217 182 243 +chr22 30500964 30501964 242 175 218 +chr22 30501964 30502964 244 220 261 +chr22 30502964 30503964 261 223 243 +chr22 30503964 30504964 224 204 234 +chr22 30504964 30505964 313 217 307 +chr22 30505964 30506964 241 197 247 +chr22 30506964 30507964 240 226 238 +chr22 30507964 30508964 295 240 262 +chr22 30508964 30509964 234 179 206 +chr22 30509964 30510964 212 214 250 +chr22 30510964 30511964 243 210 247 +chr22 30511964 30512964 263 206 231 +chr22 30512964 30513964 235 202 231 +chr22 30513964 30514964 286 237 290 +chr22 30514964 30515964 252 208 241 +chr22 30515964 30516964 192 203 200 +chr22 30516964 30517964 253 185 225 +chr22 30517964 30518964 226 188 243 +chr22 30518964 30519964 208 166 204 +chr22 30519964 30520964 253 230 202 +chr22 30520964 30521964 252 279 257 +chr22 30521964 30522964 229 189 229 +chr22 30522964 30523964 234 201 235 +chr22 30523964 30524964 270 217 215 +chr22 30524964 30525964 313 239 267 +chr22 30525964 30526964 301 202 245 +chr22 30526964 30527964 206 237 223 +chr22 30527964 30528964 260 203 272 +chr22 30528964 30529964 248 195 295 +chr22 30529964 30530964 239 193 241 +chr22 30530964 30531964 267 236 265 +chr22 30531964 30532964 306 238 317 +chr22 30532964 30533964 283 238 274 +chr22 30533964 30534964 243 177 294 +chr22 30534964 30535964 257 220 287 +chr22 30535964 30536964 229 204 210 +chr22 30536964 30537964 275 210 275 +chr22 30537964 30538964 201 227 193 +chr22 30538964 30539964 243 219 234 +chr22 30539964 30540964 263 213 250 +chr22 30540964 30541964 270 242 261 +chr22 30541964 30542964 281 197 254 +chr22 30542964 30543964 271 197 249 +chr22 30543964 30544964 233 226 246 +chr22 30544964 30545964 245 210 233 +chr22 30545964 30546964 218 149 229 +chr22 30546964 30547964 250 165 287 +chr22 30547964 30548964 241 215 243 +chr22 30548964 30549964 262 248 285 diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/printevidence/no_condense.rd.txt b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/printevidence/no_condense.rd.txt new file mode 100644 index 00000000000..86b5ed4f2fb --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/printevidence/no_condense.rd.txt @@ -0,0 +1,7 @@ +#Chr Start End HG00096 HG00129 HG00140 +chr20 30499964 30500964 217 182 243 +chr20 30500963 30501964 242 175 218 +chr20 30501965 30502964 244 220 261 +chr21 30502964 30503964 261 223 243 +chr21 30504000 30505000 261 223 243 +chr21 30505000 30506000 261 223 243