broadinstitute · jamesemery · May 9, 2018 · Apr 17, 2018 · Apr 25, 2018 · May 4, 2018
diff --git a/src/main/java/org/broadinstitute/hellbender/engine/spark/GATKSparkTool.java b/src/main/java/org/broadinstitute/hellbender/engine/spark/GATKSparkTool.java
@@ -66,6 +66,7 @@ public abstract class GATKSparkTool extends SparkCommandLineProgram {
     public static final String BAM_PARTITION_SIZE_LONG_NAME = "bam-partition-size";
     public static final String NUM_REDUCERS_LONG_NAME = "num-reducers";
     public static final String SHARDED_OUTPUT_LONG_NAME = "sharded-output";
+    public static final String OUTPUT_SHARD_DIR_LONG_NAME = "output-shard-tmp-dir";
 
     @ArgumentCollection
     public final ReferenceInputArgumentCollection referenceArguments = requiresReference() ? new RequiredReferenceInputArgumentCollection() :  new OptionalReferenceInputArgumentCollection();
@@ -88,9 +89,16 @@ public abstract class GATKSparkTool extends SparkCommandLineProgram {
 
     @Argument(doc = "For tools that write an output, write the output in multiple pieces (shards)",
             fullName = SHARDED_OUTPUT_LONG_NAME,
-            optional = true)
+            optional = true,
+            mutex = {OUTPUT_SHARD_DIR_LONG_NAME})
     protected boolean shardedOutput = false;
 
+    @Argument(doc = "when writing a bam, in single sharded mode this directory to write the temporary intermediate output shards, if not specified .parts/ will be used",
+            fullName = OUTPUT_SHARD_DIR_LONG_NAME,
+            optional = true,
+            mutex = {SHARDED_OUTPUT_LONG_NAME})
+    protected String shardedPartsDir = null;
+
     @Argument(doc="For tools that shuffle data or write an output, sets the number of reducers. Defaults to 0, which gives one partition per 10MB of input.",
             fullName = NUM_REDUCERS_LONG_NAME,
             optional = true)
@@ -277,7 +285,7 @@ public void writeReads(final JavaSparkContext ctx, final String outputFile, Java
             ReadsSparkSink.writeReads(ctx, outputFile,
                     hasReference() ? referenceArguments.getReferencePath().toAbsolutePath().toUri().toString() : null,
                     reads, header, shardedOutput ? ReadsWriteFormat.SHARDED : ReadsWriteFormat.SINGLE,
-                    getRecommendedNumReducers());
+                    getRecommendedNumReducers(), shardedPartsDir);
         } catch (IOException e) {
             throw new UserException.CouldNotCreateOutputFile(outputFile,"writing failed", e);
         }

diff --git a/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSink.java b/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSink.java
@@ -19,6 +19,7 @@
 import org.bdgenomics.adam.models.RecordGroupDictionary;
 import org.bdgenomics.adam.models.SequenceDictionary;
 import org.bdgenomics.formats.avro.AlignmentRecord;
+import org.broadinstitute.hellbender.exceptions.GATKException;
 import org.broadinstitute.hellbender.exceptions.UserException;
 import org.broadinstitute.hellbender.tools.walkers.annotator.VariantAnnotatorEngine;
 import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
@@ -122,7 +123,7 @@ public SparkHeaderlessCRAMOutputFormat() {
     public static void writeReads(
             final JavaSparkContext ctx, final String outputFile, final String referenceFile, final JavaRDD<GATKRead> reads,
             final SAMFileHeader header, ReadsWriteFormat format) throws IOException {
-        writeReads(ctx, outputFile, referenceFile, reads, header, format, 0);
+        writeReads(ctx, outputFile, referenceFile, reads, header, format, 0, null);
     }
 
     /**
@@ -135,10 +136,11 @@ public static void writeReads(
      * @param format should the output be a single file, sharded, ADAM, etc.
      * @param numReducers the number of reducers to use when writing a single file. A value of zero indicates that the default
      *                    should be used.
+     * @param outputPartsDir directory for temporary files for SINGLE output format, should be null for default value of filename + .output
      */
     public static void writeReads(
             final JavaSparkContext ctx, final String outputFile, final String referenceFile, final JavaRDD<GATKRead> reads,
-            final SAMFileHeader header, ReadsWriteFormat format, final int numReducers) throws IOException {
+            final SAMFileHeader header, ReadsWriteFormat format, final int numReducers, final String outputPartsDir) throws IOException {
 
         SAMFormat samOutputFormat = IOUtils.isCramFileName(outputFile) ? SAMFormat.CRAM : SAMFormat.BAM;
 
@@ -155,10 +157,16 @@ public static void writeReads(
         final JavaRDD<SAMRecord> samReads = reads.map(read -> read.convertToSAMRecord(null));
 
         if (format == ReadsWriteFormat.SINGLE) {
-            writeReadsSingle(ctx, absoluteOutputFile, absoluteReferenceFile, samOutputFormat, samReads, header, numReducers);
+            writeReadsSingle(ctx, absoluteOutputFile, absoluteReferenceFile, samOutputFormat, samReads, header, numReducers, outputPartsDir);
         } else if (format == ReadsWriteFormat.SHARDED) {
+            if (outputPartsDir!=null) {
+                throw new  GATKException(String.format("You specified the bam output parts directory %s, but requested a sharded output format which does not use this option",outputPartsDir));
+            }
             saveAsShardedHadoopFiles(ctx, absoluteOutputFile, absoluteReferenceFile, samOutputFormat, samReads, header, true);
         } else if (format == ReadsWriteFormat.ADAM) {
+            if (outputPartsDir!=null) {
+                throw new  GATKException(String.format("You specified the bam output parts directory %s, but requested an ADAM output format which does not use this option",outputPartsDir));
+            }
             writeReadsADAM(ctx, absoluteOutputFile, samReads, header);
         }
     }
@@ -228,10 +236,10 @@ private static JavaRDD<SAMRecord> setHeaderForEachPartition(final JavaRDD<SAMRec
 
     private static void writeReadsSingle(
             final JavaSparkContext ctx, final String outputFile, final String referenceFile, final SAMFormat samOutputFormat, final JavaRDD<SAMRecord> reads,
-            final SAMFileHeader header, final int numReducers) throws IOException {
+            final SAMFileHeader header, final int numReducers, final String outputPartsDir) throws IOException {
 
         final JavaRDD<SAMRecord> sortedReads = SparkUtils.sortReads(reads, header, numReducers);
-        final String outputPartsDirectory = outputFile + ".parts/";
+        final String outputPartsDirectory = (outputPartsDir == null)? getDefaultPartsDirectory(outputFile)  : outputPartsDir;
         saveAsShardedHadoopFiles(ctx, outputPartsDirectory, referenceFile, samOutputFormat, sortedReads,  header, false);
         logger.info("Finished sorting the bam file and dumping read shards to disk, proceeding to merge the shards into a single file using the master thread");
         SAMFileMerger.mergeParts(outputPartsDirectory, outputFile, samOutputFormat, header);
@@ -301,4 +309,11 @@ private static void setHadoopBAMConfigurationProperties(final JavaSparkContext c
         }
     }
 
+    /**
+     * Gets the default parts directory for a given file by appending .parts/ to the end of it
+     */
+    public static String getDefaultPartsDirectory(String file) {
+        return file + ".parts/";
+    }
+
 }
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/pathseq/PathSeqBwaSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/pathseq/PathSeqBwaSpark.java
@@ -180,7 +180,7 @@ private void writeBam(final JavaRDD<GATKRead> reads, final String inputBamPath,
         try {
             ReadsSparkSink.writeReads(ctx, outputPath, bwaArgs.referencePath, reads, header,
                     shardedOutput ? ReadsWriteFormat.SHARDED : ReadsWriteFormat.SINGLE,
-                    PSUtils.pathseqGetRecommendedNumReducers(inputBamPath, numReducers, getTargetPartitionSize()));
+                    PSUtils.pathseqGetRecommendedNumReducers(inputBamPath, numReducers, getTargetPartitionSize()), shardedPartsDir);
         } catch (final IOException e) {
             throw new UserException.CouldNotCreateOutputFile(outputPath, "Writing failed", e);
         }

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/pathseq/PathSeqPipelineSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/pathseq/PathSeqPipelineSpark.java
@@ -289,7 +289,7 @@ protected void runTool(final JavaSparkContext ctx) {
                 final int numPartitions = Math.max(1, (int) (numTotalReads / readsPerPartitionOutput));
                 final JavaRDD<GATKRead> readsFinalRepartitioned = readsFinal.coalesce(numPartitions, false);
                 ReadsSparkSink.writeReads(ctx, outputPath, null, readsFinalRepartitioned, header,
-                        shardedOutput ? ReadsWriteFormat.SHARDED : ReadsWriteFormat.SINGLE, numPartitions);
+                        shardedOutput ? ReadsWriteFormat.SHARDED : ReadsWriteFormat.SINGLE, numPartitions, shardedPartsDir);
             } catch (final IOException e) {
                 throw new UserException.CouldNotCreateOutputFile(outputPath, "writing failed", e);
             }

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/pathseq/PathSeqScoreSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/pathseq/PathSeqScoreSpark.java
@@ -221,7 +221,7 @@ protected void runTool(final JavaSparkContext ctx) {
         if (outputPath != null) {
             try {
                 ReadsSparkSink.writeReads(ctx, outputPath, null, readsFinal, header,
-                        shardedOutput ? ReadsWriteFormat.SHARDED : ReadsWriteFormat.SINGLE, recommendedNumReducers);
+                        shardedOutput ? ReadsWriteFormat.SHARDED : ReadsWriteFormat.SINGLE, recommendedNumReducers, shardedPartsDir);
             } catch (final IOException e) {
                 throw new UserException.CouldNotCreateOutputFile(outputPath, "writing failed", e);
             }

diff --git a/...rg/broadinstitute/hellbender/tools/spark/pipelines/BwaAndMarkDuplicatesPipelineSpark.java b/...rg/broadinstitute/hellbender/tools/spark/pipelines/BwaAndMarkDuplicatesPipelineSpark.java
@@ -66,7 +66,7 @@ protected void runTool(final JavaSparkContext ctx) {
                         referenceArguments.getReferencePath().toAbsolutePath().toUri().toString(),
                         markedReads, bwaEngine.getHeader(),
                         shardedOutput ? ReadsWriteFormat.SHARDED : ReadsWriteFormat.SINGLE,
-                        getRecommendedNumReducers());
+                        getRecommendedNumReducers(), shardedPartsDir);
             } catch (IOException e) {
                 throw new GATKException("unable to write bam: " + e);
             }

diff --git a/...t/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSinkUnitTest.java b/...t/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSinkUnitTest.java
@@ -25,7 +25,9 @@
 import org.testng.annotations.DataProvider;
 import org.testng.annotations.Test;
 
+import java.io.BufferedWriter;
 import java.io.File;
+import java.io.FileWriter;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.util.ArrayList;
@@ -87,14 +89,43 @@ public Object[][] loadReadsADAM() {
     @Test(dataProvider = "loadReadsBAM", groups = "spark")
     public void readsSinkTest(String inputBam, String outputFileName, String referenceFile, String outputFileExtension) throws IOException {
         final File outputFile = createTempFile(outputFileName, outputFileExtension);
-        assertSingleShardedWritingWorks(inputBam, referenceFile, outputFile.getAbsolutePath());
+        assertSingleShardedWritingWorks(inputBam, referenceFile, outputFile.getAbsolutePath(), null);
+    }
+
+    @Test(dataProvider = "loadReadsBAM", groups = "spark")
+    public void testSpecifyPartsDir(String inputBam, String outputFileName, String referenceFile, String outputFileExtension) throws IOException {
+        final File outputFile = createTempFile(outputFileName, outputFileExtension);
+        final File nonDefaultShardsDir = createTempDir(outputFileName + ".someOtherPlace");
+
+        final java.nio.file.Path defaultPartsDir = IOUtils.getPath(ReadsSparkSink.getDefaultPartsDirectory(outputFile.getAbsolutePath()));
+        final java.nio.file.Path subpath = defaultPartsDir.resolve("subpath");
+
+        try {
+            // Make a directory with unusable permissions in place of where the default file will live
+            Files.createDirectory(defaultPartsDir);
+            Files.createFile(subpath);
+            Runtime.getRuntime().exec("chmod a-w -R " + defaultPartsDir + "/");
+
+            //assert it fails when writing to the default path
+            Assert.assertThrows(() -> assertSingleShardedWritingWorks(inputBam, referenceFile, outputFile.getAbsolutePath(), null));
+
+            //show this succeeds when specifying a different path for the parts directory
+            assertSingleShardedWritingWorks(inputBam, referenceFile, outputFile.getAbsolutePath(), nonDefaultShardsDir.getAbsolutePath());
+
+            // Test that the file wasn't deleted when spark cleared its temp directory
+            Assert.assertTrue(Files.exists(defaultPartsDir));
+
+        } finally {
+            // Remove the file this time
+            Runtime.getRuntime().exec("rm -r " + defaultPartsDir );
+        }
     }
 
     @Test(dataProvider = "loadReadsBAM", groups = "spark")
     public void readsSinkHDFSTest(String inputBam, String outputFileName, String referenceFileName, String outputFileExtension) throws IOException {
         final String outputHDFSPath = MiniClusterUtils.getTempPath(cluster, outputFileName, outputFileExtension).toString();
         Assert.assertTrue(BucketUtils.isHadoopUrl(outputHDFSPath));
-        assertSingleShardedWritingWorks(inputBam, referenceFileName, outputHDFSPath);
+        assertSingleShardedWritingWorks(inputBam, referenceFileName, outputHDFSPath, null);
     }
 
     @Test(dataProvider = "loadReadsBAM", groups = "spark")
@@ -103,24 +134,24 @@ public void testWritingToAnExistingFileHDFS(String inputBam, String outputFileNa
         final FileSystem fs = outputPath.getFileSystem(new Configuration());
         Assert.assertTrue(fs.createNewFile(outputPath));
         Assert.assertTrue(fs.exists(outputPath));
-        assertSingleShardedWritingWorks(inputBam, referenceFileName, outputPath.toString());
+        assertSingleShardedWritingWorks(inputBam, referenceFileName, outputPath.toString(), null);
     }
 
     @Test(groups = "spark")
     public void testWritingToFileURL() throws IOException {
         String inputBam = testDataDir + "tools/BQSR/HiSeq.1mb.1RG.2k_lines.bam";
         String outputUrl = "file:///" + createTempFile("ReadsSparkSinkUnitTest1", ".bam").getAbsolutePath();
-        assertSingleShardedWritingWorks(inputBam, null, outputUrl);
+        assertSingleShardedWritingWorks(inputBam, null, outputUrl, null);
     }
 
-    private void assertSingleShardedWritingWorks(String inputBam, String referenceFile, String outputPath) throws IOException {
+    private void assertSingleShardedWritingWorks(String inputBam, String referenceFile, String outputPath, String outputPartsPath) throws IOException {
         JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
 
         ReadsSparkSource readSource = new ReadsSparkSource(ctx);
         JavaRDD<GATKRead> rddParallelReads = readSource.getParallelReads(inputBam, referenceFile);
         SAMFileHeader header = readSource.getHeader(inputBam, referenceFile);
 
-        ReadsSparkSink.writeReads(ctx, outputPath, referenceFile, rddParallelReads, header, ReadsWriteFormat.SINGLE);
+        ReadsSparkSink.writeReads(ctx, outputPath, referenceFile, rddParallelReads, header, ReadsWriteFormat.SINGLE, 0, outputPartsPath);
 
         // check that a splitting bai file is created
         if (IOUtils.isBamFileName(outputPath)) {