gitignore merge

Schaudge · May 24, 2024 · 9c44b25 · 9c44b25
2 parents 7a1cccb + 4ed93fe
commit 9c44b25
Show file tree

Hide file tree

Showing 19 changed files with 371 additions and 84 deletions.
diff --git a/.github/actions/upload-gatk-test-results/action.yml b/.github/actions/upload-gatk-test-results/action.yml
@@ -40,9 +40,10 @@ runs:
         name: test-results-${{ inputs.is-docker == 'true' && 'docker-' ||  '' }}${{ matrix.Java }}-${{ matrix.testType }}
         path: build/reports/tests
 
-    - name: Upload to codecov
-      run: bash <(curl -s https://raw.githubusercontent.com/broadinstitute/codecov-bash-uploader/main/codecov-verified.bash)
-      shell: bash
+          # Disabling codecov because it is timing out and failing builds that otherwise succeed.
+          ##    - name: Upload to codecov
+          ##      run: bash <(curl -s https://raw.githubusercontent.com/broadinstitute/codecov-bash-uploader/main/codecov-verified.bash)
+          ##      shell: bash
 
     - name: Upload Reports
       if: ${{ inputs.only-artifact != 'true' }}
@@ -91,4 +92,4 @@ runs:
       run: |
         pip install --user PyGithub;
         python scripts/github_actions/Reporter.py ${{ steps.uploadreports.outputs.view_url }};
-      shell: bash
+      shell: bash
diff --git a/.gitignore b/.gitignore
@@ -45,4 +45,6 @@ funcotator_tmp
 #Test generated dot files
 test*.dot
 
+.vscode/
+
 check
diff --git a/.sapient/mock_preferences.json b/.sapient/mock_preferences.json
@@ -0,0 +1,5 @@
+{
+  "note" : "Please restart the plugin after making changes to the lists below. If you want to mock files of any package, please add the package to packagesToMock list ex org.apache.commons. If you don't want to mock files of any package, please add the package to packagesToNotMock list ex com.google.gson Please make sure the json is a valid json, or it will revert to default list of packages.",
+  "classesAndPackagesNotToMock" : [ ],
+  "classesAndPackagesToMock" : [ ]
+}
diff --git a/Dockerfile b/Dockerfile
@@ -5,12 +5,14 @@ FROM ${BASE_DOCKER} AS gradleBuild
 LABEL stage=gatkIntermediateBuildImage
 ARG RELEASE=false
 
-RUN ls .
+
 ADD . /gatk
 WORKDIR /gatk
 
 # Get an updated gcloud signing key, in case the one in the base image has expired
-RUN rm /etc/apt/sources.list.d/google-cloud-sdk.list && \
+#Download only resources required for the build, not for testing
+RUN ls . && \
+    rm /etc/apt/sources.list.d/google-cloud-sdk.list && \
     apt update &&\
     apt-key list && \
     curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \
@@ -19,16 +21,13 @@ RUN rm /etc/apt/sources.list.d/google-cloud-sdk.list && \
     apt-get -y clean  && \
     apt-get -y autoclean  && \
     apt-get -y autoremove && \
-    rm -rf /var/lib/apt/lists/*
-RUN git lfs install --force
-
-#Download only resources required for the build, not for testing
-RUN git lfs pull --include src/main/resources/large
-
-RUN export GRADLE_OPTS="-Xmx4048m -Dorg.gradle.daemon=false" && /gatk/gradlew clean collectBundleIntoDir shadowTestClassJar shadowTestJar -Drelease=$RELEASE
-RUN cp -r $( find /gatk/build -name "*bundle-files-collected" )/ /gatk/unzippedJar/
-RUN unzip -o -j $( find /gatk/unzippedJar -name "gatkPython*.zip" ) -d /gatk/unzippedJar/scripts
-RUN chmod -R a+rw /gatk/unzippedJar
+    rm -rf /var/lib/apt/lists/* && \
+    git lfs install --force && \
+    git lfs pull --include src/main/resources/large && \
+    export GRADLE_OPTS="-Xmx4048m -Dorg.gradle.daemon=false" && /gatk/gradlew clean collectBundleIntoDir shadowTestClassJar shadowTestJar -Drelease=$RELEASE && \
+    cp -r $( find /gatk/build -name "*bundle-files-collected" )/ /gatk/unzippedJar/ && \
+    unzip -o -j $( find /gatk/unzippedJar -name "gatkPython*.zip" ) -d /gatk/unzippedJar/scripts && \
+    chmod -R a+rw /gatk/unzippedJar
 
 FROM ${BASE_DOCKER}
 
@@ -47,17 +46,17 @@ RUN chmod -R a+rw /gatk
 COPY --from=gradleBuild /gatk/unzippedJar .
 
 #Setup linked jars that may be needed for running gatk
-RUN ln -s $( find /gatk -name "gatk*local.jar" ) gatk.jar
-RUN ln -s $( find /gatk -name "gatk*local.jar" ) /root/gatk.jar
-RUN ln -s $( find /gatk -name "gatk*spark.jar" ) gatk-spark.jar
+RUN ln -s $( find /gatk -name "gatk*local.jar" ) gatk.jar && \
+    ln -s $( find /gatk -name "gatk*local.jar" ) /root/gatk.jar && \
+    ln -s $( find /gatk -name "gatk*spark.jar" ) gatk-spark.jar
 
 WORKDIR /root
 
  # Make sure we can see a help message
-RUN java -jar gatk.jar -h
-RUN mkdir /gatkCloneMountPoint
-RUN mkdir /jars
-RUN mkdir .gradle
+RUN java -jar gatk.jar -h && \
+    mkdir /gatkCloneMountPoint && \
+    mkdir /jars && \
+    mkdir .gradle
 
 WORKDIR /gatk
 
@@ -80,15 +79,12 @@ RUN echo "source activate gatk" > /root/run_unit_tests.sh && \
     echo "ln -s /gatkCloneMountPoint/build/ /gatkCloneMountPoint/scripts/docker/build" >> /root/run_unit_tests.sh && \
     echo "cd /gatk/ && /gatkCloneMountPoint/gradlew -Dfile.encoding=UTF-8 -b /gatkCloneMountPoint/dockertest.gradle testOnPackagedReleaseJar jacocoTestReportOnPackagedReleaseJar -a -p /gatkCloneMountPoint" >> /root/run_unit_tests.sh
 
-WORKDIR /root
-RUN cp -r /root/run_unit_tests.sh /gatk
-RUN cp -r gatk.jar /gatk
-ENV CLASSPATH /gatk/gatk.jar:$CLASSPATH
+RUN cp -r /root/run_unit_tests.sh /gatk && \
+    cp -r /root/gatk.jar /gatk
+ENV CLASSPATH=/gatk/gatk.jar:$CLASSPATH PATH=$CONDA_PATH/envs/gatk/bin:$CONDA_PATH/bin:$PATH
 
 # Start GATK Python environment
 
-WORKDIR /gatk
-ENV PATH $CONDA_PATH/envs/gatk/bin:$CONDA_PATH/bin:$PATH
 RUN conda env create -n gatk -f /gatk/gatkcondaenv.yml && \
     echo "source activate gatk" >> /gatk/gatkenv.rc && \
     echo "source /gatk/gatk-completion.sh" >> /gatk/gatkenv.rc && \

diff --git a/build.gradle b/build.gradle
@@ -63,7 +63,7 @@ final barclayVersion = System.getProperty('barclay.version','5.0.0')
 final sparkVersion = System.getProperty('spark.version', '3.5.0')
 final hadoopVersion = System.getProperty('hadoop.version', '3.3.6')
 final disqVersion = System.getProperty('disq.version','0.3.8')
-final genomicsdbVersion = System.getProperty('genomicsdb.version','1.5.2')
+final genomicsdbVersion = System.getProperty('genomicsdb.version','1.5.3')
 final bigQueryVersion = System.getProperty('bigQuery.version', '2.35.0')
 final bigQueryStorageVersion = System.getProperty('bigQueryStorage.version', '2.47.0')
 final guavaVersion = System.getProperty('guava.version', '32.1.3-jre')

diff --git a/scripts/docker/gatkbase/Dockerfile b/scripts/docker/gatkbase/Dockerfile
@@ -3,10 +3,14 @@
 # NOTE: If you update the ubuntu version make sure to update the samtools/bcftools/bedtools versions in the README
 FROM ubuntu:22.04
 
+# Set environment variables.
 # Avoid interactive prompts during apt installs/upgrades
-ENV DEBIAN_FRONTEND noninteractive
+ENV DEBIAN_FRONTEND="noninteractive" HOME="/root" JAVA_LIBRARY_PATH="/usr/lib/jni" DOWNLOAD_DIR="/downloads" CONDA_URL="https://repo.anaconda.com/miniconda/Miniconda3-py310_23.10.0-1-Linux-x86_64.sh" CONDA_SHA256="c7a34df472feb69805b64df6e8db58363c5ccab41cd3b40b07e3e6dfb924359a" CONDA_PATH="/opt/miniconda" PATH="/opt/miniconda/bin:$PATH"
+
+# Define working directory.
+WORKDIR /root
 
-#### Basic image utilities
+#### Basic image utilities, google cloud support, and miniconda
 RUN apt update && \
     apt full-upgrade -y && \
     apt install -y --no-install-recommends \
@@ -32,12 +36,9 @@ RUN apt update && \
     apt -y clean  && \
     apt -y autoclean  && \
     apt -y autoremove && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN java -version
-
-#### Specific for google cloud support
-RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" \
+    rm -rf /var/lib/apt/lists/* && \
+    java -version && \
+    echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" \
     | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && curl https://packages.cloud.google.com/apt/doc/apt-key.gpg \
     | apt-key --keyring /usr/share/keyrings/cloud.google.gpg  add - && \
     apt update -y && \
@@ -49,26 +50,8 @@ RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.c
 #  Remove the anthos cli tool and related files since they are very large and we / anyone using the docker are unlikely to use them
 #  Remove the bundled python because we have python installed separately
     rm -rf /usr/lib/google-cloud-sdk/bin/anthoscli /usr/lib/google-cloud-sdk/platform/anthoscli_licenses /usr/lib/google-cloud-sdk/platform/bundledpythonunix && \
-    find / -wholename "*__pycache__/*.pyc" -exec rm {} + 
-
-# Set environment variables.
-ENV HOME /root
-
-# Define working directory.
-WORKDIR /root
-
-# Define default command.
-CMD ["bash"]
-
-ENV JAVA_LIBRARY_PATH /usr/lib/jni
-
-# Install miniconda
-ENV DOWNLOAD_DIR /downloads
-ENV CONDA_URL https://repo.anaconda.com/miniconda/Miniconda3-py310_23.10.0-1-Linux-x86_64.sh
-ENV CONDA_SHA256 "c7a34df472feb69805b64df6e8db58363c5ccab41cd3b40b07e3e6dfb924359a"
-ENV CONDA_PATH /opt/miniconda
-ENV PATH $CONDA_PATH/bin:$PATH
-RUN mkdir $DOWNLOAD_DIR && \
+    find / -wholename "*__pycache__/*.pyc" -exec rm {} + && \
+    mkdir $DOWNLOAD_DIR && \
     wget -nv -O $DOWNLOAD_DIR/miniconda.sh $CONDA_URL && \
     test "$(sha256sum $DOWNLOAD_DIR/miniconda.sh | awk -v FS=' ' -v ORS='' '{print $1}')" = "$CONDA_SHA256" && \
     bash $DOWNLOAD_DIR/miniconda.sh -p $CONDA_PATH -b && \
@@ -77,3 +60,5 @@ RUN mkdir $DOWNLOAD_DIR && \
     conda config --set auto_update_conda false && \
     conda config --set solver libmamba && \
     rm -rf /root/.cache/pip
+
+CMD ["bash"]
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/filters/VariantFiltration.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/filters/VariantFiltration.java
@@ -114,6 +114,7 @@ public final class VariantFiltration extends VariantWalker {
     public static final String CLUSTER_WINDOW_SIZE_LONG_NAME = "cluster-window-size";
     public static final String MASK_EXTENSION_LONG_NAME = "mask-extension";
     public static final String MASK_NAME_LONG_NAME = "mask-name";
+    public static final String MASK_DESCRIPTION_LONG_NAME = "mask-description";
     public static final String FILTER_NOT_IN_MASK_LONG_NAME = "filter-not-in-mask";
     public static final String MISSING_VAL_LONG_NAME = "missing-values-evaluate-as-failing";
     public static final String INVERT_LONG_NAME = "invert-filter-expression";
@@ -238,6 +239,14 @@ public final class VariantFiltration extends VariantWalker {
     @Argument(fullName=ALLELE_SPECIFIC_LONG_NAME, optional=true, doc="Set mask at the allele level. This option is not compatible with clustering.")
     public boolean applyForAllele = false;
 
+    /**
+     * If a mask interval list is provided, then set the description of the filter in the VCF header to this String.
+     * Note that if spaces are needed, then the entire description should be enclosed in quotes. Also note that if
+     * --filter-not-in-mask is used, the description should be adapted to reflect the reverse logic.
+     */
+    @Argument(fullName=MASK_DESCRIPTION_LONG_NAME, optional=true, doc="Description to add to the FILTER field in VCF header for the mask filter.")
+    public String maskDescription;
+
     // JEXL expressions for the filters
     private List<JexlVCMatchExp> filterExps;
     private List<JexlVCMatchExp> genotypeFilterExps;
@@ -305,7 +314,9 @@ private void initializeVcfWriter() {
             }
 
             if ( mask != null ) {
-                if (filterRecordsNotInMask) {
+                if (maskDescription != null) {
+                    hInfo.add(new VCFFilterHeaderLine(maskName, maskDescription));
+                } else if (filterRecordsNotInMask) {
                     hInfo.add(new VCFFilterHeaderLine(maskName, "Doesn't overlap a user-input mask"));
                 } else {
                     hInfo.add(new VCFFilterHeaderLine(maskName, "Overlaps a user-input mask"));
@@ -331,6 +342,9 @@ public void onTraversalStart() {
         if (filterRecordsNotInMask && mask == null) {
             throw new CommandLineException.BadArgumentValue(FILTER_NOT_IN_MASK_LONG_NAME, "argument not allowed if mask argument is not provided");
         }
+        if (maskDescription != null && mask == null) {
+            throw new CommandLineException.BadArgumentValue(MASK_DESCRIPTION_LONG_NAME, "argument not allowed if mask argument is not provided");
+        }
         filterExps = VariantContextUtils.initializeMatchExps(filterNames, filterExpressions);
         genotypeFilterExps = VariantContextUtils.initializeMatchExps(genotypeFilterNames, genotypeFilterExpressions);
         howToTreatMissingValues = failMissingValues ? JexlMissingValueTreatment.TREAT_AS_MATCH : JexlMissingValueTreatment.TREAT_AS_MISMATCH;

diff --git a/...ain/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFiltering.java b/...ain/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFiltering.java
@@ -29,6 +29,8 @@
 import java.util.stream.Collectors;
 import java.util.stream.IntStream;
 
+import com.google.common.annotations.VisibleForTesting;
+
 /**
  * Filtering haplotypes that contribute weak alleles to the genotyping.
  *
@@ -278,7 +280,8 @@ private AlleleLikelihoods<GATKRead, Haplotype> subsetHaplotypesByAlleles(final A
      * @param sorThreshold only variants with SOR above threshold will be considered
      * @return list of alleles that can be removed
      */
-    private List<Event> identifyBadAlleles(final List<Integer> collectedRPLs, final List<Double> collectedSORs,
+    @VisibleForTesting
+    List<Event> identifyBadAlleles(final List<Integer> collectedRPLs, final List<Double> collectedSORs,
                                                       final List<Event> alleles,
                                                       final double qualThreshold,
                                                       final double sorThreshold) {
@@ -303,9 +306,11 @@ private List<Event> identifyBadAlleles(final List<Integer> collectedRPLs, final
         //we then add alleles with high SOR. Note that amongh all allleles with the SOR higher than the SOR_THRESHOLD
         //we will first filter the one with the lowest QUAL.
         logger.debug(() -> String.format("SHA:: Have %d candidates with low QUAL", rplCount));
-        for (int i = sorIndices.length-1 ; (i >= 0) && (collectedSORs.get(sorIndices[i])>SOR_THRESHOLD) ; i--) {
-            if (!result.contains(alleles.get(sorIndices[i]))) {
-                result.add(alleles.get(sorIndices[i]));
+        for (int i = sorIndices.length-1 ; (i >= 0) ; i--) {
+             if (collectedSORs.get(sorIndices[i])>SOR_THRESHOLD){
+                if (!result.contains(alleles.get(sorIndices[i]))) {
+                    result.add(alleles.get(sorIndices[i]));
+                }
             }
         }
         logger.debug(() -> String.format("SHA:: Have %d candidates with high SOR", result.size() - rplCount));

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/FeaturizedReadSets.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/FeaturizedReadSets.java
@@ -11,6 +11,8 @@
 import org.broadinstitute.hellbender.utils.Utils;
 import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
 import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
+import org.broadinstitute.hellbender.utils.locusiterator.AlignmentStateMachine;
+import org.broadinstitute.hellbender.utils.pileup.PileupElement;
 import org.broadinstitute.hellbender.utils.read.AlignmentUtils;
 import org.broadinstitute.hellbender.utils.read.Fragment;
 import org.broadinstitute.hellbender.utils.read.GATKRead;
@@ -20,6 +22,7 @@
 
 import java.util.*;
 import java.util.stream.Collectors;
+import java.util.stream.IntStream;
 
 /**
  * For each sample and for each allele a list feature vectors of supporting reads
@@ -33,6 +36,11 @@ public class FeaturizedReadSets {
     public static final int DEFAULT_BASE_QUALITY = 25;
 
     private static final SmithWatermanAligner aligner = SmithWatermanAligner.getAligner(SmithWatermanAligner.Implementation.JAVA);
+    private static final int FEATURES_PER_RANGE = 5;
+    private static final List<Integer> RANGES = List.of(5, 10, 25, 50);
+    public static final int NUM_RANGED_FEATURES = FEATURES_PER_RANGE * RANGES.size();
+    private static final int VERY_BAD_QUAL_THRESHOLD = 10;
+    private static final int BAD_QUAL_THRESHOLD = 20;
 
     private FeaturizedReadSets() { }
 
@@ -92,9 +100,9 @@ private static List<Integer> featurize(final GATKRead read, final VariantContext
         result.add(read.isReverseStrand() ? 1 : 0);
 
         // distances from ends of read
-        final int readPosition = ReadPosition.getPosition(read, vc).orElse(0);
-        result.add(readPosition);
-        result.add(read.getLength() - readPosition);
+        final int readPositionOfVariantStart = ReadPosition.getPosition(read, vc).orElse(0);
+        result.add(readPositionOfVariantStart);
+        result.add(read.getLength() - readPositionOfVariantStart);
 
 
         result.add(Math.abs(read.getFragmentLength()));
@@ -123,15 +131,64 @@ private static List<Integer> featurize(final GATKRead read, final VariantContext
                     vc.getContig(), vc.getStart()));
             result.add(3);
             result.add(2);
+
+            for (int n = 0; n < NUM_RANGED_FEATURES; n++) {
+                result.add(0);
+            }
         } else {
-            final SmithWatermanAlignment readToHaplotypeAlignment = aligner.align(haplotype.getBases(), read.getBases(), SmithWatermanAlignmentConstants.ALIGNMENT_TO_BEST_HAPLOTYPE_SW_PARAMETERS, SWOverhangStrategy.SOFTCLIP);
+            byte[] haplotypeBases = haplotype.getBases();
+            final SmithWatermanAlignment readToHaplotypeAlignment = aligner.align(haplotypeBases, read.getBases(), SmithWatermanAlignmentConstants.ALIGNMENT_TO_BEST_HAPLOTYPE_SW_PARAMETERS, SWOverhangStrategy.SOFTCLIP);
             final GATKRead copy = read.copy();
             copy.setCigar(readToHaplotypeAlignment.getCigar());
-            final int mismatchCount = AlignmentUtils.getMismatchCount(copy, haplotype.getBases(), readToHaplotypeAlignment.getAlignmentOffset()).numMismatches;
+            final int mismatchCount = AlignmentUtils.getMismatchCount(copy, haplotypeBases, readToHaplotypeAlignment.getAlignmentOffset()).numMismatches;
             result.add(mismatchCount);
 
             final long indelsVsBestHaplotype = readToHaplotypeAlignment.getCigar().getCigarElements().stream().filter(el -> el.getOperator().isIndel()).count();
             result.add((int) indelsVsBestHaplotype);
+
+            final int readStartInHaplotype = readToHaplotypeAlignment.getAlignmentOffset();
+            final AlignmentStateMachine asm = new AlignmentStateMachine(copy);
+            asm.stepForwardOnGenome();
+            final List<int[]> rangedFeatures = RANGES.stream().map(range -> new int[FEATURES_PER_RANGE]).toList();
+
+            while (!asm.isRightEdge()) {
+                final PileupElement pe = asm.makePileupElement();
+                final int distanceFromVariant = Math.abs(asm.getReadOffset() - readPositionOfVariantStart);
+
+                // pick which array's features we are accounting.  If the ranges are 5, 10, 25, 50 and the distance is, say 8, then the '<= 10' range is relevant
+                final OptionalInt relevantRange = IntStream.range(0, RANGES.size()).filter(n -> distanceFromVariant <= RANGES.get(n)).findFirst();
+                if (relevantRange.isPresent()) {
+                    final int[] featuresToAddTo = rangedFeatures.get(relevantRange.getAsInt());
+                    if (pe.isAfterInsertion()) {
+                        featuresToAddTo[0]++;
+                    }
+
+                    if (pe.isDeletion()) {
+                        featuresToAddTo[1]++;
+                    } else {
+                        final byte base = pe.getBase();
+                        final byte qual = pe.getQual();
+                        final byte haplotypeBase = haplotypeBases[asm.getGenomeOffset() + readStartInHaplotype];
+
+                        if (base != haplotypeBase) {
+                            featuresToAddTo[2]++;
+                        }
+
+                        if (qual < VERY_BAD_QUAL_THRESHOLD) {
+                            featuresToAddTo[3]++;
+                        } else if (qual < BAD_QUAL_THRESHOLD) {
+                            featuresToAddTo[4]++;
+                        }
+                    }
+                }
+                asm.stepForwardOnGenome();
+            }
+
+            for (final int[] featuresToAdd : rangedFeatures) {
+                for (final int val : featuresToAdd) {
+                    result.add(val);
+                }
+            }
         }
         Utils.validate(result.size() == mutect3DatasetMode.getNumReadFeatures(), "Wrong number of features");