Skip to content

Commit

Permalink
gitignore merge
Browse files Browse the repository at this point in the history
  • Loading branch information
Schaudge committed May 24, 2024
2 parents 7a1cccb + 4ed93fe commit 9c44b25
Show file tree
Hide file tree
Showing 19 changed files with 371 additions and 84 deletions.
9 changes: 5 additions & 4 deletions .github/actions/upload-gatk-test-results/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,10 @@ runs:
name: test-results-${{ inputs.is-docker == 'true' && 'docker-' || '' }}${{ matrix.Java }}-${{ matrix.testType }}
path: build/reports/tests

- name: Upload to codecov
run: bash <(curl -s https://raw.githubusercontent.com/broadinstitute/codecov-bash-uploader/main/codecov-verified.bash)
shell: bash
# Disabling codecov because it is timing out and failing builds that otherwise succeed.
## - name: Upload to codecov
## run: bash <(curl -s https://raw.githubusercontent.com/broadinstitute/codecov-bash-uploader/main/codecov-verified.bash)
## shell: bash

- name: Upload Reports
if: ${{ inputs.only-artifact != 'true' }}
Expand Down Expand Up @@ -91,4 +92,4 @@ runs:
run: |
pip install --user PyGithub;
python scripts/github_actions/Reporter.py ${{ steps.uploadreports.outputs.view_url }};
shell: bash
shell: bash
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,6 @@ funcotator_tmp
#Test generated dot files
test*.dot

.vscode/

check
5 changes: 5 additions & 0 deletions .sapient/mock_preferences.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"note" : "Please restart the plugin after making changes to the lists below. If you want to mock files of any package, please add the package to packagesToMock list ex org.apache.commons. If you don't want to mock files of any package, please add the package to packagesToNotMock list ex com.google.gson Please make sure the json is a valid json, or it will revert to default list of packages.",
"classesAndPackagesNotToMock" : [ ],
"classesAndPackagesToMock" : [ ]
}
46 changes: 21 additions & 25 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@ FROM ${BASE_DOCKER} AS gradleBuild
LABEL stage=gatkIntermediateBuildImage
ARG RELEASE=false

RUN ls .

ADD . /gatk
WORKDIR /gatk

# Get an updated gcloud signing key, in case the one in the base image has expired
RUN rm /etc/apt/sources.list.d/google-cloud-sdk.list && \
#Download only resources required for the build, not for testing
RUN ls . && \
rm /etc/apt/sources.list.d/google-cloud-sdk.list && \
apt update &&\
apt-key list && \
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \
Expand All @@ -19,16 +21,13 @@ RUN rm /etc/apt/sources.list.d/google-cloud-sdk.list && \
apt-get -y clean && \
apt-get -y autoclean && \
apt-get -y autoremove && \
rm -rf /var/lib/apt/lists/*
RUN git lfs install --force

#Download only resources required for the build, not for testing
RUN git lfs pull --include src/main/resources/large

RUN export GRADLE_OPTS="-Xmx4048m -Dorg.gradle.daemon=false" && /gatk/gradlew clean collectBundleIntoDir shadowTestClassJar shadowTestJar -Drelease=$RELEASE
RUN cp -r $( find /gatk/build -name "*bundle-files-collected" )/ /gatk/unzippedJar/
RUN unzip -o -j $( find /gatk/unzippedJar -name "gatkPython*.zip" ) -d /gatk/unzippedJar/scripts
RUN chmod -R a+rw /gatk/unzippedJar
rm -rf /var/lib/apt/lists/* && \
git lfs install --force && \
git lfs pull --include src/main/resources/large && \
export GRADLE_OPTS="-Xmx4048m -Dorg.gradle.daemon=false" && /gatk/gradlew clean collectBundleIntoDir shadowTestClassJar shadowTestJar -Drelease=$RELEASE && \
cp -r $( find /gatk/build -name "*bundle-files-collected" )/ /gatk/unzippedJar/ && \
unzip -o -j $( find /gatk/unzippedJar -name "gatkPython*.zip" ) -d /gatk/unzippedJar/scripts && \
chmod -R a+rw /gatk/unzippedJar

FROM ${BASE_DOCKER}

Expand All @@ -47,17 +46,17 @@ RUN chmod -R a+rw /gatk
COPY --from=gradleBuild /gatk/unzippedJar .

#Setup linked jars that may be needed for running gatk
RUN ln -s $( find /gatk -name "gatk*local.jar" ) gatk.jar
RUN ln -s $( find /gatk -name "gatk*local.jar" ) /root/gatk.jar
RUN ln -s $( find /gatk -name "gatk*spark.jar" ) gatk-spark.jar
RUN ln -s $( find /gatk -name "gatk*local.jar" ) gatk.jar && \
ln -s $( find /gatk -name "gatk*local.jar" ) /root/gatk.jar && \
ln -s $( find /gatk -name "gatk*spark.jar" ) gatk-spark.jar

WORKDIR /root

# Make sure we can see a help message
RUN java -jar gatk.jar -h
RUN mkdir /gatkCloneMountPoint
RUN mkdir /jars
RUN mkdir .gradle
RUN java -jar gatk.jar -h && \
mkdir /gatkCloneMountPoint && \
mkdir /jars && \
mkdir .gradle

WORKDIR /gatk

Expand All @@ -80,15 +79,12 @@ RUN echo "source activate gatk" > /root/run_unit_tests.sh && \
echo "ln -s /gatkCloneMountPoint/build/ /gatkCloneMountPoint/scripts/docker/build" >> /root/run_unit_tests.sh && \
echo "cd /gatk/ && /gatkCloneMountPoint/gradlew -Dfile.encoding=UTF-8 -b /gatkCloneMountPoint/dockertest.gradle testOnPackagedReleaseJar jacocoTestReportOnPackagedReleaseJar -a -p /gatkCloneMountPoint" >> /root/run_unit_tests.sh

WORKDIR /root
RUN cp -r /root/run_unit_tests.sh /gatk
RUN cp -r gatk.jar /gatk
ENV CLASSPATH /gatk/gatk.jar:$CLASSPATH
RUN cp -r /root/run_unit_tests.sh /gatk && \
cp -r /root/gatk.jar /gatk
ENV CLASSPATH=/gatk/gatk.jar:$CLASSPATH PATH=$CONDA_PATH/envs/gatk/bin:$CONDA_PATH/bin:$PATH

# Start GATK Python environment

WORKDIR /gatk
ENV PATH $CONDA_PATH/envs/gatk/bin:$CONDA_PATH/bin:$PATH
RUN conda env create -n gatk -f /gatk/gatkcondaenv.yml && \
echo "source activate gatk" >> /gatk/gatkenv.rc && \
echo "source /gatk/gatk-completion.sh" >> /gatk/gatkenv.rc && \
Expand Down
2 changes: 1 addition & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ final barclayVersion = System.getProperty('barclay.version','5.0.0')
final sparkVersion = System.getProperty('spark.version', '3.5.0')
final hadoopVersion = System.getProperty('hadoop.version', '3.3.6')
final disqVersion = System.getProperty('disq.version','0.3.8')
final genomicsdbVersion = System.getProperty('genomicsdb.version','1.5.2')
final genomicsdbVersion = System.getProperty('genomicsdb.version','1.5.3')
final bigQueryVersion = System.getProperty('bigQuery.version', '2.35.0')
final bigQueryStorageVersion = System.getProperty('bigQueryStorage.version', '2.47.0')
final guavaVersion = System.getProperty('guava.version', '32.1.3-jre')
Expand Down
41 changes: 13 additions & 28 deletions scripts/docker/gatkbase/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,14 @@
# NOTE: If you update the ubuntu version make sure to update the samtools/bcftools/bedtools versions in the README
FROM ubuntu:22.04

# Set environment variables.
# Avoid interactive prompts during apt installs/upgrades
ENV DEBIAN_FRONTEND noninteractive
ENV DEBIAN_FRONTEND="noninteractive" HOME="/root" JAVA_LIBRARY_PATH="/usr/lib/jni" DOWNLOAD_DIR="/downloads" CONDA_URL="https://repo.anaconda.com/miniconda/Miniconda3-py310_23.10.0-1-Linux-x86_64.sh" CONDA_SHA256="c7a34df472feb69805b64df6e8db58363c5ccab41cd3b40b07e3e6dfb924359a" CONDA_PATH="/opt/miniconda" PATH="/opt/miniconda/bin:$PATH"

# Define working directory.
WORKDIR /root

#### Basic image utilities
#### Basic image utilities, google cloud support, and miniconda
RUN apt update && \
apt full-upgrade -y && \
apt install -y --no-install-recommends \
Expand All @@ -32,12 +36,9 @@ RUN apt update && \
apt -y clean && \
apt -y autoclean && \
apt -y autoremove && \
rm -rf /var/lib/apt/lists/*

RUN java -version

#### Specific for google cloud support
RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" \
rm -rf /var/lib/apt/lists/* && \
java -version && \
echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" \
| tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && curl https://packages.cloud.google.com/apt/doc/apt-key.gpg \
| apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \
apt update -y && \
Expand All @@ -49,26 +50,8 @@ RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.c
# Remove the anthos cli tool and related files since they are very large and we / anyone using the docker are unlikely to use them
# Remove the bundled python because we have python installed separately
rm -rf /usr/lib/google-cloud-sdk/bin/anthoscli /usr/lib/google-cloud-sdk/platform/anthoscli_licenses /usr/lib/google-cloud-sdk/platform/bundledpythonunix && \
find / -wholename "*__pycache__/*.pyc" -exec rm {} +

# Set environment variables.
ENV HOME /root

# Define working directory.
WORKDIR /root

# Define default command.
CMD ["bash"]

ENV JAVA_LIBRARY_PATH /usr/lib/jni

# Install miniconda
ENV DOWNLOAD_DIR /downloads
ENV CONDA_URL https://repo.anaconda.com/miniconda/Miniconda3-py310_23.10.0-1-Linux-x86_64.sh
ENV CONDA_SHA256 "c7a34df472feb69805b64df6e8db58363c5ccab41cd3b40b07e3e6dfb924359a"
ENV CONDA_PATH /opt/miniconda
ENV PATH $CONDA_PATH/bin:$PATH
RUN mkdir $DOWNLOAD_DIR && \
find / -wholename "*__pycache__/*.pyc" -exec rm {} + && \
mkdir $DOWNLOAD_DIR && \
wget -nv -O $DOWNLOAD_DIR/miniconda.sh $CONDA_URL && \
test "$(sha256sum $DOWNLOAD_DIR/miniconda.sh | awk -v FS=' ' -v ORS='' '{print $1}')" = "$CONDA_SHA256" && \
bash $DOWNLOAD_DIR/miniconda.sh -p $CONDA_PATH -b && \
Expand All @@ -77,3 +60,5 @@ RUN mkdir $DOWNLOAD_DIR && \
conda config --set auto_update_conda false && \
conda config --set solver libmamba && \
rm -rf /root/.cache/pip

CMD ["bash"]
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ public final class VariantFiltration extends VariantWalker {
public static final String CLUSTER_WINDOW_SIZE_LONG_NAME = "cluster-window-size";
public static final String MASK_EXTENSION_LONG_NAME = "mask-extension";
public static final String MASK_NAME_LONG_NAME = "mask-name";
public static final String MASK_DESCRIPTION_LONG_NAME = "mask-description";
public static final String FILTER_NOT_IN_MASK_LONG_NAME = "filter-not-in-mask";
public static final String MISSING_VAL_LONG_NAME = "missing-values-evaluate-as-failing";
public static final String INVERT_LONG_NAME = "invert-filter-expression";
Expand Down Expand Up @@ -238,6 +239,14 @@ public final class VariantFiltration extends VariantWalker {
@Argument(fullName=ALLELE_SPECIFIC_LONG_NAME, optional=true, doc="Set mask at the allele level. This option is not compatible with clustering.")
public boolean applyForAllele = false;

/**
* If a mask interval list is provided, then set the description of the filter in the VCF header to this String.
* Note that if spaces are needed, then the entire description should be enclosed in quotes. Also note that if
* --filter-not-in-mask is used, the description should be adapted to reflect the reverse logic.
*/
@Argument(fullName=MASK_DESCRIPTION_LONG_NAME, optional=true, doc="Description to add to the FILTER field in VCF header for the mask filter.")
public String maskDescription;

// JEXL expressions for the filters
private List<JexlVCMatchExp> filterExps;
private List<JexlVCMatchExp> genotypeFilterExps;
Expand Down Expand Up @@ -305,7 +314,9 @@ private void initializeVcfWriter() {
}

if ( mask != null ) {
if (filterRecordsNotInMask) {
if (maskDescription != null) {
hInfo.add(new VCFFilterHeaderLine(maskName, maskDescription));
} else if (filterRecordsNotInMask) {
hInfo.add(new VCFFilterHeaderLine(maskName, "Doesn't overlap a user-input mask"));
} else {
hInfo.add(new VCFFilterHeaderLine(maskName, "Overlaps a user-input mask"));
Expand All @@ -331,6 +342,9 @@ public void onTraversalStart() {
if (filterRecordsNotInMask && mask == null) {
throw new CommandLineException.BadArgumentValue(FILTER_NOT_IN_MASK_LONG_NAME, "argument not allowed if mask argument is not provided");
}
if (maskDescription != null && mask == null) {
throw new CommandLineException.BadArgumentValue(MASK_DESCRIPTION_LONG_NAME, "argument not allowed if mask argument is not provided");
}
filterExps = VariantContextUtils.initializeMatchExps(filterNames, filterExpressions);
genotypeFilterExps = VariantContextUtils.initializeMatchExps(genotypeFilterNames, genotypeFilterExpressions);
howToTreatMissingValues = failMissingValues ? JexlMissingValueTreatment.TREAT_AS_MATCH : JexlMissingValueTreatment.TREAT_AS_MISMATCH;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
import java.util.stream.Collectors;
import java.util.stream.IntStream;

import com.google.common.annotations.VisibleForTesting;

/**
* Filtering haplotypes that contribute weak alleles to the genotyping.
*
Expand Down Expand Up @@ -278,7 +280,8 @@ private AlleleLikelihoods<GATKRead, Haplotype> subsetHaplotypesByAlleles(final A
* @param sorThreshold only variants with SOR above threshold will be considered
* @return list of alleles that can be removed
*/
private List<Event> identifyBadAlleles(final List<Integer> collectedRPLs, final List<Double> collectedSORs,
@VisibleForTesting
List<Event> identifyBadAlleles(final List<Integer> collectedRPLs, final List<Double> collectedSORs,
final List<Event> alleles,
final double qualThreshold,
final double sorThreshold) {
Expand All @@ -303,9 +306,11 @@ private List<Event> identifyBadAlleles(final List<Integer> collectedRPLs, final
//we then add alleles with high SOR. Note that amongh all allleles with the SOR higher than the SOR_THRESHOLD
//we will first filter the one with the lowest QUAL.
logger.debug(() -> String.format("SHA:: Have %d candidates with low QUAL", rplCount));
for (int i = sorIndices.length-1 ; (i >= 0) && (collectedSORs.get(sorIndices[i])>SOR_THRESHOLD) ; i--) {
if (!result.contains(alleles.get(sorIndices[i]))) {
result.add(alleles.get(sorIndices[i]));
for (int i = sorIndices.length-1 ; (i >= 0) ; i--) {
if (collectedSORs.get(sorIndices[i])>SOR_THRESHOLD){
if (!result.contains(alleles.get(sorIndices[i]))) {
result.add(alleles.get(sorIndices[i]));
}
}
}
logger.debug(() -> String.format("SHA:: Have %d candidates with high SOR", result.size() - rplCount));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
import org.broadinstitute.hellbender.utils.locusiterator.AlignmentStateMachine;
import org.broadinstitute.hellbender.utils.pileup.PileupElement;
import org.broadinstitute.hellbender.utils.read.AlignmentUtils;
import org.broadinstitute.hellbender.utils.read.Fragment;
import org.broadinstitute.hellbender.utils.read.GATKRead;
Expand All @@ -20,6 +22,7 @@

import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

/**
* For each sample and for each allele a list feature vectors of supporting reads
Expand All @@ -33,6 +36,11 @@ public class FeaturizedReadSets {
public static final int DEFAULT_BASE_QUALITY = 25;

private static final SmithWatermanAligner aligner = SmithWatermanAligner.getAligner(SmithWatermanAligner.Implementation.JAVA);
private static final int FEATURES_PER_RANGE = 5;
private static final List<Integer> RANGES = List.of(5, 10, 25, 50);
public static final int NUM_RANGED_FEATURES = FEATURES_PER_RANGE * RANGES.size();
private static final int VERY_BAD_QUAL_THRESHOLD = 10;
private static final int BAD_QUAL_THRESHOLD = 20;

private FeaturizedReadSets() { }

Expand Down Expand Up @@ -92,9 +100,9 @@ private static List<Integer> featurize(final GATKRead read, final VariantContext
result.add(read.isReverseStrand() ? 1 : 0);

// distances from ends of read
final int readPosition = ReadPosition.getPosition(read, vc).orElse(0);
result.add(readPosition);
result.add(read.getLength() - readPosition);
final int readPositionOfVariantStart = ReadPosition.getPosition(read, vc).orElse(0);
result.add(readPositionOfVariantStart);
result.add(read.getLength() - readPositionOfVariantStart);


result.add(Math.abs(read.getFragmentLength()));
Expand Down Expand Up @@ -123,15 +131,64 @@ private static List<Integer> featurize(final GATKRead read, final VariantContext
vc.getContig(), vc.getStart()));
result.add(3);
result.add(2);

for (int n = 0; n < NUM_RANGED_FEATURES; n++) {
result.add(0);
}
} else {
final SmithWatermanAlignment readToHaplotypeAlignment = aligner.align(haplotype.getBases(), read.getBases(), SmithWatermanAlignmentConstants.ALIGNMENT_TO_BEST_HAPLOTYPE_SW_PARAMETERS, SWOverhangStrategy.SOFTCLIP);
byte[] haplotypeBases = haplotype.getBases();
final SmithWatermanAlignment readToHaplotypeAlignment = aligner.align(haplotypeBases, read.getBases(), SmithWatermanAlignmentConstants.ALIGNMENT_TO_BEST_HAPLOTYPE_SW_PARAMETERS, SWOverhangStrategy.SOFTCLIP);
final GATKRead copy = read.copy();
copy.setCigar(readToHaplotypeAlignment.getCigar());
final int mismatchCount = AlignmentUtils.getMismatchCount(copy, haplotype.getBases(), readToHaplotypeAlignment.getAlignmentOffset()).numMismatches;
final int mismatchCount = AlignmentUtils.getMismatchCount(copy, haplotypeBases, readToHaplotypeAlignment.getAlignmentOffset()).numMismatches;
result.add(mismatchCount);

final long indelsVsBestHaplotype = readToHaplotypeAlignment.getCigar().getCigarElements().stream().filter(el -> el.getOperator().isIndel()).count();
result.add((int) indelsVsBestHaplotype);

final int readStartInHaplotype = readToHaplotypeAlignment.getAlignmentOffset();
final AlignmentStateMachine asm = new AlignmentStateMachine(copy);
asm.stepForwardOnGenome();
final List<int[]> rangedFeatures = RANGES.stream().map(range -> new int[FEATURES_PER_RANGE]).toList();

while (!asm.isRightEdge()) {
final PileupElement pe = asm.makePileupElement();
final int distanceFromVariant = Math.abs(asm.getReadOffset() - readPositionOfVariantStart);

// pick which array's features we are accounting. If the ranges are 5, 10, 25, 50 and the distance is, say 8, then the '<= 10' range is relevant
final OptionalInt relevantRange = IntStream.range(0, RANGES.size()).filter(n -> distanceFromVariant <= RANGES.get(n)).findFirst();
if (relevantRange.isPresent()) {
final int[] featuresToAddTo = rangedFeatures.get(relevantRange.getAsInt());
if (pe.isAfterInsertion()) {
featuresToAddTo[0]++;
}

if (pe.isDeletion()) {
featuresToAddTo[1]++;
} else {
final byte base = pe.getBase();
final byte qual = pe.getQual();
final byte haplotypeBase = haplotypeBases[asm.getGenomeOffset() + readStartInHaplotype];

if (base != haplotypeBase) {
featuresToAddTo[2]++;
}

if (qual < VERY_BAD_QUAL_THRESHOLD) {
featuresToAddTo[3]++;
} else if (qual < BAD_QUAL_THRESHOLD) {
featuresToAddTo[4]++;
}
}
}
asm.stepForwardOnGenome();
}

for (final int[] featuresToAdd : rangedFeatures) {
for (final int val : featuresToAdd) {
result.add(val);
}
}
}
Utils.validate(result.size() == mutect3DatasetMode.getNumReadFeatures(), "Wrong number of features");

Expand Down
Loading

0 comments on commit 9c44b25

Please sign in to comment.