Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generate missing bowtie2 index files #164

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
# minute Changelog

## v0.4.0

### Features

* PR #164: Bowtie2 indexes are now explicit on `minute.yaml` file as a
`bowtie2_index` field, and will be dynamically generated if this field is
missing or the path provided is not valid.

## v0.3.1

### Bug fixes
Expand Down
48 changes: 45 additions & 3 deletions minute/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,49 @@ rule demultiplex_stats:
print(lines // 2, file=fo)


bowtie2_ext = [
"1.bt2",
"2.bt2",
"3.bt2",
"4.bt2",
"rev.1.bt2",
"rev.2.bt2",
]

rule bowtie2_build:
output:
index=expand("bowtie2_indexes/{{reference}}.{ext}", ext=bowtie2_ext),
flag=touch("bowtie2_indexes/{reference}.flag"),
input:
fasta=lambda wildcards: str(references[wildcards.reference].fasta),
params:
index=lambda wildcards: references[wildcards.reference].bowtie_index,
log:
"log/bowtie2_build/{reference}.log"
threads:
8
run:
if params.index:
with open(str(log), "w") as logf:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems more like debugging output, do you think this should be in here?

print(f"Found index files for {wildcards.reference}: {params.index}.", file=logf)
for index_file in output.index:
matched_ext = "".join(Path(index_file).suffixes)
os.link(
Path(params.index).with_suffix(matched_ext),
index_file,
)
else:
with open(str(log), "w") as logf:
print(f"Generating index files for {wildcards.reference}.", file=logf)
shell(
"bowtie2-build"
" {input.fasta}"
" bowtie2_indexes/{wildcards.reference}"
" --threads {threads}"
" >> {log}"
)


rule bowtie2:
threads:
19 # One fewer than available to allow other jobs to run in parallel
Expand All @@ -246,8 +289,7 @@ rule bowtie2:
input:
r1="final/demultiplexed/{sample}_rep{replicate}_R1.fastq.gz",
r2="final/demultiplexed/{sample}_rep{replicate}_R2.fastq.gz",
params:
index=lambda wildcards: references[wildcards.reference].bowtie_index
ref="bowtie2_indexes/{reference}.flag",
log:
"log/4-mapped/{sample}_rep{replicate}.{reference}.log"
# TODO
Expand All @@ -259,7 +301,7 @@ rule bowtie2:
"bowtie2"
" --reorder"
" -p {threads}"
" -x {params.index}"
" -x bowtie2_indexes/{wildcards.reference}"
" -1 {input.r1}"
" -2 {input.r2}"
" --fast"
Expand Down
54 changes: 25 additions & 29 deletions minute/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class ParseError(Exception):
class Reference:
name: str
fasta: Path
bowtie_index: Path
bowtie_index: Optional[Path]
exclude_bed: Optional[Path]


Expand Down Expand Up @@ -172,10 +172,11 @@ def make_references(config) -> Dict[str, Reference]:
for name, ref in config.items():
fasta = Path(ref["fasta"])
exclude_bed = Path(ref["exclude"]) if ref["exclude"] else None
bowtie_index = Path(ref["bowtie2_index"]) if ref["bowtie2_index"] else None
try:
bowtie_index = detect_bowtie_index_name(ref["fasta"])
except FileNotFoundError as e:
sys.exit(str(e))
bowtie_index = validate_bowtie_index(bowtie_index)
except (FileNotFoundError, TypeError, KeyError):
bowtie_index = None
references[name] = Reference(
name=name,
fasta=fasta,
Expand Down Expand Up @@ -328,33 +329,28 @@ def compute_genome_size(fasta: str) -> int:
return n


def detect_bowtie_index_name(fasta_path: str) -> Path:
def validate_bowtie_index(base_path: str) -> Path:
"""
Given the path to a reference FASTA (which may optionally be compressed),
detect the base name of the Bowtie2 index assumed to be in the same
location.

Given "ref.fasta.gz", this function checks for the existence of
- "ref.fasta.gz.1.bt2"
- "ref.fasta.1.bt2"
- "ref.1.bt2"
in that order and then returns the name of the first file it finds
minus the ".1.bt2" suffix.
Given a base bowtie2 index name, checks that all 6 index files exist.
If any of the indexes are missing, it raises an exception. Otherwise it
returns the same bowtie2 base path.
"""
path = Path(fasta_path)
bowtie_index_extension = ".1.bt2"
bases = [path]
if path.suffix == ".gz":
bases.append(path.with_suffix(""))
if bases[-1].suffix:
bases.append(bases[-1].with_suffix(""))
for base in bases:
if base.with_name(base.name + bowtie_index_extension).exists():
return base
raise FileNotFoundError(
f"No Bowtie2 index found for '{fasta_path}', expected one of\n- "
+ "\n- ".join(str(b) + bowtie_index_extension for b in bases)
)
path = Path(base_path)
bowtie_index_extensions = [
".1.bt2",
".2.bt2",
".3.bt2",
".4.bt2",
".rev.1.bt2",
".rev.2.bt2",
]
for ext in bowtie_index_extensions:
if not path.with_name(path.name + ext).exists():
raise FileNotFoundError(
f"Bowtie2 index file missing for '{path}'\n- "
+ str(path) + ext
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It’s a bit inconsistent to mix an f-string with string concatenation. I’d recommend using only an f-string. You should also remove the \n- part, which only made sense in the previous version where the - was used as a bullet, but now this doesn’t make sense as you complain only about a single file.

)
return(path)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There should be no parentheses around the returned value.



def get_replicates(libraries, sample):
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ author_email =
url = https://github.com/NBISweden/minute/
description = MINUTE-ChIP data analysis workflow
license = MIT
version = 0.3.1
version = 0.4.0
long_description = file: README.md
long_description_content_type = text/markdown
classifiers =
Expand Down
2 changes: 1 addition & 1 deletion test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,6 @@ tar --strip-components=1 -xf ../${testdata_file}
cp ../testdata/*.{tsv,bed,sizes,yaml} .
mkdir fastq && mv -- *.fastq.gz fastq/
mkdir ref && mv ref*.fa.gz ref/
( cd ref && bowtie2-build -q ref1.fa.gz ref1 && bowtie2-build -q ref2.fa.gz ref2 )
( cd ref && bowtie2-build -q ref2.fa.gz ref2 )

minute run "$@"
7 changes: 6 additions & 1 deletion testdata/minute.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,20 @@ references:
mini: # Arbitrary name for this reference. This is also used in output file names.

# Path to a reference FASTA file (may be gzip-compressed).
# A matching Bowtie2 index must exist in the same location.
fasta: "ref/ref1.fa.gz"

# Matching Bowtie2 index name without extension (any of the index files
# not including the final .1.bt2 / .rev.1.bt2 etc).
# If this is empty, minute will build the index dynamically.
bowtie2_index:

# Path to a BED file with regions to exclude.
# If this is empty, no regions are excluded.
exclude: "exclude1.bed"

small:
fasta: "ref/ref2.fa.gz"
bowtie2_index: "ref/ref2"
exclude:


Expand Down