From 1756a4eabc4c8145610ccfe7d1d1bacb469ad116 Mon Sep 17 00:00:00 2001 From: cnluzon Date: Fri, 3 Mar 2023 14:08:03 +0100 Subject: [PATCH 1/7] Add Bowtie2 index to config.yaml --- minute/__init__.py | 50 +++++++++++++++++++------------------------- testdata/minute.yaml | 5 +++++ 2 files changed, 26 insertions(+), 29 deletions(-) diff --git a/minute/__init__.py b/minute/__init__.py index e86ffe0..35b1330 100644 --- a/minute/__init__.py +++ b/minute/__init__.py @@ -172,10 +172,7 @@ def make_references(config) -> Dict[str, Reference]: for name, ref in config.items(): fasta = Path(ref["fasta"]) exclude_bed = Path(ref["exclude"]) if ref["exclude"] else None - try: - bowtie_index = detect_bowtie_index_name(ref["fasta"]) - except FileNotFoundError as e: - sys.exit(str(e)) + bowtie_index = validate_bowtie_index(Path(ref["bowtie2_index"])) references[name] = Reference( name=name, fasta=fasta, @@ -328,33 +325,28 @@ def compute_genome_size(fasta: str) -> int: return n -def detect_bowtie_index_name(fasta_path: str) -> Path: +def validate_bowtie_index(base_path: str) -> Path: """ - Given the path to a reference FASTA (which may optionally be compressed), - detect the base name of the Bowtie2 index assumed to be in the same - location. - - Given "ref.fasta.gz", this function checks for the existence of - - "ref.fasta.gz.1.bt2" - - "ref.fasta.1.bt2" - - "ref.1.bt2" - in that order and then returns the name of the first file it finds - minus the ".1.bt2" suffix. + Given a base bowtie2 index name, checks that all 6 index files exist. + If any of the indexes are missing, it raises an exception. Otherwise it + returns the same bowtie2 base path. """ - path = Path(fasta_path) - bowtie_index_extension = ".1.bt2" - bases = [path] - if path.suffix == ".gz": - bases.append(path.with_suffix("")) - if bases[-1].suffix: - bases.append(bases[-1].with_suffix("")) - for base in bases: - if base.with_name(base.name + bowtie_index_extension).exists(): - return base - raise FileNotFoundError( - f"No Bowtie2 index found for '{fasta_path}', expected one of\n- " - + "\n- ".join(str(b) + bowtie_index_extension for b in bases) - ) + path = Path(base_path) + bowtie_index_extensions = [ + ".1.bt2", + ".2.bt2", + ".3.bt2", + ".4.bt2", + ".rev.1.bt2", + ".rev.2.bt2", + ] + for ext in bowtie_index_extensions: + if not path.with_name(path.name + ext).exists(): + raise FileNotFoundError( + f"Bowtie2 index file missing for '{path}'\n- " + + str(path) + ext + ) + return(path) def get_replicates(libraries, sample): diff --git a/testdata/minute.yaml b/testdata/minute.yaml index 6723ac1..660fae5 100644 --- a/testdata/minute.yaml +++ b/testdata/minute.yaml @@ -8,12 +8,17 @@ references: # A matching Bowtie2 index must exist in the same location. fasta: "ref/ref1.fa.gz" + # Matching Bowtie2 index name without extension (any of the index files + # not including the final .1.bt2 / .rev.1.bt2 etc) + bowtie2_index: "ref/ref1" + # Path to a BED file with regions to exclude. # If this is empty, no regions are excluded. exclude: "exclude1.bed" small: fasta: "ref/ref2.fa.gz" + bowtie2_index: "ref/ref2" exclude: From 23f1803d907ca727fee0abe2616af7af23f27362 Mon Sep 17 00:00:00 2001 From: cnluzon Date: Fri, 3 Mar 2023 15:56:22 +0100 Subject: [PATCH 2/7] Generate missing bowtie2 indexes --- minute/Snakefile | 34 +++++++++++++++++++++++++++++++--- minute/__init__.py | 9 +++++++-- test.sh | 2 +- testdata/minute.yaml | 6 +++--- 4 files changed, 42 insertions(+), 9 deletions(-) diff --git a/minute/Snakefile b/minute/Snakefile index 9ad0777..0e538c0 100644 --- a/minute/Snakefile +++ b/minute/Snakefile @@ -238,6 +238,35 @@ rule demultiplex_stats: print(lines // 2, file=fo) +bowtie2_ext = [ + "1.bt2", + "2.bt2", + "3.bt2", + "4.bt2", + "rev.1.bt2", + "rev.2.bt2", +] + +rule bowtie2_build: + output: + index=expand("bowtie2_indexes/{{reference}}.{ext}", ext=bowtie2_ext), + flag=touch("bowtie2_indexes/{reference}.flag"), + input: + fasta=lambda wildcards: str(references[wildcards.reference].fasta), + params: + index=lambda wildcards: references[wildcards.reference].bowtie_index, + run: + if params.index: + for index_file in output.index: + matched_ext = "".join(Path(index_file).suffixes) + os.link( + Path(params.index).with_suffix(matched_ext), + index_file, + ) + else: + shell("bowtie2-build -q {input.fasta} bowtie2_indexes/{wildcards.reference}") + + rule bowtie2: threads: 19 # One fewer than available to allow other jobs to run in parallel @@ -246,8 +275,7 @@ rule bowtie2: input: r1="final/demultiplexed/{sample}_rep{replicate}_R1.fastq.gz", r2="final/demultiplexed/{sample}_rep{replicate}_R2.fastq.gz", - params: - index=lambda wildcards: references[wildcards.reference].bowtie_index + ref="bowtie2_indexes/{reference}.flag", log: "log/4-mapped/{sample}_rep{replicate}.{reference}.log" # TODO @@ -259,7 +287,7 @@ rule bowtie2: "bowtie2" " --reorder" " -p {threads}" - " -x {params.index}" + " -x bowtie2_indexes/{wildcards.reference}" " -1 {input.r1}" " -2 {input.r2}" " --fast" diff --git a/minute/__init__.py b/minute/__init__.py index 35b1330..f28aff9 100644 --- a/minute/__init__.py +++ b/minute/__init__.py @@ -27,7 +27,7 @@ class ParseError(Exception): class Reference: name: str fasta: Path - bowtie_index: Path + bowtie_index: Optional[Path] exclude_bed: Optional[Path] @@ -172,7 +172,12 @@ def make_references(config) -> Dict[str, Reference]: for name, ref in config.items(): fasta = Path(ref["fasta"]) exclude_bed = Path(ref["exclude"]) if ref["exclude"] else None - bowtie_index = validate_bowtie_index(Path(ref["bowtie2_index"])) + try: + bowtie_index = validate_bowtie_index(ref["bowtie2_index"]) + except (FileNotFoundError, TypeError): + print(f"Bowtie2 index files not found for ref {name}. " + f"Generating index files dynamically.") + bowtie_index = None references[name] = Reference( name=name, fasta=fasta, diff --git a/test.sh b/test.sh index 3cd9a18..41364be 100755 --- a/test.sh +++ b/test.sh @@ -17,6 +17,6 @@ tar --strip-components=1 -xf ../${testdata_file} cp ../testdata/*.{tsv,bed,sizes,yaml} . mkdir fastq && mv -- *.fastq.gz fastq/ mkdir ref && mv ref*.fa.gz ref/ -( cd ref && bowtie2-build -q ref1.fa.gz ref1 && bowtie2-build -q ref2.fa.gz ref2 ) +( cd ref && bowtie2-build -q ref2.fa.gz ref2 ) minute run "$@" diff --git a/testdata/minute.yaml b/testdata/minute.yaml index 660fae5..9f93e88 100644 --- a/testdata/minute.yaml +++ b/testdata/minute.yaml @@ -5,12 +5,12 @@ references: mini: # Arbitrary name for this reference. This is also used in output file names. # Path to a reference FASTA file (may be gzip-compressed). - # A matching Bowtie2 index must exist in the same location. fasta: "ref/ref1.fa.gz" # Matching Bowtie2 index name without extension (any of the index files - # not including the final .1.bt2 / .rev.1.bt2 etc) - bowtie2_index: "ref/ref1" + # not including the final .1.bt2 / .rev.1.bt2 etc). + # If this is empty, minute will build the index dynamically. + bowtie2_index: # Path to a BED file with regions to exclude. # If this is empty, no regions are excluded. From 2362bc8b205eca07c9ba5e02440143467cb71a92 Mon Sep 17 00:00:00 2001 From: cnluzon Date: Fri, 3 Mar 2023 15:59:15 +0100 Subject: [PATCH 3/7] Allow for missing bowtie2_index field in config.yaml --- minute/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/minute/__init__.py b/minute/__init__.py index f28aff9..7596d10 100644 --- a/minute/__init__.py +++ b/minute/__init__.py @@ -174,7 +174,7 @@ def make_references(config) -> Dict[str, Reference]: exclude_bed = Path(ref["exclude"]) if ref["exclude"] else None try: bowtie_index = validate_bowtie_index(ref["bowtie2_index"]) - except (FileNotFoundError, TypeError): + except (FileNotFoundError, TypeError, KeyError): print(f"Bowtie2 index files not found for ref {name}. " f"Generating index files dynamically.") bowtie_index = None From 7cffcd0432063c7551bfcc0143ac6b8836785932 Mon Sep 17 00:00:00 2001 From: cnluzon Date: Fri, 3 Mar 2023 16:23:34 +0100 Subject: [PATCH 4/7] Version bump + changelog --- CHANGES.md | 8 ++++++++ setup.cfg | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 201869c..9c11fe9 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,13 @@ # minute Changelog +## v0.4.0 + +### Features + +* PR #164: Bowtie2 indexes are now explicit on `minute.yaml` file as a +`bowtie2_index` field, and will be dynamically generated if this field is +missing or the path provided is not valid. + ## v0.3.1 ### Bug fixes diff --git a/setup.cfg b/setup.cfg index 0684ccd..21c93c5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -5,7 +5,7 @@ author_email = url = https://github.com/NBISweden/minute/ description = MINUTE-ChIP data analysis workflow license = MIT -version = 0.3.1 +version = 0.4.0 long_description = file: README.md long_description_content_type = text/markdown classifiers = From 3bc205d71c9a55db9fcbdccc37d031c0334d7e10 Mon Sep 17 00:00:00 2001 From: cnluzon Date: Fri, 3 Mar 2023 17:42:38 +0100 Subject: [PATCH 5/7] bowtie2_build output moved to log --- minute/Snakefile | 8 +++++++- minute/__init__.py | 2 -- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/minute/Snakefile b/minute/Snakefile index 0e538c0..6e84f2f 100644 --- a/minute/Snakefile +++ b/minute/Snakefile @@ -255,8 +255,12 @@ rule bowtie2_build: fasta=lambda wildcards: str(references[wildcards.reference].fasta), params: index=lambda wildcards: references[wildcards.reference].bowtie_index, + log: + "log/bowtie2_build/{reference}.log" run: if params.index: + with open(str(log), "w") as logf: + print(f"Found index files for {wildcards.reference}: {params.index}.", file=logf) for index_file in output.index: matched_ext = "".join(Path(index_file).suffixes) os.link( @@ -264,7 +268,9 @@ rule bowtie2_build: index_file, ) else: - shell("bowtie2-build -q {input.fasta} bowtie2_indexes/{wildcards.reference}") + with open(str(log), "w") as logf: + print(f"Generating index files for {wildcards.reference}.", file=logf) + shell("bowtie2-build {input.fasta} bowtie2_indexes/{wildcards.reference} >> {log}") rule bowtie2: diff --git a/minute/__init__.py b/minute/__init__.py index 7596d10..190a3e1 100644 --- a/minute/__init__.py +++ b/minute/__init__.py @@ -175,8 +175,6 @@ def make_references(config) -> Dict[str, Reference]: try: bowtie_index = validate_bowtie_index(ref["bowtie2_index"]) except (FileNotFoundError, TypeError, KeyError): - print(f"Bowtie2 index files not found for ref {name}. " - f"Generating index files dynamically.") bowtie_index = None references[name] = Reference( name=name, From a7bee704002eed1555d81222b28d1fa15ad833f5 Mon Sep 17 00:00:00 2001 From: cnluzon Date: Fri, 3 Mar 2023 17:47:11 +0100 Subject: [PATCH 6/7] Allow for empty string as bowtie2_index in minute.yaml --- minute/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/minute/__init__.py b/minute/__init__.py index 190a3e1..753616f 100644 --- a/minute/__init__.py +++ b/minute/__init__.py @@ -172,8 +172,9 @@ def make_references(config) -> Dict[str, Reference]: for name, ref in config.items(): fasta = Path(ref["fasta"]) exclude_bed = Path(ref["exclude"]) if ref["exclude"] else None + bowtie_index = Path(ref["bowtie2_index"]) if ref["bowtie2_index"] else None try: - bowtie_index = validate_bowtie_index(ref["bowtie2_index"]) + bowtie_index = validate_bowtie_index(bowtie_index) except (FileNotFoundError, TypeError, KeyError): bowtie_index = None references[name] = Reference( From 2e03641b1fcd499e529a5a756a4fc7fe98518cb3 Mon Sep 17 00:00:00 2001 From: cnluzon Date: Fri, 3 Mar 2023 17:50:27 +0100 Subject: [PATCH 7/7] Add threads to bowtie2_build rule --- minute/Snakefile | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/minute/Snakefile b/minute/Snakefile index 6e84f2f..cb4f496 100644 --- a/minute/Snakefile +++ b/minute/Snakefile @@ -257,6 +257,8 @@ rule bowtie2_build: index=lambda wildcards: references[wildcards.reference].bowtie_index, log: "log/bowtie2_build/{reference}.log" + threads: + 8 run: if params.index: with open(str(log), "w") as logf: @@ -270,7 +272,13 @@ rule bowtie2_build: else: with open(str(log), "w") as logf: print(f"Generating index files for {wildcards.reference}.", file=logf) - shell("bowtie2-build {input.fasta} bowtie2_indexes/{wildcards.reference} >> {log}") + shell( + "bowtie2-build" + " {input.fasta}" + " bowtie2_indexes/{wildcards.reference}" + " --threads {threads}" + " >> {log}" + ) rule bowtie2: