elsasserlab · cnluzon · Mar 3, 2023 · Mar 3, 2023 · Mar 3, 2023 · Mar 3, 2023
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,5 +1,13 @@
 # minute Changelog
 
+## v0.4.0
+
+### Features
+
+* PR #164: Bowtie2 indexes are now explicit on `minute.yaml` file as a
+`bowtie2_index` field, and will be dynamically generated if this field is
+missing or the path provided is not valid.
+
 ## v0.3.1
 
 ### Bug fixes

diff --git a/minute/Snakefile b/minute/Snakefile
@@ -238,6 +238,49 @@ rule demultiplex_stats:
             print(lines // 2, file=fo)
 
 
+bowtie2_ext = [
+    "1.bt2",
+    "2.bt2",
+    "3.bt2",
+    "4.bt2",
+    "rev.1.bt2",
+    "rev.2.bt2",
+]
+
+rule bowtie2_build:
+    output:
+        index=expand("bowtie2_indexes/{{reference}}.{ext}", ext=bowtie2_ext),
+        flag=touch("bowtie2_indexes/{reference}.flag"),
+    input:
+        fasta=lambda wildcards: str(references[wildcards.reference].fasta),
+    params:
+        index=lambda wildcards: references[wildcards.reference].bowtie_index,
+    log:
+        "log/bowtie2_build/{reference}.log"
+    threads:
+        8
+    run:
+        if params.index:
+            with open(str(log), "w") as logf:
+                print(f"Found index files for {wildcards.reference}: {params.index}.", file=logf)
+            for index_file in output.index:
+                matched_ext = "".join(Path(index_file).suffixes)
+                os.link(
+                    Path(params.index).with_suffix(matched_ext),
+                    index_file,
+                )
+        else:
+            with open(str(log), "w") as logf:
+                print(f"Generating index files for {wildcards.reference}.", file=logf)
+            shell(
+                "bowtie2-build"
+                " {input.fasta}"
+                " bowtie2_indexes/{wildcards.reference}"
+                " --threads {threads}"
+                " >> {log}"
+            )
+
+
 rule bowtie2:
     threads:
         19  # One fewer than available to allow other jobs to run in parallel
@@ -246,8 +289,7 @@ rule bowtie2:
     input:
         r1="final/demultiplexed/{sample}_rep{replicate}_R1.fastq.gz",
         r2="final/demultiplexed/{sample}_rep{replicate}_R2.fastq.gz",
-    params:
-        index=lambda wildcards: references[wildcards.reference].bowtie_index
+        ref="bowtie2_indexes/{reference}.flag",
     log:
         "log/4-mapped/{sample}_rep{replicate}.{reference}.log"
     # TODO
@@ -259,7 +301,7 @@ rule bowtie2:
         "bowtie2"
         " --reorder"
         " -p {threads}"
-        " -x {params.index}"
+        " -x bowtie2_indexes/{wildcards.reference}"
         " -1 {input.r1}"
         " -2 {input.r2}"
         " --fast"

diff --git a/minute/__init__.py b/minute/__init__.py
@@ -27,7 +27,7 @@ class ParseError(Exception):
 class Reference:
     name: str
     fasta: Path
-    bowtie_index: Path
+    bowtie_index: Optional[Path]
     exclude_bed: Optional[Path]
 
 
@@ -172,10 +172,11 @@ def make_references(config) -> Dict[str, Reference]:
     for name, ref in config.items():
         fasta = Path(ref["fasta"])
         exclude_bed = Path(ref["exclude"]) if ref["exclude"] else None
+        bowtie_index = Path(ref["bowtie2_index"]) if ref["bowtie2_index"] else None
         try:
-            bowtie_index = detect_bowtie_index_name(ref["fasta"])
-        except FileNotFoundError as e:
-            sys.exit(str(e))
+            bowtie_index = validate_bowtie_index(bowtie_index)
+        except (FileNotFoundError, TypeError, KeyError):
+            bowtie_index = None
         references[name] = Reference(
             name=name,
             fasta=fasta,
@@ -328,33 +329,28 @@ def compute_genome_size(fasta: str) -> int:
     return n
 
 
-def detect_bowtie_index_name(fasta_path: str) -> Path:
+def validate_bowtie_index(base_path: str) -> Path:
     """
-    Given the path to a reference FASTA (which may optionally be compressed),
-    detect the base name of the Bowtie2 index assumed to be in the same
-    location.
-
-    Given "ref.fasta.gz", this function checks for the existence of
-    - "ref.fasta.gz.1.bt2"
-    - "ref.fasta.1.bt2"
-    - "ref.1.bt2"
-    in that order and then returns the name of the first file it finds
-    minus the ".1.bt2" suffix.
+    Given a base bowtie2 index name, checks that all 6 index files exist.
+    If any of the indexes are missing, it raises an exception. Otherwise it
+    returns the same bowtie2 base path.
     """
-    path = Path(fasta_path)
-    bowtie_index_extension = ".1.bt2"
-    bases = [path]
-    if path.suffix == ".gz":
-        bases.append(path.with_suffix(""))
-    if bases[-1].suffix:
-        bases.append(bases[-1].with_suffix(""))
-    for base in bases:
-        if base.with_name(base.name + bowtie_index_extension).exists():
-            return base
-    raise FileNotFoundError(
-        f"No Bowtie2 index found for '{fasta_path}', expected one of\n- "
-        + "\n- ".join(str(b) + bowtie_index_extension for b in bases)
-    )
+    path = Path(base_path)
+    bowtie_index_extensions = [
+        ".1.bt2",
+        ".2.bt2",
+        ".3.bt2",
+        ".4.bt2",
+        ".rev.1.bt2",
+        ".rev.2.bt2",
+    ]
+    for ext in bowtie_index_extensions:
+        if not path.with_name(path.name + ext).exists():
+            raise FileNotFoundError(
+                f"Bowtie2 index file missing for '{path}'\n- "
+                + str(path) + ext
+            )
+    return(path)
 
 
 def get_replicates(libraries, sample):

diff --git a/setup.cfg b/setup.cfg
@@ -5,7 +5,7 @@ author_email =
 url = https://github.com/NBISweden/minute/
 description = MINUTE-ChIP data analysis workflow
 license = MIT
-version = 0.3.1
+version = 0.4.0
 long_description = file: README.md
 long_description_content_type = text/markdown
 classifiers =

diff --git a/test.sh b/test.sh
@@ -17,6 +17,6 @@ tar --strip-components=1 -xf ../${testdata_file}
 cp ../testdata/*.{tsv,bed,sizes,yaml} .
 mkdir fastq && mv -- *.fastq.gz fastq/
 mkdir ref && mv ref*.fa.gz ref/
-( cd ref && bowtie2-build -q ref1.fa.gz ref1 && bowtie2-build -q ref2.fa.gz ref2 )
+( cd ref && bowtie2-build -q ref2.fa.gz ref2 )
 
 minute run "$@"
diff --git a/testdata/minute.yaml b/testdata/minute.yaml
@@ -5,15 +5,20 @@ references:
   mini:  # Arbitrary name for this reference. This is also used in output file names.
 
     # Path to a reference FASTA file (may be gzip-compressed).
-    # A matching Bowtie2 index must exist in the same location.
     fasta: "ref/ref1.fa.gz"
 
+    # Matching Bowtie2 index name without extension (any of the index files
+    # not including the final .1.bt2 / .rev.1.bt2 etc).
+    # If this is empty, minute will build the index dynamically.
+    bowtie2_index:
+
     # Path to a BED file with regions to exclude.
     # If this is empty, no regions are excluded.
     exclude: "exclude1.bed"
 
   small:
     fasta: "ref/ref2.fa.gz"
+    bowtie2_index: "ref/ref2"
     exclude: