Add test builds to cover different inputs

These tests cover the different starting points for sequence data in each input (sequences vs aligned vs masked vs filtered). They also cover uncompressed and (xz) compressed starting points, as well as local and remote (s3) addresses. (Note: `.gz` compression is not yet covered by these tests.) Running of the tests is best described in the `tests/different-inputs.t` file itself.
nextstrain · Jun 23, 2021 · 20b7564 · 20b7564
1 parent b70cdf3
commit 20b7564
Show file tree

Hide file tree

Showing 21 changed files with 252 additions and 0 deletions.
diff --git a/tests/.gitignore b/tests/.gitignore
@@ -0,0 +1,2 @@
+*.err
+/output
diff --git a/tests/check_auspice_json.py b/tests/check_auspice_json.py
@@ -0,0 +1,31 @@
+import argparse
+import json
+import sys
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description="Ensure certain values are present for a given node trait",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument('--json', type=str, metavar="JSON", required=True, help="Auspice JSON")
+    parser.add_argument('--attr', type=str, metavar="KEY", required=True, help="node attr to collect")
+    parser.add_argument('--values', type=str, nargs="+", metavar="VALUE", required=True, help="values to check")
+    args = parser.parse_args()
+
+    values_seen = set()
+
+    def collect(node):
+        v = node.get("node_attrs", {}).get(args.attr, {}).get("value", "")
+        if v:
+            values_seen.add(v)
+        for child in node.get("children", []):
+            collect(child)
+
+    with open(args.json, "r") as f:
+        input_json = json.load(f)
+
+    collect(input_json["tree"])
+
+    if not values_seen >= set(args.values):
+        print("Following values missing from JSON:", set(args.values)-values_seen)
+        sys.exit(1)
diff --git a/tests/different-inputs.t b/tests/different-inputs.t
@@ -0,0 +1,65 @@
+Integration tests for nCoV pipeline.
+
+Note that running these tests requires setup steps, and that each test can only
+run one-at-a-time due to the shared use of the test environment as otherwise
+snakemake may use intermediate files from previous runs, thus producing
+inconsistent test results.
+
+Cram should be run in an environment which can run the pipeline via
+`cram --preserve-env tests/different-inputs.t` or similar.
+
+Set-up test environment. We could set up the correct data inside $TMP for each test
+if we prefer. For simplicity, we create a directory "output".
+
+  $ pushd "$TESTDIR" > /dev/null
+  $ basename $( pwd )
+  tests
+  $ rm -rf output && mkdir output && cd output
+  $ cp -r ../../defaults . && cp -r ../../scripts . && mkdir data/ && cp ../../data/references* data/
+  $ cd ../..
+  $ basename $( pwd )
+  ncov
+
+Test various input starting points, all from local (.xz) compressed files
+
+  $ snakemake --directory tests/output --profile tests/local-inputs-compressed \
+  > auspice/ncov_test-local-compressed.json >tests/output/local-inputs-compressed.cram.log.txt 2>&1
+
+  $ python3 tests/check_auspice_json.py --json tests/output/auspice/ncov_test-local-compressed.json \
+  > --attr region --values "North America" "Europe" "Asia" "Oceania"
+
+  $ rm -rf tests/output/results
+
+Test various input starting points, all from remote (.xz) compressed files
+
+  $ snakemake --directory tests/output --profile tests/remote-inputs-compressed \
+  > auspice/ncov_test-remote-compressed.json >tests/output/remote-inputs-compressed.cram.log.txt 2>&1
+
+  $ python3 tests/check_auspice_json.py --json tests/output/auspice/ncov_test-remote-compressed.json \
+  > --attr region --values "North America" "Europe" "Asia" "Oceania"
+
+  $ rm -rf tests/output/results tests/output/data/downloaded_test*compressed*
+
+Test various input starting points, all from local uncompressed files
+
+  $ cp tests/local-inputs-compressed/data/*xz tests/local-inputs-uncompressed/data/
+
+  $ for i in tests/local-inputs-uncompressed/data/*.xz; do xz -d $i; done
+
+  $ snakemake --directory tests/output --profile tests/local-inputs-uncompressed \
+  > auspice/ncov_test-local-uncompressed.json >tests/output/local-inputs-uncompressed.cram.log.txt 2>&1
+
+  $ python3 tests/check_auspice_json.py --json tests/output/auspice/ncov_test-local-uncompressed.json \
+  > --attr region --values "North America" "Europe" "Asia" "Oceania"
+
+  $ rm -rf tests/output/results tests/local-inputs-uncompressed/data/*.fasta tests/local-inputs-uncompressed/data/*.tsv
+
+Test various input starting points which support remote uncompressed files (this is a subset of available inputs)
+
+  $ snakemake --directory tests/output  --profile tests/remote-inputs-uncompressed \
+  > auspice/ncov_test-remote-uncompressed.json >tests/output/remote-inputs-uncompressed.cram.log.txt 2>&1
+
+  $ python3 tests/check_auspice_json.py --json tests/output/auspice/ncov_test-remote-uncompressed.json \
+  > --attr region --values "North America" "Europe" "Asia" "Oceania"
+
+  $ rm -rf tests/output/results data/downloaded_test*uncompressed*
diff --git a/tests/local-inputs-compressed/builds.yaml b/tests/local-inputs-compressed/builds.yaml
@@ -0,0 +1,24 @@
+inputs:
+  # Note: paths are relative to the --directory handed to snakemake
+  - name: test-local-compressed-asia-sequences
+    metadata: ../local-inputs-compressed/data/asia_metadata.tsv.xz
+    sequences: ../local-inputs-compressed/data/asia_sequences.fasta.xz
+  - name: test-local-compressed-europe-aligned
+    metadata: ../local-inputs-compressed/data/europe_metadata.tsv.xz
+    aligned: ../local-inputs-compressed/data/europe_aligned.fasta.xz
+  - name: test-local-compressed-oceania-masked
+    metadata: ../local-inputs-compressed/data/oceania_metadata.tsv.xz
+    masked: ../local-inputs-compressed/data/oceania_masked.fasta.xz
+  - name: test-local-compressed-americas-filtered
+    metadata: ../local-inputs-compressed/data/americas_metadata.tsv.xz
+    filtered: ../local-inputs-compressed/data/americas_filtered.fasta.xz
+
+builds:
+  test-local-compressed:
+    subsampling_scheme: small
+
+subsampling:
+  small:
+    small-sample:
+      group_by: "region"
+      max_sequences: 100
diff --git a/tests/local-inputs-compressed/config.yaml b/tests/local-inputs-compressed/config.yaml
@@ -0,0 +1,12 @@
+configfile:
+  - defaults/parameters.yaml
+  - tests/local-inputs-compressed/builds.yaml
+
+# Set the maximum number of cores you want Snakemake to use for this pipeline.
+cores: 2
+
+# Always print the commands that will be run to the screen for debugging.
+printshellcmds: True
+
+# Print log files of failed jobs
+show-failed-logs: True
diff --git a/tests/local-inputs-compressed/data/.gitignore b/tests/local-inputs-compressed/data/.gitignore
@@ -0,0 +1,2 @@
+*.fasta
+*.tsv
diff --git a/tests/local-inputs-compressed/data/americas_filtered.fasta.xz b/tests/local-inputs-compressed/data/americas_filtered.fasta.xz
diff --git a/tests/local-inputs-compressed/data/americas_metadata.tsv.xz b/tests/local-inputs-compressed/data/americas_metadata.tsv.xz
diff --git a/tests/local-inputs-compressed/data/asia_metadata.tsv.xz b/tests/local-inputs-compressed/data/asia_metadata.tsv.xz
diff --git a/tests/local-inputs-compressed/data/asia_sequences.fasta.xz b/tests/local-inputs-compressed/data/asia_sequences.fasta.xz
diff --git a/tests/local-inputs-compressed/data/europe_aligned.fasta.xz b/tests/local-inputs-compressed/data/europe_aligned.fasta.xz
diff --git a/tests/local-inputs-compressed/data/europe_metadata.tsv.xz b/tests/local-inputs-compressed/data/europe_metadata.tsv.xz
diff --git a/tests/local-inputs-compressed/data/oceania_masked.fasta.xz b/tests/local-inputs-compressed/data/oceania_masked.fasta.xz
diff --git a/tests/local-inputs-compressed/data/oceania_metadata.tsv.xz b/tests/local-inputs-compressed/data/oceania_metadata.tsv.xz
diff --git a/tests/local-inputs-uncompressed/builds.yaml b/tests/local-inputs-uncompressed/builds.yaml
@@ -0,0 +1,24 @@
+inputs:
+  # Note: paths are relative to the --directory handed to snakemake
+  - name: test-local-uncompressed-asia-sequences
+    metadata: ../local-inputs-uncompressed/data/asia_metadata.tsv
+    sequences: ../local-inputs-uncompressed/data/asia_sequences.fasta
+  - name: test-local-uncompressed-europe-aligned
+    metadata: ../local-inputs-uncompressed/data/europe_metadata.tsv
+    aligned: ../local-inputs-uncompressed/data/europe_aligned.fasta
+  - name: test-local-uncompressed-oceania-masked
+    metadata: ../local-inputs-uncompressed/data/oceania_metadata.tsv
+    masked: ../local-inputs-uncompressed/data/oceania_masked.fasta
+  - name: test-local-uncompressed-americas-filtered
+    metadata: ../local-inputs-uncompressed/data/americas_metadata.tsv
+    filtered: ../local-inputs-uncompressed/data/americas_filtered.fasta
+
+builds:
+  test-local-uncompressed:
+    subsampling_scheme: small
+
+subsampling:
+  small:
+    small-sample:
+      group_by: "region"
+      max_sequences: 100
diff --git a/tests/local-inputs-uncompressed/config.yaml b/tests/local-inputs-uncompressed/config.yaml
@@ -0,0 +1,12 @@
+configfile:
+  - defaults/parameters.yaml
+  - tests/local-inputs-uncompressed/builds.yaml
+
+# Set the maximum number of cores you want Snakemake to use for this pipeline.
+cores: 2
+
+# Always print the commands that will be run to the screen for debugging.
+printshellcmds: True
+
+# Print log files of failed jobs
+show-failed-logs: True
diff --git a/tests/local-inputs-uncompressed/data/.gitignore b/tests/local-inputs-uncompressed/data/.gitignore
@@ -0,0 +1,2 @@
+*.fasta
+*.tsv
diff --git a/tests/remote-inputs-compressed/builds.yaml b/tests/remote-inputs-compressed/builds.yaml
@@ -0,0 +1,23 @@
+inputs:
+  - name: test-remote-compressed-asia-sequences
+    metadata: s3://nextstrain-data/files/ncov/test-data/asia_metadata.tsv.xz
+    sequences: s3://nextstrain-data/files/ncov/test-data/asia_sequences.fasta.xz
+  - name: test-remote-compressed-europe-aligned
+    metadata: s3://nextstrain-data/files/ncov/test-data/europe_metadata.tsv.xz
+    aligned: s3://nextstrain-data/files/ncov/test-data/europe_aligned.fasta.xz
+  - name: test-remote-compressed-oceania-masked
+    metadata: s3://nextstrain-data/files/ncov/test-data/oceania_metadata.tsv.xz
+    masked: s3://nextstrain-data/files/ncov/test-data/oceania_masked.fasta.xz
+  - name: test-remote-compressed-americas-filtered
+    metadata: s3://nextstrain-data/files/ncov/test-data/americas_metadata.tsv.xz
+    filtered: s3://nextstrain-data/files/ncov/test-data/americas_filtered.fasta.xz
+
+builds:
+  test-remote-compressed:
+    subsampling_scheme: small
+
+subsampling:
+  small:
+    small-sample:
+      group_by: "region"
+      max_sequences: 100
diff --git a/tests/remote-inputs-compressed/config.yaml b/tests/remote-inputs-compressed/config.yaml
@@ -0,0 +1,12 @@
+configfile:
+  - defaults/parameters.yaml
+  - tests/remote-inputs-compressed/builds.yaml
+
+# Set the maximum number of cores you want Snakemake to use for this pipeline.
+cores: 2
+
+# Always print the commands that will be run to the screen for debugging.
+printshellcmds: True
+
+# Print log files of failed jobs
+show-failed-logs: True
diff --git a/tests/remote-inputs-uncompressed/builds.yaml b/tests/remote-inputs-uncompressed/builds.yaml
@@ -0,0 +1,31 @@
+inputs:
+  # NOTE: there is no input defining an uncompressed `sequences` address
+  # as this pipeline only supports compressed sequence input
+  - name: test-remote-uncompressed-europe-aligned
+    metadata: s3://nextstrain-data/files/ncov/test-data/europe_metadata.tsv
+    aligned: s3://nextstrain-data/files/ncov/test-data/europe_aligned.fasta
+  - name: test-remote-uncompressed-oceania-masked
+    metadata: s3://nextstrain-data/files/ncov/test-data/oceania_metadata.tsv
+    masked: s3://nextstrain-data/files/ncov/test-data/oceania_masked.fasta
+  - name: test-remote-uncompressed-americas-filtered
+    metadata: s3://nextstrain-data/files/ncov/test-data/americas_metadata.tsv
+    filtered: s3://nextstrain-data/files/ncov/test-data/americas_filtered.fasta
+  - name: reference
+    metadata: data/references_metadata.tsv
+    sequences: data/references_sequences.fasta
+
+# As we are not including the test data from Asia (see above), this build will
+# be missing the default root sequence. We instead use
+# `data/references_sequences.fasta` (Wuhan/WH01/2019)
+refine:
+  root: "Wuhan/WH01/2019"
+
+builds:
+  test-remote-uncompressed:
+    subsampling_scheme: small
+
+subsampling:
+  small:
+    small-sample:
+      group_by: "region"
+      max_sequences: 100
diff --git a/tests/remote-inputs-uncompressed/config.yaml b/tests/remote-inputs-uncompressed/config.yaml
@@ -0,0 +1,12 @@
+configfile:
+  - defaults/parameters.yaml
+  - tests/remote-inputs-uncompressed/builds.yaml
+
+# Set the maximum number of cores you want Snakemake to use for this pipeline.
+cores: 2
+
+# Always print the commands that will be run to the screen for debugging.
+printshellcmds: True
+
+# Print log files of failed jobs
+show-failed-logs: True