From b69e80d35501beda2f29cbf635f9e4768751975f Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Thu, 9 Nov 2023 12:00:43 +0100 Subject: [PATCH] feat: update tx support to latest cdot with MANE label transfer (#245) (#248) --- Cargo.lock | 367 +++++-- Cargo.toml | 25 +- README.md | 5 +- build.rs | 12 +- docs/db_build.md | 4 +- src/annotate/seqvars/ann.rs | 35 +- src/annotate/seqvars/csq.rs | 360 ++++++- src/annotate/seqvars/mod.rs | 50 +- src/annotate/seqvars/provider.rs | 191 +++- ...snv_brca1_one_variant@17-41196309-G-C.snap | 16 +- ...snv_brca1_one_variant@17-41196310-G-C.snap | 16 +- ...snv_brca1_one_variant@17-41196311-G-C.snap | 16 +- ...snv_brca1_one_variant@17-41196312-G-C.snap | 16 +- ...snv_brca1_one_variant@17-41196313-G-C.snap | 16 +- ...snv_brca1_one_variant@17-41197701-G-C.snap | 16 +- ...snv_brca1_one_variant@17-41197818-G-C.snap | 16 +- ...snv_brca1_one_variant@17-41197819-G-C.snap | 16 +- ...snv_brca1_one_variant@17-41197820-G-C.snap | 16 +- ...snv_brca1_one_variant@17-41197821-G-C.snap | 16 +- ...snv_brca1_one_variant@17-41197822-G-C.snap | 16 +- ...snv_brca1_one_variant@17-41197823-G-C.snap | 16 +- ...snv_brca1_one_variant@17-41277379-A-C.snap | 16 +- ...snv_brca1_one_variant@17-41277380-G-C.snap | 16 +- ...snv_brca1_one_variant@17-41277381-G-T.snap | 16 +- ...snv_brca1_one_variant@17-41277382-G-C.snap | 16 +- ...snv_brca1_one_variant@17-41277383-A-C.snap | 16 +- ...snv_brca1_one_variant@17-41277384-G-C.snap | 16 +- ...reporting@17-41197701-G-C-false-false.snap | 36 + ..._reporting@17-41197701-G-C-false-true.snap | 154 +++ ..._reporting@17-41197701-G-C-true-false.snap | 36 + ...g_reporting@17-41197701-G-C-true-true.snap | 36 + ...reporting@2-179393094-C-T-false-false.snap | 36 + ..._reporting@2-179393094-C-T-false-true.snap | 186 ++++ ..._reporting@2-179393094-C-T-true-false.snap | 36 + ...reporting@2-179631246-G-A-false-false.snap | 36 + ..._reporting@2-179631246-G-A-false-true.snap | 217 ++++ ..._reporting@2-179631246-G-A-true-false.snap | 36 + ...g_reporting@2-179631246-G-A-true-true.snap | 67 ++ src/annotate/strucvars/csq.rs | 7 +- src/annotate/strucvars/mod.rs | 22 +- ...trucvars__test__guess_sv_caller_delly.snap | 6 +- ...ars__test__guess_sv_caller_dragen_cnv.snap | 6 +- ...vars__test__guess_sv_caller_dragen_sv.snap | 6 +- ...strucvars__test__guess_sv_caller_gcnv.snap | 6 +- ...trucvars__test__guess_sv_caller_manta.snap | 6 +- ...strucvars__test__guess_sv_caller_melt.snap | 6 +- ...rucvars__test__guess_sv_caller_popdel.snap | 6 +- src/db/create/mod.rs | 959 +++++++++++++++++- ...ate__test__filter_transcripts_brca1-2.snap | 10 + ...ate__test__filter_transcripts_brca1-3.snap | 5 + ...eate__test__filter_transcripts_brca1.snap} | 4 +- src/db/create/txs/mod.rs | 868 ---------------- ...txs__test__filter_transcripts_brca1-2.snap | 10 - ...txs__test__filter_transcripts_brca1-3.snap | 5 - src/db/dump/mod.rs | 27 + src/db/mod.rs | 1 + src/main.rs | 24 +- src/{db/create/txs => proto}/data.proto3 | 6 +- src/server/actix_server/seqvars_csq.rs | 13 +- src/server/mod.rs | 6 +- src/verify/seqvars.rs | 30 +- tests/data/annotate/db/grch37/bootstrap.sh | 4 +- tests/data/annotate/db/grch37/tx-mane.tsv | 3 + tests/data/annotate/db/grch37/txs.bin.zst | 4 +- .../annotate/db/grch37/txs.bin.zst.report | 4 +- tests/data/annotate/vars/postproc-snpeff.sh | 2 +- tests/data/db/create/txs/bootstrap.py | 3 - tests/data/db/create/txs/bootstrap.sh | 2 +- tests/data/db/create/txs/brca1.fasta | 4 +- ....2.21.refseq.grch37_grch38.brca1_opa1.json | 3 + .../data/db/create/txs/latest/aliases.sqlite3 | 4 +- .../2023/0327/0944/1679910246.7746782.fa.bgz | 3 - .../0327/0944/1679910246.7746782.fa.bgz.fai | 3 - .../0327/0944/1679910246.7746782.fa.bgz.gzi | 3 - .../2023/0327/0944/1679910247.3800669.fa.bgz | 3 - .../0327/0944/1679910247.3800669.fa.bgz.fai | 3 - .../0327/0944/1679910247.3800669.fa.bgz.gzi | 3 - .../2023/1108/0919/1699435149.8201442.fa.bgz | 3 + .../1108/0919/1699435149.8201442.fa.bgz.fai | 3 + .../1108/0919/1699435149.8201442.fa.bgz.gzi | 3 + .../2023/1108/0919/1699435152.474355.fa.bgz | 3 + .../1108/0919/1699435152.474355.fa.bgz.fai | 3 + .../1108/0919/1699435152.474355.fa.bgz.gzi | 3 + .../db/create/txs/latest/sequences/db.sqlite3 | 4 +- tests/data/db/create/txs/opa1.fasta | 4 +- tests/data/db/create/txs/txs_main.tsv | 3 + 86 files changed, 3053 insertions(+), 1279 deletions(-) create mode 100644 src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_transcript_picking_reporting@17-41197701-G-C-false-false.snap create mode 100644 src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_transcript_picking_reporting@17-41197701-G-C-false-true.snap create mode 100644 src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_transcript_picking_reporting@17-41197701-G-C-true-false.snap create mode 100644 src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_transcript_picking_reporting@17-41197701-G-C-true-true.snap create mode 100644 src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_ttn_transcript_picking_reporting@2-179393094-C-T-false-false.snap create mode 100644 src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_ttn_transcript_picking_reporting@2-179393094-C-T-false-true.snap create mode 100644 src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_ttn_transcript_picking_reporting@2-179393094-C-T-true-false.snap create mode 100644 src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_ttn_transcript_picking_reporting@2-179631246-G-A-false-false.snap create mode 100644 src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_ttn_transcript_picking_reporting@2-179631246-G-A-false-true.snap create mode 100644 src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_ttn_transcript_picking_reporting@2-179631246-G-A-true-false.snap create mode 100644 src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_ttn_transcript_picking_reporting@2-179631246-G-A-true-true.snap create mode 100644 src/db/create/snapshots/mehari__db__create__test__filter_transcripts_brca1-2.snap create mode 100644 src/db/create/snapshots/mehari__db__create__test__filter_transcripts_brca1-3.snap rename src/db/create/{txs/snapshots/mehari__db__create__txs__test__filter_transcripts_brca1.snap => snapshots/mehari__db__create__test__filter_transcripts_brca1.snap} (50%) delete mode 100644 src/db/create/txs/mod.rs delete mode 100644 src/db/create/txs/snapshots/mehari__db__create__txs__test__filter_transcripts_brca1-2.snap delete mode 100644 src/db/create/txs/snapshots/mehari__db__create__txs__test__filter_transcripts_brca1-3.snap create mode 100644 src/db/dump/mod.rs rename src/{db/create/txs => proto}/data.proto3 (97%) create mode 100644 tests/data/annotate/db/grch37/tx-mane.tsv delete mode 100644 tests/data/db/create/txs/bootstrap.py create mode 100644 tests/data/db/create/txs/cdot-0.2.21.refseq.grch37_grch38.brca1_opa1.json delete mode 100644 tests/data/db/create/txs/latest/sequences/2023/0327/0944/1679910246.7746782.fa.bgz delete mode 100644 tests/data/db/create/txs/latest/sequences/2023/0327/0944/1679910246.7746782.fa.bgz.fai delete mode 100644 tests/data/db/create/txs/latest/sequences/2023/0327/0944/1679910246.7746782.fa.bgz.gzi delete mode 100644 tests/data/db/create/txs/latest/sequences/2023/0327/0944/1679910247.3800669.fa.bgz delete mode 100644 tests/data/db/create/txs/latest/sequences/2023/0327/0944/1679910247.3800669.fa.bgz.fai delete mode 100644 tests/data/db/create/txs/latest/sequences/2023/0327/0944/1679910247.3800669.fa.bgz.gzi create mode 100644 tests/data/db/create/txs/latest/sequences/2023/1108/0919/1699435149.8201442.fa.bgz create mode 100644 tests/data/db/create/txs/latest/sequences/2023/1108/0919/1699435149.8201442.fa.bgz.fai create mode 100644 tests/data/db/create/txs/latest/sequences/2023/1108/0919/1699435149.8201442.fa.bgz.gzi create mode 100644 tests/data/db/create/txs/latest/sequences/2023/1108/0919/1699435152.474355.fa.bgz create mode 100644 tests/data/db/create/txs/latest/sequences/2023/1108/0919/1699435152.474355.fa.bgz.fai create mode 100644 tests/data/db/create/txs/latest/sequences/2023/1108/0919/1699435152.474355.fa.bgz.gzi create mode 100644 tests/data/db/create/txs/txs_main.tsv diff --git a/Cargo.lock b/Cargo.lock index 9b23ade8..9cfec00a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -65,7 +65,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e01ed3140b2f8d422c68afa1ed2e85d996ea619c988ac834d255db32138655cb" dependencies = [ "quote", - "syn 2.0.38", + "syn 2.0.39", ] [[package]] @@ -178,7 +178,7 @@ dependencies = [ "actix-router", "proc-macro2", "quote", - "syn 2.0.38", + "syn 2.0.39", ] [[package]] @@ -198,9 +198,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "ahash" -version = "0.8.5" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd7d5a2cecb58716e47d67d5703a249964b14c7be1ec3cad3affc295b2d1c35d" +checksum = "91429305e9f0a25f6205c5b8e0d2db09e0708a7a6df0f42212bb56c32c8ac97a" dependencies = [ "cfg-if", "getrandom", @@ -256,9 +256,9 @@ dependencies = [ [[package]] name = "annonars" -version = "0.24.4" +version = "0.24.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0433a67ac720f710267253606ae2470969ea29d0e0c712835744483c967225ed" +checksum = "52c5a439b949bb0571edac27895435b3f23371c80faa4b1cb3e51f20764035f3" dependencies = [ "actix-web", "anyhow", @@ -270,7 +270,7 @@ dependencies = [ "clap", "clap-verbosity-flag", "csv", - "enum-map 2.7.0", + "enum-map 2.7.1", "env_logger", "flate2", "indexmap 2.1.0", @@ -436,7 +436,7 @@ dependencies = [ "log", "parking", "polling", - "rustix 0.37.26", + "rustix 0.37.27", "slab", "socket2 0.4.10", "waker-fn", @@ -492,7 +492,7 @@ checksum = "a66537f1bb974b254c98ed142ff995236e81b9d0fe4db0575f46612cb15eb0f9" dependencies = [ "proc-macro2", "quote", - "syn 2.0.38", + "syn 2.0.39", ] [[package]] @@ -564,7 +564,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn 2.0.38", + "syn 2.0.39", ] [[package]] @@ -626,7 +626,7 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "82c91bd0b62cec4b6317445506ed4947a53c320942a681c5a086f4d576aa4c99" dependencies = [ - "enum-map 2.7.0", + "enum-map 2.7.1", "flate2", "indexmap 2.1.0", "lazy_static", @@ -705,9 +705,9 @@ dependencies = [ [[package]] name = "brotli-decompressor" -version = "2.5.0" +version = "2.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da74e2b81409b1b743f8f0c62cc6254afefb8b8e50bbfe3735550f7aeefa3448" +checksum = "4e2e4afe60d7dd600fdd3de8d0f08c2b7ec039712e3b6137ff98b7004e82de4f" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -741,9 +741,9 @@ dependencies = [ [[package]] name = "bytecount" -version = "0.6.5" +version = "0.6.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1a12477b7237a01c11a80a51278165f9ba0edd28fa6db00a65ab230320dc58c" +checksum = "e1e5f035d16fc623ae5f74981db80a439803888314e3a555fd6f04acd51a3205" [[package]] name = "bytemuck" @@ -765,9 +765,9 @@ checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" [[package]] name = "bytestring" -version = "1.3.0" +version = "1.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "238e4886760d98c4f899360c834fa93e62cf7f721ac3c2da375cbdf4b8679aae" +checksum = "74d80203ea6b29df88012294f62733de21cfeab47f17b41af3a38bc30a03ee72" dependencies = [ "bytes", ] @@ -875,7 +875,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.38", + "syn 2.0.39", ] [[package]] @@ -937,9 +937,9 @@ checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" [[package]] name = "cpufeatures" -version = "0.2.10" +version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fbc60abd742b35f2492f808e1abbb83d45f72db402e14c55057edc9c7b1e9e4" +checksum = "ce420fe07aecd3e67c5f910618fe65e94158f6dcc0adf44e00d69ce2bdfe0fd0" dependencies = [ "libc", ] @@ -1033,14 +1033,38 @@ version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef8ae57c4978a2acd8b869ce6b9ca1dfe817bff704c220209fdef2c0b75a01b9" +[[package]] +name = "darling" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b750cb3417fd1b327431a470f388520309479ab0bf5e323505daf0290cd3850" +dependencies = [ + "darling_core 0.14.4", + "darling_macro 0.14.4", +] + [[package]] name = "darling" version = "0.20.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0209d94da627ab5605dcccf08bb18afa5009cfbef48d8a8b7d7bdbc79be25c5e" dependencies = [ - "darling_core", - "darling_macro", + "darling_core 0.20.3", + "darling_macro 0.20.3", +] + +[[package]] +name = "darling_core" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "109c1ca6e6b7f82cc233a97004ea8ed7ca123a9af07a8230878fcfda9b158bf0" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn 1.0.109", ] [[package]] @@ -1054,7 +1078,18 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.38", + "syn 2.0.39", +] + +[[package]] +name = "darling_macro" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e" +dependencies = [ + "darling_core 0.14.4", + "quote", + "syn 1.0.109", ] [[package]] @@ -1063,9 +1098,9 @@ version = "0.20.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "836a9bbc7ad63342d6d6e7b815ccab164bc77a2d95d84bc3117a8c0d5c98e2d5" dependencies = [ - "darling_core", + "darling_core 0.20.3", "quote", - "syn 2.0.38", + "syn 2.0.39", ] [[package]] @@ -1100,6 +1135,37 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "derive_builder" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d67778784b508018359cbc8696edb3db78160bab2c2a28ba7f56ef6932997f8" +dependencies = [ + "derive_builder_macro", +] + +[[package]] +name = "derive_builder_core" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c11bdc11a0c47bc7d37d582b5285da6849c96681023680b906673c5707af7b0f" +dependencies = [ + "darling 0.14.4", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "derive_builder_macro" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebcda35c7a396850a55ffeac740804b40ffec779b98fffbb1738f4033f0ee79e" +dependencies = [ + "derive_builder_core", + "syn 1.0.109", +] + [[package]] name = "derive_more" version = "0.99.17" @@ -1168,11 +1234,11 @@ dependencies = [ [[package]] name = "enum-map" -version = "2.7.0" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53337c2dbf26a3c31eccc73a37b10c1614e8d4ae99b6a50d553e8936423c1f16" +checksum = "ed40247825a1a0393b91b51d475ea1063a6cbbf0847592e7f13fb427aca6a716" dependencies = [ - "enum-map-derive 0.14.0", + "enum-map-derive 0.15.0", "serde", ] @@ -1189,13 +1255,13 @@ dependencies = [ [[package]] name = "enum-map-derive" -version = "0.14.0" +version = "0.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04d0b288e3bb1d861c4403c1774a6f7a798781dfc519b3647df2a3dd4ae95f25" +checksum = "7933cd46e720348d29ed1493f89df9792563f272f96d8f13d18afe03b32f8cb8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.38", + "syn 2.0.39", ] [[package]] @@ -1219,9 +1285,9 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "errno" -version = "0.3.5" +version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3e13f66a2f95e32a39eaa81f6b95d42878ca0e1db0c7543723dfe12557e860" +checksum = "7c18ee0ed65a5f1f81cac6b1d213b69c35fa47d4252ad41f1486dbd8226fe36e" dependencies = [ "libc", "windows-sys 0.48.0", @@ -1374,7 +1440,7 @@ checksum = "53b153fd91e4b0147f4aced87be237c98248656bb01050b96bf3ee89220a8ddb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.38", + "syn 2.0.39", ] [[package]] @@ -1434,9 +1500,9 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.10" +version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" +checksum = "fe9006bed769170c11f845cf00c7c1e9092aeb3f268e007c3e760ac68008070f" dependencies = [ "cfg-if", "libc", @@ -1543,15 +1609,15 @@ checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" [[package]] name = "hgvs" -version = "0.12.0" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd1c9bcd21975b12b0d6f4db48882cb3668dffc3dbe2820b9ae9d642f1a9e273" +checksum = "87482326c27e244841df05d4aff8478987a9838200d7bbf696e4d0290caa7a99" dependencies = [ "base16ct", "bio", "biocommons-bioutils", "chrono", - "enum-map 2.7.0", + "enum-map 2.7.1", "flate2", "indexmap 2.1.0", "lazy_static", @@ -1562,7 +1628,7 @@ dependencies = [ "quick_cache", "regex", "rustc-hash", - "seqrepo", + "seqrepo 0.8.0", "serde", "serde_json", "thiserror", @@ -1776,9 +1842,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.64" +version = "0.3.65" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a" +checksum = "54c0c35952f67de54bb584e9fd912b3023117cbafc0a77d8f3dee1fb5f572fe8" dependencies = [ "wasm-bindgen", ] @@ -1823,9 +1889,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "libc" -version = "0.2.149" +version = "0.2.150" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b" +checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c" [[package]] name = "libloading" @@ -1894,15 +1960,15 @@ checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" [[package]] name = "linux-raw-sys" -version = "0.4.10" +version = "0.4.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f" +checksum = "969488b55f8ac402214f3f5fd243ebb7206cf82de60d3172994707a4bcc2b829" [[package]] name = "local-channel" -version = "0.1.4" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0a493488de5f18c8ffcba89eebb8532ffc562dc400490eb65b84893fae0b178" +checksum = "b6cbc85e69b8df4b8bb8b89ec634e7189099cea8927a276b7384ce5488e53ec8" dependencies = [ "futures-core", "futures-sink", @@ -1911,9 +1977,9 @@ dependencies = [ [[package]] name = "local-waker" -version = "0.1.3" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e34f76eb3611940e0e7d53a9aaa4e6a3151f69541a282fd0dad5571420c53ff1" +checksum = "4d873d7c67ce09b42110d801813efbc9364414e356be9935700d368351657487" [[package]] name = "lock_api" @@ -1944,6 +2010,15 @@ dependencies = [ "libc", ] +[[package]] +name = "matchers" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558" +dependencies = [ + "regex-automata 0.1.10", +] + [[package]] name = "matrixmultiply" version = "0.3.8" @@ -1983,6 +2058,7 @@ dependencies = [ "clap-verbosity-flag", "csv", "derivative", + "derive_builder", "env_logger", "flate2", "futures", @@ -2012,10 +2088,11 @@ dependencies = [ "rocksdb", "rstest", "rustc-hash", - "seqrepo", + "seqrepo 0.9.0", "serde", "serde_json", "serde_with", + "serde_yaml", "strum", "temp_testdir", "tempfile", @@ -2023,6 +2100,7 @@ dependencies = [ "tokio", "tracing", "tracing-subscriber", + "tracing-test", "uuid", "zstd 0.13.0", ] @@ -2190,9 +2268,9 @@ checksum = "94fbe3192fe33acacabaedd387657f39b0fc606f1996d546db0dfe14703b843a" [[package]] name = "noodles-csi" -version = "0.25.1" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39d86e55b4784ba7c38b4ffbfc24e122bc05ce971b0a664e8e1a15ffd9de68a7" +checksum = "e0531175d5473e6057c1724c1242b19bfc42dba644fe275b4df89c5b8d31a782" dependencies = [ "bit-vec", "byteorder", @@ -2227,12 +2305,13 @@ dependencies = [ [[package]] name = "noodles-tabix" -version = "0.31.0" +version = "0.32.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d12d6981ba752798cb03abc9604324ff6cbc1e5354252f95c372545d967a6daf" +checksum = "415e319f97784c110a85756a8747bf26e9a18bf321b113d00984ca1af7a6fef9" dependencies = [ "bit-vec", "byteorder", + "indexmap 2.1.0", "noodles-bgzf 0.25.0", "noodles-core", "noodles-csi", @@ -2241,9 +2320,9 @@ dependencies = [ [[package]] name = "noodles-vcf" -version = "0.43.0" +version = "0.45.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f13f0225114584f7ab147f146532dccc584e810006a4dfb275544957e7722884" +checksum = "6000206d1ec762d3c983fb189edde86a22c969de3bb90e4048c4916c492babfa" dependencies = [ "futures", "indexmap 2.1.0", @@ -2404,7 +2483,7 @@ dependencies = [ "regex", "regex-syntax 0.7.5", "structmeta", - "syn 2.0.38", + "syn 2.0.39", ] [[package]] @@ -2500,9 +2579,9 @@ dependencies = [ [[package]] name = "portable-atomic" -version = "1.5.0" +version = "1.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b559898e0b4931ed2d3b959ab0c2da4d99cc644c4b0b1a35b4d344027f474023" +checksum = "3bccab0e7fd7cc19f820a1c8c91720af652d0c88dc9664dd72aef2614f04af3b" [[package]] name = "postgres" @@ -2577,7 +2656,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ae005bd773ab59b4725093fd7df83fd7892f7d8eafb48dbd7de6e024e4215f9d" dependencies = [ "proc-macro2", - "syn 2.0.38", + "syn 2.0.39", ] [[package]] @@ -2666,7 +2745,7 @@ dependencies = [ "prost", "prost-types", "regex", - "syn 2.0.38", + "syn 2.0.39", "tempfile", "which", ] @@ -2681,7 +2760,7 @@ dependencies = [ "itertools", "proc-macro2", "quote", - "syn 2.0.38", + "syn 2.0.39", ] [[package]] @@ -2797,10 +2876,19 @@ checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" dependencies = [ "aho-corasick", "memchr", - "regex-automata", + "regex-automata 0.4.3", "regex-syntax 0.8.2", ] +[[package]] +name = "regex-automata" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" +dependencies = [ + "regex-syntax 0.6.29", +] + [[package]] name = "regex-automata" version = "0.4.3" @@ -2812,6 +2900,12 @@ dependencies = [ "regex-syntax 0.8.2", ] +[[package]] +name = "regex-syntax" +version = "0.6.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" + [[package]] name = "regex-syntax" version = "0.7.5" @@ -2876,7 +2970,7 @@ dependencies = [ "regex", "relative-path", "rustc_version 0.4.0", - "syn 2.0.38", + "syn 2.0.39", "unicode-ident", ] @@ -2926,9 +3020,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.37.26" +version = "0.37.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84f3f8f960ed3b5a59055428714943298bf3fa2d4a1d53135084e0544829d995" +checksum = "fea8ca367a3a01fe35e6943c400addf443c0f57670e6ec51196f71a4b8762dd2" dependencies = [ "bitflags 1.3.2", "errno", @@ -2947,7 +3041,7 @@ dependencies = [ "bitflags 2.4.1", "errno", "libc", - "linux-raw-sys 0.4.10", + "linux-raw-sys 0.4.11", "windows-sys 0.48.0", ] @@ -3005,24 +3099,39 @@ dependencies = [ "tracing", ] +[[package]] +name = "seqrepo" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2952aee50ad05e57127f0857aa9553bbce29aea59817da2127c34ce09f6dd65" +dependencies = [ + "chrono", + "noodles-bgzf 0.25.0", + "noodles-core", + "noodles-fasta 0.30.0", + "rusqlite", + "thiserror", + "tracing", +] + [[package]] name = "serde" -version = "1.0.191" +version = "1.0.192" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a834c4821019838224821468552240d4d95d14e751986442c816572d39a080c9" +checksum = "bca2a08484b285dcb282d0f67b26cadc0df8b19f8c12502c13d966bf9482f001" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.191" +version = "1.0.192" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46fa52d5646bce91b680189fe5b1c049d2ea38dabb4e2e7c8d00ca12cfbfbcfd" +checksum = "d6c7207fbec9faa48073f3e3074cbe553af6ea512d7c21ba46e434e70ea9fbc1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.38", + "syn 2.0.39", ] [[package]] @@ -3072,10 +3181,23 @@ version = "3.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "93634eb5f75a2323b16de4748022ac4297f9e76b6dced2be287a099f41b5e788" dependencies = [ - "darling", + "darling 0.20.3", "proc-macro2", "quote", - "syn 2.0.38", + "syn 2.0.39", +] + +[[package]] +name = "serde_yaml" +version = "0.9.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3cc7a1570e38322cfe4154732e5110f887ea57e22b76f4bfd32b5bdd3368666c" +dependencies = [ + "indexmap 2.1.0", + "itoa", + "ryu", + "serde", + "unsafe-libyaml", ] [[package]] @@ -3223,7 +3345,7 @@ dependencies = [ "proc-macro2", "quote", "structmeta-derive", - "syn 2.0.38", + "syn 2.0.39", ] [[package]] @@ -3234,7 +3356,7 @@ checksum = "a60bcaff7397072dca0017d1db428e30d5002e00b6847703e2e42005c95fbe00" dependencies = [ "proc-macro2", "quote", - "syn 2.0.38", + "syn 2.0.39", ] [[package]] @@ -3256,7 +3378,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.38", + "syn 2.0.39", ] [[package]] @@ -3278,9 +3400,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.38" +version = "2.0.39" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e96b79aaa137db8f61e26363a0c9b47d8b4ec75da28b7d1d614c2303e232408b" +checksum = "23e78b90f2fcf45d3e842032ce32e3f2d1545ba6636271dcbf24fa306d87be7a" dependencies = [ "proc-macro2", "quote", @@ -3332,7 +3454,7 @@ checksum = "266b2e40bc00e5a6c09c3584011e08b06f123c00362c92b975ba9843aaaa14b8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.38", + "syn 2.0.39", ] [[package]] @@ -3422,7 +3544,7 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.38", + "syn 2.0.39", ] [[package]] @@ -3453,9 +3575,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.9" +version = "0.7.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d68074620f57a0b21594d9735eb2e98ab38b17f80d3fcb189fca266771ca60d" +checksum = "5419f34732d9eb6ee4c3578b7989078579b7f039cbbb9ca2c4da015749371e15" dependencies = [ "bytes", "futures-core", @@ -3485,7 +3607,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.38", + "syn 2.0.39", ] [[package]] @@ -3515,14 +3637,41 @@ version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77" dependencies = [ + "matchers", "nu-ansi-term", + "once_cell", + "regex", "sharded-slab", "smallvec", "thread_local", + "tracing", "tracing-core", "tracing-log", ] +[[package]] +name = "tracing-test" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a2c0ff408fe918a94c428a3f2ad04e4afd5c95bbc08fcf868eff750c15728a4" +dependencies = [ + "lazy_static", + "tracing-core", + "tracing-subscriber", + "tracing-test-macro", +] + +[[package]] +name = "tracing-test-macro" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "258bc1c4f8e2e73a977812ab339d503e6feeb92700f6d07a6de4d321522d5c08" +dependencies = [ + "lazy_static", + "quote", + "syn 1.0.109", +] + [[package]] name = "triple_accel" version = "0.4.0" @@ -3562,6 +3711,12 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85" +[[package]] +name = "unsafe-libyaml" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f28467d3e1d3c6586d8f25fa243f544f5800fec42d97032474e17222c2b75cfa" + [[package]] name = "url" version = "2.4.1" @@ -3643,9 +3798,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.87" +version = "0.2.88" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342" +checksum = "7daec296f25a1bae309c0cd5c29c4b260e510e6d813c286b19eaadf409d40fce" dependencies = [ "cfg-if", "wasm-bindgen-macro", @@ -3653,24 +3808,24 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.87" +version = "0.2.88" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd" +checksum = "e397f4664c0e4e428e8313a469aaa58310d302159845980fd23b0f22a847f217" dependencies = [ "bumpalo", "log", "once_cell", "proc-macro2", "quote", - "syn 2.0.38", + "syn 2.0.39", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.37" +version = "0.4.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c02dbc21516f9f1f04f187958890d7e6026df8d16540b7ad9492bc34a67cea03" +checksum = "9afec9963e3d0994cac82455b2b3502b81a7f40f9a0d32181f7528d9f4b43e02" dependencies = [ "cfg-if", "js-sys", @@ -3680,9 +3835,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.87" +version = "0.2.88" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d" +checksum = "5961017b3b08ad5f3fe39f1e79877f8ee7c23c5e5fd5eb80de95abc41f1f16b2" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -3690,28 +3845,28 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.87" +version = "0.2.88" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" +checksum = "c5353b8dab669f5e10f5bd76df26a9360c748f054f862ff5f3f8aae0c7fb3907" dependencies = [ "proc-macro2", "quote", - "syn 2.0.38", + "syn 2.0.39", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.87" +version = "0.2.88" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1" +checksum = "0d046c5d029ba91a1ed14da14dca44b68bf2f124cfbaf741c54151fdb3e0750b" [[package]] name = "web-sys" -version = "0.3.64" +version = "0.3.65" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b85cbef8c220a6abc02aefd892dfc0fc23afb1c6a426316ec33253a3877249b" +checksum = "5db499c5f66323272151db0e666cd34f78617522fb0c1604d31a27c50c206a85" dependencies = [ "js-sys", "wasm-bindgen", @@ -3938,22 +4093,22 @@ checksum = "09041cd90cf85f7f8b2df60c646f853b7f535ce68f85244eb6731cf89fa498ec" [[package]] name = "zerocopy" -version = "0.7.11" +version = "0.7.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c19fae0c8a9efc6a8281f2e623db8af1db9e57852e04cde3e754dd2dc29340f" +checksum = "8cd369a67c0edfef15010f980c3cbe45d7f651deac2cd67ce097cd801de16557" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.7.11" +version = "0.7.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc56589e9ddd1f1c28d4b4b5c773ce232910a6bb67a70133d61c9e347585efe9" +checksum = "c2f140bda219a26ccc0cdb03dba58af72590c53b22642577d88a927bc5c87d6b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.38", + "syn 2.0.39", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 11d01304..d24a5af2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -33,17 +33,18 @@ async-compression = { version = "0.4", features = ["tokio", "gzip"] } bgzip = "0.3" bio = "1.3" biocommons-bioutils = "0.1.4" -byte-unit = "4.0" byteorder = "1.4" +byte-unit = "4.0" chrono = "0.4" -clap = { version = "4.4", features = ["derive"] } clap-verbosity-flag = "2.0" +clap = { version = "4.4", features = ["derive"] } csv = "1.3" derivative = "2.2" +derive_builder = { version = "0.12", features = ["clippy"] } env_logger = "0.10" flate2 = "1.0" -futures = "0.3.29" -hgvs = "0.12" +futures = "0.3" +hgvs = "0.13" indexmap = { version = "2.1", features = ["serde"] } indicatif = "0.17" jsonl = "4.0" @@ -52,10 +53,10 @@ log = "0.4" nom = "7.1" noodles-bgzf = { version = "0.25", features = ["async"] } noodles-core = "0.12" -noodles-csi = "0.25" +noodles-csi = "0.26" noodles-fasta = "0.30" -noodles-tabix = "0.31" -noodles-vcf = { version = "0.43", features = ["async"] } +noodles-tabix = "0.32" +noodles-vcf = { version = "0.45", features = ["async"] } parse-display = "0.8" procfs = "0.16" prost = "0.12" @@ -64,16 +65,17 @@ rand = "0.8" rand_core = "0.6" rocksdb = { version = "0.21", features = ["multi-threaded-cf"] } rustc-hash = "1.1" -seqrepo = "0.8" -serde = { version = "1.0", features = ["derive"] } +seqrepo = "0.9" serde_json = "1.0" +serde = { version = "1.0", features = ["derive"] } serde_with = { version = "3.3", features = ["indexmap_2"] } +serde_yaml = "0.9" strum = { version = "0.25", features = ["derive"] } tempfile = "3" thousands = "0.2" tokio = { version = "1.33", features = ["full"] } -tracing = { version = "0.1", features = ["log"] } tracing-subscriber = "0.3" +tracing = { version = "0.1", features = ["log"] } uuid = { version = "1.4", features = ["fast-rng", "serde"] } zstd = "0.13" @@ -83,8 +85,9 @@ prost-build = "0.12" [dev-dependencies] async-std = { version = "1.12", features = ["attributes"] } csv = "1.3" -hxdmp = "0.2.1" +hxdmp = "0.2" insta = { version = "1.34", features = ["yaml"] } pretty_assertions = "1.4" rstest = "0.18" temp_testdir = "0.2" +tracing-test = "0.2" diff --git a/README.md b/README.md index e4ba7a40..34ef8701 100644 --- a/README.md +++ b/README.md @@ -166,8 +166,9 @@ cargo run --release -- \ -v \ db create txs \ --path-out /tmp/txs-out.bin.zst \ - --path-cdot-json ../cdot-0.2.12.ensembl.grch37_grch38.json.gz \ - --path-cdot-json ../cdot-0.2.12.refseq.grch37_grch38.json.gz \ + --path-lable-tsv PATH_TO_MANE_LABEL.tsv \ + --path-cdot-json ../cdot-0.2.21.ensembl.grch37_grch38.json.gz \ + --path-cdot-json ../cdot-0.2.21.refseq.grch37_grch38.json.gz \ --path-seqrepo-instance ../hgvs-rs-data/seqrepo-data/master/master ``` diff --git a/build.rs b/build.rs index 7e96650c..e260bfc1 100644 --- a/build.rs +++ b/build.rs @@ -1,6 +1,14 @@ // The custom build script, needed as we use prost. fn main() { - println!("cargo:rerun-if-changed=src/db/create/txs/data.proto3"); - prost_build::compile_protos(&["src/db/create/txs/data.proto3"], &["src/"]).unwrap(); + println!("cargo:rerun-if-changed=src/proto/data.proto3"); + prost_build::Config::new() + .protoc_arg("-Isrc/proto") + // Add serde serialization and deserialization to the generated code. + .type_attribute(".", "#[derive(serde::Serialize, serde::Deserialize)]") + // Skip serializing `None` values. + .type_attribute(".", "#[serde_with::skip_serializing_none]") + // Define the protobuf files to compile. + .compile_protos(&["src/proto/data.proto3"], &["src/"]) + .unwrap(); } diff --git a/docs/db_build.md b/docs/db_build.md index 50994be7..0aea41b1 100644 --- a/docs/db_build.md +++ b/docs/db_build.md @@ -164,8 +164,8 @@ $ mehari db create txs \ \ --path-seqrepo-instance path/to/seqrepo-data/master \ \ - --path-cdot-json cdot-0.2.12.refseq.grch37_grch38.json \ - --path-cdot-json cdot-0.2.12.ensembl.grch37_grch38.json \ + --path-cdot-json cdot-0.2.21.refseq.grch37_grch38.json \ + --path-cdot-json cdot-0.2.21.ensembl.grch37_grch38.json \ \ --path-seqrepo-instance path/to/seqrepo-data/master \ \ diff --git a/src/annotate/seqvars/ann.rs b/src/annotate/seqvars/ann.rs index 91b4762f..4290b7ee 100644 --- a/src/annotate/seqvars/ann.rs +++ b/src/annotate/seqvars/ann.rs @@ -361,16 +361,19 @@ impl FromStr for FeatureType { strum::EnumIter, )] pub enum FeatureBiotype { + /// Is coding transcript. Coding, + /// Is non-coding transcript. Noncoding, + /// Is in MANE Select set. + ManeSelect, + /// Is in MANE Plus Clinical set. + ManePlusClinical, } impl FeatureBiotype { pub fn is_coding(&self) -> bool { - match self { - FeatureBiotype::Coding => true, - FeatureBiotype::Noncoding => false, - } + matches!(self, FeatureBiotype::Coding) } } @@ -509,7 +512,7 @@ pub struct AnnField { /// The feature identifier. pub feature_id: String, /// The feature biotype. - pub feature_biotype: FeatureBiotype, + pub feature_biotype: Vec, /// The exon / intron rank. pub rank: Option, /// HGVS c. notation. @@ -542,7 +545,7 @@ impl Default for AnnField { term: SoFeature::Transcript, }, feature_id: Default::default(), - feature_biotype: FeatureBiotype::Coding, + feature_biotype: vec![FeatureBiotype::Coding], rank: Default::default(), hgvs_t: Default::default(), hgvs_p: Default::default(), @@ -572,7 +575,13 @@ impl FromStr for AnnField { let gene_id = fields.next().unwrap().to_string(); let feature_type = fields.next().unwrap().parse()?; let feature_id = fields.next().unwrap().to_string(); - let feature_biotype = fields.next().unwrap().parse()?; + let feature_biotype = fields + .next() + .unwrap() + .split('&') + .map(|s| s.parse()) + .collect::, _>>() + .map_err(|e| anyhow::anyhow!("could not parse feature biotype: {}", e))?; let rank = fields.next().unwrap(); let rank = if rank.is_empty() { None @@ -669,7 +678,15 @@ impl std::fmt::Display for AnnField { write!(f, "|")?; write!(f, "{}", self.feature_id)?; write!(f, "|")?; - write!(f, "{}", self.feature_biotype)?; + write!( + f, + "{}", + self.feature_biotype + .iter() + .map(|t| format!("{}", t)) + .collect::>() + .join("&") + )?; write!(f, "|")?; if let Some(rank) = &self.rank { write!(f, "{}", rank)?; @@ -1082,7 +1099,7 @@ mod test { term: SoFeature::Transcript, }, feature_id: String::from("feature_id"), - feature_biotype: FeatureBiotype::Coding, + feature_biotype: vec![FeatureBiotype::Coding], rank: Some(Rank { ord: 1, total: 2 }), hgvs_t: Some(String::from("HGVS.c")), hgvs_p: Some(String::from("HGVS.p")), diff --git a/src/annotate/seqvars/csq.rs b/src/annotate/seqvars/csq.rs index 84d3c4df..781a46aa 100644 --- a/src/annotate/seqvars/csq.rs +++ b/src/annotate/seqvars/csq.rs @@ -9,12 +9,13 @@ use hgvs::{ Accession, CdsFrom, GenomeInterval, GenomeLocEdit, HgvsVariant, Mu, NaEdit, ProtLocEdit, }, }; +use rustc_hash::FxHashMap; -use crate::db::create::txs::data::{Strand, TranscriptBiotype}; +use crate::db::create::data::{self, Strand, TranscriptBiotype}; use super::{ ann::{Allele, AnnField, Consequence, FeatureBiotype, FeatureType, Pos, Rank, SoFeature}, - provider::MehariProvider, + provider::Provider as MehariProvider, }; /// A variant description how VCF would do it. @@ -30,6 +31,21 @@ pub struct VcfVariant { pub alternative: String, } +/// Configuration for consequence prediction. +#[derive(Debug, Clone, derive_builder::Builder)] +#[builder(pattern = "immutable")] +pub struct Config { + /// Whether to report consequences for all picked transcripts. + #[builder(default = "true")] + pub report_all_transcripts: bool, +} + +impl Default for Config { + fn default() -> Self { + ConfigBuilder::default().build().unwrap() + } +} + /// Wrap mapper, provider, and map for consequence prediction. #[derive(derivative::Derivative)] #[derivative(Debug)] @@ -43,6 +59,9 @@ pub struct ConsequencePredictor { /// Mapping from chromosome name to accession. #[derivative(Debug = "ignore")] chrom_to_acc: HashMap, + /// Configuration for the predictor. + #[derivative(Debug = "ignore")] + config: Config, } /// Padding to look for genes upstream/downstream. @@ -51,8 +70,8 @@ pub const PADDING: i32 = 5_000; pub const ALT_ALN_METHOD: &str = "splign"; impl ConsequencePredictor { - pub fn new(provider: Arc, assembly: Assembly) -> Self { - let acc_to_chrom = provider.get_assembly_map(assembly); + pub fn new(provider: Arc, assembly: Assembly, config: Config) -> Self { + let acc_to_chrom: indexmap::IndexMap = provider.get_assembly_map(assembly); let mut chrom_to_acc = HashMap::new(); for (acc, chrom) in &acc_to_chrom { let chrom = if chrom.starts_with("chr") { @@ -64,22 +83,43 @@ impl ConsequencePredictor { chrom_to_acc.insert(format!("chr{}", chrom), acc.clone()); } - let config = assembly::Config { + let mapper_config = assembly::Config { replace_reference: false, strict_bounds: false, renormalize_g: false, genome_seq_available: false, ..Default::default() }; - let mapper = assembly::Mapper::new(config, provider.clone()); + let mapper = assembly::Mapper::new(mapper_config, provider.clone()); ConsequencePredictor { provider, mapper, chrom_to_acc, + config, } } + /// Predict the consequences of a variant. + /// + /// Note that the predictions will be affected by whether transcript picking has been + /// enabled in the data provider and the configuration of the predictor, in particular + /// `Config::report_all_transcripts`. + /// + /// # Args + /// + /// * `var`: The variant to predict consequences for. + /// + /// # Returns + /// + /// A list of `AnnField` records, one for each transcript affected by the variant + /// sorted lexicographically by transcript accession. + /// + /// If the accessio is not valid, then `None` will be returned. + /// + /// # Errors + /// + /// If there was any error during the prediction. pub fn predict(&self, var: &VcfVariant) -> Result>, anyhow::Error> { // Normalize variant by stripping common prefix and suffix. let norm_var = self.normalize_variant(var); @@ -107,24 +147,91 @@ impl ConsequencePredictor { }; let qry_start = var_start - PADDING; let qry_end = var_end + PADDING; - let mut txs = - self.provider - .get_tx_for_region(chrom_acc, ALT_ALN_METHOD, qry_start, qry_end)?; - txs.sort_by(|a, b| a.tx_ac.cmp(&b.tx_ac)); + let txs = { + let mut txs = + self.provider + .get_tx_for_region(chrom_acc, ALT_ALN_METHOD, qry_start, qry_end)?; + txs.sort_by(|a, b| a.tx_ac.cmp(&b.tx_ac)); + // Filter transcripts to the picked ones. + tracing::info!(" txs = {:#?}", &txs); + self.filter_picked_txs(txs) + }; + tracing::info!(" txs = {:#?}", &txs); + + // Compute annotations for all (picked) transcripts first, skipping `None`` results. + let anns_all_txs = txs + .into_iter() + .map(|tx| { + self.build_ann_field(var, &norm_var, tx, chrom_acc.clone(), var_start, var_end) + }) + .collect::, _>>()? + .into_iter() + .flatten() + .collect::>(); + + // Return all or worst annotation only. + Ok(Some(self.filter_ann_fields(anns_all_txs))) + } - // Generate `AnnField` records for each transcript. + // Filter transcripts to the picked ones. + fn filter_picked_txs(&self, txs: Vec) -> Vec { + // Short-circuit if transcript picking has been disabled. + if !self.provider.transcript_picking() { + return txs; + } + + // Get gene ids for all transcripts in `txs`, then obtain the picked transcript + // identifiers for these genes and limit `txs` to those transcripts. + let picked_txs = txs + .iter() + .flat_map(|tx| self.provider.get_tx(&tx.tx_ac)) + .flat_map(|tx| self.provider.get_picked_transcripts(&tx.gene_id)) + .flatten() + .collect::>(); + // tracing::trace!("Picked transcripts: {:?}", &picked_txs); + txs.into_iter() + .filter(|tx| picked_txs.contains(&tx.tx_ac)) + .collect::>() + } + + /// Filter the ANN fields depending on the configuration. + /// + /// If all transcripts are to be reported then return `ann_fields` as is, otherwise + /// select one worst consequence per gene. + fn filter_ann_fields(&self, ann_fields: Vec) -> Vec { + // Short-circuit if to report all transcript results. + if self.config.report_all_transcripts { + return ann_fields; + } + + // First, split annotations by gene. + let mut anns_by_gene: FxHashMap> = FxHashMap::default(); + for ann in ann_fields { + let gene_id = ann.gene_id.clone(); + anns_by_gene.entry(gene_id).or_default().push(ann); + } + + /// Return sort order for ANN biotype, gives priority to ManeSelect and ManePlusClinical. + fn biotype_order(biotypes: &[FeatureBiotype]) -> i32 { + if biotypes.contains(&FeatureBiotype::ManeSelect) { + 0 + } else if biotypes.contains(&FeatureBiotype::ManePlusClinical) { + 1 + } else { + 2 + } + } + + // Now, sort by consequence, giving priority to ManeSelect and ManePlusClinical. // - // Skip `None` results. - Ok(Some( - txs.into_iter() - .map(|tx| { - self.build_ann_field(var, &norm_var, tx, chrom_acc.clone(), var_start, var_end) - }) - .collect::, _>>()? - .into_iter() - .flatten() - .collect::>(), - )) + // This uses the invariant that the consequences in the ANN fields are sorted already + // and there is at least one consequence. + let mut result = Vec::new(); + for anns in anns_by_gene.values_mut() { + anns.sort_by_key(|ann| (ann.consequences[0], biotype_order(&ann.feature_biotype))); + result.push(anns.remove(0)); + } + result } fn build_ann_field( @@ -309,20 +416,34 @@ impl ConsequencePredictor { let min_start = min_start.expect("must have seen exon"); let max_end = max_end.expect("must have seen exon"); - let feature_biotype = - match TranscriptBiotype::try_from(tx.biotype).expect("invalid transcript biotype") { + let transcript_biotype = + TranscriptBiotype::try_from(tx.biotype).expect("invalid transcript biotype"); + let feature_biotype = { + let mut feature_biotypes = vec![match transcript_biotype { TranscriptBiotype::Coding => FeatureBiotype::Coding, TranscriptBiotype::NonCoding => FeatureBiotype::Noncoding, - }; + }]; + + if tx.tags.contains(&(data::TranscriptTag::ManeSelect as i32)) { + feature_biotypes.push(FeatureBiotype::ManeSelect); + } else if tx + .tags + .contains(&(data::TranscriptTag::ManePlusClinical as i32)) + { + feature_biotypes.push(FeatureBiotype::ManePlusClinical); + } + + feature_biotypes + }; let is_upstream = var_end <= min_start; let is_downstream = var_start >= max_end; if is_exonic { - if !feature_biotype.is_coding() { + if transcript_biotype == TranscriptBiotype::NonCoding { consequences.push(Consequence::NonCodingTranscriptExonVariant); } } else if is_intronic { - if !feature_biotype.is_coding() { + if transcript_biotype == TranscriptBiotype::NonCoding { consequences.push(Consequence::NonCodingTranscriptIntronVariant); } else { consequences.push(Consequence::IntronVariant); @@ -414,8 +535,8 @@ impl ConsequencePredictor { _ => panic!("Invalid tx position: {:?}", &var_n), }; - let (var_t, _var_p, hgvs_p, cds_pos, protein_pos) = match feature_biotype { - FeatureBiotype::Coding => { + let (var_t, _var_p, hgvs_p, cds_pos, protein_pos) = match transcript_biotype { + TranscriptBiotype::Coding => { let cds_len = tx.stop_codon.unwrap() - tx.start_codon.unwrap(); let prot_len = cds_len / 3; @@ -596,7 +717,7 @@ impl ConsequencePredictor { (var_c, Some(var_p), hgvs_p, cds_pos, protein_pos) } - FeatureBiotype::Noncoding => (var_n, None, None, None, None), + TranscriptBiotype::NonCoding => (var_n, None, None, None, None), }; let hgvs_t = format!("{}", &var_t); let hgvs_t = hgvs_t.split(':').nth(1).unwrap().to_owned(); @@ -639,12 +760,12 @@ impl ConsequencePredictor { }, consequences, putative_impact, - gene_symbol: tx.gene_name.clone(), - gene_id: format!("HGNC:{}", &tx.gene_id), + gene_symbol: tx.gene_symbol, + gene_id: tx.gene_id, feature_type: FeatureType::SoTerm { term: SoFeature::Transcript, }, - feature_id: tx.id.clone(), + feature_id: tx.id, feature_biotype, rank, hgvs_t, @@ -706,6 +827,7 @@ mod test { use serde::Deserialize; use crate::annotate::seqvars::load_tx_db; + use crate::annotate::seqvars::provider::ConfigBuilder as MehariProviderConfigBuilder; use super::*; @@ -744,9 +866,14 @@ mod test { let tx_path = "tests/data/annotate/db/grch37/txs.bin.zst"; let tx_db = load_tx_db(tx_path)?; - let provider = Arc::new(MehariProvider::new(tx_db, Assembly::Grch37p10)); + let provider = Arc::new(MehariProvider::new( + tx_db, + Assembly::Grch37p10, + Default::default(), + )); - let predictor = ConsequencePredictor::new(provider, Assembly::Grch37p10); + let predictor = + ConsequencePredictor::new(provider, Assembly::Grch37p10, Default::default()); let res = predictor .predict(&VcfVariant { @@ -769,6 +896,116 @@ mod test { Ok(()) } + #[tracing_test::traced_test] + #[rstest::rstest] + #[case("17:41197701:G:C", false, false)] // don't pick transcripts, report worst + #[case("17:41197701:G:C", false, true)] // don't pick transcripts, report all + #[case("17:41197701:G:C", true, false)] // pick transcripts, report worst + #[case("17:41197701:G:C", true, true)] // pick transcripts, report all + fn annotate_snv_brca1_transcript_picking_reporting( + #[case] spdi: &str, + #[case] pick_transcripts: bool, + #[case] report_all_transcripts: bool, + ) -> Result<(), anyhow::Error> { + crate::common::set_snapshot_suffix!( + "{}-{}-{}", + spdi.replace(':', "-"), + pick_transcripts, + report_all_transcripts + ); + + let spdi = spdi.split(':').map(|s| s.to_string()).collect::>(); + + let tx_path = "tests/data/annotate/db/grch37/txs.bin.zst"; + let tx_db = load_tx_db(tx_path)?; + let provider = Arc::new(MehariProvider::new( + tx_db, + Assembly::Grch37p10, + MehariProviderConfigBuilder::default() + .transcript_picking(pick_transcripts) + .build() + .unwrap(), + )); + + let predictor = ConsequencePredictor::new( + provider, + Assembly::Grch37p10, + ConfigBuilder::default() + .report_all_transcripts(report_all_transcripts) + .build() + .unwrap(), + ); + + let res = predictor + .predict(&VcfVariant { + chromosome: spdi[0].clone(), + position: spdi[1].parse()?, + reference: spdi[2].clone(), + alternative: spdi[3].clone(), + })? + .unwrap(); + + insta::assert_yaml_snapshot!(res); + + Ok(()) + } + + // Test predictions on TTN where we have a ManeSelect and a ManePlusClinical + // transcript. + #[tracing_test::traced_test] + #[rstest::rstest] + #[case("2:179631246:G:A", false, false)] // don't pick transcripts, report worst + #[case("2:179631246:G:A", false, true)] // don't pick transcripts, report all + #[case("2:179631246:G:A", true, false)] // pick transcripts, report worst + #[case("2:179631246:G:A", true, true)] // pick transcripts, report all + fn annotate_snv_ttn_transcript_picking_reporting( + #[case] spdi: &str, + #[case] pick_transcripts: bool, + #[case] report_all_transcripts: bool, + ) -> Result<(), anyhow::Error> { + crate::common::set_snapshot_suffix!( + "{}-{}-{}", + spdi.replace(':', "-"), + pick_transcripts, + report_all_transcripts + ); + + let spdi = spdi.split(':').map(|s| s.to_string()).collect::>(); + + let tx_path = "tests/data/annotate/db/grch37/txs.bin.zst"; + let tx_db = load_tx_db(tx_path)?; + let provider = Arc::new(MehariProvider::new( + tx_db, + Assembly::Grch37p10, + MehariProviderConfigBuilder::default() + .transcript_picking(pick_transcripts) + .build() + .unwrap(), + )); + + let predictor = ConsequencePredictor::new( + provider, + Assembly::Grch37p10, + ConfigBuilder::default() + .report_all_transcripts(report_all_transcripts) + .build() + .unwrap(), + ); + + let res = predictor + .predict(&VcfVariant { + chromosome: spdi[0].clone(), + position: spdi[1].parse()?, + reference: spdi[2].clone(), + alternative: spdi[3].clone(), + })? + .unwrap(); + + insta::assert_yaml_snapshot!(res); + + Ok(()) + } + #[derive(Debug, Deserialize)] struct Record { pub var: String, @@ -779,24 +1016,30 @@ mod test { // Compare to SnpEff annotated variants for OPA1, touching special cases. #[test] fn annotate_opa1_hand_picked_vars() -> Result<(), anyhow::Error> { - annotate_opa1_vars("tests/data/annotate/vars/opa1.hand_picked.tsv") + annotate_opa1_vars("tests/data/annotate/vars/opa1.hand_picked.tsv", true) } // Compare to SnpEff annotated ClinVar variants for OPA1 (slow). #[ignore] #[test] fn annotate_opa1_clinvar_vars_snpeff() -> Result<(), anyhow::Error> { - annotate_opa1_vars("tests/data/annotate/vars/clinvar.excerpt.snpeff.opa1.tsv") + annotate_opa1_vars( + "tests/data/annotate/vars/clinvar.excerpt.snpeff.opa1.tsv", + true, + ) } // Compare to SnpEff annotated ClinVar variants for OPA1 (slow). #[ignore] #[test] fn annotate_opa1_clinvar_vars_vep() -> Result<(), anyhow::Error> { - annotate_opa1_vars("tests/data/annotate/vars/clinvar.excerpt.vep.opa1.tsv") + annotate_opa1_vars( + "tests/data/annotate/vars/clinvar.excerpt.vep.opa1.tsv", + true, + ) } - fn annotate_opa1_vars(path_tsv: &str) -> Result<(), anyhow::Error> { + fn annotate_opa1_vars(path_tsv: &str, all_transcripts: bool) -> Result<(), anyhow::Error> { let txs = vec![ String::from("NM_001354663.2"), String::from("NM_001354664.2"), @@ -806,30 +1049,36 @@ mod test { String::from("NM_130837.3"), ]; - annotate_vars(path_tsv, &txs) + annotate_vars(path_tsv, &txs, all_transcripts) } // Compare to SnpEff annotated variants for BRCA1, touching special cases. #[test] fn annotate_brca1_hand_picked_vars() -> Result<(), anyhow::Error> { - annotate_brca1_vars("tests/data/annotate/vars/brca1.hand_picked.tsv") + annotate_brca1_vars("tests/data/annotate/vars/brca1.hand_picked.tsv", true) } // Compare to SnpEff annotated ClinVar variants for BRCA1 (slow). #[ignore] #[test] fn annotate_brca1_clinvar_vars_snpeff() -> Result<(), anyhow::Error> { - annotate_brca1_vars("tests/data/annotate/vars/clinvar.excerpt.snpeff.brca1.tsv") + annotate_brca1_vars( + "tests/data/annotate/vars/clinvar.excerpt.snpeff.brca1.tsv", + true, + ) } // Compare to SnpEff annotated ClinVar variants for BRCA1 (slow). #[ignore] #[test] fn annotate_brca1_clinvar_vars_vep() -> Result<(), anyhow::Error> { - annotate_brca1_vars("tests/data/annotate/vars/clinvar.excerpt.vep.brca1.tsv") + annotate_brca1_vars( + "tests/data/annotate/vars/clinvar.excerpt.vep.brca1.tsv", + true, + ) } - fn annotate_brca1_vars(path_tsv: &str) -> Result<(), anyhow::Error> { + fn annotate_brca1_vars(path_tsv: &str, all_transcripts: bool) -> Result<(), anyhow::Error> { let txs = vec![ String::from("NM_007294.4"), String::from("NM_007297.4"), @@ -838,14 +1087,29 @@ mod test { String::from("NM_007300.4"), ]; - annotate_vars(path_tsv, &txs) + annotate_vars(path_tsv, &txs, all_transcripts) } - fn annotate_vars(path_tsv: &str, txs: &[String]) -> Result<(), anyhow::Error> { + fn annotate_vars( + path_tsv: &str, + txs: &[String], + all_transcripts: bool, + ) -> Result<(), anyhow::Error> { let tx_path = "tests/data/annotate/db/grch37/txs.bin.zst"; let tx_db = load_tx_db(tx_path)?; - let provider = Arc::new(MehariProvider::new(tx_db, Assembly::Grch37p10)); - let predictor = ConsequencePredictor::new(provider, Assembly::Grch37p10); + let provider = Arc::new(MehariProvider::new( + tx_db, + Assembly::Grch37p10, + Default::default(), + )); + let predictor = ConsequencePredictor::new( + provider, + Assembly::Grch37p10, + ConfigBuilder::default() + .report_all_transcripts(all_transcripts) + .build() + .unwrap(), + ); let mut reader = ReaderBuilder::new() .delimiter(b'\t') diff --git a/src/annotate/seqvars/mod.rs b/src/annotate/seqvars/mod.rs index 336bb8d8..9877342d 100644 --- a/src/annotate/seqvars/mod.rs +++ b/src/annotate/seqvars/mod.rs @@ -45,11 +45,15 @@ use rustc_hash::FxHashMap; use serde::{Deserialize, Serialize}; use thousands::Separable; -use crate::annotate::seqvars::csq::{ConsequencePredictor, VcfVariant}; -use crate::annotate::seqvars::provider::MehariProvider; +use crate::annotate::seqvars::csq::{ + ConfigBuilder as ConsequencePredictorConfigBuilder, ConsequencePredictor, VcfVariant, +}; +use crate::annotate::seqvars::provider::{ + ConfigBuilder as MehariProviderConfigBuilder, Provider as MehariProvider, +}; use crate::common::GenomeRelease; -use crate::db::create::txs::data::TxSeqDatabase; +use crate::db::create::data::TxSeqDatabase; use crate::finalize_buf_writer; use crate::ped::{PedigreeByName, Sex}; @@ -88,6 +92,14 @@ pub struct Args { #[command(flatten)] pub output: PathOutput, + /// Whether to report for all picked transcripts. + #[arg(long, default_value_t = true)] + pub report_all_transcripts: bool, + /// Limit transcripts to (a) ManeSelect+ManePlusClinical, (b) ManeSelect, + /// (c) longest transcript for the gene - the first available. + #[arg(long, default_value_t = false)] + pub transcript_picking: bool, + /// For debug purposes, maximal number of variants to annotate. #[arg(long)] pub max_var_count: Option, @@ -1040,7 +1052,7 @@ impl VarFishSeqvarTsvWriter { tsv_record.ensembl_gene_id = Some(hgnc_record.ensembl_gene_id.clone()); tsv_record.ensembl_transcript_id = Some(ann.feature_id.clone()); tsv_record.ensembl_transcript_coding = - Some(ann.feature_biotype == FeatureBiotype::Coding); + Some(ann.feature_biotype.contains(&FeatureBiotype::Coding)); tsv_record.ensembl_hgvs_c = ann.hgvs_t.clone(); tsv_record.ensembl_hgvs_p = ann.hgvs_p.clone(); if !ann.consequences.is_empty() { @@ -1069,7 +1081,7 @@ impl VarFishSeqvarTsvWriter { tsv_record.refseq_gene_id = Some(hgnc_record.entrez_id.clone()); tsv_record.refseq_transcript_id = Some(ann.feature_id.clone()); tsv_record.refseq_transcript_coding = - Some(ann.feature_biotype == FeatureBiotype::Coding); + Some(ann.feature_biotype.contains(&FeatureBiotype::Coding)); tsv_record.refseq_hgvs_c = ann.hgvs_t.clone(); tsv_record.refseq_hgvs_p = ann.hgvs_p.clone(); if !ann.consequences.is_empty() { @@ -1400,7 +1412,7 @@ impl AnnotatedVcfWriter for VarFishSeqvarTsvWriter { /// Run the annotation with the given `Write` within the `VcfWriter`. fn run_with_writer(writer: &mut dyn AnnotatedVcfWriter, args: &Args) -> Result<(), anyhow::Error> { tracing::info!("Open VCF and read header"); - let mut reader = VariantReaderBuilder.build_from_path(&args.path_input_vcf)?; + let mut reader = VariantReaderBuilder::default().build_from_path(&args.path_input_vcf)?; let header_in = reader.read_header()?; let header_out = build_header(&header_in); @@ -1455,8 +1467,22 @@ fn run_with_writer(writer: &mut dyn AnnotatedVcfWriter, args: &Args) -> Result<( path_component(assembly) ))?; tracing::info!("Building transcript interval trees ..."); - let provider = Arc::new(MehariProvider::new(tx_db, assembly)); - let predictor = ConsequencePredictor::new(provider, assembly); + let provider = Arc::new(MehariProvider::new( + tx_db, + assembly, + MehariProviderConfigBuilder::default() + .transcript_picking(args.transcript_picking) + .build() + .unwrap(), + )); + let predictor = ConsequencePredictor::new( + provider, + assembly, + ConsequencePredictorConfigBuilder::default() + .report_all_transcripts(args.report_all_transcripts) + .build() + .unwrap(), + ); tracing::info!("... done building transcript interval trees"); // Perform the VCF annotation. @@ -1647,6 +1673,8 @@ mod test { }; let args = Args { genome_release: None, + report_all_transcripts: false, + transcript_picking: false, path_db: String::from("tests/data/annotate/db"), path_input_vcf: String::from( "tests/data/db/create/seqvar_freqs/db-rs1263393206/input.vcf", @@ -1682,6 +1710,8 @@ mod test { }; let args = Args { genome_release: None, + report_all_transcripts: true, + transcript_picking: false, path_db: String::from("tests/data/annotate/db"), path_input_vcf: String::from( "tests/data/db/create/seqvar_freqs/db-rs1263393206/input.vcf", @@ -1729,6 +1759,8 @@ mod test { }; let args = Args { genome_release: None, + report_all_transcripts: true, + transcript_picking: false, path_db: String::from("tests/data/annotate/db"), path_input_vcf: String::from("tests/data/db/create/badly_formed_vcf_entry.vcf"), output: PathOutput { @@ -1764,6 +1796,8 @@ mod test { }; let args = Args { genome_release: None, + report_all_transcripts: true, + transcript_picking: false, path_db: String::from("tests/data/annotate/db"), path_input_vcf: String::from("tests/data/db/create/mitochondrial_variants.vcf"), output: PathOutput { diff --git a/src/annotate/seqvars/provider.rs b/src/annotate/seqvars/provider.rs index 2ec5c67a..3e980a4a 100644 --- a/src/annotate/seqvars/provider.rs +++ b/src/annotate/seqvars/provider.rs @@ -19,7 +19,7 @@ use hgvs::{ use crate::{ annotate::seqvars::csq::ALT_ALN_METHOD, - db::create::txs::data::{Strand, Transcript, TxSeqDatabase}, + db::create::data::{GeneToTxId, Strand, Transcript, TranscriptTag, TxSeqDatabase}, }; type IntervalTree = ArrayBackedIntervalTree; @@ -92,16 +92,52 @@ impl TxIntervalTrees { } } -pub struct MehariProvider { +/// Configuration for constructing the `Provider`. +#[derive(Debug, Clone, Default, derive_builder::Builder)] +#[builder(pattern = "immutable")] +pub struct Config { + /// * `transcript_picking` - Whether to use transcript picking. When + /// enabled, only use (a) ManeSelect+ManePlusClinical, (b) ManeSelect, + /// (c) longest transcript (the first available). + pub transcript_picking: bool, +} + +/// Provider based on the protobuf `TxSeqDatabase`. +pub struct Provider { + /// Database of transcripts and sequences as deserialized from protobuf. pub tx_seq_db: TxSeqDatabase, + /// Interval trees for the tanscripts. pub tx_trees: TxIntervalTrees, + /// Mapping from gene identifier to index in `TxSeqDatabase::tx_db::gene_to_tx`. + gene_map: HashMap, + /// Mapping from transcript accession to index in `TxSeqDatabase::tx_db::transcripts`. tx_map: HashMap, + /// Mapping from sequence accession to index in `TxSeqDatabase::seq_db::seqs`. seq_map: HashMap, + /// When transcript picking is enabled, contains the `GeneToTxIdx` entries + /// for each gene; the order matches the one of `tx_seq_db.gene_to_tx`. + picked_gene_to_tx_id: Option>, } -impl MehariProvider { - pub fn new(tx_seq_db: TxSeqDatabase, assembly: Assembly) -> Self { +impl Provider { + /// Create a new `MehariProvider` from a `TxSeqDatabase`. + /// + /// # Arguments + /// + /// * `tx_seq_db` - The `TxSeqDatabase` to use. + /// * `assembly` - The assembly to use. + pub fn new(mut tx_seq_db: TxSeqDatabase, assembly: Assembly, config: Config) -> Self { let tx_trees = TxIntervalTrees::new(&tx_seq_db, assembly); + let gene_map = HashMap::from_iter( + tx_seq_db + .tx_db + .as_ref() + .expect("no tx_db?") + .gene_to_tx + .iter() + .enumerate() + .map(|(idx, entry)| (entry.gene_id.clone(), idx as u32)), + ); let tx_map = HashMap::from_iter( tx_seq_db .tx_db @@ -130,27 +166,160 @@ impl MehariProvider { .map(|(alias, idx)| (alias.clone(), *idx)), ); + // When transcript picking is enabled, restrict to ManeSelect and ManePlusClinical if + // we have any such transcript. Otherwise, fall back to the longest transcript. + let picked_gene_to_tx_id = if config.transcript_picking { + if let Some(tx_db) = tx_seq_db.tx_db.as_mut() { + // The new gene-to-txid mapping we will build. + let mut new_gene_to_tx = Vec::new(); + + // Process each gene. + for entry in tx_db.gene_to_tx.iter() { + // First, determine whether we have any MANE transcripts. + let mane_tx_ids = entry + .tx_ids + .iter() + .filter(|tx_id| { + tx_map + .get(*tx_id) + .map(|tx_idx| { + let tx = &tx_db.transcripts[*tx_idx as usize]; + tx.tags.contains(&TranscriptTag::ManePlusClinical.into()) + || tx.tags.contains(&TranscriptTag::ManeSelect.into()) + }) + .unwrap_or_default() + }) + .cloned() + .collect::>(); + + // Now, construct gene-to-txid mapping entry. + let new_entry = if !mane_tx_ids.is_empty() { + // For the case that we have MANE transcripts. + GeneToTxId { + gene_id: entry.gene_id.clone(), + tx_ids: mane_tx_ids, + } + } else { + // Otherwise, determine the longest transcript's length. + let (_, tx_id) = entry + .tx_ids + .iter() + .map(|tx_id| { + tx_map + .get(tx_id) + .map(|tx_idx| { + // A slight complication, we need to look at all genome alignments... + let tx = &tx_db.transcripts[*tx_idx as usize]; + let mut max_tx_length = 0; + for genome_alignment in tx.genome_alignments.iter() { + // We just count length in reference so we don't have to look + // into the CIGAR string. + let mut tx_length = 0; + for exon_alignment in genome_alignment.exons.iter() { + tx_length += exon_alignment.alt_cds_end_i() + - exon_alignment.alt_cds_start_i(); + } + if tx_length > max_tx_length { + max_tx_length = tx_length; + } + } + (max_tx_length, tx_id.clone()) + }) + .unwrap_or_default() + }) + .max() + .unwrap_or_else(|| panic!("no length for gene {}", &entry.gene_id)); + + GeneToTxId { + gene_id: entry.gene_id.clone(), + tx_ids: vec![tx_id], + } + }; + + tracing::trace!( + "picked transcripts {:?} for gene {}", + new_entry.tx_ids, + new_entry.gene_id + ); + new_gene_to_tx.push(new_entry); + } + + Some(new_gene_to_tx) + } else { + None + } + } else { + None + }; + Self { tx_seq_db, tx_trees, + gene_map, tx_map, seq_map, + picked_gene_to_tx_id, } } + /// Return whether transcript picking is enabled. + pub fn transcript_picking(&self) -> bool { + self.picked_gene_to_tx_id.is_some() + } + + /// Return the picked transcript IDs for a gene. + /// + /// # Args + /// + /// * `gene_name` - The gene HGNC ID. + /// + /// # Returns + /// + /// The picked transcript IDs, or None if the gene is not found. + pub fn get_picked_transcripts(&self, hgnc_id: &str) -> Option> { + self.gene_map.get(hgnc_id).map(|gene_idx| { + let gene_to_tx = if let Some(picked_gene_to_tx_id) = self.picked_gene_to_tx_id.as_ref() + { + picked_gene_to_tx_id + } else { + &self.tx_seq_db.tx_db.as_ref().expect("no tx_db?").gene_to_tx + }; + + // tracing::trace!( + // "get_picked_transcripts({}) = {:?}", + // hgnc_id, + // &gene_to_tx[*gene_idx as usize].tx_ids + // ); + gene_to_tx[*gene_idx as usize].tx_ids.clone() + }) + } + + /// Return `Transcript` for the given transcript accession. + /// + /// # Args + /// + /// * `tx_id` - The transcript accession. + /// + /// # Returns + /// + /// The `Transcript` for the given accession, or None if the accession was not found. pub fn get_tx(&self, tx_id: &str) -> Option { self.tx_map.get(tx_id).map(|idx| { - self.tx_seq_db + let result = self + .tx_seq_db .tx_db .as_ref() .expect("no tx_db?") .transcripts[*idx as usize] - .clone() + .clone(); + + // tracing::trace!("get_tx({}) = {:?}", tx_id, &result); + result }) } } -impl ProviderInterface for MehariProvider { +impl ProviderInterface for Provider { fn data_version(&self) -> &str { panic!("not implemented"); } @@ -253,7 +422,7 @@ impl ProviderInterface for MehariProvider { .exons .iter() .map(|exon| TxExonsRecord { - hgnc: tx.gene_name.clone(), + hgnc: tx.gene_id.clone(), tx_ac: tx_ac.to_string(), alt_ac: alt_ac.to_string(), alt_aln_method: ALT_ALN_METHOD.to_string(), @@ -349,7 +518,7 @@ impl ProviderInterface for MehariProvider { .expect("no tx_db?") .transcripts[tx_idx]; - let hgnc = tx.gene_name.clone(); + let hgnc = tx.gene_id.clone(); let mut tmp = tx .genome_alignments @@ -401,7 +570,7 @@ impl ProviderInterface for MehariProvider { for genome_alignment in &tx.genome_alignments { if genome_alignment.contig == alt_ac { return Ok(TxInfoRecord { - hgnc: tx.gene_name.clone(), + hgnc: tx.gene_id.clone(), cds_start_i: genome_alignment.cds_start, cds_end_i: genome_alignment.cds_end, tx_ac: tx.id.clone(), @@ -445,6 +614,6 @@ mod test { #[test] fn test_sync() { fn is_sync() {} - is_sync::(); + is_sync::(); } } diff --git a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41196309-G-C.snap b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41196309-G-C.snap index b716e80f..e153a5f2 100644 --- a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41196309-G-C.snap +++ b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41196309-G-C.snap @@ -14,7 +14,9 @@ expression: res SoTerm: term: Transcript feature_id: NM_007294.4 - feature_biotype: Coding + feature_biotype: + - Coding + - ManeSelect rank: ~ hgvs_t: ~ hgvs_p: ~ @@ -35,7 +37,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007297.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ~ hgvs_t: ~ hgvs_p: ~ @@ -56,7 +59,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007298.3 - feature_biotype: Coding + feature_biotype: + - Coding rank: ~ hgvs_t: ~ hgvs_p: ~ @@ -77,7 +81,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007299.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ~ hgvs_t: ~ hgvs_p: ~ @@ -98,7 +103,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007300.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ~ hgvs_t: ~ hgvs_p: ~ diff --git a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41196310-G-C.snap b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41196310-G-C.snap index 519e3e90..8f7f9ad1 100644 --- a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41196310-G-C.snap +++ b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41196310-G-C.snap @@ -14,7 +14,9 @@ expression: res SoTerm: term: Transcript feature_id: NM_007294.4 - feature_biotype: Coding + feature_biotype: + - Coding + - ManeSelect rank: ~ hgvs_t: ~ hgvs_p: ~ @@ -35,7 +37,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007297.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ~ hgvs_t: ~ hgvs_p: ~ @@ -56,7 +59,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007298.3 - feature_biotype: Coding + feature_biotype: + - Coding rank: ~ hgvs_t: ~ hgvs_p: ~ @@ -77,7 +81,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007299.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ~ hgvs_t: ~ hgvs_p: ~ @@ -98,7 +103,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007300.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ~ hgvs_t: ~ hgvs_p: ~ diff --git a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41196311-G-C.snap b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41196311-G-C.snap index 6a5aed2b..2f49d6eb 100644 --- a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41196311-G-C.snap +++ b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41196311-G-C.snap @@ -14,7 +14,9 @@ expression: res SoTerm: term: Transcript feature_id: NM_007294.4 - feature_biotype: Coding + feature_biotype: + - Coding + - ManeSelect rank: ~ hgvs_t: ~ hgvs_p: ~ @@ -35,7 +37,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007297.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ~ hgvs_t: ~ hgvs_p: ~ @@ -56,7 +59,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007298.3 - feature_biotype: Coding + feature_biotype: + - Coding rank: ~ hgvs_t: ~ hgvs_p: ~ @@ -77,7 +81,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007299.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ~ hgvs_t: ~ hgvs_p: ~ @@ -98,7 +103,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007300.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ~ hgvs_t: ~ hgvs_p: ~ diff --git a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41196312-G-C.snap b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41196312-G-C.snap index e51bf3b7..b8573ac3 100644 --- a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41196312-G-C.snap +++ b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41196312-G-C.snap @@ -14,7 +14,9 @@ expression: res SoTerm: term: Transcript feature_id: NM_007294.4 - feature_biotype: Coding + feature_biotype: + - Coding + - ManeSelect rank: ord: 23 total: 23 @@ -41,7 +43,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007297.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 22 total: 22 @@ -68,7 +71,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007298.3 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 22 total: 22 @@ -95,7 +99,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007299.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 22 total: 22 @@ -122,7 +127,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007300.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 24 total: 24 diff --git a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41196313-G-C.snap b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41196313-G-C.snap index bf05793a..eca2c00e 100644 --- a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41196313-G-C.snap +++ b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41196313-G-C.snap @@ -14,7 +14,9 @@ expression: res SoTerm: term: Transcript feature_id: NM_007294.4 - feature_biotype: Coding + feature_biotype: + - Coding + - ManeSelect rank: ord: 23 total: 23 @@ -41,7 +43,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007297.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 22 total: 22 @@ -68,7 +71,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007298.3 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 22 total: 22 @@ -95,7 +99,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007299.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 22 total: 22 @@ -122,7 +127,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007300.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 24 total: 24 diff --git a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41197701-G-C.snap b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41197701-G-C.snap index 5816cbda..0ca67620 100644 --- a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41197701-G-C.snap +++ b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41197701-G-C.snap @@ -14,7 +14,9 @@ expression: res SoTerm: term: Transcript feature_id: NM_007294.4 - feature_biotype: Coding + feature_biotype: + - Coding + - ManeSelect rank: ord: 23 total: 23 @@ -43,7 +45,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007297.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 22 total: 22 @@ -72,7 +75,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007298.3 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 22 total: 22 @@ -101,7 +105,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007299.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 22 total: 22 @@ -128,7 +133,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007300.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 24 total: 24 diff --git a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41197818-G-C.snap b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41197818-G-C.snap index ab281196..34ca5e97 100644 --- a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41197818-G-C.snap +++ b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41197818-G-C.snap @@ -14,7 +14,9 @@ expression: res SoTerm: term: Transcript feature_id: NM_007294.4 - feature_biotype: Coding + feature_biotype: + - Coding + - ManeSelect rank: ord: 23 total: 23 @@ -43,7 +45,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007297.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 22 total: 22 @@ -72,7 +75,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007298.3 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 22 total: 22 @@ -101,7 +105,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007299.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 22 total: 22 @@ -130,7 +135,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007300.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 24 total: 24 diff --git a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41197819-G-C.snap b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41197819-G-C.snap index 637f789c..624dfffa 100644 --- a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41197819-G-C.snap +++ b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41197819-G-C.snap @@ -14,7 +14,9 @@ expression: res SoTerm: term: Transcript feature_id: NM_007294.4 - feature_biotype: Coding + feature_biotype: + - Coding + - ManeSelect rank: ord: 23 total: 23 @@ -43,7 +45,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007297.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 22 total: 22 @@ -72,7 +75,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007298.3 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 22 total: 22 @@ -101,7 +105,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007299.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 22 total: 22 @@ -130,7 +135,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007300.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 24 total: 24 diff --git a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41197820-G-C.snap b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41197820-G-C.snap index a9728e0c..c06620e3 100644 --- a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41197820-G-C.snap +++ b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41197820-G-C.snap @@ -15,7 +15,9 @@ expression: res SoTerm: term: Transcript feature_id: NM_007294.4 - feature_biotype: Coding + feature_biotype: + - Coding + - ManeSelect rank: ord: 22 total: 22 @@ -43,7 +45,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007297.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 21 total: 21 @@ -71,7 +74,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007298.3 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 21 total: 21 @@ -99,7 +103,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007299.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 21 total: 21 @@ -127,7 +132,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007300.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 23 total: 23 diff --git a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41197821-G-C.snap b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41197821-G-C.snap index 9916c3a6..027ef2ac 100644 --- a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41197821-G-C.snap +++ b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41197821-G-C.snap @@ -15,7 +15,9 @@ expression: res SoTerm: term: Transcript feature_id: NM_007294.4 - feature_biotype: Coding + feature_biotype: + - Coding + - ManeSelect rank: ord: 22 total: 22 @@ -43,7 +45,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007297.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 21 total: 21 @@ -71,7 +74,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007298.3 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 21 total: 21 @@ -99,7 +103,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007299.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 21 total: 21 @@ -127,7 +132,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007300.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 23 total: 23 diff --git a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41197822-G-C.snap b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41197822-G-C.snap index 14d4637d..d3217e1b 100644 --- a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41197822-G-C.snap +++ b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41197822-G-C.snap @@ -15,7 +15,9 @@ expression: res SoTerm: term: Transcript feature_id: NM_007294.4 - feature_biotype: Coding + feature_biotype: + - Coding + - ManeSelect rank: ord: 22 total: 22 @@ -43,7 +45,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007297.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 21 total: 21 @@ -71,7 +74,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007298.3 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 21 total: 21 @@ -99,7 +103,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007299.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 21 total: 21 @@ -127,7 +132,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007300.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 23 total: 23 diff --git a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41197823-G-C.snap b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41197823-G-C.snap index 7225add4..b7ad0dad 100644 --- a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41197823-G-C.snap +++ b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41197823-G-C.snap @@ -15,7 +15,9 @@ expression: res SoTerm: term: Transcript feature_id: NM_007294.4 - feature_biotype: Coding + feature_biotype: + - Coding + - ManeSelect rank: ord: 22 total: 22 @@ -43,7 +45,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007297.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 21 total: 21 @@ -71,7 +74,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007298.3 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 21 total: 21 @@ -99,7 +103,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007299.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 21 total: 21 @@ -127,7 +132,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007300.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 23 total: 23 diff --git a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41277379-A-C.snap b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41277379-A-C.snap index f33e673a..f44ac352 100644 --- a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41277379-A-C.snap +++ b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41277379-A-C.snap @@ -14,7 +14,9 @@ expression: res SoTerm: term: Transcript feature_id: NM_007294.4 - feature_biotype: Coding + feature_biotype: + - Coding + - ManeSelect rank: ord: 1 total: 23 @@ -41,7 +43,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007297.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 1 total: 22 @@ -68,7 +71,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007298.3 - feature_biotype: Coding + feature_biotype: + - Coding rank: ~ hgvs_t: ~ hgvs_p: ~ @@ -89,7 +93,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007299.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 1 total: 22 @@ -116,7 +121,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007300.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 1 total: 24 diff --git a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41277380-G-C.snap b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41277380-G-C.snap index f5560b5e..f9c4398e 100644 --- a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41277380-G-C.snap +++ b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41277380-G-C.snap @@ -14,7 +14,9 @@ expression: res SoTerm: term: Transcript feature_id: NM_007294.4 - feature_biotype: Coding + feature_biotype: + - Coding + - ManeSelect rank: ord: 1 total: 23 @@ -41,7 +43,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007297.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 1 total: 22 @@ -68,7 +71,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007298.3 - feature_biotype: Coding + feature_biotype: + - Coding rank: ~ hgvs_t: ~ hgvs_p: ~ @@ -89,7 +93,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007299.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 1 total: 22 @@ -116,7 +121,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007300.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 1 total: 24 diff --git a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41277381-G-T.snap b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41277381-G-T.snap index 815f6c89..20f1b83f 100644 --- a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41277381-G-T.snap +++ b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41277381-G-T.snap @@ -14,7 +14,9 @@ expression: res SoTerm: term: Transcript feature_id: NM_007294.4 - feature_biotype: Coding + feature_biotype: + - Coding + - ManeSelect rank: ord: 1 total: 23 @@ -41,7 +43,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007297.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 1 total: 22 @@ -68,7 +71,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007298.3 - feature_biotype: Coding + feature_biotype: + - Coding rank: ~ hgvs_t: ~ hgvs_p: ~ @@ -89,7 +93,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007299.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 1 total: 22 @@ -116,7 +121,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007300.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ord: 1 total: 24 diff --git a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41277382-G-C.snap b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41277382-G-C.snap index fb8afb18..9b9dd01e 100644 --- a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41277382-G-C.snap +++ b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41277382-G-C.snap @@ -14,7 +14,9 @@ expression: res SoTerm: term: Transcript feature_id: NM_007294.4 - feature_biotype: Coding + feature_biotype: + - Coding + - ManeSelect rank: ~ hgvs_t: ~ hgvs_p: ~ @@ -35,7 +37,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007297.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ~ hgvs_t: ~ hgvs_p: ~ @@ -56,7 +59,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007298.3 - feature_biotype: Coding + feature_biotype: + - Coding rank: ~ hgvs_t: ~ hgvs_p: ~ @@ -77,7 +81,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007299.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ~ hgvs_t: ~ hgvs_p: ~ @@ -98,7 +103,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007300.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ~ hgvs_t: ~ hgvs_p: ~ diff --git a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41277383-A-C.snap b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41277383-A-C.snap index 1335a6d8..57e7098a 100644 --- a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41277383-A-C.snap +++ b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41277383-A-C.snap @@ -14,7 +14,9 @@ expression: res SoTerm: term: Transcript feature_id: NM_007294.4 - feature_biotype: Coding + feature_biotype: + - Coding + - ManeSelect rank: ~ hgvs_t: ~ hgvs_p: ~ @@ -35,7 +37,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007297.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ~ hgvs_t: ~ hgvs_p: ~ @@ -56,7 +59,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007298.3 - feature_biotype: Coding + feature_biotype: + - Coding rank: ~ hgvs_t: ~ hgvs_p: ~ @@ -77,7 +81,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007299.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ~ hgvs_t: ~ hgvs_p: ~ @@ -98,7 +103,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007300.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ~ hgvs_t: ~ hgvs_p: ~ diff --git a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41277384-G-C.snap b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41277384-G-C.snap index da121093..0cfed492 100644 --- a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41277384-G-C.snap +++ b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_one_variant@17-41277384-G-C.snap @@ -14,7 +14,9 @@ expression: res SoTerm: term: Transcript feature_id: NM_007294.4 - feature_biotype: Coding + feature_biotype: + - Coding + - ManeSelect rank: ~ hgvs_t: ~ hgvs_p: ~ @@ -35,7 +37,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007297.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ~ hgvs_t: ~ hgvs_p: ~ @@ -56,7 +59,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007298.3 - feature_biotype: Coding + feature_biotype: + - Coding rank: ~ hgvs_t: ~ hgvs_p: ~ @@ -77,7 +81,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007299.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ~ hgvs_t: ~ hgvs_p: ~ @@ -98,7 +103,8 @@ expression: res SoTerm: term: Transcript feature_id: NM_007300.4 - feature_biotype: Coding + feature_biotype: + - Coding rank: ~ hgvs_t: ~ hgvs_p: ~ diff --git a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_transcript_picking_reporting@17-41197701-G-C-false-false.snap b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_transcript_picking_reporting@17-41197701-G-C-false-false.snap new file mode 100644 index 00000000..6bd42d97 --- /dev/null +++ b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_transcript_picking_reporting@17-41197701-G-C-false-false.snap @@ -0,0 +1,36 @@ +--- +source: src/annotate/seqvars/csq.rs +expression: res +--- +- allele: + Alt: + alternative: C + consequences: + - missense_variant + putative_impact: MODERATE + gene_symbol: BRCA1 + gene_id: "HGNC:1100" + feature_type: + SoTerm: + term: Transcript + feature_id: NM_007294.4 + feature_biotype: + - Coding + - ManeSelect + rank: + ord: 23 + total: 23 + hgvs_t: c.5586C>G + hgvs_p: p.H1862Q + tx_pos: + ord: 5699 + total: 7088 + cds_pos: + ord: 5586 + total: 5592 + protein_pos: + ord: 1862 + total: 1864 + distance: 0 + messages: ~ + diff --git a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_transcript_picking_reporting@17-41197701-G-C-false-true.snap b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_transcript_picking_reporting@17-41197701-G-C-false-true.snap new file mode 100644 index 00000000..0ca67620 --- /dev/null +++ b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_transcript_picking_reporting@17-41197701-G-C-false-true.snap @@ -0,0 +1,154 @@ +--- +source: src/annotate/seqvars/csq.rs +expression: res +--- +- allele: + Alt: + alternative: C + consequences: + - missense_variant + putative_impact: MODERATE + gene_symbol: BRCA1 + gene_id: "HGNC:1100" + feature_type: + SoTerm: + term: Transcript + feature_id: NM_007294.4 + feature_biotype: + - Coding + - ManeSelect + rank: + ord: 23 + total: 23 + hgvs_t: c.5586C>G + hgvs_p: p.H1862Q + tx_pos: + ord: 5699 + total: 7088 + cds_pos: + ord: 5586 + total: 5592 + protein_pos: + ord: 1862 + total: 1864 + distance: 0 + messages: ~ +- allele: + Alt: + alternative: C + consequences: + - missense_variant + putative_impact: MODERATE + gene_symbol: BRCA1 + gene_id: "HGNC:1100" + feature_type: + SoTerm: + term: Transcript + feature_id: NM_007297.4 + feature_biotype: + - Coding + rank: + ord: 22 + total: 22 + hgvs_t: c.5445C>G + hgvs_p: p.H1815Q + tx_pos: + ord: 5639 + total: 7028 + cds_pos: + ord: 5445 + total: 5451 + protein_pos: + ord: 1815 + total: 1817 + distance: 0 + messages: ~ +- allele: + Alt: + alternative: C + consequences: + - missense_variant + putative_impact: MODERATE + gene_symbol: BRCA1 + gene_id: "HGNC:1100" + feature_type: + SoTerm: + term: Transcript + feature_id: NM_007298.3 + feature_biotype: + - Coding + rank: + ord: 22 + total: 22 + hgvs_t: c.2274C>G + hgvs_p: p.H758Q + tx_pos: + ord: 2293 + total: 3682 + cds_pos: + ord: 2274 + total: 2280 + protein_pos: + ord: 758 + total: 760 + distance: 0 + messages: ~ +- allele: + Alt: + alternative: C + consequences: + - 3_prime_UTR_variant + putative_impact: MODIFIER + gene_symbol: BRCA1 + gene_id: "HGNC:1100" + feature_type: + SoTerm: + term: Transcript + feature_id: NM_007299.4 + feature_biotype: + - Coding + rank: + ord: 22 + total: 22 + hgvs_t: c.*100C>G + hgvs_p: p.? + tx_pos: + ord: 2307 + total: 3696 + cds_pos: + ord: 100 + total: 2100 + protein_pos: ~ + distance: 0 + messages: ~ +- allele: + Alt: + alternative: C + consequences: + - missense_variant + putative_impact: MODERATE + gene_symbol: BRCA1 + gene_id: "HGNC:1100" + feature_type: + SoTerm: + term: Transcript + feature_id: NM_007300.4 + feature_biotype: + - Coding + rank: + ord: 24 + total: 24 + hgvs_t: c.5649C>G + hgvs_p: p.H1883Q + tx_pos: + ord: 5762 + total: 7151 + cds_pos: + ord: 5649 + total: 5655 + protein_pos: + ord: 1883 + total: 1885 + distance: 0 + messages: ~ + diff --git a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_transcript_picking_reporting@17-41197701-G-C-true-false.snap b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_transcript_picking_reporting@17-41197701-G-C-true-false.snap new file mode 100644 index 00000000..6bd42d97 --- /dev/null +++ b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_transcript_picking_reporting@17-41197701-G-C-true-false.snap @@ -0,0 +1,36 @@ +--- +source: src/annotate/seqvars/csq.rs +expression: res +--- +- allele: + Alt: + alternative: C + consequences: + - missense_variant + putative_impact: MODERATE + gene_symbol: BRCA1 + gene_id: "HGNC:1100" + feature_type: + SoTerm: + term: Transcript + feature_id: NM_007294.4 + feature_biotype: + - Coding + - ManeSelect + rank: + ord: 23 + total: 23 + hgvs_t: c.5586C>G + hgvs_p: p.H1862Q + tx_pos: + ord: 5699 + total: 7088 + cds_pos: + ord: 5586 + total: 5592 + protein_pos: + ord: 1862 + total: 1864 + distance: 0 + messages: ~ + diff --git a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_transcript_picking_reporting@17-41197701-G-C-true-true.snap b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_transcript_picking_reporting@17-41197701-G-C-true-true.snap new file mode 100644 index 00000000..6bd42d97 --- /dev/null +++ b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_brca1_transcript_picking_reporting@17-41197701-G-C-true-true.snap @@ -0,0 +1,36 @@ +--- +source: src/annotate/seqvars/csq.rs +expression: res +--- +- allele: + Alt: + alternative: C + consequences: + - missense_variant + putative_impact: MODERATE + gene_symbol: BRCA1 + gene_id: "HGNC:1100" + feature_type: + SoTerm: + term: Transcript + feature_id: NM_007294.4 + feature_biotype: + - Coding + - ManeSelect + rank: + ord: 23 + total: 23 + hgvs_t: c.5586C>G + hgvs_p: p.H1862Q + tx_pos: + ord: 5699 + total: 7088 + cds_pos: + ord: 5586 + total: 5592 + protein_pos: + ord: 1862 + total: 1864 + distance: 0 + messages: ~ + diff --git a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_ttn_transcript_picking_reporting@2-179393094-C-T-false-false.snap b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_ttn_transcript_picking_reporting@2-179393094-C-T-false-false.snap new file mode 100644 index 00000000..27883e2f --- /dev/null +++ b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_ttn_transcript_picking_reporting@2-179393094-C-T-false-false.snap @@ -0,0 +1,36 @@ +--- +source: src/annotate/seqvars/csq.rs +expression: res +--- +- allele: + Alt: + alternative: T + consequences: + - synonymous_variant + putative_impact: LOW + gene_symbol: TTN + gene_id: "HGNC:12403" + feature_type: + SoTerm: + term: Transcript + feature_id: NM_001267550.2 + feature_biotype: + - Coding + - ManeSelect + rank: + ord: 361 + total: 363 + hgvs_t: c.107284C>A + hgvs_p: p.R35762= + tx_pos: + ord: 107509 + total: 109224 + cds_pos: + ord: 107284 + total: 107976 + protein_pos: + ord: 35762 + total: 35992 + distance: 0 + messages: ~ + diff --git a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_ttn_transcript_picking_reporting@2-179393094-C-T-false-true.snap b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_ttn_transcript_picking_reporting@2-179393094-C-T-false-true.snap new file mode 100644 index 00000000..41381b20 --- /dev/null +++ b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_ttn_transcript_picking_reporting@2-179393094-C-T-false-true.snap @@ -0,0 +1,186 @@ +--- +source: src/annotate/seqvars/csq.rs +expression: res +--- +- allele: + Alt: + alternative: T + consequences: + - synonymous_variant + putative_impact: LOW + gene_symbol: TTN + gene_id: "HGNC:12403" + feature_type: + SoTerm: + term: Transcript + feature_id: NM_001256850.1 + feature_biotype: + - Coding + rank: + ord: 311 + total: 313 + hgvs_t: c.102361C>A + hgvs_p: p.R34121= + tx_pos: + ord: 102586 + total: 104301 + cds_pos: + ord: 102361 + total: 103053 + protein_pos: + ord: 34121 + total: 34351 + distance: 0 + messages: ~ +- allele: + Alt: + alternative: T + consequences: + - synonymous_variant + putative_impact: LOW + gene_symbol: TTN + gene_id: "HGNC:12403" + feature_type: + SoTerm: + term: Transcript + feature_id: NM_001267550.2 + feature_biotype: + - Coding + - ManeSelect + rank: + ord: 361 + total: 363 + hgvs_t: c.107284C>A + hgvs_p: p.R35762= + tx_pos: + ord: 107509 + total: 109224 + cds_pos: + ord: 107284 + total: 107976 + protein_pos: + ord: 35762 + total: 35992 + distance: 0 + messages: ~ +- allele: + Alt: + alternative: T + consequences: + - synonymous_variant + putative_impact: LOW + gene_symbol: TTN + gene_id: "HGNC:12403" + feature_type: + SoTerm: + term: Transcript + feature_id: NM_003319.4 + feature_biotype: + - Coding + rank: + ord: 189 + total: 191 + hgvs_t: c.80089C>A + hgvs_p: p.R26697= + tx_pos: + ord: 80314 + total: 82029 + cds_pos: + ord: 80089 + total: 80781 + protein_pos: + ord: 26697 + total: 26927 + distance: 0 + messages: ~ +- allele: + Alt: + alternative: T + consequences: + - synonymous_variant + putative_impact: LOW + gene_symbol: TTN + gene_id: "HGNC:12403" + feature_type: + SoTerm: + term: Transcript + feature_id: NM_133378.4 + feature_biotype: + - Coding + rank: + ord: 310 + total: 312 + hgvs_t: c.99580C>A + hgvs_p: p.R33194= + tx_pos: + ord: 99805 + total: 101520 + cds_pos: + ord: 99580 + total: 100272 + protein_pos: + ord: 33194 + total: 33424 + distance: 0 + messages: ~ +- allele: + Alt: + alternative: T + consequences: + - synonymous_variant + putative_impact: LOW + gene_symbol: TTN + gene_id: "HGNC:12403" + feature_type: + SoTerm: + term: Transcript + feature_id: NM_133432.3 + feature_biotype: + - Coding + rank: + ord: 190 + total: 192 + hgvs_t: c.80464C>A + hgvs_p: p.R26822= + tx_pos: + ord: 80689 + total: 82404 + cds_pos: + ord: 80464 + total: 81156 + protein_pos: + ord: 26822 + total: 27052 + distance: 0 + messages: ~ +- allele: + Alt: + alternative: T + consequences: + - synonymous_variant + putative_impact: LOW + gene_symbol: TTN + gene_id: "HGNC:12403" + feature_type: + SoTerm: + term: Transcript + feature_id: NM_133437.4 + feature_biotype: + - Coding + rank: + ord: 190 + total: 192 + hgvs_t: c.80665C>A + hgvs_p: p.R26889= + tx_pos: + ord: 80890 + total: 82605 + cds_pos: + ord: 80665 + total: 81357 + protein_pos: + ord: 26889 + total: 27119 + distance: 0 + messages: ~ + diff --git a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_ttn_transcript_picking_reporting@2-179393094-C-T-true-false.snap b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_ttn_transcript_picking_reporting@2-179393094-C-T-true-false.snap new file mode 100644 index 00000000..27883e2f --- /dev/null +++ b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_ttn_transcript_picking_reporting@2-179393094-C-T-true-false.snap @@ -0,0 +1,36 @@ +--- +source: src/annotate/seqvars/csq.rs +expression: res +--- +- allele: + Alt: + alternative: T + consequences: + - synonymous_variant + putative_impact: LOW + gene_symbol: TTN + gene_id: "HGNC:12403" + feature_type: + SoTerm: + term: Transcript + feature_id: NM_001267550.2 + feature_biotype: + - Coding + - ManeSelect + rank: + ord: 361 + total: 363 + hgvs_t: c.107284C>A + hgvs_p: p.R35762= + tx_pos: + ord: 107509 + total: 109224 + cds_pos: + ord: 107284 + total: 107976 + protein_pos: + ord: 35762 + total: 35992 + distance: 0 + messages: ~ + diff --git a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_ttn_transcript_picking_reporting@2-179631246-G-A-false-false.snap b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_ttn_transcript_picking_reporting@2-179631246-G-A-false-false.snap new file mode 100644 index 00000000..0f9ab45e --- /dev/null +++ b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_ttn_transcript_picking_reporting@2-179631246-G-A-false-false.snap @@ -0,0 +1,36 @@ +--- +source: src/annotate/seqvars/csq.rs +expression: res +--- +- allele: + Alt: + alternative: A + consequences: + - stop_gained + putative_impact: HIGH + gene_symbol: TTN + gene_id: "HGNC:12403" + feature_type: + SoTerm: + term: Transcript + feature_id: NM_001267550.2 + feature_biotype: + - Coding + - ManeSelect + rank: + ord: 41 + total: 363 + hgvs_t: c.9565C>T + hgvs_p: p.Q3189* + tx_pos: + ord: 9790 + total: 109224 + cds_pos: + ord: 9565 + total: 107976 + protein_pos: + ord: 3189 + total: 35992 + distance: 0 + messages: ~ + diff --git a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_ttn_transcript_picking_reporting@2-179631246-G-A-false-true.snap b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_ttn_transcript_picking_reporting@2-179631246-G-A-false-true.snap new file mode 100644 index 00000000..90cb276f --- /dev/null +++ b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_ttn_transcript_picking_reporting@2-179631246-G-A-false-true.snap @@ -0,0 +1,217 @@ +--- +source: src/annotate/seqvars/csq.rs +expression: res +--- +- allele: + Alt: + alternative: A + consequences: + - stop_gained + putative_impact: HIGH + gene_symbol: TTN + gene_id: "HGNC:12403" + feature_type: + SoTerm: + term: Transcript + feature_id: NM_001256850.1 + feature_biotype: + - Coding + rank: + ord: 41 + total: 313 + hgvs_t: c.9565C>T + hgvs_p: p.Q3189* + tx_pos: + ord: 9790 + total: 104301 + cds_pos: + ord: 9565 + total: 103053 + protein_pos: + ord: 3189 + total: 34351 + distance: 0 + messages: ~ +- allele: + Alt: + alternative: A + consequences: + - stop_gained + putative_impact: HIGH + gene_symbol: TTN + gene_id: "HGNC:12403" + feature_type: + SoTerm: + term: Transcript + feature_id: NM_001267550.2 + feature_biotype: + - Coding + - ManeSelect + rank: + ord: 41 + total: 363 + hgvs_t: c.9565C>T + hgvs_p: p.Q3189* + tx_pos: + ord: 9790 + total: 109224 + cds_pos: + ord: 9565 + total: 107976 + protein_pos: + ord: 3189 + total: 35992 + distance: 0 + messages: ~ +- allele: + Alt: + alternative: A + consequences: + - stop_gained + putative_impact: HIGH + gene_symbol: TTN + gene_id: "HGNC:12403" + feature_type: + SoTerm: + term: Transcript + feature_id: NM_003319.4 + feature_biotype: + - Coding + rank: + ord: 40 + total: 191 + hgvs_t: c.9427C>T + hgvs_p: p.Q3143* + tx_pos: + ord: 9652 + total: 82029 + cds_pos: + ord: 9427 + total: 80781 + protein_pos: + ord: 3143 + total: 26927 + distance: 0 + messages: ~ +- allele: + Alt: + alternative: A + consequences: + - stop_gained + putative_impact: HIGH + gene_symbol: TTN + gene_id: "HGNC:12403" + feature_type: + SoTerm: + term: Transcript + feature_id: NM_133378.4 + feature_biotype: + - Coding + rank: + ord: 41 + total: 312 + hgvs_t: c.9565C>T + hgvs_p: p.Q3189* + tx_pos: + ord: 9790 + total: 101520 + cds_pos: + ord: 9565 + total: 100272 + protein_pos: + ord: 3189 + total: 33424 + distance: 0 + messages: ~ +- allele: + Alt: + alternative: A + consequences: + - stop_gained + putative_impact: HIGH + gene_symbol: TTN + gene_id: "HGNC:12403" + feature_type: + SoTerm: + term: Transcript + feature_id: NM_133379.5 + feature_biotype: + - Coding + - ManePlusClinical + rank: + ord: 41 + total: 46 + hgvs_t: c.9565C>T + hgvs_p: p.Q3189* + tx_pos: + ord: 9790 + total: 18220 + cds_pos: + ord: 9565 + total: 16815 + protein_pos: + ord: 3189 + total: 5605 + distance: 0 + messages: ~ +- allele: + Alt: + alternative: A + consequences: + - stop_gained + putative_impact: HIGH + gene_symbol: TTN + gene_id: "HGNC:12403" + feature_type: + SoTerm: + term: Transcript + feature_id: NM_133432.3 + feature_biotype: + - Coding + rank: + ord: 40 + total: 192 + hgvs_t: c.9427C>T + hgvs_p: p.Q3143* + tx_pos: + ord: 9652 + total: 82404 + cds_pos: + ord: 9427 + total: 81156 + protein_pos: + ord: 3143 + total: 27052 + distance: 0 + messages: ~ +- allele: + Alt: + alternative: A + consequences: + - stop_gained + putative_impact: HIGH + gene_symbol: TTN + gene_id: "HGNC:12403" + feature_type: + SoTerm: + term: Transcript + feature_id: NM_133437.4 + feature_biotype: + - Coding + rank: + ord: 40 + total: 192 + hgvs_t: c.9427C>T + hgvs_p: p.Q3143* + tx_pos: + ord: 9652 + total: 82605 + cds_pos: + ord: 9427 + total: 81357 + protein_pos: + ord: 3143 + total: 27119 + distance: 0 + messages: ~ + diff --git a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_ttn_transcript_picking_reporting@2-179631246-G-A-true-false.snap b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_ttn_transcript_picking_reporting@2-179631246-G-A-true-false.snap new file mode 100644 index 00000000..0f9ab45e --- /dev/null +++ b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_ttn_transcript_picking_reporting@2-179631246-G-A-true-false.snap @@ -0,0 +1,36 @@ +--- +source: src/annotate/seqvars/csq.rs +expression: res +--- +- allele: + Alt: + alternative: A + consequences: + - stop_gained + putative_impact: HIGH + gene_symbol: TTN + gene_id: "HGNC:12403" + feature_type: + SoTerm: + term: Transcript + feature_id: NM_001267550.2 + feature_biotype: + - Coding + - ManeSelect + rank: + ord: 41 + total: 363 + hgvs_t: c.9565C>T + hgvs_p: p.Q3189* + tx_pos: + ord: 9790 + total: 109224 + cds_pos: + ord: 9565 + total: 107976 + protein_pos: + ord: 3189 + total: 35992 + distance: 0 + messages: ~ + diff --git a/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_ttn_transcript_picking_reporting@2-179631246-G-A-true-true.snap b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_ttn_transcript_picking_reporting@2-179631246-G-A-true-true.snap new file mode 100644 index 00000000..f376b40a --- /dev/null +++ b/src/annotate/seqvars/snapshots/mehari__annotate__seqvars__csq__test__annotate_snv_ttn_transcript_picking_reporting@2-179631246-G-A-true-true.snap @@ -0,0 +1,67 @@ +--- +source: src/annotate/seqvars/csq.rs +expression: res +--- +- allele: + Alt: + alternative: A + consequences: + - stop_gained + putative_impact: HIGH + gene_symbol: TTN + gene_id: "HGNC:12403" + feature_type: + SoTerm: + term: Transcript + feature_id: NM_001267550.2 + feature_biotype: + - Coding + - ManeSelect + rank: + ord: 41 + total: 363 + hgvs_t: c.9565C>T + hgvs_p: p.Q3189* + tx_pos: + ord: 9790 + total: 109224 + cds_pos: + ord: 9565 + total: 107976 + protein_pos: + ord: 3189 + total: 35992 + distance: 0 + messages: ~ +- allele: + Alt: + alternative: A + consequences: + - stop_gained + putative_impact: HIGH + gene_symbol: TTN + gene_id: "HGNC:12403" + feature_type: + SoTerm: + term: Transcript + feature_id: NM_133379.5 + feature_biotype: + - Coding + - ManePlusClinical + rank: + ord: 41 + total: 46 + hgvs_t: c.9565C>T + hgvs_p: p.Q3189* + tx_pos: + ord: 9790 + total: 18220 + cds_pos: + ord: 9565 + total: 16815 + protein_pos: + ord: 3189 + total: 5605 + distance: 0 + messages: ~ + diff --git a/src/annotate/strucvars/csq.rs b/src/annotate/strucvars/csq.rs index da6a8cca..a7c10984 100644 --- a/src/annotate/strucvars/csq.rs +++ b/src/annotate/strucvars/csq.rs @@ -6,8 +6,8 @@ use biocommons_bioutils::assemblies::Assembly; use hgvs::data::interface::Provider; use crate::{ - annotate::seqvars::provider::{MehariProvider, TxIntervalTrees}, - db::create::txs::data::{Strand, Transcript, TxSeqDatabase}, + annotate::seqvars::provider::{Provider as MehariProvider, TxIntervalTrees}, + db::create::data::{Strand, Transcript, TxSeqDatabase}, }; /// Enumeration for effect on transcript. @@ -383,9 +383,8 @@ fn compute_tx_effects_for_linear( let tree = &mehari_tx_idx.trees[*idx]; for it in tree.find(query) { let tx = &tx_db.transcripts[*it.data() as usize]; - let hgnc_id = format!("HGNC:{}", &tx.gene_id); effects_by_gene - .entry(hgnc_id) + .entry(tx.gene_id.clone()) .or_default() .extend(gene_tx_effect_for_range(tx, sv.start(), sv.stop())); } diff --git a/src/annotate/strucvars/mod.rs b/src/annotate/strucvars/mod.rs index 62e8abc2..024d726d 100644 --- a/src/annotate/strucvars/mod.rs +++ b/src/annotate/strucvars/mod.rs @@ -1442,7 +1442,7 @@ impl TryInto for VarFishStrucvarTsvRecord { } /// Enumeration for the supported variant callers. -#[derive(Debug, Clone, PartialEq, EnumIter)] +#[derive(Debug, Clone, PartialEq, EnumIter, Serialize, Deserialize)] pub enum SvCaller { Delly { version: String }, DragenSv { version: String }, @@ -2973,7 +2973,7 @@ pub async fn run(_common: &crate::common::Args, args: &Args) -> Result<(), anyho GenomeRelease::Grch38 => Assembly::Grch38, }); let (header, assembly) = { - let mut reader = VariantReaderBuilder.build_from_path( + let mut reader = VariantReaderBuilder::default().build_from_path( args.path_input_vcf .first() .expect("must have at least input VCF"), @@ -3224,7 +3224,7 @@ mod test { let temp = TempDir::default(); let out_jsonl = File::create(temp.join(out_file_name))?; - let mut reader = noodles_vcf::reader::Builder.build_from_path(path_input_vcf)?; + let mut reader = noodles_vcf::reader::Builder::default().build_from_path(path_input_vcf)?; let header_in = reader.read_header()?; // Setup deterministic bytes for UUID generation. @@ -3259,7 +3259,7 @@ mod test { /// Helper that returns sample names from VCF. fn vcf_samples(path: &str) -> Result, anyhow::Error> { - let mut reader = noodles_vcf::reader::Builder.build_from_path(path)?; + let mut reader = noodles_vcf::reader::Builder::default().build_from_path(path)?; let header: VcfHeader = reader.read_header()?; Ok(header .sample_names() @@ -3443,7 +3443,7 @@ mod test { async fn guess_sv_caller_delly() -> Result<(), anyhow::Error> { let mut reader = open_vcf_reader("tests/data/annotate/strucvars/delly2-min.vcf").await?; let sv_caller = guess_sv_caller(&mut reader).await?; - insta::assert_debug_snapshot!(sv_caller); + insta::assert_yaml_snapshot!(sv_caller); Ok(()) } @@ -3452,7 +3452,7 @@ mod test { async fn guess_sv_caller_dragen_sv() -> Result<(), anyhow::Error> { let mut reader = open_vcf_reader("tests/data/annotate/strucvars/dragen-sv-min.vcf").await?; let sv_caller = guess_sv_caller(&mut reader).await?; - insta::assert_debug_snapshot!(sv_caller); + insta::assert_yaml_snapshot!(sv_caller); Ok(()) } @@ -3462,7 +3462,7 @@ mod test { let mut reader = open_vcf_reader("tests/data/annotate/strucvars/dragen-cnv-min.vcf").await?; let sv_caller = guess_sv_caller(&mut reader).await?; - insta::assert_debug_snapshot!(sv_caller); + insta::assert_yaml_snapshot!(sv_caller); Ok(()) } @@ -3471,7 +3471,7 @@ mod test { async fn guess_sv_caller_gcnv() -> Result<(), anyhow::Error> { let mut reader = open_vcf_reader("tests/data/annotate/strucvars/gcnv-min.vcf").await?; let sv_caller = guess_sv_caller(&mut reader).await?; - insta::assert_debug_snapshot!(sv_caller); + insta::assert_yaml_snapshot!(sv_caller); Ok(()) } @@ -3480,7 +3480,7 @@ mod test { async fn guess_sv_caller_manta() -> Result<(), anyhow::Error> { let mut reader = open_vcf_reader("tests/data/annotate/strucvars/manta-min.vcf").await?; let sv_caller = guess_sv_caller(&mut reader).await?; - insta::assert_debug_snapshot!(sv_caller); + insta::assert_yaml_snapshot!(sv_caller); Ok(()) } @@ -3489,7 +3489,7 @@ mod test { async fn guess_sv_caller_melt() -> Result<(), anyhow::Error> { let mut reader = open_vcf_reader("tests/data/annotate/strucvars/melt-min.vcf").await?; let sv_caller = guess_sv_caller(&mut reader).await?; - insta::assert_debug_snapshot!(sv_caller); + insta::assert_yaml_snapshot!(sv_caller); Ok(()) } @@ -3498,7 +3498,7 @@ mod test { async fn guess_sv_caller_popdel() -> Result<(), anyhow::Error> { let mut reader = open_vcf_reader("tests/data/annotate/strucvars/popdel-min.vcf").await?; let sv_caller = guess_sv_caller(&mut reader).await?; - insta::assert_debug_snapshot!(sv_caller); + insta::assert_yaml_snapshot!(sv_caller); Ok(()) } diff --git a/src/annotate/strucvars/snapshots/mehari__annotate__strucvars__test__guess_sv_caller_delly.snap b/src/annotate/strucvars/snapshots/mehari__annotate__strucvars__test__guess_sv_caller_delly.snap index 17e61936..10567093 100644 --- a/src/annotate/strucvars/snapshots/mehari__annotate__strucvars__test__guess_sv_caller_delly.snap +++ b/src/annotate/strucvars/snapshots/mehari__annotate__strucvars__test__guess_sv_caller_delly.snap @@ -2,6 +2,6 @@ source: src/annotate/strucvars/mod.rs expression: sv_caller --- -Delly { - version: "1.1.3", -} +Delly: + version: 1.1.3 + diff --git a/src/annotate/strucvars/snapshots/mehari__annotate__strucvars__test__guess_sv_caller_dragen_cnv.snap b/src/annotate/strucvars/snapshots/mehari__annotate__strucvars__test__guess_sv_caller_dragen_cnv.snap index e462857b..9265355a 100644 --- a/src/annotate/strucvars/snapshots/mehari__annotate__strucvars__test__guess_sv_caller_dragen_cnv.snap +++ b/src/annotate/strucvars/snapshots/mehari__annotate__strucvars__test__guess_sv_caller_dragen_cnv.snap @@ -2,6 +2,6 @@ source: src/annotate/strucvars/mod.rs expression: sv_caller --- -DragenCnv { - version: "07.021.624.3.10.4", -} +DragenCnv: + version: 07.021.624.3.10.4 + diff --git a/src/annotate/strucvars/snapshots/mehari__annotate__strucvars__test__guess_sv_caller_dragen_sv.snap b/src/annotate/strucvars/snapshots/mehari__annotate__strucvars__test__guess_sv_caller_dragen_sv.snap index c93fd386..88b9f1ae 100644 --- a/src/annotate/strucvars/snapshots/mehari__annotate__strucvars__test__guess_sv_caller_dragen_sv.snap +++ b/src/annotate/strucvars/snapshots/mehari__annotate__strucvars__test__guess_sv_caller_dragen_sv.snap @@ -2,6 +2,6 @@ source: src/annotate/strucvars/mod.rs expression: sv_caller --- -DragenSv { - version: "07.021.624.3.10.4", -} +DragenSv: + version: 07.021.624.3.10.4 + diff --git a/src/annotate/strucvars/snapshots/mehari__annotate__strucvars__test__guess_sv_caller_gcnv.snap b/src/annotate/strucvars/snapshots/mehari__annotate__strucvars__test__guess_sv_caller_gcnv.snap index ad528f19..2f625ef2 100644 --- a/src/annotate/strucvars/snapshots/mehari__annotate__strucvars__test__guess_sv_caller_gcnv.snap +++ b/src/annotate/strucvars/snapshots/mehari__annotate__strucvars__test__guess_sv_caller_gcnv.snap @@ -2,6 +2,6 @@ source: src/annotate/strucvars/mod.rs expression: sv_caller --- -Gcnv { - version: "4.3.0.0", -} +Gcnv: + version: 4.3.0.0 + diff --git a/src/annotate/strucvars/snapshots/mehari__annotate__strucvars__test__guess_sv_caller_manta.snap b/src/annotate/strucvars/snapshots/mehari__annotate__strucvars__test__guess_sv_caller_manta.snap index ab880087..902b9a73 100644 --- a/src/annotate/strucvars/snapshots/mehari__annotate__strucvars__test__guess_sv_caller_manta.snap +++ b/src/annotate/strucvars/snapshots/mehari__annotate__strucvars__test__guess_sv_caller_manta.snap @@ -2,6 +2,6 @@ source: src/annotate/strucvars/mod.rs expression: sv_caller --- -Manta { - version: "1.6.0", -} +Manta: + version: 1.6.0 + diff --git a/src/annotate/strucvars/snapshots/mehari__annotate__strucvars__test__guess_sv_caller_melt.snap b/src/annotate/strucvars/snapshots/mehari__annotate__strucvars__test__guess_sv_caller_melt.snap index d59c1e4b..05be52de 100644 --- a/src/annotate/strucvars/snapshots/mehari__annotate__strucvars__test__guess_sv_caller_melt.snap +++ b/src/annotate/strucvars/snapshots/mehari__annotate__strucvars__test__guess_sv_caller_melt.snap @@ -2,6 +2,6 @@ source: src/annotate/strucvars/mod.rs expression: sv_caller --- -Melt { - version: "2.2.2", -} +Melt: + version: 2.2.2 + diff --git a/src/annotate/strucvars/snapshots/mehari__annotate__strucvars__test__guess_sv_caller_popdel.snap b/src/annotate/strucvars/snapshots/mehari__annotate__strucvars__test__guess_sv_caller_popdel.snap index 089ef02c..409551f4 100644 --- a/src/annotate/strucvars/snapshots/mehari__annotate__strucvars__test__guess_sv_caller_popdel.snap +++ b/src/annotate/strucvars/snapshots/mehari__annotate__strucvars__test__guess_sv_caller_popdel.snap @@ -2,6 +2,6 @@ source: src/annotate/strucvars/mod.rs expression: sv_caller --- -Popdel { - version: "1.1.2", -} +Popdel: + version: 1.1.2 + diff --git a/src/db/create/mod.rs b/src/db/create/mod.rs index a46ea50d..84480302 100644 --- a/src/db/create/mod.rs +++ b/src/db/create/mod.rs @@ -1,3 +1,958 @@ -//! Creation of mehari internal databases. +//! Transcript database. -pub mod txs; +use std::collections::HashSet; +use std::fs::File; +use std::path::Path; +use std::{collections::HashMap, io::Write, path::PathBuf, time::Instant}; + +use anyhow::anyhow; +use clap::Parser; +use hgvs::data::cdot::json::models; +use hgvs::sequences::{translate_cds, TranslationTable}; +use indicatif::{ProgressBar, ProgressStyle}; +use prost::Message; +use seqrepo::{AliasOrSeqId, Interface, SeqRepo}; +use thousands::Separable; + +use crate::common::{trace_rss_now, GenomeRelease}; + +lazy_static::lazy_static! { + /// Progress bar style to use. + pub static ref PROGRESS_STYLE: ProgressStyle = ProgressStyle::with_template( + "[{elapsed_precise}] [{wide_bar:.cyan/blue}] {human_pos}/{human_len} ({eta})", + ) + .unwrap(); +} + +/// Data structures for (de-)serialization as generated by `prost-build`. +pub mod data { + include!(concat!(env!("OUT_DIR"), "/mehari.db.create.txs.data.rs")); +} + +/// Command line arguments for `db create txs` sub command. +#[derive(Parser, Debug)] +#[command(about = "Construct mehari transcripts and sequence database", long_about = None)] +pub struct Args { + /// Genome release to extract transcripts for. + #[arg(long)] + pub genome_release: GenomeRelease, + /// Path to output protobuf file to write to. + #[arg(long)] + pub path_out: PathBuf, + /// Paths to the cdot JSON transcripts to import. + #[arg(long, required = true)] + pub path_cdot_json: Vec, + /// Path to the seqrepo instance directory to use. + #[arg(long)] + pub path_seqrepo_instance: PathBuf, + /// Path to TSV file for label transfer of transcripts. Columns are + /// transcript id (without version), (unused) gene symbol, and label. + #[arg(long)] + pub path_mane_txs_tsv: Option, + /// Maximal number of transcripts to process. + #[arg(long)] + pub max_txs: Option, + /// Limit transcript database to the following HGNC symbols. Useful for + /// building test databases. + #[arg(long)] + pub gene_symbols: Option>, +} + +/// Helper struct for parsing the label TSV file. +#[derive(Debug, Clone, PartialEq, Eq, serde::Deserialize)] +struct LabelEntry { + /// Transcript identifier without version. + transcript_id: String, + /// Gene symbol (unused). + _gene_symbol: String, + /// Label to transfer. + label: String, +} + +/// Load and extract from cdot JSON. +#[allow(clippy::too_many_arguments)] +fn load_and_extract( + json_path: &Path, + label_tsv_path: &Option<&Path>, + transcript_ids_for_gene: &mut HashMap>, + genes: &mut HashMap, + transcripts: &mut HashMap, + genome_release: GenomeRelease, + cdot_version: &mut String, + report_file: &mut File, +) -> Result<(), anyhow::Error> { + writeln!(report_file, "genome_release\t{:?}", genome_release)?; + let txid_to_label = if let Some(label_tsv_path) = label_tsv_path { + tracing::info!("Loading label TSV file..."); + writeln!(report_file, "label_tsv_path\t{:?}", label_tsv_path)?; + + let mut rdr = csv::ReaderBuilder::new() + .delimiter(b'\t') + .comment(Some(b'#')) + .has_headers(false) + .from_path(label_tsv_path)?; + + let mut txid_to_label = HashMap::new(); + for result in rdr.deserialize() { + let entry: LabelEntry = result?; + txid_to_label.insert( + entry.transcript_id, + entry + .label + .split(',') + .map(models::str_to_tag) + .collect::>(), + ); + } + + tracing::info!( + "...done loading label TSV file ({} entries)", + txid_to_label.len() + ); + Some(txid_to_label) + } else { + None + }; + + tracing::info!("Loading cdot transcripts from {:?}", json_path); + writeln!(report_file, "cdot_json_path\t{:?}", json_path)?; + let start = Instant::now(); + let models::Container { + genes: c_genes, + transcripts: c_txs, + cdot_version: c_version, + .. + } = if json_path.extension().unwrap_or_default() == "gz" { + tracing::info!("(from gzip compressed file)"); + serde_json::from_reader(std::io::BufReader::new(flate2::read::GzDecoder::new( + File::open(json_path)?, + )))? + } else { + tracing::info!("(from uncompressed file)"); + serde_json::from_reader(std::io::BufReader::new(File::open(json_path)?))? + }; + *cdot_version = c_version; + tracing::info!( + "loading / deserializing {} genes and {} transcripts from cdot took {:?}", + c_genes.len().separate_with_commas(), + c_txs.len().separate_with_commas(), + start.elapsed() + ); + + let start = Instant::now(); + writeln!(report_file, "total_genes\t{}", c_genes.len())?; + c_genes + .values() + .filter(|gene| { + gene.hgnc.is_some() + && !gene.hgnc.as_ref().unwrap().is_empty() + && gene.map_location.is_some() + && !gene.map_location.as_ref().unwrap().is_empty() + && gene.hgnc.is_some() + && !gene.hgnc.as_ref().unwrap().is_empty() + }) + .for_each(|gene| { + let hgnc_id = format!("HGNC:{}", gene.hgnc.as_ref().unwrap()); + transcript_ids_for_gene.entry(hgnc_id.clone()).or_default(); + genes.insert(hgnc_id, gene.clone()); + }); + writeln!( + report_file, + "genes with gene_symbol, map_location, hgnc\t{}", + genes.len() + )?; + tracing::info!( + "Processed {} genes; total gene count: {}", + c_genes.len().separate_with_commas(), + genes.len() + ); + tracing::debug!( + "some 10 genes (HGNC IDs): {:?}", + genes.keys().take(10).collect::>() + ); + tracing::debug!( + "some 10 genes (symbols): {:?}", + genes + .values() + .take(10) + .map(|tx| tx.gene_symbol.clone().unwrap_or_default()) + .collect::>() + ); + + tracing::info!("Processing transcripts"); + writeln!(report_file, "total_transcripts\t{}", c_txs.len())?; + c_txs + .values() + .map(|tx| models::Transcript { + genome_builds: tx + .genome_builds + .iter() + .filter(|(key, _)| { + matches!( + (key.as_str(), genome_release), + ("GRCh37", GenomeRelease::Grch37) | ("GRCh38", GenomeRelease::Grch38) + ) + }) + .map(|(k, v)| (k.clone(), v.clone())) + .collect(), + ..tx.clone() + }) + .filter(|tx| { + tx.hgnc.is_some() + && !tx.hgnc.as_ref().unwrap().is_empty() + && genes.contains_key(&format!("HGNC:{}", tx.hgnc.as_ref().unwrap())) + && !tx.genome_builds.is_empty() + }) + .for_each(|tx| { + let hgnc_id = &format!("HGNC:{}", tx.hgnc.as_ref().unwrap()); + transcript_ids_for_gene + .get_mut(hgnc_id) + .unwrap_or_else(|| panic!("tx {:?} for unknown gene {:?}", tx.id, hgnc_id)) + .push(tx.id.clone()); + let mut tx_out = tx.clone(); + if let Some(txid_to_tags) = txid_to_label.as_ref() { + let tx_id_no_version = tx.id.split('.').next().unwrap(); + if let Some(tags) = txid_to_tags.get(tx_id_no_version) { + tx_out.tag = Some(tags.clone()); + } + } + transcripts.insert(tx.id.clone(), tx_out); + }); + writeln!( + report_file, + "transcripts with alignment on genome and link to selected gene\t{}", + transcripts.len() + )?; + tracing::info!( + "Processed {} genes; total transcript count: {}", + c_txs.len().separate_with_commas(), + transcripts.len().separate_with_commas() + ); + tracing::info!("extracting datastructures took {:?}", start.elapsed()); + Ok(()) +} + +/// Perform protobuf file construction. +/// +/// This can be done by simply converting the models from HGVS to the prost generated data structures. +fn build_protobuf( + path_out: &Path, + seqrepo: SeqRepo, + tx_data: TranscriptData, + is_silent: bool, + genome_release: GenomeRelease, + report_file: &mut File, +) -> Result<(), anyhow::Error> { + let TranscriptData { + genes, + transcripts, + transcript_ids_for_gene, + } = tx_data; + + tracing::info!("Constructing protobuf data structures ..."); + trace_rss_now(); + + // Construct sequence database. + tracing::info!(" Constructing sequence database ..."); + let mut tx_skipped_noseq = HashSet::new(); // skipped because of missing sequence + let mut tx_skipped_nostop = HashSet::new(); // skipped because of missing stop codon + let seq_db = { + // Insert into protobuf and keep track of pointers in `Vec`s. + let mut aliases = Vec::new(); + let mut aliases_idx = Vec::new(); + let mut seqs = Vec::new(); + let pb = if is_silent { + ProgressBar::hidden() + } else { + ProgressBar::new(transcripts.len() as u64) + }; + pb.set_style(PROGRESS_STYLE.clone()); + for (tx_id, tx) in &transcripts { + pb.inc(1); + let namespace = if tx_id.starts_with("ENST") { + Some(String::from("ENSEMBL")) + } else { + Some(String::from("NCBI")) + }; + let res_seq = seqrepo.fetch_sequence(&AliasOrSeqId::Alias { + value: tx_id.clone(), + namespace, + }); + let seq = if let Ok(seq) = res_seq { + seq + } else { + tracing::debug!("Skipping transcript {} because of missing sequence", tx_id); + writeln!( + report_file, + "skip transcript because it has no sequence\t{}", + tx_id + )?; + tx_skipped_noseq.insert(tx_id.clone()); + continue; + }; + + // Skip transcript if it is coding and the translated CDS does not have a stop codon. + if let Some(cds_start) = tx.start_codon { + let cds_start = cds_start as usize; + let cds_end = tx.stop_codon.expect("must be some if start_codon is some") as usize; + if cds_end > seq.len() { + tracing::error!( + "CDS end {} is larger than sequence length {} for {}", + cds_end, + seq.len(), + tx_id + ); + writeln!( + report_file, + "skip transcript CDS end {} is longer than sequence length {} for\t{}", + cds_end, + seq.len(), + tx_id + )?; + continue; + } + let tx_seq_to_translate = &seq[cds_start..cds_end]; + let aa_sequence = + translate_cds(tx_seq_to_translate, true, "*", TranslationTable::Standard)?; + if !aa_sequence.ends_with('*') { + tracing::debug!( + "Skipping transcript {} because of missing stop codon in translated CDS", + tx_id + ); + writeln!( + report_file, + "Skipping transcript {} because of missing stop codon in translated CDS", + tx_id + )?; + tx_skipped_nostop.insert(tx_id.clone()); + continue; + } + } + + // Register sequence into protobuf. + aliases.push(tx_id.clone()); + aliases_idx.push(seqs.len() as u32); + seqs.push(seq.clone()); + } + pb.finish_and_clear(); + // Finalize by creating `SequenceDb`. + data::SequenceDb { + aliases, + aliases_idx, + seqs, + } + }; + tracing::info!( + " ... done constructing sequence database (no seq for {} transcripts, \ + no stop codon for {}, will be skipped)", + tx_skipped_noseq.len().separate_with_commas(), + tx_skipped_nostop.len().separate_with_commas(), + ); + + trace_rss_now(); + + tracing::info!(" Creating transcript records for each gene..."); + let data_transcripts = { + let gene_symbols = { + let mut gene_symbols: Vec<_> = genes.keys().cloned().collect(); + gene_symbols.sort(); + gene_symbols + }; + let mut data_transcripts = Vec::new(); + // For each gene (in lexicographic symbol order) ... + for gene_symbol in &gene_symbols { + let gene = genes.get(gene_symbol).unwrap(); + let tx_ids = transcript_ids_for_gene + .get(gene_symbol.as_str()) + .unwrap_or_else(|| panic!("No transcripts for gene {:?}", &gene_symbol)); + let tx_ids = tx_ids + .iter() + .filter(|tx_id| { + !tx_skipped_noseq.contains(*tx_id) && !tx_skipped_nostop.contains(*tx_id) + }) + .collect::>(); + if tx_ids.is_empty() { + tracing::debug!( + "Skipping gene {} as all transcripts have been removed.", + gene_symbol + ); + writeln!( + report_file, + "skip gene from protobuf because all transcripts have been removed\t{}", + gene_symbol + )?; + continue; + } + + // ... for each transcript of the gene ... + for tx_id in tx_ids { + let tx_model = transcripts + .get(tx_id) + .unwrap_or_else(|| panic!("No transcript model for id {:?}", tx_id)); + // ... build genome alignment for selected: + let mut genome_alignments = Vec::new(); + for (genome_build, alignment) in &tx_model.genome_builds { + // obtain basic properties + let genome_build = match genome_build.as_ref() { + "GRCh37" => data::GenomeBuild::Grch37, + "GRCh38" => data::GenomeBuild::Grch38, + _ => panic!("Unknown genome build {:?}", genome_build), + }; + let models::GenomeAlignment { + contig, + cds_start, + cds_end, + .. + } = alignment.clone(); + let strand = match alignment.strand { + models::Strand::Plus => data::Strand::Plus, + models::Strand::Minus => data::Strand::Minus, + }; + // and construct vector of all exons + let exons: Vec<_> = alignment + .exons + .iter() + .map(|exon| { + let models::Exon { + alt_start_i, + alt_end_i, + ord, + alt_cds_start_i, + alt_cds_end_i, + cigar, + } = exon.clone(); + data::ExonAlignment { + alt_start_i, + alt_end_i, + ord, + alt_cds_start_i: if alt_cds_start_i == -1 { + None + } else { + Some(alt_cds_start_i) + }, + alt_cds_end_i: if alt_cds_end_i == -1 { + None + } else { + Some(alt_cds_end_i) + }, + cigar, + } + }) + .collect(); + // and finally push the genome alignment + genome_alignments.push(data::GenomeAlignment { + genome_build: genome_build.into(), + contig, + cds_start, + cds_end, + strand: strand.into(), + exons, + }); + } + + // Now, just obtain the basic properties and create a new `data::Transcript`. + let models::Gene { + biotype, + hgnc, + gene_symbol, + .. + } = gene.clone(); + let biotype = if biotype.unwrap().contains(&models::BioType::ProteinCoding) { + data::TranscriptBiotype::Coding.into() + } else { + data::TranscriptBiotype::NonCoding.into() + }; + let mut tags = Vec::new(); + if let Some(tag) = tx_model.tag.as_ref() { + for t in tag { + let elem = match t { + models::Tag::Basic => data::TranscriptTag::Basic.into(), + models::Tag::EnsemblCanonical => { + data::TranscriptTag::EnsemblCanonical.into() + } + models::Tag::ManeSelect => data::TranscriptTag::ManeSelect.into(), + models::Tag::ManePlusClinical => { + data::TranscriptTag::ManePlusClinical.into() + } + models::Tag::RefSeqSelect => data::TranscriptTag::RefSeqSelect.into(), + }; + tags.push(elem); + } + } + let models::Transcript { + protein, + start_codon, + stop_codon, + .. + } = tx_model.clone(); + + data_transcripts.push(data::Transcript { + id: tx_id.clone(), + gene_symbol: gene_symbol.expect("missing gene symbol"), + gene_id: format!("HGNC:{}", hgnc.expect("missing HGNC ID")), + biotype, + tags, + protein, + start_codon, + stop_codon, + genome_alignments, + }); + } + } + + data_transcripts + }; + tracing::info!(" ... done creating transcripts"); + + trace_rss_now(); + + // Build mapping of gene HGNC symbol to transcript IDs. + tracing::info!(" Build gene symbol to transcript ID mapping ..."); + let gene_to_tx = transcript_ids_for_gene + .into_iter() + .map(|(gene_id, tx_ids)| data::GeneToTxId { gene_id, tx_ids }) + .collect::>(); + tracing::info!(" ... done building gene symbol to transcript ID mapping"); + + trace_rss_now(); + + // Compose transcript database from transcripts and gene to transcript mapping. + tracing::info!(" Composing transcript database ..."); + let tx_db = data::TranscriptDb { + transcripts: data_transcripts, + gene_to_tx, + }; + tracing::info!(" ... done composing transcript database"); + + trace_rss_now(); + + // Compose the final transcript and sequence database. + tracing::info!(" Constructing final tx and seq database ..."); + let tx_seq_db = data::TxSeqDatabase { + tx_db: Some(tx_db), + seq_db: Some(seq_db), + version: Some(crate::common::version().to_string()), + genome_release: Some(genome_release.name()), + }; + let mut buf = Vec::new(); + buf.reserve(tx_seq_db.encoded_len()); + tx_seq_db + .encode(&mut buf) + .map_err(|e| anyhow!("failed to encode: {}", e))?; + tracing::info!(" ... done constructing final tx and seq database"); + + trace_rss_now(); + + // Write out the final transcript and sequence database. + tracing::info!(" Writing out final database ..."); + // Open file and if necessary, wrap in a decompressor. + let file = std::fs::File::create(path_out) + .map_err(|e| anyhow!("failed to create file {}: {}", path_out.display(), e))?; + let ext = path_out.extension().map(|s| s.to_str()); + let mut writer: Box = if ext == Some(Some("gz")) { + Box::new(flate2::write::GzEncoder::new( + file, + flate2::Compression::default(), + )) + } else if ext == Some(Some("zst")) { + Box::new( + zstd::Encoder::new(file, 0) + .map_err(|e| { + anyhow!( + "failed to open zstd encoder for {}: {}", + path_out.display(), + e + ) + })? + .auto_finish(), + ) + } else { + Box::new(file) + }; + writer + .write_all(&buf) + .map_err(|e| anyhow!("failed to write to {}: {}", path_out.display(), e))?; + tracing::info!(" ... done writing out final database"); + + trace_rss_now(); + + tracing::info!("... done with constructing protobuf file"); + Ok(()) +} + +/// Data as loaded from cdot after processing. +#[derive(Debug)] +struct TranscriptData { + pub genes: HashMap, + pub transcripts: HashMap, + pub transcript_ids_for_gene: HashMap>, +} + +/// Filter transcripts for gene. +/// +/// We employ the following rules: +/// +/// - Remove redundant transcripts with the same identifier and pick only the +/// transcripts that have the highest version number for one assembly. +/// - Do not pick any `XM_`/`XR_` (NCBI predicted only) transcripts. +/// - Do not pick any `NR_` transcripts when there are coding `NM_` transcripts. +fn filter_transcripts( + tx_data: TranscriptData, + max_genes: Option, + gene_symbols: &Option>, + report_file: &mut File, +) -> Result { + tracing::info!("Filtering transcripts ..."); + let start = Instant::now(); + let selected_hgnc_ids = gene_symbols.as_ref().map(|gene_symbols| { + let symbol_to_hgnc: HashMap<_, _> = + HashMap::from_iter(tx_data.genes.iter().flat_map(|(hgnc_id, g)| { + g.gene_symbol + .as_ref() + .map(|gene_symbol| (gene_symbol.clone(), hgnc_id.clone())) + })); + let result = gene_symbols + .iter() + .map(|s| symbol_to_hgnc.get(s).unwrap_or(s).clone()) + .collect::>(); + tracing::info!("Will limit to {:?}", &result); + result + }); + + let TranscriptData { + genes, + transcripts, + transcript_ids_for_gene, + } = tx_data; + + // Potentially limit number of genes. + let transcript_ids_for_gene = if let Some(max_genes) = max_genes { + tracing::warn!("Limiting to {} genes!", max_genes); + transcript_ids_for_gene + .into_iter() + .take(max_genes as usize) + .collect() + } else { + transcript_ids_for_gene + }; + + // We keep track of the chosen transcript identifiers. + let mut chosen = HashSet::new(); + // Filter map from gene symbol to Vec of chosen transcript identifiers. + let transcript_ids_for_gene = { + let mut tmp = HashMap::new(); + + for (hgnc_id, tx_ids) in &transcript_ids_for_gene { + // Skip transcripts where the gene symbol is not contained in `selected_hgnc_ids`. + if !selected_hgnc_ids + .as_ref() + .map(|ids| ids.contains(hgnc_id)) + .unwrap_or(true) + { + tracing::trace!("skipping {} / {:?}, because not selected", hgnc_id, tx_ids); + continue; + } + + // Only select the highest version of each transcript. + // + // First, split off transcript versions from accessions and look for NM transcript. + let mut seen_nm = false; + let mut versioned: Vec<_> = tx_ids + .iter() + .map(|tx_id| { + if tx_id.starts_with("NM_") { + seen_nm = true; + } + let s: Vec<_> = tx_id.split('.').collect(); + (s[0], s[1].parse::().expect("invalid version")) + }) + .collect(); + // Sort descendingly by version. + versioned.sort_by(|a, b| b.1.cmp(&a.1)); + + // Build `next_tx_ids`. + let mut seen_ac = HashSet::new(); + let mut next_tx_ids = Vec::new(); + for (ac, version) in versioned { + let full_ac = format!("{}.{}", &ac, version); + let ac = ac.to_string(); + + let releases = transcripts + .get(&full_ac) + .map(|tx| tx.genome_builds.keys().cloned().collect::>()) + .unwrap_or_default(); + + for release in releases { + #[allow(clippy::if_same_then_else)] + if seen_ac.contains(&(ac.clone(), release.clone())) { + writeln!( + report_file, + "skipped transcript {} because we have a later version already", + &full_ac + )?; + continue; // skip, already have later version + } else if ac.starts_with("NR_") && seen_nm { + writeln!( + report_file, + "skipped transcript {} because we have a NM transcript", + &full_ac + )?; + continue; // skip NR transcript as we have NM one + } else if ac.starts_with('X') { + writeln!( + report_file, + "skipped transcript {} because it is an XR/XM transcript", + &full_ac + )?; + continue; // skip XR/XM transcript + } else { + // Check transcript's CDS length for being multiple of 3 and skip unless it is. + let tx = transcripts + .get(&full_ac) + .expect("must exist; accession taken from map earlier"); + if let Some(cds_start) = tx.start_codon { + let cds_end = + tx.stop_codon.expect("must be some if start_codon is some"); + let cds_len = cds_end - cds_start; + if cds_len % 3 != 0 { + tracing::debug!("skipping transcript {} because its CDS length is not a multiple of 3", &full_ac); + writeln!(report_file, "skipped transcript {} because its CDS length {} is not a multiple of 3", &full_ac, cds_len)?; + continue; + } + } + + // Otherwise, mark transcript as included by storing its accession. + next_tx_ids.push(full_ac.clone()); + seen_ac.insert((ac.clone(), release)); + } + } + } + + next_tx_ids.sort(); + next_tx_ids.dedup(); + chosen.extend(next_tx_ids.iter().cloned()); + + if !next_tx_ids.is_empty() { + tmp.insert(hgnc_id.clone(), next_tx_ids); + } else { + writeln!( + report_file, + "skipped gene {} because we have no transcripts left", + hgnc_id + )?; + } + } + + tmp + }; + + let transcripts: HashMap<_, _> = transcripts + .into_iter() + .filter(|(tx_id, _)| chosen.contains(tx_id)) + .collect(); + tracing::debug!( + " => {} transcripts left", + transcripts.len().separate_with_commas() + ); + writeln!(report_file, "total transcripts\t{}", transcripts.len())?; + + let genes: HashMap<_, _> = genes + .into_iter() + .filter(|(gene_id, _)| transcript_ids_for_gene.contains_key(gene_id)) + .collect(); + tracing::debug!(" => {} genes left", genes.len().separate_with_commas()); + + tracing::info!("... done filtering transcripts in {:?}", start.elapsed()); + Ok(TranscriptData { + genes, + transcripts, + transcript_ids_for_gene, + }) +} + +/// Create file-backed `SeqRepo`. +fn open_seqrepo(args: &Args) -> Result { + tracing::info!("Opening seqrepo..."); + let start = Instant::now(); + let seqrepo = PathBuf::from(&args.path_seqrepo_instance); + let path = seqrepo + .parent() + .ok_or(anyhow::anyhow!( + "Could not get parent from {:?}", + &args.path_seqrepo_instance + ))? + .to_str() + .unwrap() + .to_string(); + let instance = seqrepo + .file_name() + .ok_or(anyhow::anyhow!( + "Could not get basename from {:?}", + &args.path_seqrepo_instance + ))? + .to_str() + .unwrap() + .to_string(); + let seqrepo = SeqRepo::new(path, &instance)?; + tracing::info!("... seqrepo opened in {:?}", start.elapsed()); + Ok(seqrepo) +} + +/// Load the cdot JSON files. +fn load_cdot_files(args: &Args, report_file: &mut File) -> Result { + tracing::info!("Loading cdot JSON files ..."); + let start = Instant::now(); + let mut genes = HashMap::new(); + let mut transcripts = HashMap::new(); + let mut transcript_ids_for_gene = HashMap::new(); + let mut cdot_version = String::new(); + for json_path in &args.path_cdot_json { + load_and_extract( + json_path, + &args.path_mane_txs_tsv.as_ref().map(|p| p.as_ref()), + &mut transcript_ids_for_gene, + &mut genes, + &mut transcripts, + args.genome_release, + &mut cdot_version, + report_file, + )?; + } + tracing::info!( + "... done loading cdot JSON files in {:?} -- #genes = {}, #transcripts = {}, #transcript_ids_for_gene = {}", + start.elapsed(), + genes.len().separate_with_commas(), + transcripts.len().separate_with_commas(), + transcript_ids_for_gene.len().separate_with_commas() + ); + writeln!( + report_file, + "total genes\t{}\ntotal transcripts\t{}", + transcripts.len(), + transcript_ids_for_gene.len() + )?; + + Ok(TranscriptData { + genes, + transcripts, + transcript_ids_for_gene, + }) +} + +/// Main entry point for `db create txs` sub command. +pub fn run(common: &crate::common::Args, args: &Args) -> Result<(), anyhow::Error> { + let mut report_file = File::create(format!("{}.report", args.path_out.display()))?; + tracing::info!( + "Building transcript and sequence database file\ncommon args: {:#?}\nargs: {:#?}", + common, + args + ); + + // Open seqrepo, + let seqrepo = open_seqrepo(args)?; + // then load cdot files, + let tx_data = load_cdot_files(args, &mut report_file)?; + // then remove redundant onces, and + let tx_data = filter_transcripts(tx_data, args.max_txs, &args.gene_symbols, &mut report_file)?; + // finally build protobuf file. + build_protobuf( + &args.path_out, + seqrepo, + tx_data, + common.verbose.is_silent(), + args.genome_release, + &mut report_file, + )?; + + tracing::info!("Done building transcript and sequence database file"); + Ok(()) +} + +#[cfg(test)] +pub mod test { + use std::collections::HashMap; + use std::fs::File; + use std::path::{Path, PathBuf}; + + use clap_verbosity_flag::Verbosity; + use temp_testdir::TempDir; + + use crate::common::{Args as CommonArgs, GenomeRelease}; + use crate::db::create::TranscriptData; + + use super::{filter_transcripts, load_and_extract, run, Args}; + + #[test] + fn filter_transcripts_brca1() -> Result<(), anyhow::Error> { + let tmp_dir = TempDir::default(); + let mut report_file = File::create(tmp_dir.join("report"))?; + + let mut genes = HashMap::new(); + let mut transcripts = HashMap::new(); + let mut transcript_ids_for_gene = HashMap::new(); + let mut cdot_version = String::new(); + let path_tsv = Path::new("tests/data/db/create/txs/txs_main.tsv"); + load_and_extract( + Path::new("tests/data/db/create/txs/cdot-0.2.21.refseq.grch37_grch38.brca1_opa1.json"), + &Some(path_tsv), + &mut transcript_ids_for_gene, + &mut genes, + &mut transcripts, + GenomeRelease::Grch37, + &mut cdot_version, + &mut report_file, + )?; + + let tx_data = TranscriptData { + genes, + transcripts, + transcript_ids_for_gene, + }; + + eprintln!("{:#?}", &tx_data.transcript_ids_for_gene); + insta::assert_yaml_snapshot!(tx_data + .transcript_ids_for_gene + .get("HGNC:1100") + .unwrap() + .iter() + .map(|s| s.as_str()) + .collect::>()); + + let filtered = filter_transcripts(tx_data, None, &None, &mut report_file)?; + insta::assert_yaml_snapshot!(filtered + .transcript_ids_for_gene + .get("HGNC:1100") + .unwrap() + .iter() + .map(|s| s.as_str()) + .collect::>()); + + insta::assert_snapshot!(&cdot_version); + + Ok(()) + } + + #[test] + fn run_smoke() -> Result<(), anyhow::Error> { + let tmp_dir = TempDir::default(); + + let common_args = CommonArgs { + verbose: Verbosity::new(0, 1), + }; + let args = Args { + path_out: tmp_dir.join("out.bin.zst"), + path_cdot_json: vec![PathBuf::from( + "tests/data/db/create/txs/cdot-0.2.21.refseq.grch37_grch38.brca1_opa1.json", + )], + path_mane_txs_tsv: Some(PathBuf::from("tests/data/db/create/txs/txs_main.tsv")), + path_seqrepo_instance: PathBuf::from("tests/data/db/create/txs/latest"), + genome_release: GenomeRelease::Grch38, + max_txs: None, + gene_symbols: None, + }; + + run(&common_args, &args)?; + + Ok(()) + } +} diff --git a/src/db/create/snapshots/mehari__db__create__test__filter_transcripts_brca1-2.snap b/src/db/create/snapshots/mehari__db__create__test__filter_transcripts_brca1-2.snap new file mode 100644 index 00000000..6cdf9da5 --- /dev/null +++ b/src/db/create/snapshots/mehari__db__create__test__filter_transcripts_brca1-2.snap @@ -0,0 +1,10 @@ +--- +source: src/db/create/mod.rs +expression: "filtered.transcript_ids_for_gene.get(\"HGNC:1100\").unwrap().iter().map(|s|\n s.as_str()).collect::>()" +--- +- NM_007294.4 +- NM_007297.4 +- NM_007298.3 +- NM_007299.4 +- NM_007300.4 + diff --git a/src/db/create/snapshots/mehari__db__create__test__filter_transcripts_brca1-3.snap b/src/db/create/snapshots/mehari__db__create__test__filter_transcripts_brca1-3.snap new file mode 100644 index 00000000..afa5ae6e --- /dev/null +++ b/src/db/create/snapshots/mehari__db__create__test__filter_transcripts_brca1-3.snap @@ -0,0 +1,5 @@ +--- +source: src/db/create/mod.rs +expression: "&cdot_version" +--- +0.2.21 diff --git a/src/db/create/txs/snapshots/mehari__db__create__txs__test__filter_transcripts_brca1.snap b/src/db/create/snapshots/mehari__db__create__test__filter_transcripts_brca1.snap similarity index 50% rename from src/db/create/txs/snapshots/mehari__db__create__txs__test__filter_transcripts_brca1.snap rename to src/db/create/snapshots/mehari__db__create__test__filter_transcripts_brca1.snap index 0138e702..0c5ca246 100644 --- a/src/db/create/txs/snapshots/mehari__db__create__txs__test__filter_transcripts_brca1.snap +++ b/src/db/create/snapshots/mehari__db__create__test__filter_transcripts_brca1.snap @@ -1,6 +1,6 @@ --- -source: src/db/create/txs/mod.rs -expression: "tx_data.transcript_ids_for_gene.get(\"BRCA1\").unwrap().iter().map(|s|\n s.as_str()).collect::>()" +source: src/db/create/mod.rs +expression: "tx_data.transcript_ids_for_gene.get(\"HGNC:1100\").unwrap().iter().map(|s|\n s.as_str()).collect::>()" --- - NM_007294.3 - NM_007294.4 diff --git a/src/db/create/txs/mod.rs b/src/db/create/txs/mod.rs deleted file mode 100644 index 3089af6f..00000000 --- a/src/db/create/txs/mod.rs +++ /dev/null @@ -1,868 +0,0 @@ -//! Transcript database. - -use std::collections::HashSet; -use std::fs::File; -use std::path::Path; -use std::{collections::HashMap, io::Write, path::PathBuf, time::Instant}; - -use anyhow::anyhow; -use clap::Parser; -use hgvs::data::cdot::json::models; -use hgvs::sequences::{translate_cds, TranslationTable}; -use indicatif::{ProgressBar, ProgressStyle}; -use prost::Message; -use seqrepo::{AliasOrSeqId, Interface, SeqRepo}; -use thousands::Separable; - -use crate::common::{trace_rss_now, GenomeRelease}; - -lazy_static::lazy_static! { - /// Progress bar style to use. - pub static ref PROGRESS_STYLE: ProgressStyle = ProgressStyle::with_template( - "[{elapsed_precise}] [{wide_bar:.cyan/blue}] {human_pos}/{human_len} ({eta})", - ) - .unwrap(); -} - -/// Data structures for (de-)serialization as generated by `prost-build`. -pub mod data { - include!(concat!(env!("OUT_DIR"), "/mehari.db.create.txs.data.rs")); -} - -/// Command line arguments for `db create txs` sub command. -#[derive(Parser, Debug)] -#[command(about = "Construct mehari transcripts and sequence database", long_about = None)] -pub struct Args { - /// Genome release to extract transcripts for. - #[arg(long)] - pub genome_release: GenomeRelease, - /// Path to output protobuf file to write to. - #[arg(long)] - pub path_out: PathBuf, - /// Paths to the cdot JSON transcripts to import. - #[arg(long, required = true)] - pub path_cdot_json: Vec, - /// Path to the seqrepo instance directory to use. - #[arg(long)] - pub path_seqrepo_instance: PathBuf, - /// Maximal number of transcripts to process. - #[arg(long)] - pub max_txs: Option, - /// Limit transcript database to the following HGNC symbols. Useful for - /// building test databases. - #[arg(long)] - pub gene_symbols: Option>, -} - -/// Load and extract from cdot JSON. -fn load_and_extract( - json_path: &Path, - transcript_ids_for_gene: &mut HashMap>, - genes: &mut HashMap, - transcripts: &mut HashMap, - genome_release: GenomeRelease, - cdot_version: &mut String, - report_file: &mut File, -) -> Result<(), anyhow::Error> { - writeln!(report_file, "genome_release\t{:?}", genome_release)?; - - tracing::info!("Loading cdot transcripts from {:?}", json_path); - writeln!(report_file, "cdot_json_path\t{:?}", json_path)?; - let start = Instant::now(); - let models::Container { - genes: c_genes, - transcripts: c_txs, - cdot_version: c_version, - .. - } = if json_path.extension().unwrap_or_default() == "gz" { - tracing::info!("(from gzip compressed file)"); - serde_json::from_reader(std::io::BufReader::new(flate2::read::GzDecoder::new( - File::open(json_path)?, - )))? - } else { - tracing::info!("(from uncompressed file)"); - serde_json::from_reader(std::io::BufReader::new(File::open(json_path)?))? - }; - *cdot_version = c_version; - tracing::info!( - "loading / deserializing {} genes and {} transcripts from cdot took {:?}", - c_genes.len().separate_with_commas(), - c_txs.len().separate_with_commas(), - start.elapsed() - ); - - let start = Instant::now(); - writeln!(report_file, "total_genes\t{}", c_genes.len())?; - c_genes - .values() - .filter(|gene| { - gene.gene_symbol.is_some() - && !gene.gene_symbol.as_ref().unwrap().is_empty() - && gene.map_location.is_some() - && !gene.map_location.as_ref().unwrap().is_empty() - && gene.hgnc.is_some() - && !gene.hgnc.as_ref().unwrap().is_empty() - }) - .for_each(|gene| { - let gene_symbol = gene.gene_symbol.as_ref().unwrap().clone(); - transcript_ids_for_gene - .entry(gene_symbol.clone()) - .or_default(); - genes.insert(gene_symbol, gene.clone()); - }); - writeln!( - report_file, - "genes with gene_symbol, map_location, hgnc\t{}", - genes.len() - )?; - tracing::info!( - "Processed {} genes; total gene count: {}", - c_genes.len().separate_with_commas(), - genes.len() - ); - tracing::debug!( - "some 10 genes: {:?}", - genes.keys().take(10).collect::>() - ); - - tracing::info!("Processing transcripts"); - writeln!(report_file, "total_transcripts\t{}", c_txs.len())?; - c_txs - .values() - .map(|tx| models::Transcript { - genome_builds: tx - .genome_builds - .iter() - .filter(|(key, _)| { - matches!( - (key.as_str(), genome_release), - ("GRCh37", GenomeRelease::Grch37) | ("GRCh38", GenomeRelease::Grch38) - ) - }) - .map(|(k, v)| (k.clone(), v.clone())) - .collect(), - ..tx.clone() - }) - .filter(|tx| { - tx.gene_name.is_some() - && !tx.gene_name.as_ref().unwrap().is_empty() - && genes.contains_key(tx.gene_name.as_ref().unwrap()) - && !tx.genome_builds.is_empty() - }) - .for_each(|tx| { - let gene_name = tx.gene_name.as_ref().unwrap(); - transcript_ids_for_gene - .get_mut(gene_name) - .unwrap_or_else(|| panic!("tx {:?} for unknown gene {:?}", tx.id, gene_name)) - .push(tx.id.clone()); - transcripts.insert(tx.id.clone(), tx.clone()); - }); - writeln!( - report_file, - "transcripts with alignment on genome and link to selected gene\t{}", - transcripts.len() - )?; - tracing::info!( - "Processed {} genes; total transcript count: {}", - c_txs.len().separate_with_commas(), - transcripts.len().separate_with_commas() - ); - tracing::info!("extracting datastructures took {:?}", start.elapsed()); - Ok(()) -} - -/// Perform protobuf file construction. -/// -/// This can be done by simply converting the models from HGVS to the prost generated data structures. -fn build_protobuf( - path_out: &Path, - seqrepo: SeqRepo, - tx_data: TranscriptData, - is_silent: bool, - genome_release: GenomeRelease, - report_file: &mut File, -) -> Result<(), anyhow::Error> { - let TranscriptData { - genes, - transcripts, - transcript_ids_for_gene, - } = tx_data; - - tracing::info!("Constructing protobuf data structures ..."); - trace_rss_now(); - - // Construct sequence database. - tracing::info!(" Constructing sequence database ..."); - let mut tx_skipped_noseq = HashSet::new(); // skipped because of missing sequence - let mut tx_skipped_nostop = HashSet::new(); // skipped because of missing stop codon - let seq_db = { - // Insert into protobuf and keep track of pointers in `Vec`s. - let mut aliases = Vec::new(); - let mut aliases_idx = Vec::new(); - let mut seqs = Vec::new(); - let pb = if is_silent { - ProgressBar::hidden() - } else { - ProgressBar::new(transcripts.len() as u64) - }; - pb.set_style(PROGRESS_STYLE.clone()); - for (tx_id, tx) in &transcripts { - pb.inc(1); - let res_seq = seqrepo.fetch_sequence(&AliasOrSeqId::Alias { - value: tx_id.clone(), - namespace: None, - }); - let seq = if let Ok(seq) = res_seq { - seq - } else { - tracing::debug!("Skipping transcript {} because of missing sequence", tx_id); - writeln!( - report_file, - "skip transcript because it has no sequence\t{}", - tx_id - )?; - tx_skipped_noseq.insert(tx_id.clone()); - continue; - }; - - // Skip transcript if it is coding and the translated CDS does not have a stop codon. - if let Some(cds_start) = tx.start_codon { - let cds_start = cds_start as usize; - let cds_end = tx.stop_codon.expect("must be some if start_codon is some") as usize; - if cds_end > seq.len() { - tracing::error!( - "CDS end {} is larger than sequence length {} for {}", - cds_end, - seq.len(), - tx_id - ); - writeln!( - report_file, - "skip transcript CDS end {} is longer than sequence length {} for\t{}", - cds_end, - seq.len(), - tx_id - )?; - continue; - } - let tx_seq_to_translate = &seq[cds_start..cds_end]; - let aa_sequence = - translate_cds(tx_seq_to_translate, true, "*", TranslationTable::Standard)?; - if !aa_sequence.ends_with('*') { - tracing::debug!( - "Skipping transcript {} because of missing stop codon in translated CDS", - tx_id - ); - writeln!( - report_file, - "Skipping transcript {} because of missing stop codon in translated CDS", - tx_id - )?; - tx_skipped_nostop.insert(tx_id.clone()); - continue; - } - } - - // Register sequence into protobuf. - aliases.push(tx_id.clone()); - aliases_idx.push(seqs.len() as u32); - seqs.push(seq.clone()); - } - pb.finish_and_clear(); - // Finalize by creating `SequenceDb`. - data::SequenceDb { - aliases, - aliases_idx, - seqs, - } - }; - tracing::info!( - " ... done constructing sequence database (no seq for {} transcripts, \ - no stop codon for {}, will be skipped)", - tx_skipped_noseq.len().separate_with_commas(), - tx_skipped_nostop.len().separate_with_commas(), - ); - - trace_rss_now(); - - tracing::info!(" Creating transcript records for each gene..."); - let data_transcripts = { - let gene_symbols = { - let mut gene_symbols: Vec<_> = genes.keys().cloned().collect(); - gene_symbols.sort(); - gene_symbols - }; - let mut data_transcripts = Vec::new(); - // For each gene (in lexicographic symbol order) ... - for gene_symbol in &gene_symbols { - let gene = genes.get(gene_symbol).unwrap(); - let tx_ids = transcript_ids_for_gene - .get(gene_symbol.as_str()) - .unwrap_or_else(|| panic!("No transcripts for gene {:?}", &gene_symbol)); - let tx_ids = tx_ids - .iter() - .filter(|tx_id| { - !tx_skipped_noseq.contains(*tx_id) && !tx_skipped_nostop.contains(*tx_id) - }) - .collect::>(); - if tx_ids.is_empty() { - tracing::debug!( - "Skipping gene {} as all transcripts have been removed.", - gene_symbol - ); - writeln!( - report_file, - "skip gene from protobuf because all transcripts have been removed\t{}", - gene_symbol - )?; - continue; - } - - // ... for each transcript of the gene ... - for tx_id in tx_ids { - let tx_model = transcripts - .get(tx_id) - .unwrap_or_else(|| panic!("No transcript model for id {:?}", tx_id)); - // ... build genome alignment for selected: - let mut genome_alignments = Vec::new(); - for (genome_build, alignment) in &tx_model.genome_builds { - // obtain basic properties - let genome_build = match genome_build.as_ref() { - "GRCh37" => data::GenomeBuild::Grch37, - "GRCh38" => data::GenomeBuild::Grch38, - _ => panic!("Unknown genome build {:?}", genome_build), - }; - let models::GenomeAlignment { - contig, - cds_start, - cds_end, - .. - } = alignment.clone(); - let strand = match alignment.strand { - models::Strand::Plus => data::Strand::Plus, - models::Strand::Minus => data::Strand::Minus, - }; - // and construct vector of all exons - let exons: Vec<_> = alignment - .exons - .iter() - .map(|exon| { - let models::Exon { - alt_start_i, - alt_end_i, - ord, - alt_cds_start_i, - alt_cds_end_i, - cigar, - } = exon.clone(); - data::ExonAlignment { - alt_start_i, - alt_end_i, - ord, - alt_cds_start_i: if alt_cds_start_i == -1 { - None - } else { - Some(alt_cds_start_i) - }, - alt_cds_end_i: if alt_cds_end_i == -1 { - None - } else { - Some(alt_cds_end_i) - }, - cigar, - } - }) - .collect(); - // and finally push the genome alignment - genome_alignments.push(data::GenomeAlignment { - genome_build: genome_build.into(), - contig, - cds_start, - cds_end, - strand: strand.into(), - exons, - }); - } - - // Now, just obtain the basic properties and create a new `data::Transcript`. - let models::Gene { - biotype, - hgnc, - gene_symbol, - .. - } = gene.clone(); - let biotype = if biotype.unwrap().contains(&models::BioType::ProteinCoding) { - data::TranscriptBiotype::Coding.into() - } else { - data::TranscriptBiotype::NonCoding.into() - }; - let mut tags = Vec::new(); - if let Some(tag) = tx_model.tag.as_ref() { - for t in tag { - let elem = match t { - models::Tag::Basic => data::TranscriptTag::Basic.into(), - models::Tag::EnsemblCanonical => { - data::TranscriptTag::EnsemblCanonical.into() - } - models::Tag::ManeSelect => data::TranscriptTag::ManeSelect.into(), - models::Tag::ManePlusClinical => { - data::TranscriptTag::ManePlusClinical.into() - } - models::Tag::RefSeqSelect => data::TranscriptTag::RefSeqSelect.into(), - }; - tags.push(elem); - } - } - let models::Transcript { - protein, - start_codon, - stop_codon, - .. - } = tx_model.clone(); - - data_transcripts.push(data::Transcript { - id: tx_id.clone(), - gene_name: gene_symbol.expect("missing gene symbol"), - gene_id: hgnc.expect("missing HGNC ID"), - biotype, - tags, - protein, - start_codon, - stop_codon, - genome_alignments, - }); - } - } - - data_transcripts - }; - tracing::info!(" ... done creating transcripts"); - - trace_rss_now(); - - // Build mapping of gene HGNC symbol to transcript IDs. - tracing::info!(" Build gene symbol to transcript ID mapping ..."); - let gene_to_tx = transcript_ids_for_gene - .into_iter() - .map(|(gene_name, tx_ids)| data::GeneToTxId { gene_name, tx_ids }) - .collect::>(); - tracing::info!(" ... done building gene symbol to transcript ID mapping"); - - trace_rss_now(); - - // Compose transcript database from transcripts and gene to transcript mapping. - tracing::info!(" Composing transcript database ..."); - let tx_db = data::TranscriptDb { - transcripts: data_transcripts, - gene_to_tx, - }; - tracing::info!(" ... done composing transcript database"); - - trace_rss_now(); - - // Compose the final transcript and sequence database. - tracing::info!(" Constructing final tx and seq database ..."); - let tx_seq_db = data::TxSeqDatabase { - tx_db: Some(tx_db), - seq_db: Some(seq_db), - version: Some(crate::common::version().to_string()), - genome_release: Some(genome_release.name()), - }; - let mut buf = Vec::new(); - buf.reserve(tx_seq_db.encoded_len()); - tx_seq_db - .encode(&mut buf) - .map_err(|e| anyhow!("failed to encode: {}", e))?; - tracing::info!(" ... done constructing final tx and seq database"); - - trace_rss_now(); - - // Write out the final transcript and sequence database. - tracing::info!(" Writing out final database ..."); - // Open file and if necessary, wrap in a decompressor. - let file = std::fs::File::create(path_out) - .map_err(|e| anyhow!("failed to create file {}: {}", path_out.display(), e))?; - let ext = path_out.extension().map(|s| s.to_str()); - let mut writer: Box = if ext == Some(Some("gz")) { - Box::new(flate2::write::GzEncoder::new( - file, - flate2::Compression::default(), - )) - } else if ext == Some(Some("zst")) { - Box::new( - zstd::Encoder::new(file, 0) - .map_err(|e| { - anyhow!( - "failed to open zstd encoder for {}: {}", - path_out.display(), - e - ) - })? - .auto_finish(), - ) - } else { - Box::new(file) - }; - writer - .write_all(&buf) - .map_err(|e| anyhow!("failed to write to {}: {}", path_out.display(), e))?; - tracing::info!(" ... done writing out final database"); - - trace_rss_now(); - - tracing::info!("... done with constructing protobuf file"); - Ok(()) -} - -/// Data as loaded from cdot after processing. -struct TranscriptData { - pub genes: HashMap, - pub transcripts: HashMap, - pub transcript_ids_for_gene: HashMap>, -} - -/// Filter transcripts for gene. -/// -/// We employ the following rules: -/// -/// - Remove redundant transcripts with the same identifier and pick only the -/// transcripts that have the highest version number for one assembly. -/// - Do not pick any `XM_`/`XR_` (NCBI predicted only) transcripts. -/// - Do not pick any `NR_` transcripts when there are coding `NM_` transcripts. -fn filter_transcripts( - tx_data: TranscriptData, - max_genes: Option, - gene_symbols: &Option>, - report_file: &mut File, -) -> Result { - tracing::info!("Filtering transcripts ..."); - let start = Instant::now(); - let gene_symbols = gene_symbols.clone().unwrap_or_default(); - - let TranscriptData { - genes, - transcripts, - transcript_ids_for_gene, - } = tx_data; - - // Potentially limit number of genes. - let transcript_ids_for_gene = if let Some(max_genes) = max_genes { - tracing::warn!("Limiting to {} genes!", max_genes); - transcript_ids_for_gene - .into_iter() - .take(max_genes as usize) - .collect() - } else { - transcript_ids_for_gene - }; - - // We keep track of the chosen transcript identifiers. - let mut chosen = HashSet::new(); - // Filter map from gene symbol to Vec of chosen transcript identifiers. - let transcript_ids_for_gene = { - let mut tmp = HashMap::new(); - - for (gene_symbol, tx_ids) in &transcript_ids_for_gene { - // Skip transcripts where the gene symbol is not contained in `gene_symbols`. - if !gene_symbols.is_empty() && !gene_symbols.contains(gene_symbol) { - continue; - } - - // Only select the highest version of each transcript. - // - // First, split off transcript versions from accessions and look for NM transcript. - let mut seen_nm = false; - let mut versioned: Vec<_> = tx_ids - .iter() - .map(|tx_id| { - if tx_id.starts_with("NM_") { - seen_nm = true; - } - let s: Vec<_> = tx_id.split('.').collect(); - (s[0], s[1].parse::().expect("invalid version")) - }) - .collect(); - // Sort descendingly by version. - versioned.sort_by(|a, b| b.1.cmp(&a.1)); - - // Build `next_tx_ids`. - let mut seen_ac = HashSet::new(); - let mut next_tx_ids = Vec::new(); - for (ac, version) in versioned { - let full_ac = format!("{}.{}", &ac, version); - let ac = ac.to_string(); - - let releases = transcripts - .get(&full_ac) - .map(|tx| tx.genome_builds.keys().cloned().collect::>()) - .unwrap_or_default(); - - for release in releases { - #[allow(clippy::if_same_then_else)] - if seen_ac.contains(&(ac.clone(), release.clone())) { - writeln!( - report_file, - "skipped transcript {} because we have a later version already", - &full_ac - )?; - continue; // skip, already have later version - } else if ac.starts_with("NR_") && seen_nm { - writeln!( - report_file, - "skipped transcript {} because we have a NR transcript", - &full_ac - )?; - continue; // skip NR transcript as we have NM one - } else if ac.starts_with('X') { - writeln!( - report_file, - "skipped transcript {} because it is an XR/XM transcript", - &full_ac - )?; - continue; // skip XR/XM transcript - } else { - // Check transcript's CDS length for being multiple of 3 and skip unless it is. - let tx = transcripts - .get(&full_ac) - .expect("must exist; accession taken from map earlier"); - if let Some(cds_start) = tx.start_codon { - let cds_end = - tx.stop_codon.expect("must be some if start_codon is some"); - let cds_len = cds_end - cds_start; - if cds_len % 3 != 0 { - tracing::debug!("skipping transcript {} because its CDS length is not a multiple of 3", &full_ac); - writeln!(report_file, "skipped transcript {} because its CDS length {} is not a multiple of 3", &full_ac, cds_len)?; - continue; - } - } - - // Otherwise, mark transcript as included by storing its accession. - next_tx_ids.push(full_ac.clone()); - seen_ac.insert((ac.clone(), release)); - } - } - } - - next_tx_ids.sort(); - next_tx_ids.dedup(); - chosen.extend(next_tx_ids.iter().cloned()); - - if !next_tx_ids.is_empty() { - tmp.insert(gene_symbol.clone(), next_tx_ids); - } else { - writeln!( - report_file, - "skipped gene {} because we have no transcripts left", - gene_symbol - )?; - } - } - - tmp - }; - - let transcripts: HashMap<_, _> = transcripts - .into_iter() - .filter(|(tx_id, _)| chosen.contains(tx_id)) - .collect(); - tracing::debug!( - " => {} transcripts left", - transcripts.len().separate_with_commas() - ); - writeln!(report_file, "total transcripts\t{}", transcripts.len())?; - - let genes: HashMap<_, _> = genes - .into_iter() - .filter(|(gene_id, _)| transcript_ids_for_gene.contains_key(gene_id)) - .collect(); - tracing::debug!(" => {} genes left", genes.len().separate_with_commas()); - - tracing::info!("... done filtering transcripts in {:?}", start.elapsed()); - Ok(TranscriptData { - genes, - transcripts, - transcript_ids_for_gene, - }) -} - -/// Create file-backed `SeqRepo`. -fn open_seqrepo(args: &Args) -> Result { - tracing::info!("Opening seqrepo..."); - let start = Instant::now(); - let seqrepo = PathBuf::from(&args.path_seqrepo_instance); - let path = seqrepo - .parent() - .ok_or(anyhow::anyhow!( - "Could not get parent from {:?}", - &args.path_seqrepo_instance - ))? - .to_str() - .unwrap() - .to_string(); - let instance = seqrepo - .file_name() - .ok_or(anyhow::anyhow!( - "Could not get basename from {:?}", - &args.path_seqrepo_instance - ))? - .to_str() - .unwrap() - .to_string(); - let seqrepo = SeqRepo::new(path, &instance)?; - tracing::info!("... seqrepo opened in {:?}", start.elapsed()); - Ok(seqrepo) -} - -/// Load the cdot JSON files. -fn load_cdot_files(args: &Args, report_file: &mut File) -> Result { - tracing::info!("Loading cdot JSON files ..."); - let start = Instant::now(); - let mut genes = HashMap::new(); - let mut transcripts = HashMap::new(); - let mut transcript_ids_for_gene = HashMap::new(); - let mut cdot_version = String::new(); - for json_path in &args.path_cdot_json { - load_and_extract( - json_path, - &mut transcript_ids_for_gene, - &mut genes, - &mut transcripts, - args.genome_release, - &mut cdot_version, - report_file, - )?; - } - tracing::info!( - "... done loading cdot JSON files in {:?} -- #genes = {}, #transcripts = {}, #transcript_ids_for_gene = {}", - start.elapsed(), - genes.len().separate_with_commas(), - transcripts.len().separate_with_commas(), - transcript_ids_for_gene.len().separate_with_commas() - ); - writeln!( - report_file, - "total genes\t{}\ntotal transcripts\t{}", - transcripts.len(), - transcript_ids_for_gene.len() - )?; - - Ok(TranscriptData { - genes, - transcripts, - transcript_ids_for_gene, - }) -} - -/// Main entry point for `db create txs` sub command. -pub fn run(common: &crate::common::Args, args: &Args) -> Result<(), anyhow::Error> { - let mut report_file = File::create(format!("{}.report", args.path_out.display()))?; - tracing::info!( - "Building transcript and sequence database file\ncommon args: {:#?}\nargs: {:#?}", - common, - args - ); - - // Open seqrepo, - let seqrepo = open_seqrepo(args)?; - // then load cdot files, - let tx_data = load_cdot_files(args, &mut report_file)?; - // then remove redundant onces, and - let tx_data = filter_transcripts(tx_data, args.max_txs, &args.gene_symbols, &mut report_file)?; - // finally build protobuf file. - build_protobuf( - &args.path_out, - seqrepo, - tx_data, - common.verbose.is_silent(), - args.genome_release, - &mut report_file, - )?; - - tracing::info!("Done building transcript and sequence database file"); - Ok(()) -} - -#[cfg(test)] -pub mod test { - use std::collections::HashMap; - use std::fs::File; - use std::path::{Path, PathBuf}; - - use clap_verbosity_flag::Verbosity; - use temp_testdir::TempDir; - - use crate::common::{Args as CommonArgs, GenomeRelease}; - use crate::db::create::txs::TranscriptData; - - use super::{filter_transcripts, load_and_extract, run, Args}; - - #[test] - fn filter_transcripts_brca1() -> Result<(), anyhow::Error> { - let tmp_dir = TempDir::default(); - let mut report_file = File::create(tmp_dir.join("report"))?; - - let mut genes = HashMap::new(); - let mut transcripts = HashMap::new(); - let mut transcript_ids_for_gene = HashMap::new(); - let mut cdot_version = String::new(); - load_and_extract( - Path::new("tests/data/db/create/txs/cdot-0.2.12.refseq.grch37_grch38.brca1_opa1.json"), - &mut transcript_ids_for_gene, - &mut genes, - &mut transcripts, - GenomeRelease::Grch37, - &mut cdot_version, - &mut report_file, - )?; - - let tx_data = TranscriptData { - genes, - transcripts, - transcript_ids_for_gene, - }; - - insta::assert_yaml_snapshot!(tx_data - .transcript_ids_for_gene - .get("BRCA1") - .unwrap() - .iter() - .map(|s| s.as_str()) - .collect::>()); - - let filtered = filter_transcripts(tx_data, None, &None, &mut report_file)?; - insta::assert_yaml_snapshot!(filtered - .transcript_ids_for_gene - .get("BRCA1") - .unwrap() - .iter() - .map(|s| s.as_str()) - .collect::>()); - - insta::assert_snapshot!(&cdot_version); - - Ok(()) - } - - #[test] - fn run_smoke() -> Result<(), anyhow::Error> { - let tmp_dir = TempDir::default(); - - let common_args = CommonArgs { - verbose: Verbosity::new(0, 1), - }; - let args = Args { - path_out: tmp_dir.join("out.bin.zst"), - path_cdot_json: vec![PathBuf::from( - "tests/data/db/create/txs/cdot-0.2.12.refseq.grch37_grch38.brca1_opa1.json", - )], - path_seqrepo_instance: PathBuf::from("tests/data/db/create/txs/latest"), - genome_release: GenomeRelease::Grch38, - max_txs: None, - gene_symbols: None, - }; - - run(&common_args, &args)?; - - Ok(()) - } -} diff --git a/src/db/create/txs/snapshots/mehari__db__create__txs__test__filter_transcripts_brca1-2.snap b/src/db/create/txs/snapshots/mehari__db__create__txs__test__filter_transcripts_brca1-2.snap deleted file mode 100644 index 82d18d4b..00000000 --- a/src/db/create/txs/snapshots/mehari__db__create__txs__test__filter_transcripts_brca1-2.snap +++ /dev/null @@ -1,10 +0,0 @@ ---- -source: src/db/create/txs/mod.rs -expression: "filtered.transcript_ids_for_gene.get(\"BRCA1\").unwrap().iter().map(|s|\n s.as_str()).collect::>()" ---- -- NM_007294.4 -- NM_007297.4 -- NM_007298.3 -- NM_007299.4 -- NM_007300.4 - diff --git a/src/db/create/txs/snapshots/mehari__db__create__txs__test__filter_transcripts_brca1-3.snap b/src/db/create/txs/snapshots/mehari__db__create__txs__test__filter_transcripts_brca1-3.snap deleted file mode 100644 index 7255b3b1..00000000 --- a/src/db/create/txs/snapshots/mehari__db__create__txs__test__filter_transcripts_brca1-3.snap +++ /dev/null @@ -1,5 +0,0 @@ ---- -source: src/db/create/txs/mod.rs -expression: "&cdot_version" ---- -0.2.12 diff --git a/src/db/dump/mod.rs b/src/db/dump/mod.rs new file mode 100644 index 00000000..75f197bf --- /dev/null +++ b/src/db/dump/mod.rs @@ -0,0 +1,27 @@ +//! Dump transcript database. + +use std::path::PathBuf; + +use clap::Parser; + +use crate::annotate::seqvars::load_tx_db; + +/// Command line arguments for `db dump` sub command. +#[derive(Parser, Debug)] +#[command(about = "Dump transcript database", long_about = None)] +pub struct Args { + /// Path to database file to dump + #[arg(long)] + pub path_db: PathBuf, +} + +/// Main entry point for `db create txs` sub command. +pub fn run(_common: &crate::common::Args, args: &Args) -> Result<(), anyhow::Error> { + tracing::info!("Opening transcript database"); + let tx_db = load_tx_db(&format!("{}", args.path_db.display()))?; + tracing::info!("Dumping ..."); + serde_yaml::to_writer(std::io::stdout(), &tx_db)?; + tracing::info!("... done"); + + Ok(()) +} diff --git a/src/db/mod.rs b/src/db/mod.rs index 6d6b4192..a4efc931 100644 --- a/src/db/mod.rs +++ b/src/db/mod.rs @@ -1,3 +1,4 @@ //! Database construction and introspection tools. pub mod create; +pub mod dump; diff --git a/src/main.rs b/src/main.rs index e477bace..08e0fe88 100644 --- a/src/main.rs +++ b/src/main.rs @@ -118,22 +118,8 @@ struct Db { /// Enum supporting the parsing of "db *" sub commands. #[derive(Debug, Subcommand)] enum DbCommands { - Create(DbCreate), -} - -/// Parsing of "db create *" sub commands. -#[derive(Debug, Args)] -#[command(args_conflicts_with_subcommands = true)] -struct DbCreate { - /// The sub command to run - #[command(subcommand)] - command: DbCreateCommands, -} - -/// Enum supporting the parsing of "db create *" sub commands. -#[derive(Debug, Subcommand)] -enum DbCreateCommands { - Txs(db::create::txs::Args), + Create(db::create::Args), + Dump(db::dump::Args), } /// Parsing of "annotate *" sub commands. @@ -173,6 +159,7 @@ async fn main() -> Result<(), anyhow::Error> { // Build a tracing subscriber according to the configuration in `cli.common`. let collector = tracing_subscriber::fmt() + .with_writer(std::io::stderr) .with_target(false) .with_max_level(match cli.common.verbose.log_level() { Some(level) => match level { @@ -193,9 +180,8 @@ async fn main() -> Result<(), anyhow::Error> { match &cli.command { Commands::Db(db) => match &db.command { - DbCommands::Create(db_create) => match &db_create.command { - DbCreateCommands::Txs(args) => db::create::txs::run(&cli.common, args)?, - }, + DbCommands::Create(args) => db::create::run(&cli.common, args)?, + DbCommands::Dump(args) => db::dump::run(&cli.common, args)?, }, Commands::Annotate(annotate) => match &annotate.command { AnnotateCommands::Seqvars(args) => annotate::seqvars::run(&cli.common, args)?, diff --git a/src/db/create/txs/data.proto3 b/src/proto/data.proto3 similarity index 97% rename from src/db/create/txs/data.proto3 rename to src/proto/data.proto3 index d0c4c758..49eac510 100644 --- a/src/db/create/txs/data.proto3 +++ b/src/proto/data.proto3 @@ -19,8 +19,8 @@ message SequenceDb { // Mapping from gene to transcript ID. message GeneToTxId { - // Gene HGNC symbol; serves as gene identifier. - string gene_name = 1; + // Gene HGNC ID; serves as gene identifier. + string gene_id = 1; // Vector of all transcript IDs. repeated string tx_ids = 2; } @@ -53,7 +53,7 @@ message Transcript { // Transcript accession with version, e.g., `"NM_007294.3"` or `"ENST00000461574.1"` for BRCA1. string id = 1; // HGNC symbol, e.g., `"BRCA1"` - string gene_name = 2; + string gene_symbol = 2; // HGNC gene identifier, e.g., `"1100"` for BRCA1. string gene_id = 3; // Transcript biotype. diff --git a/src/server/actix_server/seqvars_csq.rs b/src/server/actix_server/seqvars_csq.rs index 99ec5808..2eae9280 100644 --- a/src/server/actix_server/seqvars_csq.rs +++ b/src/server/actix_server/seqvars_csq.rs @@ -54,6 +54,8 @@ struct ResultEntry { pub feature_id: String, /// The feature biotype. pub feature_biotype: FeatureBiotype, + /// The feature tags. + pub feature_tag: Vec, /// The exon / intron rank. pub rank: Option, /// HGVS c. notation. @@ -153,7 +155,16 @@ async fn handle( gene_id, feature_type, feature_id, - feature_biotype, + feature_biotype: if feature_biotype.contains(&FeatureBiotype::Coding) { + FeatureBiotype::Coding + } else { + FeatureBiotype::Noncoding + }, + feature_tag: feature_biotype + .iter() + .cloned() + .filter(|b| *b != FeatureBiotype::Coding && *b != FeatureBiotype::Noncoding) + .collect(), rank, hgvs_t, hgvs_p, diff --git a/src/server/mod.rs b/src/server/mod.rs index fc761e53..76a21e4d 100644 --- a/src/server/mod.rs +++ b/src/server/mod.rs @@ -4,7 +4,7 @@ use crate::{ annotate::{ seqvars::{ csq::ConsequencePredictor as SeqvarConsequencePredictor, load_tx_db, path_component, - provider::MehariProvider, + provider::Provider, }, strucvars::csq::ConsequencePredictor as StrucvarConsequencePredictor, }, @@ -106,11 +106,11 @@ pub async fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), a tracing::info!(" - loading {}", &path); let tx_db = load_tx_db(&path)?; tracing::info!(" - building interval trees"); - let provider = Arc::new(MehariProvider::new(tx_db, assembly)); + let provider = Arc::new(Provider::new(tx_db, assembly, Default::default())); tracing::info!(" - building seqvars predictors"); data.seqvars_predictors.insert( genome_release, - SeqvarConsequencePredictor::new(provider.clone(), assembly), + SeqvarConsequencePredictor::new(provider.clone(), assembly, Default::default()), ); tracing::info!(" - building strucvars predictors"); data.strucvars_predictors.insert( diff --git a/src/verify/seqvars.rs b/src/verify/seqvars.rs index 3812333e..9f82058a 100644 --- a/src/verify/seqvars.rs +++ b/src/verify/seqvars.rs @@ -13,9 +13,9 @@ use noodles_core::{Position, Region}; use quick_cache::unsync::Cache; use crate::annotate::seqvars::{ - csq::{ConsequencePredictor, VcfVariant}, + csq::{ConfigBuilder as ConsequencePredictorConfigBuilder, ConsequencePredictor, VcfVariant}, load_tx_db, path_component, - provider::MehariProvider, + provider::{ConfigBuilder as MehariProviderConfigBuilder, Provider as MehariProvider}, }; /// Command line arguments for `verify seqvars` sub command. @@ -36,6 +36,14 @@ pub struct Args { #[arg(long)] pub path_output_tsv: String, + /// Whether to report for all picked transcripts. + #[arg(long, default_value_t = true)] + pub report_all_transcripts: bool, + /// Limit transcripts to (a) ManeSelect+ManePlusClinical, (b) ManeSelect, + /// (c) longest transcript for the gene - the first available. + #[arg(long, default_value_t = false)] + pub transcript_picking: bool, + /// For debug purposes, maximal number of variants to annotate. #[arg(long)] pub max_var_count: Option, @@ -125,8 +133,22 @@ pub fn run(_common: &crate::common::Args, args: &Args) -> Result<(), anyhow::Err path_component(assembly) ))?; tracing::info!("Building transcript interval trees ..."); - let provider = Arc::new(MehariProvider::new(tx_db, assembly)); - let predictor = ConsequencePredictor::new(provider, assembly); + let provider = Arc::new(MehariProvider::new( + tx_db, + assembly, + MehariProviderConfigBuilder::default() + .transcript_picking(args.transcript_picking) + .build() + .unwrap(), + )); + let predictor = ConsequencePredictor::new( + provider, + assembly, + ConsequencePredictorConfigBuilder::default() + .report_all_transcripts(args.report_all_transcripts) + .build() + .unwrap(), + ); tracing::info!("... done building transcript interval trees"); // LRU caches used below to avoid re-reading from FAI and prediction. diff --git a/tests/data/annotate/db/grch37/bootstrap.sh b/tests/data/annotate/db/grch37/bootstrap.sh index 112301df..02c77097 100644 --- a/tests/data/annotate/db/grch37/bootstrap.sh +++ b/tests/data/annotate/db/grch37/bootstrap.sh @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:17019918b0c639a6aed24001ef177a66965d2d3638a14b392181794e06cc2c75 -size 468 +oid sha256:5a0f1c06683ac9fdc97a45a3a7551ced5f71879a515ae43581b7d1c0efc56061 +size 569 diff --git a/tests/data/annotate/db/grch37/tx-mane.tsv b/tests/data/annotate/db/grch37/tx-mane.tsv new file mode 100644 index 00000000..57e32e75 --- /dev/null +++ b/tests/data/annotate/db/grch37/tx-mane.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c3e311fe48eebebc8e79c71b5ecf1cf461335424416def379c2e902954df68a +size 146 diff --git a/tests/data/annotate/db/grch37/txs.bin.zst b/tests/data/annotate/db/grch37/txs.bin.zst index 8ee106f5..db59fd43 100644 --- a/tests/data/annotate/db/grch37/txs.bin.zst +++ b/tests/data/annotate/db/grch37/txs.bin.zst @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bbd1ede1ba0dd9c1d338d876cced202494ff7420e78fc40df5e996f621bcf1a9 -size 10127 +oid sha256:de4c495314ba6bce7fb346b3012af464a53886ac5b9a1707c91c3d46aa06f09d +size 64771 diff --git a/tests/data/annotate/db/grch37/txs.bin.zst.report b/tests/data/annotate/db/grch37/txs.bin.zst.report index 7dd98898..42e018c8 100644 --- a/tests/data/annotate/db/grch37/txs.bin.zst.report +++ b/tests/data/annotate/db/grch37/txs.bin.zst.report @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b5f4d793ccdb99a8d47e1c48c3d038d65aad63a25d66e3feea7d9c73c78de12f -size 2152 +oid sha256:90bb8919fe098885b6aa99e4413cb09a6262112e2d4851a6142880e5f08e1cb6 +size 451 diff --git a/tests/data/annotate/vars/postproc-snpeff.sh b/tests/data/annotate/vars/postproc-snpeff.sh index cdcc083a..dcc8ab79 100644 --- a/tests/data/annotate/vars/postproc-snpeff.sh +++ b/tests/data/annotate/vars/postproc-snpeff.sh @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f3944b9efc0e1b08a6fca9be5fd314a8e618f061edd9667b283d3d3c80afba44 +oid sha256:ac3408a023374fb8314c9a68e03bfc59abe7e5267ead48ae2d3caa65ce78ca27 size 1235 diff --git a/tests/data/db/create/txs/bootstrap.py b/tests/data/db/create/txs/bootstrap.py deleted file mode 100644 index 9a80d1ed..00000000 --- a/tests/data/db/create/txs/bootstrap.py +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8fb27ae7880f932a1d6a273ee9d979166bf6be543e45f63b5a186067324a9671 -size 557 diff --git a/tests/data/db/create/txs/bootstrap.sh b/tests/data/db/create/txs/bootstrap.sh index d1cdeb64..88b33104 100644 --- a/tests/data/db/create/txs/bootstrap.sh +++ b/tests/data/db/create/txs/bootstrap.sh @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:374f3733df8b408aaeaab688ccc8297149a6e52af727c5160e3a3506865dd02b +oid sha256:dea38896f551bec46d3973b9364765e54a8ca03faacb5d5d1844b60f90198b94 size 1538 diff --git a/tests/data/db/create/txs/brca1.fasta b/tests/data/db/create/txs/brca1.fasta index f1cf335f..30912355 100644 --- a/tests/data/db/create/txs/brca1.fasta +++ b/tests/data/db/create/txs/brca1.fasta @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5f877e2a811d529d05e469930d6f354f516002d6a7bf3f196f3f6c9dc6675ae7 -size 25266 +oid sha256:e0cf90d9e9954d3185d75439d54762c9505500d902198e027e2a3451d01e863b +size 2364983 diff --git a/tests/data/db/create/txs/cdot-0.2.21.refseq.grch37_grch38.brca1_opa1.json b/tests/data/db/create/txs/cdot-0.2.21.refseq.grch37_grch38.brca1_opa1.json new file mode 100644 index 00000000..34763cc1 --- /dev/null +++ b/tests/data/db/create/txs/cdot-0.2.21.refseq.grch37_grch38.brca1_opa1.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c30c2e0bd21c8a092d3b1b0a61b691bfbce6c57956ac0b3aa335bb0fd6f4a7af +size 1923119 diff --git a/tests/data/db/create/txs/latest/aliases.sqlite3 b/tests/data/db/create/txs/latest/aliases.sqlite3 index 0a8a130d..99a47dc3 100644 --- a/tests/data/db/create/txs/latest/aliases.sqlite3 +++ b/tests/data/db/create/txs/latest/aliases.sqlite3 @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ba20e29649ca2e36b8937c20c9704b687586aacd126ad9d5c9a5f32c0e34de5d -size 77824 +oid sha256:1f16ab6003ade8eac53bfc663046e60736dc86103aa6f6cd95bd2359e325895e +size 614400 diff --git a/tests/data/db/create/txs/latest/sequences/2023/0327/0944/1679910246.7746782.fa.bgz b/tests/data/db/create/txs/latest/sequences/2023/0327/0944/1679910246.7746782.fa.bgz deleted file mode 100644 index e9b97d26..00000000 --- a/tests/data/db/create/txs/latest/sequences/2023/0327/0944/1679910246.7746782.fa.bgz +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5afe8079dff1e0659d0917c7ee7989c3b3c0bafc123961ec5dd81c81325b7ca7 -size 3938 diff --git a/tests/data/db/create/txs/latest/sequences/2023/0327/0944/1679910246.7746782.fa.bgz.fai b/tests/data/db/create/txs/latest/sequences/2023/0327/0944/1679910246.7746782.fa.bgz.fai deleted file mode 100644 index 36363231..00000000 --- a/tests/data/db/create/txs/latest/sequences/2023/0327/0944/1679910246.7746782.fa.bgz.fai +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2ead419c5cefec85dba8362003a3eb55fe42341c3339f2efcff8a8b152db5a6f -size 204 diff --git a/tests/data/db/create/txs/latest/sequences/2023/0327/0944/1679910246.7746782.fa.bgz.gzi b/tests/data/db/create/txs/latest/sequences/2023/0327/0944/1679910246.7746782.fa.bgz.gzi deleted file mode 100644 index a2b71017..00000000 --- a/tests/data/db/create/txs/latest/sequences/2023/0327/0944/1679910246.7746782.fa.bgz.gzi +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:af5570f5a1810b7af78caf4bc70a660f0df51e42baf91d4de5b2328de0e83dfc -size 8 diff --git a/tests/data/db/create/txs/latest/sequences/2023/0327/0944/1679910247.3800669.fa.bgz b/tests/data/db/create/txs/latest/sequences/2023/0327/0944/1679910247.3800669.fa.bgz deleted file mode 100644 index 23a9923a..00000000 --- a/tests/data/db/create/txs/latest/sequences/2023/0327/0944/1679910247.3800669.fa.bgz +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:486e2a8fd38a09ba3518f14a684ae2060845e0c937a55ea7627534f49f1ae6df -size 4957 diff --git a/tests/data/db/create/txs/latest/sequences/2023/0327/0944/1679910247.3800669.fa.bgz.fai b/tests/data/db/create/txs/latest/sequences/2023/0327/0944/1679910247.3800669.fa.bgz.fai deleted file mode 100644 index 81211bc6..00000000 --- a/tests/data/db/create/txs/latest/sequences/2023/0327/0944/1679910247.3800669.fa.bgz.fai +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:913e82073a2adc50d790360d404a7ec7edda6093ae975fdb7cca97fcc2f5df80 -size 308 diff --git a/tests/data/db/create/txs/latest/sequences/2023/0327/0944/1679910247.3800669.fa.bgz.gzi b/tests/data/db/create/txs/latest/sequences/2023/0327/0944/1679910247.3800669.fa.bgz.gzi deleted file mode 100644 index a2b71017..00000000 --- a/tests/data/db/create/txs/latest/sequences/2023/0327/0944/1679910247.3800669.fa.bgz.gzi +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:af5570f5a1810b7af78caf4bc70a660f0df51e42baf91d4de5b2328de0e83dfc -size 8 diff --git a/tests/data/db/create/txs/latest/sequences/2023/1108/0919/1699435149.8201442.fa.bgz b/tests/data/db/create/txs/latest/sequences/2023/1108/0919/1699435149.8201442.fa.bgz new file mode 100644 index 00000000..b9f4cf22 --- /dev/null +++ b/tests/data/db/create/txs/latest/sequences/2023/1108/0919/1699435149.8201442.fa.bgz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b4c33a9616798a6c0956f660689ae987b21ea598b2cb964afdd31f3104cbc1b +size 251241 diff --git a/tests/data/db/create/txs/latest/sequences/2023/1108/0919/1699435149.8201442.fa.bgz.fai b/tests/data/db/create/txs/latest/sequences/2023/1108/0919/1699435149.8201442.fa.bgz.fai new file mode 100644 index 00000000..a29adc7f --- /dev/null +++ b/tests/data/db/create/txs/latest/sequences/2023/1108/0919/1699435149.8201442.fa.bgz.fai @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e76ae8d6840c99a144c6c5a3a33abad1babddfeb4022a11d827990598f0c49b2 +size 20739 diff --git a/tests/data/db/create/txs/latest/sequences/2023/1108/0919/1699435149.8201442.fa.bgz.gzi b/tests/data/db/create/txs/latest/sequences/2023/1108/0919/1699435149.8201442.fa.bgz.gzi new file mode 100644 index 00000000..aa655a4f --- /dev/null +++ b/tests/data/db/create/txs/latest/sequences/2023/1108/0919/1699435149.8201442.fa.bgz.gzi @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b53d125a8df26a8721522fec113c28719ffa726f4cc430fcddabc0c3c96891f +size 568 diff --git a/tests/data/db/create/txs/latest/sequences/2023/1108/0919/1699435152.474355.fa.bgz b/tests/data/db/create/txs/latest/sequences/2023/1108/0919/1699435152.474355.fa.bgz new file mode 100644 index 00000000..f6efd170 --- /dev/null +++ b/tests/data/db/create/txs/latest/sequences/2023/1108/0919/1699435152.474355.fa.bgz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f607dff24557d46043b4dc9a6e4e6abb1b5759c23610133790d24fed9e781de8 +size 27316 diff --git a/tests/data/db/create/txs/latest/sequences/2023/1108/0919/1699435152.474355.fa.bgz.fai b/tests/data/db/create/txs/latest/sequences/2023/1108/0919/1699435152.474355.fa.bgz.fai new file mode 100644 index 00000000..10b88f48 --- /dev/null +++ b/tests/data/db/create/txs/latest/sequences/2023/1108/0919/1699435152.474355.fa.bgz.fai @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5b34dbaf87777539f73ad19a3677ca1f3545dfbda17dcaf6485a8cd53e69556 +size 2471 diff --git a/tests/data/db/create/txs/latest/sequences/2023/1108/0919/1699435152.474355.fa.bgz.gzi b/tests/data/db/create/txs/latest/sequences/2023/1108/0919/1699435152.474355.fa.bgz.gzi new file mode 100644 index 00000000..70fe3a20 --- /dev/null +++ b/tests/data/db/create/txs/latest/sequences/2023/1108/0919/1699435152.474355.fa.bgz.gzi @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4343a3235e2c73b3264ef0edaf4beb9e80b379cb1171ebb5c6f3e8d8ee60151 +size 56 diff --git a/tests/data/db/create/txs/latest/sequences/db.sqlite3 b/tests/data/db/create/txs/latest/sequences/db.sqlite3 index 666a2228..de13a4c1 100644 --- a/tests/data/db/create/txs/latest/sequences/db.sqlite3 +++ b/tests/data/db/create/txs/latest/sequences/db.sqlite3 @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8b0c1d65f44e709c046515f17d04c424ade4d46432265d2d5ba4114e1cd30836 -size 61440 +oid sha256:eab46c5b294309ef8fcea27d776f490595f138e9011f085145bec1719c423dad +size 151552 diff --git a/tests/data/db/create/txs/opa1.fasta b/tests/data/db/create/txs/opa1.fasta index 5eb79744..9d12b5e0 100644 --- a/tests/data/db/create/txs/opa1.fasta +++ b/tests/data/db/create/txs/opa1.fasta @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6cbdb62a07a725ba59a4dbf4c4c45597c08b8d7446959acdad8faa1dd40efd58 -size 38171 +oid sha256:b6dcdb5515ab8bfbf15c44b31ec4f6de724e89d75bacf99184987462a4cce9e5 +size 239879 diff --git a/tests/data/db/create/txs/txs_main.tsv b/tests/data/db/create/txs/txs_main.tsv new file mode 100644 index 00000000..95cc22e4 --- /dev/null +++ b/tests/data/db/create/txs/txs_main.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10939bec73d3848928bfb9a08905d00dfec81f21d3842909054aca63a5c2e0e9 +size 55