Skip to content

Commit

Permalink
feat: REST server for variant consequence prediction (#18) (#90)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe authored Jun 12, 2023
1 parent d727391 commit b8c5a7b
Show file tree
Hide file tree
Showing 15 changed files with 460 additions and 40 deletions.
33 changes: 18 additions & 15 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,32 +21,34 @@ all-features = true
name = "mehari"

[dependencies]
annonars = "0.8"
actix-web = "4.3"
annonars = "0.10"
anyhow = "1.0"
bgzip = "0.3"
bincode = "1.3"
bio = "1.1"
byteorder = "1.4"
byte-unit = "4.0"
byteorder = "1.4"
chrono = "0.4"
clap-verbosity-flag = "2.0"
clap = { version = "4.1", features = ["derive"] }
clap-verbosity-flag = "2.0"
csv = "1.2"
derivative = "2.2"
enumset = { version = "1.0", features = ["serde"] }
env_logger = "0.10"
flate2 = "1.0"
hgvs = "0.8"
hgvs = "0.9"
indexmap = "1.9"
indicatif = "0.17"
jsonl = "4.0"
lazy_static = "1.4"
log = "0.4"
memmap2 = "0.6"
nom = "7.1"
noodles-bcf = "0.28"
noodles-bgzf = "0.22"
noodles-core = "0.11"
noodles-csi = "0.19"
noodles-fasta = "0.23"
noodles-fasta = "0.24"
noodles-tabix = "0.22"
noodles-vcf = "0.31"
parse-display = "0.8"
Expand All @@ -57,23 +59,24 @@ rand = "0.8"
rand_core = "0.6"
rocksdb = { version = "0.21", features = ["multi-threaded-cf"] }
rustc-hash = "1.1"
seqrepo = "0.5"
serde_json = "1.0"
seqrepo = "0.6"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
serde_with = "3.0"
strum = { version = "0.24", features = ["derive"] }
tempdir = "0.3"
thousands = "0.2"
tracing-subscriber = "0.3"
tracing = { version = "0.1", features = ["log"] }
tracing-subscriber = "0.3"
uuid = { version = "1.3", features = ["fast-rng", "serde"] }
zstd = "0.12"

[build-dependencies]
prost-build = "0.11.9"
prost-build = "0.11"

[dev-dependencies]
csv = "1.2.0"
insta = { version = "1.29.0", features = ["yaml"] }
pretty_assertions = "1.3.0"
rstest = "0.17.0"
temp_testdir = "0.2.3"
csv = "1.2"
insta = { version = "1.29", features = ["yaml"] }
pretty_assertions = "1.3"
rstest = "0.17"
temp_testdir = "0.2"
1 change: 0 additions & 1 deletion build.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
// The custom build script, needed as we use prost.

fn main() {
println!("cargo:rerun-if-changed=src/db/create/txs/data.proto3");
prost_build::compile_protos(&["src/db/create/txs/data.proto3"], &["src/"]).unwrap();
}
40 changes: 28 additions & 12 deletions src/annotate/seqvars/csq.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
//! Compute molecular consequence of variants.
use std::{collections::HashMap, rc::Rc};
use std::{collections::HashMap, sync::Arc};

use hgvs::{
data::interface::{Provider, TxForRegionRecord},
mapper::{
assembly::{Config as AssemblyConfig, Mapper as AssemblyMapper},
Error,
},
mapper::{assembly, Error},
parser::{
Accession, CdsFrom, GenomeInterval, GenomeLocEdit, HgvsVariant, Mu, NaEdit, ProtLocEdit,
},
Expand All @@ -34,12 +31,18 @@ pub struct VcfVariant {
pub alternative: String,
}

/// Wrap mapper, provider, and map for consequence prediction.
#[derive(derivative::Derivative)]
#[derivative(Debug)]
pub struct ConsequencePredictor {
/// The internal transcript provider for locating transcripts.
provider: Rc<MehariProvider>,
#[derivative(Debug = "ignore")]
provider: Arc<MehariProvider>,
/// Assembly mapper for variant consequence prediction.
mapper: AssemblyMapper,
#[derivative(Debug = "ignore")]
mapper: assembly::Mapper,
/// Mapping from chromosome name to accession.
#[derivative(Debug = "ignore")]
chrom_to_acc: HashMap<String, String>,
}

Expand All @@ -49,7 +52,7 @@ pub const PADDING: i32 = 5_000;
pub const ALT_ALN_METHOD: &str = "splign";

impl ConsequencePredictor {
pub fn new(provider: Rc<MehariProvider>, assembly: Assembly) -> Self {
pub fn new(provider: Arc<MehariProvider>, assembly: Assembly) -> Self {
let acc_to_chrom = provider.get_assembly_map(assembly);
let mut chrom_to_acc = HashMap::new();
for (acc, chrom) in &acc_to_chrom {
Expand All @@ -62,14 +65,14 @@ impl ConsequencePredictor {
chrom_to_acc.insert(format!("chr{}", chrom), acc.clone());
}

let config = AssemblyConfig {
let config = assembly::Config {
replace_reference: false,
strict_bounds: false,
renormalize_g: false,
genome_seq_available: false,
..Default::default()
};
let mapper = AssemblyMapper::new(config, provider.clone());
let mapper = assembly::Mapper::new(config, provider.clone());

ConsequencePredictor {
provider,
Expand Down Expand Up @@ -687,6 +690,13 @@ impl ConsequencePredictor {
}
}

impl ConsequencePredictor {
/// Return data version string (if set).
pub fn data_version(&self) -> Option<String> {
self.provider.as_ref().tx_seq_db.version.clone()
}
}

#[cfg(test)]
mod test {
use std::{fs::File, io::BufReader};
Expand All @@ -699,11 +709,17 @@ mod test {

use super::*;

#[test]
fn test_sync() {
fn is_sync<T: Sync>() {}
is_sync::<super::ConsequencePredictor>();
}

#[test]
fn annotate_snv_brca1_one_variant() -> Result<(), anyhow::Error> {
let tx_path = "tests/data/annotate/db/grch37/txs.bin.zst";
let tx_db = load_tx_db(tx_path)?;
let provider = Rc::new(MehariProvider::new(tx_db, Assembly::Grch37p10));
let provider = Arc::new(MehariProvider::new(tx_db, Assembly::Grch37p10));

let predictor = ConsequencePredictor::new(provider, Assembly::Grch37p10);

Expand Down Expand Up @@ -797,7 +813,7 @@ mod test {
fn annotate_vars(path_tsv: &str, txs: &[String]) -> Result<(), anyhow::Error> {
let tx_path = "tests/data/annotate/db/grch37/txs.bin.zst";
let tx_db = load_tx_db(tx_path)?;
let provider = Rc::new(MehariProvider::new(tx_db, Assembly::Grch37p10));
let provider = Arc::new(MehariProvider::new(tx_db, Assembly::Grch37p10));
let predictor = ConsequencePredictor::new(provider, Assembly::Grch37p10);

let mut reader = ReaderBuilder::new()
Expand Down
3 changes: 1 addition & 2 deletions src/annotate/seqvars/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ use std::fs::File;
use std::io::{BufWriter, Cursor, Read, Write};
use std::ops::Deref;
use std::path::Path;
use std::rc::Rc;
use std::str::FromStr;
use std::sync::Arc;
use std::time::Instant;
Expand Down Expand Up @@ -1428,7 +1427,7 @@ fn run_with_writer(writer: &mut dyn AnnotatedVcfWriter, args: &Args) -> Result<(
path_component(assembly)
))?;
tracing::info!("Building transcript interval trees ...");
let provider = Rc::new(MehariProvider::new(tx_db, assembly));
let provider = Arc::new(MehariProvider::new(tx_db, assembly));
let predictor = ConsequencePredictor::new(provider, assembly);
tracing::info!("... done building transcript interval trees");

Expand Down
9 changes: 9 additions & 0 deletions src/annotate/seqvars/provider.rs
Original file line number Diff line number Diff line change
Expand Up @@ -439,3 +439,12 @@ impl ProviderInterface for MehariProvider {
}])
}
}

#[cfg(test)]
mod test {
#[test]
fn test_sync() {
fn is_sync<T: Sync>() {}
is_sync::<super::MehariProvider>();
}
}
40 changes: 39 additions & 1 deletion src/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,10 @@ pub fn trace_rss_now() {
}

/// Select the genome release to use.
#[derive(clap::ValueEnum, Clone, Copy, Debug, PartialEq, Eq)]
#[derive(
clap::ValueEnum, serde::Serialize, serde::Deserialize, Clone, Copy, Debug, PartialEq, Eq, Hash,
)]
#[serde(rename_all = "kebab-case")]
pub enum GenomeRelease {
Grch37,
Grch38,
Expand Down Expand Up @@ -76,3 +79,38 @@ pub fn reciprocal_overlap(lhs: Range<i32>, rhs: Range<i32>) -> f32 {
x1.min(x2)
}
}

/// The version of `viguno` package.
#[cfg(not(test))]
const VERSION: &str = env!("CARGO_PKG_VERSION");

/// This allows us to override the version to `0.0.0` in tests.
pub fn version() -> &'static str {
#[cfg(test)]
return "0.0.0";
#[cfg(not(test))]
return VERSION;
}

/// Version information that is returned by the HTTP server.
#[derive(serde::Serialize, serde::Deserialize, Default, Debug, Clone)]
#[serde_with::skip_serializing_none]
#[serde(rename_all = "kebab-case")]
pub struct Version {
/// Version of the transcript database data.
pub tx_db: Option<String>,
/// Version of the `mehari` package.
pub mehari: String,
}

impl Version {
/// Construct a new version.
///
/// The viguno version is filed automatically.
pub fn new(tx_db: Option<String>) -> Self {
Self {
tx_db,
mehari: version().to_string(),
}
}
}
4 changes: 4 additions & 0 deletions src/db/create/txs/data.proto3
Original file line number Diff line number Diff line change
Expand Up @@ -120,4 +120,8 @@ message TxSeqDatabase {
TranscriptDb tx_db = 1;
// Store sequence with their aliases.
SequenceDb seq_db = 2;
// The version of the database.
optional string version = 3;
// The reference assembly that this database refers to.
optional string genome_release = 4;
}
19 changes: 16 additions & 3 deletions src/db/create/txs/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ fn load_and_extract(
genes: &mut HashMap<String, models::Gene>,
transcripts: &mut HashMap<String, models::Transcript>,
genome_release: GenomeRelease,
cdot_version: &mut String,
report_file: &mut File,
) -> Result<(), anyhow::Error> {
writeln!(report_file, "genome_release\t{:?}", genome_release)?;
Expand All @@ -71,6 +72,7 @@ fn load_and_extract(
let models::Container {
genes: c_genes,
transcripts: c_txs,
cdot_version: c_version,
..
} = if json_path.extension().unwrap_or_default() == "gz" {
tracing::info!("(from gzip compressed file)");
Expand All @@ -81,6 +83,7 @@ fn load_and_extract(
tracing::info!("(from uncompressed file)");
serde_json::from_reader(std::io::BufReader::new(File::open(json_path)?))?
};
*cdot_version = c_version;
tracing::info!(
"loading / deserializing {} genes and {} transcripts from cdot took {:?}",
c_genes.len().separate_with_commas(),
Expand Down Expand Up @@ -176,6 +179,7 @@ fn build_protobuf(
seqrepo: SeqRepo,
tx_data: TranscriptData,
is_silent: bool,
genome_release: GenomeRelease,
report_file: &mut File,
) -> Result<(), anyhow::Error> {
let TranscriptData {
Expand All @@ -192,7 +196,7 @@ fn build_protobuf(
let mut tx_skipped_noseq = HashSet::new(); // skipped because of missing sequence
let mut tx_skipped_nostop = HashSet::new(); // skipped because of missing stop codon
let seq_db = {
// Insert into flatbuffer and keep track of pointers in `Vec`s.
// Insert into protobuf and keep track of pointers in `Vec`s.
let mut aliases = Vec::new();
let mut aliases_idx = Vec::new();
let mut seqs = Vec::new();
Expand Down Expand Up @@ -259,7 +263,7 @@ fn build_protobuf(
}
}

// Register sequence into flatbuffer.
// Register sequence into protobuf.
aliases.push(tx_id.clone());
aliases_idx.push(seqs.len() as u32);
seqs.push(seq.clone());
Expand Down Expand Up @@ -308,7 +312,7 @@ fn build_protobuf(
);
writeln!(
report_file,
"skip gene from flatbuffers because all transcripts have been removed\t{}",
"skip gene from protobuf because all transcripts have been removed\t{}",
gene_symbol
)?;
continue;
Expand Down Expand Up @@ -461,6 +465,8 @@ fn build_protobuf(
let tx_seq_db = data::TxSeqDatabase {
tx_db: Some(tx_db),
seq_db: Some(seq_db),
version: Some(crate::common::version().to_string()),
genome_release: Some(genome_release.name()),
};
let mut buf = Vec::new();
buf.reserve(tx_seq_db.encoded_len());
Expand Down Expand Up @@ -714,13 +720,15 @@ fn load_cdot_files(args: &Args, report_file: &mut File) -> Result<TranscriptData
let mut genes = HashMap::new();
let mut transcripts = HashMap::new();
let mut transcript_ids_for_gene = HashMap::new();
let mut cdot_version = String::new();
for json_path in &args.path_cdot_json {
load_and_extract(
json_path,
&mut transcript_ids_for_gene,
&mut genes,
&mut transcripts,
args.genome_release,
&mut cdot_version,
report_file,
)?;
}
Expand Down Expand Up @@ -766,6 +774,7 @@ pub fn run(common: &crate::common::Args, args: &Args) -> Result<(), anyhow::Erro
seqrepo,
tx_data,
common.verbose.is_silent(),
args.genome_release,
&mut report_file,
)?;

Expand Down Expand Up @@ -795,12 +804,14 @@ pub mod test {
let mut genes = HashMap::new();
let mut transcripts = HashMap::new();
let mut transcript_ids_for_gene = HashMap::new();
let mut cdot_version = String::new();
load_and_extract(
Path::new("tests/data/db/create/txs/cdot-0.2.12.refseq.grch37_grch38.brca1_opa1.json"),
&mut transcript_ids_for_gene,
&mut genes,
&mut transcripts,
GenomeRelease::Grch37,
&mut cdot_version,
&mut report_file,
)?;

Expand All @@ -827,6 +838,8 @@ pub mod test {
.map(|s| s.as_str())
.collect::<Vec<_>>());

insta::assert_snapshot!(&cdot_version);

Ok(())
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
source: src/db/create/txs/mod.rs
expression: "&cdot_version"
---
0.2.12
Loading

0 comments on commit b8c5a7b

Please sign in to comment.