Skip to content

Commit

Permalink
fix: issues occuring when annotating real data (#28)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe authored Mar 31, 2023
1 parent fd4c523 commit 0636d49
Show file tree
Hide file tree
Showing 8 changed files with 239 additions and 60 deletions.
4 changes: 2 additions & 2 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

version: 2
updates:
- package-ecosystem: "" # See documentation for possible values
directory: "/" # Location of package manifests
- package-ecosystem: "cargo"
directory: "/"
schedule:
interval: "weekly"
4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,14 @@ clap-verbosity-flag = "2.0.0"
csv = "1.2.0"
flatbuffers = "23.1.21"
flate2 = "1.0.25"
hgvs = "0.4.0"
hgvs = "0.5.0"
lazy_static = "1.4.0"
log = "0.4.17"
noodles = { version = "0.33.0", features = ["vcf", "bcf", "csi", "fasta", "bgzf", "tabix"] }
noodles-util = { version = "0.5.0", features = ["noodles-bcf", "noodles-bgzf", "noodles-vcf", "variant"] }
procfs = "0.15.1"
rocksdb = "0.20.1"
seqrepo = "0.2.3"
seqrepo = "0.3.0"
serde = { version = "1.0.152", features = ["derive"] }
serde_json = "1.0.94"
tracing = { version = "0.1.37", features = ["log"] }
Expand Down
2 changes: 1 addition & 1 deletion docs/db_build.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ $ conda activate seqrepo
Then, download the transcript FASTA files into the current directory:

```text
$ wget https://ftp.ensembl.org/pub/grch37/release-108/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh37.cdna.all.fa.gz
$ wget https://ftp.ensembl.org/pub/grch37/release-109/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh37.cdna.all.fa.gz
$ wget https://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/human.{1..12}.rna.fna.gz \
https://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/human.files.installed
```
Expand Down
2 changes: 1 addition & 1 deletion src/annotate/seqvars/ann.rs
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,7 @@ pub enum Message {
ErrorChromosomeNotFound,
ErrorOutOfChromosomeRange,
WarningRefDoesNotMatchGenome,
WarningSequnenceNotAvailable,
WarningSequenceNotAvailable,
WarningTranscriptIncomplete,
WarningTranscriptMultipleStopCodons,
WarningTranscriptsNoStartCodon,
Expand Down
23 changes: 20 additions & 3 deletions src/annotate/seqvars/csq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ impl ConsequencePredictor {
replace_reference: false,
strict_bounds: false,
renormalize_g: false,
genome_seq_available: false,
..Default::default()
};
let mapper = AssemblyMapper::new(config, provider.clone());
Expand Down Expand Up @@ -114,7 +115,7 @@ impl ConsequencePredictor {
.map(|tx| self.build_ann_field(&var, tx, chrom_acc.clone(), var_start, var_end))
.collect::<Result<Vec<_>, _>>()?
.into_iter()
.filter_map(|x| x)
.flatten()
.collect::<Vec<_>>(),
))
}
Expand Down Expand Up @@ -363,7 +364,23 @@ impl ConsequencePredictor {

let (rank, hgvs_t, hgvs_p, tx_pos, cds_pos, protein_pos) = if !is_upstream && !is_downstream
{
let var_n = self.mapper.g_to_n(&var_g, &tx.id)?;
// Gracefully handle problems in the projection (in this case "Non-adjacent exons for ...").
// TODO: do not include such transcripts when building the tx database.
let var_n = self.mapper.g_to_n(&var_g, &tx.id).map_or_else(
|e| {
if e.to_string().contains("Non-adjacent exons for") {
Ok(None)
} else {
Err(e)
}
},
|v| Ok(Some(v)),
)?;
if var_n.is_none() {
return Ok(None);
}
let var_n = var_n.unwrap();

let tx_pos = match &var_n {
HgvsVariant::TxVariant { loc_edit, .. } => Some(Pos {
ord: loc_edit.loc.inner().start.base,
Expand Down Expand Up @@ -779,7 +796,7 @@ mod test {
annotate_vars(path_tsv, &txs)
}

fn annotate_vars(path_tsv: &str, txs: &Vec<String>) -> Result<(), anyhow::Error> {
fn annotate_vars(path_tsv: &str, txs: &[String]) -> Result<(), anyhow::Error> {
let tx_path = "tests/data/annotate/db/seqvars/grch37/txs.bin";
let tx_db = load_tx_db(tx_path, 5_000_000)?;
let provider = Rc::new(MehariProvider::new(tx_db, Assembly::Grch37p10));
Expand Down
51 changes: 30 additions & 21 deletions src/annotate/seqvars/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ use crate::annotate::seqvars::csq::{ConsequencePredictor, VcfVariant};
use crate::annotate::seqvars::provider::MehariProvider;
use crate::common::GenomeRelease;
use crate::db::create::seqvar_clinvar::serialize::Record as ClinvarRecord;
use crate::db::create::seqvar_freqs::reading::guess_assembly;
use crate::db::create::seqvar_freqs::reading::{guess_assembly, is_canonical};
use crate::db::create::seqvar_freqs::serialized::vcf::Var as VcfVar;
use crate::db::create::seqvar_freqs::serialized::{
auto::Record as AutoRecord, mt::Record as MtRecord, xy::Record as XyRecord,
Expand Down Expand Up @@ -384,7 +384,7 @@ where
);
vcf_record
.info_mut()
.insert(keys::CLINVAR_VCV.clone(), Some(Value::String(vcv.clone())));
.insert(keys::CLINVAR_VCV.clone(), Some(Value::String(vcv)));
}

Ok(())
Expand Down Expand Up @@ -653,6 +653,7 @@ fn run_with_writer<Inner: Write>(
// Perform the VCf annotation.
tracing::info!("Annotating VCF ...");
let start = Instant::now();
let mut prev = Instant::now();
let mut total_written = 0usize;

writer.write_header(&header_out)?;
Expand All @@ -664,26 +665,35 @@ fn run_with_writer<Inner: Write>(
// TODO: ignores all but the first alternative allele!
let vcf_var = VcfVar::from_vcf(&vcf_record);

// Build key for RocksDB database from `vcf_var`.
let key: Vec<u8> = vcf_var.clone().into();

// Annotate with frequency.
if CHROM_AUTO.contains(vcf_var.chrom.as_str()) {
annotate_record_auto(&db_freq, cf_autosomal, &key, &mut vcf_record)?;
} else if CHROM_XY.contains(vcf_var.chrom.as_str()) {
annotate_record_xy(&db_freq, cf_gonosomal, &key, &mut vcf_record)?;
} else if CHROM_MT.contains(vcf_var.chrom.as_str()) {
annotate_record_mt(&db_freq, cf_mtdna, &key, &mut vcf_record)?;
} else {
tracing::trace!(
"Record @{:?} on non-canonical chromosome, skipping.",
&vcf_var
);
if prev.elapsed().as_secs() >= 60 {
tracing::info!("at {:?}", &vcf_var);
prev = Instant::now();
}

// Annotate with ClinVar.
annotate_record_clinvar(&db_clinvar, cf_clinvar, &key, &mut vcf_record)?;
// Only attempt lookups into RocksDB for canonical contigs.
if is_canonical(vcf_var.chrom.as_str()) {
// Build key for RocksDB database from `vcf_var`.
let key: Vec<u8> = vcf_var.clone().into();

// Annotate with frequency.
if CHROM_AUTO.contains(vcf_var.chrom.as_str()) {
annotate_record_auto(&db_freq, cf_autosomal, &key, &mut vcf_record)?;
} else if CHROM_XY.contains(vcf_var.chrom.as_str()) {
annotate_record_xy(&db_freq, cf_gonosomal, &key, &mut vcf_record)?;
} else if CHROM_MT.contains(vcf_var.chrom.as_str()) {
annotate_record_mt(&db_freq, cf_mtdna, &key, &mut vcf_record)?;
} else {
tracing::trace!(
"Record @{:?} on non-canonical chromosome, skipping.",
&vcf_var
);
}

// Annotate with ClinVar information.
annotate_record_clinvar(&db_clinvar, cf_clinvar, &key, &mut vcf_record)?;
}

tracing::trace!("var = {:?}", &vcf_var);
let VcfVar {
chrom,
pos,
Expand All @@ -708,8 +718,7 @@ fn run_with_writer<Inner: Write>(
}
}

// Annotate with ClinVar annotation.

// Write out the record.
writer.write_record(&vcf_record)?;
} else {
break; // all done
Expand Down
8 changes: 8 additions & 0 deletions src/db/create/seqvar_freqs/reading.rs
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,14 @@ pub const CANONICAL: &[&str] = &[
"18", "19", "20", "21", "22", "X", "Y", "M", "MT",
];

/// Return whether the given chromosome name is a canonical one.
///
/// The prefix `"chr"` is stripped from the name before checking.
pub fn is_canonical(chrom: &str) -> bool {
let chrom = chrom.strip_prefix("chr").unwrap_or(chrom);
CANONICAL.contains(&chrom)
}

/// Guess the assembly from the given header.
///
/// If the header only contains chrM, for example, the result may be ambiguous. Use `ambiguous_ok`
Expand Down
Loading

0 comments on commit 0636d49

Please sign in to comment.