diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index 5b4a924..f32cd71 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -53,8 +53,8 @@ jobs: - name: Run cargo fmt run: cargo fmt --all -- --check --verbose - # - name: rust tests - # run: cargo test --verbose --no-fail-fast +# - name: rust tests +# run: cargo test --verbose --no-fail-fast - name: build shell: bash -l {0} diff --git a/README.md b/README.md index e004d12..20fd991 100644 --- a/README.md +++ b/README.md @@ -153,29 +153,34 @@ options: ``` ## `urlsketch` -download and sketch directly from a url +download and sketch directly from URL(s) + ### Create an input file First, create a file, e.g. `acc-url.csv` with identifiers, sketch names, and other required info. ``` -accession,name,moltype,md5sum,download_filename,url -GCA_000961135.2,GCA_000961135.2 Candidatus Aramenus sulfurataquae isolate AZ1-454,dna,47b9fb20c51f0552b87db5d44d5d4566,GCA_000961135.2_genomic.urlsketch.fna.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/961/135/GCA_000961135.2_ASM96113v2/GCA_000961135.2_ASM96113v2_genomic.fna.gz -GCA_000961135.2,GCA_000961135.2 Candidatus Aramenus sulfurataquae isolate AZ1-454,protein,fb7920fb8f3cf5d6ab9b6b754a5976a4,GCA_000961135.2_protein.urlsketch.faa.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/961/135/GCA_000961135.2_ASM96113v2/GCA_000961135.2_ASM96113v2_protein.faa.gz -GCA_000175535.1,GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14,dna,a1a8f1c6dc56999c73fe298871c963d1,GCA_000175535.1_genomic.urlsketch.fna.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_genomic.fna.gz +accession,name,moltype,md5sum,download_filename,url,range +GCA_000961135.2,GCA_000961135.2 Candidatus Aramenus sulfurataquae isolate AZ1-454,dna,47b9fb20c51f0552b87db5d44d5d4566,GCA_000961135.2_genomic.urlsketch.fna.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/961/135/GCA_000961135.2_ASM96113v2/GCA_000961135.2_ASM96113v2_genomic.fna.gz, +GCA_000961135.2,GCA_000961135.2 Candidatus Aramenus sulfurataquae isolate AZ1-454,protein,fb7920fb8f3cf5d6ab9b6b754a5976a4,GCA_000961135.2_protein.urlsketch.faa.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/961/135/GCA_000961135.2_ASM96113v2/GCA_000961135.2_ASM96113v2_protein.faa.gz, +GCA_000175535.1,GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14,dna,a1a8f1c6dc56999c73fe298871c963d1,GCA_000175535.1_genomic.urlsketch.fna.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_genomic.fna.gz, ``` > Six columns must be present: > - `accession` - an accession or unique identifier. Ideally no spaces. > - `name` - full name for the sketch. > - `moltype` - is the file 'dna' or 'protein'? -> - `md5sum` - expected md5sum (optional, will be checked after download if provided) +> - `md5sum` - expected md5sum(s). Optional, will be checked after download if provided. > - `download_filename` - filename for FASTA download. Required if `--keep-fastas`, but useful for signatures, too (saved in sig data). -> - `url` - direct link for the file +> - `url` - direct link(s) for the file(s) +> - `range` - if desired, include base pair range(s), e.g 500-10000. This range will be selected from the record(s) and sketched (and/or saved to the download_filename). If there are multiple records in a FASTA file, the range will be applied to each record. + +#### Note: Merging Files into the same signature +As of v0.5.0, `urlsketch` allows specification of multiple URLs to be downloaded and sketched into a single signature. If providing multiple URLs for a single accession/name, you must either provide no `md5sum` or `range`, or the number of entries in these columns must match the number of URLs. In each case, separate the entries with ';' -- e.g. "abc;def" for two md5sums. ### Run: -To run the test accession file at `tests/test-data/acc-url.csv`, run: +To run after creating file above: ``` -sourmash scripts urlsketch tests/test-data/acc-url.csv -o test-urlsketch.zip -f out_fastas -k --failed test.failed.csv -p dna,k=21,k=31,scaled=1000,abund -p protein,k=10,scaled=100,abund -r 1 +sourmash scripts urlsketch acc-url.csv -o test-urlsketch.zip -f out_fastas -k --failed test.failed.csv -p dna,k=21,k=31,scaled=1000,abund -p protein,k=10,scaled=100,abund -r 1 ``` Full Usage: diff --git a/src/directsketch.rs b/src/directsketch.rs index 56fb5af..a5d0d96 100644 --- a/src/directsketch.rs +++ b/src/directsketch.rs @@ -1,6 +1,7 @@ use anyhow::{anyhow, bail, Context, Error, Result}; use async_zip::base::write::ZipFileWriter; use camino::Utf8PathBuf as PathBuf; +use needletail::parser::SequenceRecord; use regex::Regex; use reqwest::Client; use sourmash::collection::Collection; @@ -9,7 +10,7 @@ use std::collections::HashMap; use std::fs::{self, create_dir_all}; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; -use tokio::fs::File; +use tokio::fs::{File, OpenOptions}; use tokio::io::{AsyncWriteExt, BufWriter}; use tokio::sync::Semaphore; use tokio_util::compat::Compat; @@ -17,8 +18,8 @@ use tokio_util::compat::Compat; use pyo3::prelude::*; use crate::utils::{ - load_accession_info, load_gbassembly_info, AccessionData, GBAssemblyData, GenBankFileType, - InputMolType, MultiCollection, + load_accession_info, load_gbassembly_info, AccessionData, FailedChecksum, FailedDownload, + GBAssemblyData, GenBankFileType, InputMolType, MultiCollection, }; use crate::utils::buildutils::{BuildCollection, BuildManifest, MultiSelect, MultiSelection}; @@ -207,26 +208,6 @@ async fn download_with_retry( })) } -pub struct FailedDownload { - accession: String, - name: String, - moltype: String, - md5sum: Option, - download_filename: Option, - url: Option, -} - -pub struct FailedChecksum { - accession: String, - name: String, - moltype: String, - md5sum_url: Option, - download_filename: Option, - url: Option, - expected_md5sum: Option, - reason: String, -} - #[allow(clippy::too_many_arguments)] async fn dl_sketch_assembly_accession( client: &Client, @@ -245,36 +226,38 @@ async fn dl_sketch_assembly_accession( let mut download_failures = Vec::::new(); let mut checksum_failures = Vec::::new(); - let name = accinfo.name; - let accession = accinfo.accession; + let name = accinfo.name.clone(); + let accession = accinfo.accession.clone(); // keep track of any accessions for which we fail to find URLs let (base_url, full_name) = - match fetch_genbank_filename(client, accession.as_str(), accinfo.url).await { + match fetch_genbank_filename(client, accession.as_str(), accinfo.url.clone()).await { Ok(result) => result, Err(_err) => { // Add accession to failed downloads with each moltype if !proteomes_only { - let failed_download_dna = FailedDownload { - accession: accession.clone(), - name: name.clone(), - moltype: "dna".to_string(), - md5sum: None, - download_filename: None, - url: None, - }; + let failed_download_dna = FailedDownload::from_gbassembly( + accession.clone(), + name.clone(), + "dna".to_string(), + None, // No MD5 checksum + None, // No Download filename + None, // URL of the file + None, // No range in this case + ); download_failures.push(failed_download_dna); } if !genomes_only { - let failed_download_protein = FailedDownload { - accession: accession.clone(), - name: name.clone(), - moltype: "protein".to_string(), - md5sum: None, - download_filename: None, - url: None, - }; - download_failures.push(failed_download_protein); + let failed_download_protein = FailedDownload::from_gbassembly( + accession.clone(), + name.clone(), + "protein".to_string(), + None, // No MD5 checksum + None, // No Download filename + None, // URL of the file + None, // No range in this case + ); + download_failures.push(failed_download_protein) } return Ok((empty_coll, download_failures, checksum_failures)); @@ -303,16 +286,16 @@ async fn dl_sketch_assembly_accession( // get filename, filetype info to facilitate downstream let url = file_type.url(&base_url, &full_name); let file_name = file_type.filename_to_write(&accession); - let failed_checksum_download: FailedChecksum = FailedChecksum { - accession: accession.clone(), - name: name.clone(), - moltype: file_type.moltype(), - md5sum_url: Some(md5sum_url.clone()), - download_filename: Some(file_name), - url: Some(url), - expected_md5sum: None, - reason: error_message.clone(), // write full error message - }; + let failed_checksum_download: FailedChecksum = FailedChecksum::new( + accession.clone(), + name.clone(), + file_type.moltype(), + Some(md5sum_url.clone()), + Some(file_name), + Some(url), + None, + error_message.clone(), // write full error message + ); checksum_failures.push(failed_checksum_download); } // return early from function b/c we can't check any checksums @@ -334,26 +317,27 @@ async fn dl_sketch_assembly_accession( // did we have a checksum error or a download error? // here --> keep track of accession errors + filetype if error_message.contains("MD5 hash does not match") { - let checksum_mismatch: FailedChecksum = FailedChecksum { - accession: accession.clone(), - name: name.clone(), - moltype: file_type.moltype(), - md5sum_url: Some(md5sum_url.clone()), - download_filename: Some(file_name.clone()), - url: Some(url.clone()), - expected_md5sum: expected_md5.cloned(), - reason: error_message.clone(), - }; + let checksum_mismatch: FailedChecksum = FailedChecksum::new( + accession.clone(), + name.clone(), + file_type.moltype(), + Some(md5sum_url.clone()), + Some(file_name.clone()), + Some(url.clone()), + expected_md5.cloned(), + error_message.clone(), + ); checksum_failures.push(checksum_mismatch); } else { - let failed_download = FailedDownload { - accession: accession.clone(), - name: name.clone(), - moltype: file_type.moltype(), - md5sum: expected_md5.map(|x| x.to_string()), - download_filename: Some(file_name), - url: Some(url), - }; + let failed_download = FailedDownload::from_gbassembly( + accession.clone(), + name.clone(), + file_type.moltype(), + expected_md5.map(|x| x.to_string()), // single MD5 checksum + Some(file_name), // intended download filename + Some(url), // URL of the file + None, // No range + ); download_failures.push(failed_download); } continue; @@ -368,10 +352,16 @@ async fn dl_sketch_assembly_accession( // sketch data match file_type { GenBankFileType::Genomic => { - sigs.build_sigs_from_data(data, "DNA", name.clone(), file_name.clone())?; + sigs.build_sigs_from_data(data, "DNA", name.clone(), file_name.clone(), None)?; } GenBankFileType::Protein => { - sigs.build_sigs_from_data(data, "protein", name.clone(), file_name.clone())?; + sigs.build_sigs_from_data( + data, + "protein", + name.clone(), + file_name.clone(), + None, + )?; } _ => {} // Do nothing for other file types }; @@ -381,83 +371,205 @@ async fn dl_sketch_assembly_accession( Ok((sigs, download_failures, checksum_failures)) } +/// Extracts the specified range from sequences in data and writes to the file in FASTA format. +async fn process_and_write_range( + data: &[u8], + file: &mut File, + range: Option<(usize, usize)>, +) -> Result<()> { + let cursor = std::io::Cursor::new(data); + let mut fastx_reader = + needletail::parse_fastx_reader(cursor).context("Failed to parse FASTA/FASTQ data")?; + + while let Some(record) = fastx_reader.next() { + let record = record.context("Failed to read record")?; + let sequence_to_write = extract_range_from_record(&record, range) + .context("Failed to extract range from record")?; + + // Use the `id` and `seq` fields directly to construct the FASTA entry + let fasta_entry = format!( + ">{}\n{}\n", + String::from_utf8_lossy(record.id()), + String::from_utf8_lossy(&sequence_to_write) + ); + + // Write the FASTA entry to the file + file.write_all(fasta_entry.as_bytes()) + .await + .context("Failed to write FASTA entry to file")?; + } + + Ok(()) +} + +/// Extracts a range from a `SequenceRecord`. Returns the specified sequence slice as a `Vec`. +fn extract_range_from_record( + record: &SequenceRecord, + range: Option<(usize, usize)>, +) -> Result> { + let full_sequence = record.seq(); + if let Some((start, end)) = range { + let adjusted_start = start.saturating_sub(1); // Adjust for 1-based indexing + if adjusted_start >= end || end > full_sequence.len() { + return Err(anyhow::anyhow!( + "Invalid range: start={}, end={}, sequence length={}", + start, + end, + full_sequence.len() + )); + } + Ok(full_sequence[adjusted_start..end].to_vec()) + } else { + Ok(full_sequence.to_vec()) + } +} + +/// Opens a file for writing, creating necessary directories and truncating it if it exists. +/// Returns an `Option` if a filename is provided, or `None` if the filename is `None`. +async fn open_file_for_writing( + location: &PathBuf, + filename: Option<&String>, +) -> Result> { + if let Some(download_filename) = filename { + let path = location.join(download_filename); + + // Create subdirectories if needed + if let Some(parent) = path.parent() { + create_dir_all(parent).with_context(|| { + format!( + "Failed to create directories for download filename path {}", + &path + ) + })?; + } + + // Open the file in write mode (truncate if it exists) + let file = OpenOptions::new() + .create(true) // Create the file if it doesn't exist + .write(true) // Enable write mode + .truncate(true) // Clear existing content + .open(&path) + .await + .with_context(|| format!("Failed to open file at {}", path))?; + Ok(Some(file)) + } else { + Ok(None) + } +} + #[allow(clippy::too_many_arguments)] async fn dl_sketch_url( client: &Client, accinfo: AccessionData, location: &PathBuf, retry: Option, - _keep_fastas: bool, + keep_fastas: bool, mut sigs: BuildCollection, _genomes_only: bool, _proteomes_only: bool, download_only: bool, + write_checksum_fail: bool, ) -> Result<(BuildCollection, Vec, Vec)> { let retry_count = retry.unwrap_or(3); // Default retry count let empty_coll = BuildCollection::new(); let mut download_failures = Vec::::new(); let mut checksum_failures = Vec::::new(); - let name = accinfo.name; - let accession = accinfo.accession; - let url = accinfo.url; - let expected_md5 = accinfo.expected_md5sum; - let download_filename = accinfo.download_filename; - let moltype = accinfo.moltype; - - match download_with_retry(client, &url, expected_md5.as_deref(), retry_count).await { - Ok(data) => { - // check keep_fastas instead?? - if let Some(ref download_filename) = download_filename { - let path = location.join(download_filename); - fs::write(path, &data).context("Failed to write data to file")?; - } - if !download_only { - let filename = download_filename.clone().unwrap_or("".to_string()); - // sketch data + let name = accinfo.name.clone(); + let accession = accinfo.accession.clone(); + let download_filename = &accinfo.download_filename; + let filename = download_filename.clone().unwrap_or("".to_string()); + let moltype = &accinfo.moltype; - match moltype { - InputMolType::Dna => { - sigs.build_sigs_from_data(data, "DNA", name.clone(), filename.clone())?; - } - InputMolType::Protein => { - sigs.build_sigs_from_data(data, "protein", name.clone(), filename.clone())?; + let mut file: Option = if keep_fastas { + open_file_for_writing(location, download_filename.as_ref()).await? + } else { + None + }; + + // are we merging files? + let merged_sample: bool = accinfo.url_info.len() > 1; + for uinfo in &accinfo.url_info { + let url = &uinfo.url; + let expected_md5 = &uinfo.md5sum; + let range = uinfo.range; + match download_with_retry(client, url, expected_md5.as_deref(), retry_count).await { + Ok(data) => { + // Write to file if keep_fastas is true and a file is open + // note, if multiple urls are provided, this will append to the same file + if let Some(file) = file.as_mut() { + if range.is_some() { + process_and_write_range(&data, file, range) + .await + .context("Failed to process and write range to file")?; + } else { + // Write the entire data if no range is provided + file.write_all(&data) + .await + .context("Failed to write data to file")?; } - }; + } + + if !download_only { + // sketch data + + match moltype { + InputMolType::Dna => { + sigs.build_sigs_from_data( + data, + "DNA", + name.clone(), + filename.clone(), + range, + )?; + } + InputMolType::Protein => { + sigs.build_sigs_from_data( + data, + "protein", + name.clone(), + filename.clone(), + range, + )?; + } + }; + } } - } - Err(err) => { - let error_message = err.to_string(); - // did we have a checksum error or a download error? - // here --> keep track of accession errors + filetype - if error_message.contains("MD5 hash does not match") { - let checksum_mismatch: FailedChecksum = FailedChecksum { - accession: accession.clone(), - name: name.clone(), - moltype: moltype.to_string(), - md5sum_url: None, - download_filename, - url: Some(url.clone()), - expected_md5sum: expected_md5.clone(), - reason: error_message.clone(), - }; - checksum_failures.push(checksum_mismatch); - sigs = empty_coll; - } else { - let failed_download = FailedDownload { - accession: accession.clone(), - name: name.clone(), - moltype: moltype.to_string(), - md5sum: expected_md5.map(|x| x.to_string()), - download_filename, - url: Some(url), - }; - download_failures.push(failed_download); - sigs = empty_coll; + Err(err) => { + let error_message = err.to_string(); + // did we have a checksum error or a download error? + // here --> keep track of accession errors + filetype + if error_message.contains("MD5 hash does not match") && write_checksum_fail { + let checksum_mismatch: FailedChecksum = FailedChecksum::new( + accession.clone(), + name.clone(), + moltype.to_string(), + None, + download_filename.clone(), + Some(url.clone()), + expected_md5.clone(), + error_message.clone(), + ); + checksum_failures.push(checksum_mismatch); + // if this is a merged sample, the checksum failure is only for one part of it. + // also write a download failure, which is the full entry. + // The checksum failures file is mostly for debugging, while the failure csv + // can be used to re-run urlsketch. + if merged_sample { + download_failures.push(FailedDownload::from_accession_data(&accinfo)); + } + } else { + download_failures.push(FailedDownload::from_accession_data(&accinfo)); + } + // Clear signatures and return immediately on failure + return Ok((empty_coll, download_failures, checksum_failures)); } } } + // Update signature info + sigs.update_info(name, filename); + Ok((sigs, download_failures, checksum_failures)) } @@ -705,43 +817,25 @@ pub fn failures_handle( Ok(file) => { let mut writer = BufWriter::new(file); - // Attempt to write CSV headers + // Write CSV header if let Err(e) = writer - .write_all(b"accession,name,moltype,md5sum,download_filename,url\n") + .write_all(FailedDownload::csv_header().as_bytes()) .await { let error = Error::new(e).context("Failed to write headers"); let _ = error_sender.send(error).await; - return; // Exit the task early after reporting the error + return; } - - while let Some(FailedDownload { - accession, - name, - md5sum, - download_filename, - url, - moltype, - }) = recv_failed.recv().await - { - let record = format!( - "{},{},{},{},{},{}\n", - accession, - name, - moltype, - md5sum.unwrap_or("".to_string()), - download_filename.unwrap_or("".to_string()), - url.map(|u| u.to_string()).unwrap_or("".to_string()) - ); - // Attempt to write each record - if let Err(e) = writer.write_all(record.as_bytes()).await { + while let Some(failed_download) = recv_failed.recv().await { + // Write the FailedDownload to the CSV writer + if let Err(e) = failed_download.to_writer(&mut writer).await { let error = Error::new(e).context("Failed to write record"); let _ = error_sender.send(error).await; - continue; // Optionally continue to try to write next records + continue; } } - // Attempt to flush the writer + // Flush the writer if let Err(e) = writer.flush().await { let error = Error::new(e).context("Failed to flush writer"); let _ = error_sender.send(error).await; @@ -768,41 +862,20 @@ pub fn checksum_failures_handle( // Attempt to write CSV headers if let Err(e) = writer - .write_all(b"accession,name,moltype,md5sum_url,download_filename,url,expected_md5sum,reason\n") + .write_all(FailedChecksum::csv_header().as_bytes()) .await { let error = Error::new(e).context("Failed to write headers"); let _ = error_sender.send(error).await; - return; // Exit the task early after reporting the error + return; } - while let Some(FailedChecksum { - accession, - name, - moltype, - md5sum_url, - download_filename, - url, - expected_md5sum, - reason, - }) = recv_failed.recv().await - { - let record = format!( - "{},{},{},{},{},{},{},{}\n", - accession, - name, - moltype, - md5sum_url.map(|u| u.to_string()).unwrap_or("".to_string()), - download_filename.unwrap_or("".to_string()), - url.map(|u| u.to_string()).unwrap_or("".to_string()), - expected_md5sum.unwrap_or("".to_string()), - reason, - ); - // Attempt to write each record - if let Err(e) = writer.write_all(record.as_bytes()).await { + // Write each failed checksum record + while let Some(failed_checksum) = recv_failed.recv().await { + if let Err(e) = failed_checksum.to_writer(&mut writer).await { let error = Error::new(e).context("Failed to write failed checksum record"); let _ = error_sender.send(error).await; - continue; // continue to try to write next records + continue; } } @@ -1233,6 +1306,16 @@ pub async fn urlsketch( sigs.filter_by_manifest(existing_manifest); } } + // eliminate sigs that won't be added to based on moltype + // this assumes no translation --> modify as needed if adding that. + if accinfo.moltype == InputMolType::Dna { + sigs.select(&dna_multiselection)?; + } else { + sigs.select(&protein_multiselection)?; + } + if sigs.is_empty() && !download_only { + continue; + } // eliminate sigs that won't be added to based on moltype // this assumes no translation --> modify as needed if adding that. @@ -1252,6 +1335,7 @@ pub async fn urlsketch( let checksum_send_failed = send_failed_checksums.clone(); let download_path_clone = download_path.clone(); // Clone the path for each task let send_errors = error_sender.clone(); + let write_checksum_fail = write_failed_checksums; tokio::spawn(async move { let _permit = semaphore_clone.acquire().await; @@ -1276,6 +1360,7 @@ pub async fn urlsketch( genomes_only, proteomes_only, download_only, + write_checksum_fail, ) .await; match result { @@ -1292,28 +1377,11 @@ pub async fn urlsketch( let _ = send_errors.send(e.into()).await; // Send the error through the channel } } - if write_failed_checksums { - for fail in failed_checksums { - if let Err(e) = checksum_send_failed.send(fail).await { - eprintln!("Failed to send failed checksum info: {}", e); - let _ = send_errors.send(e.into()).await; // Send the error through the channel - } - } - } else { - // if we don't have a failed checksum file, convert to failed downloads + write there - for fail in failed_checksums { - let dl_fail: FailedDownload = FailedDownload { - accession: fail.accession, - name: fail.name, - moltype: fail.moltype, - md5sum: fail.expected_md5sum, - download_filename: fail.download_filename, - url: fail.url, - }; - if let Err(e) = send_failed.send(dl_fail).await { - eprintln!("Failed to send failed download info: {}", e); - let _ = send_errors.send(e.into()).await; // Send the error through the channel - } + // if write_failed_checksums { + for fail in failed_checksums { + if let Err(e) = checksum_send_failed.send(fail).await { + eprintln!("Failed to send failed checksum info: {}", e); + let _ = send_errors.send(e.into()).await; // Send the error through the channel } } } diff --git a/src/utils/buildutils.rs b/src/utils/buildutils.rs index c99ed26..595bfd8 100644 --- a/src/utils/buildutils.rs +++ b/src/utils/buildutils.rs @@ -759,14 +759,33 @@ impl BuildCollection { &mut self, input_moltype: &str, record: &SequenceRecord, + range: Option<(usize, usize)>, ) -> Result<()> { + // Get the full sequence and apply the range if provided + let full_sequence = record.seq(); + let sequence_to_process = if let Some((start, end)) = range { + // Adjust for 1-based input: start - 1, end remains unchanged + let adjusted_start = start.saturating_sub(1); // Ensure no underflow + if adjusted_start >= end || end > full_sequence.len() { + return Err(anyhow::anyhow!( + "Invalid range: start={}, end={}, sequence length={}", + start, + end, + full_sequence.len() + )); + } + &full_sequence[adjusted_start..end] + } else { + &full_sequence + }; + // add seq to sigs self.iter_mut().try_for_each(|(rec, sig)| { if input_moltype == "protein" && (rec.moltype() == HashFunctions::Murmur64Protein || rec.moltype() == HashFunctions::Murmur64Dayhoff || rec.moltype() == HashFunctions::Murmur64Hp) { - sig.add_protein(&record.seq()) + sig.add_protein(sequence_to_process) .context("Failed to add protein")?; if !rec.sequence_added { rec.sequence_added = true; @@ -776,7 +795,7 @@ impl BuildCollection { || rec.moltype() == HashFunctions::Murmur64Skipm2n3 || rec.moltype() == HashFunctions::Murmur64Skipm1n3) { - sig.add_sequence(&record.seq(), true) + sig.add_sequence(sequence_to_process, true) .context("Failed to add sequence")?; if !rec.sequence_added { rec.sequence_added = true; @@ -792,6 +811,7 @@ impl BuildCollection { input_moltype: &str, name: String, filename: String, + range: Option<(usize, usize)>, ) -> Result<()> { let cursor = Cursor::new(data); let mut fastx_reader = @@ -800,7 +820,7 @@ impl BuildCollection { // Iterate over FASTA records and add sequences/proteins to sigs while let Some(record) = fastx_reader.next() { let record = record.context("Failed to read record")?; - self.build_sigs_from_record(input_moltype, &record)?; + self.build_sigs_from_record(input_moltype, &record, range)?; } // After processing sequences, update sig, record information @@ -814,6 +834,7 @@ impl BuildCollection { input_moltype: &str, // "protein" or "DNA" name: String, filename: String, + range: Option<(usize, usize)>, ) -> Result { // Create a FASTX reader from the file or stdin let mut fastx_reader = if filename == "-" { @@ -830,7 +851,7 @@ impl BuildCollection { while let Some(record_result) = fastx_reader.next() { let record = record_result.context("Failed to read a record from input")?; - self.build_sigs_from_record(input_moltype, &record)?; + self.build_sigs_from_record(input_moltype, &record, range)?; record_count += 1; } @@ -847,8 +868,9 @@ impl BuildCollection { record: SequenceRecord, input_moltype: &str, // (protein/dna); todo - use hashfns? filename: String, + range: Option<(usize, usize)>, ) -> Result<()> { - self.build_sigs_from_record(input_moltype, &record)?; + self.build_sigs_from_record(input_moltype, &record, range)?; // After processing sequences, update sig, record information let record_name = std::str::from_utf8(record.id()) .expect("could not get record id") diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 6c7fa18..efa326c 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -3,6 +3,7 @@ use reqwest::Url; use sourmash::collection::Collection; use std::collections::HashMap; use std::fmt; +use tokio::io::AsyncWriteExt; pub mod buildutils; use crate::utils::buildutils::{BuildManifest, BuildRecord}; @@ -85,15 +86,22 @@ impl GenBankFileType { } } } + #[allow(dead_code)] #[derive(Clone)] pub struct AccessionData { pub accession: String, pub name: String, pub moltype: InputMolType, + pub url_info: Vec, + pub download_filename: Option, // Need to require this if --keep-fastas are used +} + +#[derive(Clone)] +pub struct UrlInfo { pub url: reqwest::Url, - pub expected_md5sum: Option, - pub download_filename: Option, // need to require this if --keep-fastas are used + pub md5sum: Option, + pub range: Option<(usize, usize)>, } #[derive(Clone)] @@ -174,6 +182,116 @@ pub fn load_gbassembly_info(input_csv: String) -> Result<(Vec, u Ok((results, row_count)) } +fn parse_urls(url_field: Option<&str>) -> Result, anyhow::Error> { + let url_field = url_field.ok_or_else(|| anyhow!("Missing 'url' field"))?; + + let mut urls = Vec::new(); + + for s in url_field.split(';').map(|s| s.trim()) { + if s.is_empty() { + return Err(anyhow!("Empty URL entry found in 'url' field")); + } + + let parsed_url = + reqwest::Url::parse(s).map_err(|e| anyhow!("Invalid URL '{}': {}", s, e))?; + urls.push(parsed_url); + } + + if urls.is_empty() { + return Err(anyhow!("No valid URLs found in 'url' field")); + } + + Ok(urls) +} + +fn parse_md5sums( + md5sum_field: &str, + expected_num_urls: usize, + accession: &str, +) -> Result<(Vec>, usize), anyhow::Error> { + if md5sum_field.trim().is_empty() { + // Return a vector of None for each expected URL and a count of 0 + return Ok((vec![None; expected_num_urls], 0)); + } + + let md5sums: Vec> = md5sum_field + .split(';') + .map(|s| { + let trimmed = s.trim(); + if trimmed.is_empty() { + None + } else { + Some(trimmed.to_string()) + } + }) + .collect(); + + // Validate the number of MD5 sums matches the expected number of URLs + if md5sums.len() != expected_num_urls { + return Err(anyhow::anyhow!( + "Number of MD5 sums ({}) does not match the number of URLs ({}) for accession '{}'", + md5sums.len(), + expected_num_urls, + accession + )); + } + + // Count the number of non-None MD5 sums + let count = md5sums.iter().filter(|md5| md5.is_some()).count(); + + Ok((md5sums, count)) +} + +fn parse_ranges( + range_field: &str, + expected_num_ranges: usize, +) -> Result>, String> { + if range_field.trim().is_empty() { + // Return a vector of None for each expected range + return Ok(vec![None; expected_num_ranges]); + } + + let ranges: Vec<&str> = range_field.split(';').collect(); + + // Check if the number of ranges matches expected_num_ranges + if ranges.len() != expected_num_ranges { + return Err(format!( + "Number of ranges ({}) does not match expected number of ranges ({})", + ranges.len(), + expected_num_ranges + )); + } + + ranges + .into_iter() + .map(|s| { + let s = s.trim(); // Trim whitespace + if s.is_empty() { + return Ok(None); // Treat empty range as None + } + let parts: Vec<&str> = s.split('-').collect(); + if parts.len() == 2 { + let start = parts[0] + .parse::() + .map_err(|_| format!("Invalid start value in range: {}", s))?; + let end = parts[1] + .parse::() + .map_err(|_| format!("Invalid end value in range: {}", s))?; + if start < end { + Ok(Some((start, end))) // Return Some for valid ranges + } else { + Err(format!( + "Start value must be less than end value in range: {}", + s + )) + } + } else { + Err(format!("Invalid range format: {}", s)) + } + }) + .collect() +} + pub fn load_accession_info( input_csv: String, keep_fasta: bool, @@ -195,6 +313,7 @@ pub fn load_accession_info( "md5sum", "download_filename", "url", + "range", ]; if header != expected_header { return Err(anyhow!( @@ -227,36 +346,47 @@ pub fn load_accession_info( .ok_or_else(|| anyhow!("Missing 'moltype' field"))? .parse::() .map_err(|_| anyhow!("Invalid 'moltype' value"))?; - let expected_md5sum = record.get(3).map(|s| s.to_string()); + + // Parse URLs + let url_result = parse_urls(record.get(5)); + let urls = match url_result { + Ok(urls) => { + if urls.is_empty() { + return Err(anyhow!("No valid URLs found in 'url' field")); + } + urls + } + Err(e) => return Err(e), // Propagate the error if parsing fails + }; + + // Parse MD5sums (optional) + let (md5sums, md5sum_count_in_row) = + parse_md5sums(record.get(3).unwrap_or(""), urls.len(), &acc)?; + // Update the overall MD5 sum count + md5sum_count += md5sum_count_in_row; + + // Parse ranges (optional) + let range_field = record.get(6).unwrap_or(""); + let ranges = parse_ranges(range_field, urls.len()).map_err(|e| anyhow!("{}", e))?; + + // Combine URLs, MD5 sums, and ranges into UrlInfo + let url_info: Vec = urls + .into_iter() + .zip(md5sums) + .zip(ranges) + .map(|((url, md5sum), range)| UrlInfo { url, md5sum, range }) + .collect(); + let download_filename = record.get(4).map(|s| s.to_string()); if keep_fasta && download_filename.is_none() { return Err(anyhow!("Missing 'download_filename' field")); } - let url = record - .get(5) - .ok_or_else(|| anyhow!("Missing 'url' field"))? - .split(',') - .filter_map(|s| { - if s.starts_with("http://") || s.starts_with("https://") || s.starts_with("ftp://") - { - reqwest::Url::parse(s).ok() - } else { - None - } - }) - .next() - .ok_or_else(|| anyhow!("Invalid 'url' value"))?; - // count entries with url and md5sum - if expected_md5sum.is_some() { - md5sum_count += 1; - } // store accession data results.push(AccessionData { accession: acc, name, moltype, - url, - expected_md5sum, + url_info, download_filename, }); } @@ -312,3 +442,620 @@ impl MultiCollection { records_map } } + +#[derive(Clone)] +pub struct FailedDownload { + accession: String, + name: String, + moltype: String, + md5sum: String, + download_filename: String, + url: String, + range: String, +} + +impl FailedDownload { + /// Build a `FailedDownload` from `GBAssemblyData` with detailed information + pub fn from_gbassembly( + accession: String, + name: String, + moltype: String, + md5sum: Option, // Single MD5 checksum + download_filename: Option, // Download filename + url: Option, // URL for the file + range: Option<(usize, usize)>, // Optional range for the download + ) -> Self { + Self { + accession, + name, + moltype, + md5sum: md5sum.unwrap_or_default(), + download_filename: download_filename.unwrap_or_default(), + url: url.map(|u| u.to_string()).unwrap_or_default(), + range: range + .map(|(start, end)| format!("{}-{}", start, end)) + .unwrap_or_default(), // Format range or use "" + } + } + + fn parse_to_separated_string(url_info: &[UrlInfo], mut extractor: F) -> String + where + F: FnMut(&UrlInfo) -> Option, + T: ToString, + { + let results: Vec = url_info + .iter() + .map(|info| extractor(info).map_or("".to_string(), |v| v.to_string())) // Map `None` to empty string + .collect(); + + if results.iter().all(|entry| entry.is_empty()) { + "".to_string() // If all entries are empty, return `""` + } else { + results.join(";") // Otherwise, join with `;` + } + } + + /// Build a `FailedDownload` from `AccessionData` + pub fn from_accession_data(acc_data: &AccessionData) -> Self { + Self { + accession: acc_data.accession.clone(), + name: acc_data.name.clone(), + moltype: acc_data.moltype.to_string(), + md5sum: Self::parse_to_separated_string(&acc_data.url_info, |info| info.md5sum.clone()), + download_filename: acc_data.download_filename.clone().unwrap_or_default(), + url: Self::parse_to_separated_string(&acc_data.url_info, |info| { + Some(info.url.to_string()) + }), + range: Self::parse_to_separated_string(&acc_data.url_info, |info| { + info.range.map(|(start, end)| format!("{}-{}", start, end)) + }), + } + } + + pub fn to_csv_record(&self) -> String { + format!( + "{},{},{},{},{},{},{}\n", + self.accession, + self.name, + self.moltype, + self.md5sum, + self.download_filename, + self.url, + self.range, + ) + } + + pub fn csv_header() -> &'static str { + "accession,name,moltype,md5sum,download_filename,url,range\n" + } + + /// Write a `FailedDownload` to a CSV writer + pub async fn to_writer( + &self, + writer: &mut W, + ) -> Result<(), std::io::Error> { + writer.write_all(self.to_csv_record().as_bytes()).await + } +} + +pub struct FailedChecksum { + accession: String, + name: String, + moltype: String, + md5sum_url: Option, + download_filename: Option, + url: Option, + expected_md5sum: Option, + reason: String, +} + +impl FailedChecksum { + #[allow(clippy::too_many_arguments)] + pub fn new( + accession: String, + name: String, + moltype: String, + md5sum_url: Option, + download_filename: Option, + url: Option, + expected_md5sum: Option, + reason: String, + ) -> Self { + Self { + accession, + name, + moltype, + md5sum_url, + download_filename, + url, + expected_md5sum, + reason, + } + } + + /// Convert a `FailedChecksum` to a CSV-formatted string + pub fn to_csv_record(&self) -> String { + let md5sum_url_str = self + .md5sum_url + .as_ref() + .map(|u| u.to_string()) + .unwrap_or_default(); + + let url_str = self.url.as_ref().map(|u| u.to_string()).unwrap_or_default(); + + format!( + "{},{},{},{},{},{},{},{}\n", + self.accession, + self.name, + self.moltype, + md5sum_url_str, + self.download_filename.clone().unwrap_or_default(), + url_str, + self.expected_md5sum.clone().unwrap_or_default(), + self.reason, + ) + } + + /// Get the CSV header for a `FailedChecksum` + pub fn csv_header() -> &'static str { + "accession,name,moltype,md5sum_url,download_filename,url,expected_md5sum,reason\n" + } + + /// Write a `FailedChecksum` to a CSV writer + pub async fn to_writer( + &self, + writer: &mut W, + ) -> Result<(), std::io::Error> { + writer.write_all(self.to_csv_record().as_bytes()).await + } +} + +#[cfg(test)] +mod tests { + use super::*; + use reqwest::Url; + + #[test] + fn test_parse_urls_valid_urls() { + let url_field = Some("http://example.com; https://example.org"); + let result = parse_urls(url_field).unwrap(); + + assert_eq!( + result, + vec![ + Url::parse("http://example.com").unwrap(), + Url::parse("https://example.org").unwrap() + ] + ); + } + + #[test] + fn test_parse_urls_with_whitespace() { + let url_field = Some(" http://example.com ; https://example.org "); + let result = parse_urls(url_field).unwrap(); + + assert_eq!( + result, + vec![ + Url::parse("http://example.com").unwrap(), + Url::parse("https://example.org").unwrap() + ] + ); + } + + #[test] + fn test_parse_urls_with_empty_entries() { + let url_field = Some("http://example.com;;https://example.org"); + let result = parse_urls(url_field); + + assert!(result.is_err()); + assert_eq!( + result.unwrap_err().to_string(), + "Empty URL entry found in 'url' field" + ); + } + + #[test] + fn test_parse_urls_invalid_url() { + let url_field = Some("http://example.com; invalid-url; https://example.org"); + let result = parse_urls(url_field); + + assert!(result.is_err()); + assert_eq!( + result.unwrap_err().to_string(), + "Invalid URL 'invalid-url': relative URL without a base" + ); + } + + #[test] + fn test_parse_urls_empty_field() { + let url_field = Some(""); + let result = parse_urls(url_field); + + assert!(result.is_err()); + assert_eq!( + result.unwrap_err().to_string(), + "Empty URL entry found in 'url' field" + ); + } + + #[test] + fn test_parse_urls_missing_field() { + let url_field = None; + let result = parse_urls(url_field); + + assert!(result.is_err()); + assert_eq!(result.unwrap_err().to_string(), "Missing 'url' field"); + } + + #[test] + fn test_parse_urls_all_invalid() { + let url_field = Some("invalid-url; still-not-a-url"); + let result = parse_urls(url_field); + + assert!(result.is_err()); + assert_eq!( + result.unwrap_err().to_string(), + "Invalid URL 'invalid-url': relative URL without a base" + ); + } + + #[test] + fn test_parse_ranges_valid() { + let range_field = "1-10;20-30;40-50"; + let expected_num_ranges = 3; + let result = parse_ranges(range_field, expected_num_ranges); + + assert!(result.is_ok()); + assert_eq!( + result.unwrap(), + vec![Some((1, 10)), Some((20, 30)), Some((40, 50))] + ); + } + + #[test] + fn test_parse_ranges_empty_field() { + let range_field = " "; + let expected_num_ranges = 3; + let result = parse_ranges(range_field, expected_num_ranges); + + assert!(result.is_ok()); + assert_eq!(result.unwrap(), vec![None, None, None]); + } + + #[test] + fn test_parse_ranges_neg_start() { + let range_field = "1-10;-20-30"; + let expected_num_ranges = 2; + let result = parse_ranges(range_field, expected_num_ranges); + + assert!(result.is_err()); + assert_eq!(result.unwrap_err(), "Invalid range format: -20-30"); + } + + #[test] + fn test_parse_ranges_invalid_start() { + let range_field = "1-10;bar-30"; + let expected_num_ranges = 2; + let result = parse_ranges(range_field, expected_num_ranges); + + assert!(result.is_err()); + assert_eq!(result.unwrap_err(), "Invalid start value in range: bar-30"); + } + + #[test] + fn test_parse_ranges_invalid_end() { + let range_field = "1-10;20-bar"; + let expected_num_ranges = 2; + let result = parse_ranges(range_field, expected_num_ranges); + + assert!(result.is_err()); + assert_eq!(result.unwrap_err(), "Invalid end value in range: 20-bar"); + } + + #[test] + fn test_parse_ranges_start_not_less_than_end() { + let range_field = "30-10"; + let expected_num_ranges = 1; + let result = parse_ranges(range_field, expected_num_ranges); + + assert!(result.is_err()); + assert_eq!( + result.unwrap_err(), + "Start value must be less than end value in range: 30-10" + ); + } + + #[test] + fn test_parse_ranges_extra_ranges() { + let range_field = "1-10;20-30;40-50"; + let expected_num_ranges = 5; // Expecting more ranges than provided + let result = parse_ranges(range_field, expected_num_ranges); + + assert!(result.is_err()); // Now expecting an error + assert_eq!( + result.unwrap_err(), + "Number of ranges (3) does not match expected number of ranges (5)" + ); + } + + #[test] + fn test_parse_ranges_fewer_ranges() { + let range_field = "1-10;20-30"; + let expected_num_ranges = 3; // Expecting more ranges than provided + let result = parse_ranges(range_field, expected_num_ranges); + + assert!(result.is_err()); // Now expecting an error + assert_eq!( + result.unwrap_err(), + "Number of ranges (2) does not match expected number of ranges (3)" + ); + } + + #[test] + fn test_parse_ranges_with_empty_values() { + let range_field = "1-10;;20-30"; + let expected_num_ranges = 3; + let result = parse_ranges(range_field, expected_num_ranges).unwrap(); + + assert_eq!(result, vec![Some((1, 10)), None, Some((20, 30))]); + } + + #[test] + fn test_parse_md5sums_valid() { + let md5sum_field = "abcd1234;efgh5678;ijkl9012"; + let expected_num_urls = 3; + let accession = "ACC123"; + + let result = parse_md5sums(md5sum_field, expected_num_urls, accession).unwrap(); + assert_eq!( + result, + ( + vec![ + Some("abcd1234".to_string()), + Some("efgh5678".to_string()), + Some("ijkl9012".to_string()) + ], + 3 + ) + ); + } + + #[test] + fn test_parse_md5sums_empty_field() { + let md5sum_field = ""; + let expected_num_urls = 2; + let accession = "ACC123"; + + let result = parse_md5sums(md5sum_field, expected_num_urls, accession).unwrap(); + assert_eq!(result, (vec![None, None], 0)); + } + + #[test] + fn test_parse_md5sums_mismatched_count_more_md5s() { + let md5sum_field = "abcd1234;efgh5678;ijkl9012"; + let expected_num_urls = 2; // Fewer URLs than MD5 sums + let accession = "ACC123"; + + let result = parse_md5sums(md5sum_field, expected_num_urls, accession); + assert!(result.is_err()); + assert_eq!( + result.unwrap_err().to_string(), + "Number of MD5 sums (3) does not match the number of URLs (2) for accession 'ACC123'" + ); + } + + #[test] + fn test_parse_md5sums_mismatched_count_fewer_md5s() { + let md5sum_field = "abcd1234;efgh5678"; + let expected_num_urls = 3; // More URLs than MD5 sums + let accession = "ACC123"; + + let result = parse_md5sums(md5sum_field, expected_num_urls, accession); + assert!(result.is_err()); + assert_eq!( + result.unwrap_err().to_string(), + "Number of MD5 sums (2) does not match the number of URLs (3) for accession 'ACC123'" + ); + } + + #[test] + fn test_parse_md5sums_with_whitespace() { + let md5sum_field = " abcd1234 ; efgh5678 ; ijkl9012 "; + let expected_num_urls = 3; + let accession = "ACC123"; + + let result = parse_md5sums(md5sum_field, expected_num_urls, accession).unwrap(); + assert_eq!( + result, + ( + vec![ + Some("abcd1234".to_string()), + Some("efgh5678".to_string()), + Some("ijkl9012".to_string()) + ], + 3 + ) + ); + } + + #[test] + fn test_parse_md5sums_some_empty_entries() { + let md5sum_field = "abcd1234;;ijkl9012"; + let expected_num_urls = 3; + let accession = "ACC123"; + + let result = parse_md5sums(md5sum_field, expected_num_urls, accession).unwrap(); + assert_eq!( + result, + ( + vec![ + Some("abcd1234".to_string()), + None, // Empty MD5 sum + Some("ijkl9012".to_string()) + ], + 2 // Only count non-empty Some values + ) + ); + } + + #[test] + fn test_failed_download_from_gbassembly_valid() { + let accession = "ACC123".to_string(); + let name = "Sample Name".to_string(); + let moltype = "DNA".to_string(); + let md5sum = Some("abcd1234".to_string()); + let download_filename = Some("file.fasta".to_string()); + let url = Some(Url::parse("http://example.com/file.fasta").unwrap()); + let range = Some((10, 20)); + + let failed_download = FailedDownload::from_gbassembly( + accession.clone(), + name.clone(), + moltype.clone(), + md5sum.clone(), + download_filename.clone(), + url.clone(), + range.clone(), + ); + + assert_eq!(failed_download.accession, accession); + assert_eq!(failed_download.name, name); + assert_eq!(failed_download.moltype, moltype); + assert_eq!(failed_download.md5sum, "abcd1234"); + assert_eq!(failed_download.download_filename, "file.fasta"); + assert_eq!(failed_download.url, "http://example.com/file.fasta"); + assert_eq!(failed_download.range, "10-20"); + } + + #[test] + fn test_failed_download_from_gbassembly_defaults() { + let accession = "ACC123".to_string(); + let name = "Sample Name".to_string(); + let moltype = "DNA".to_string(); + + let failed_download = FailedDownload::from_gbassembly( + accession.clone(), + name.clone(), + moltype.clone(), + None, // No MD5 checksum + None, // No filename + None, // No URL + None, // No range + ); + + assert_eq!(failed_download.accession, accession); + assert_eq!(failed_download.name, name); + assert_eq!(failed_download.moltype, moltype); + assert_eq!(failed_download.md5sum, ""); + assert_eq!(failed_download.download_filename, ""); + assert_eq!(failed_download.url, ""); + assert_eq!(failed_download.range, ""); + } + + #[test] + fn test_failed_download_from_accession_data() { + let url_info = vec![ + UrlInfo { + url: Url::parse("http://example.com/file1").unwrap(), + md5sum: Some("abcd1234".to_string()), + range: Some((10, 20)), + }, + UrlInfo { + url: Url::parse("http://example.com/file2").unwrap(), + md5sum: None, + range: Some((30, 40)), + }, + ]; + + let acc_data = AccessionData { + accession: "ACC123".to_string(), + name: "Sample Name".to_string(), + moltype: InputMolType::Dna, + url_info, + download_filename: Some("file.fasta".to_string()), + }; + + let failed_download = FailedDownload::from_accession_data(&acc_data); + + assert_eq!(failed_download.accession, "ACC123"); + assert_eq!(failed_download.name, "Sample Name"); + assert_eq!(failed_download.moltype, "DNA"); + assert_eq!(failed_download.md5sum, "abcd1234;"); + assert_eq!(failed_download.download_filename, "file.fasta"); + assert_eq!( + failed_download.url, + "http://example.com/file1;http://example.com/file2" + ); + assert_eq!(failed_download.range, "10-20;30-40"); + } + + #[test] + fn test_parse_to_separated_string() { + let url_info = vec![ + UrlInfo { + url: Url::parse("http://example.com/file1").unwrap(), + md5sum: Some("abcd1234".to_string()), + range: Some((10, 20)), + }, + UrlInfo { + url: Url::parse("http://example.com/file2").unwrap(), + md5sum: None, + range: Some((30, 40)), + }, + ]; + + let md5sum_result = + FailedDownload::parse_to_separated_string(&url_info, |info| info.md5sum.clone()); + assert_eq!(md5sum_result, "abcd1234;"); + + let url_result = + FailedDownload::parse_to_separated_string(&url_info, |info| Some(info.url.to_string())); + assert_eq!( + url_result, + "http://example.com/file1;http://example.com/file2" + ); + + let range_result = FailedDownload::parse_to_separated_string(&url_info, |info| { + info.range.map(|(start, end)| format!("{}-{}", start, end)) + }); + assert_eq!(range_result, "10-20;30-40"); + } + + #[test] + fn test_parse_to_separated_string_2() { + let url_info = vec![ + UrlInfo { + url: Url::parse("http://example.com/file1").unwrap(), + md5sum: Some("abcd1234".to_string()), + range: Some((10, 20)), + }, + UrlInfo { + url: Url::parse("http://example.org/file2").unwrap(), + md5sum: Some("efgh5678".to_string()), + range: Some((30, 40)), + }, + UrlInfo { + url: Url::parse("http://example.net/file3").unwrap(), + md5sum: Some("ijkl9012".to_string()), + range: Some((50, 60)), + }, + ]; + + let md5sum_result = + FailedDownload::parse_to_separated_string(&url_info, |info| info.md5sum.clone()); + assert_eq!(md5sum_result, "abcd1234;efgh5678;ijkl9012"); + + let url_result = + FailedDownload::parse_to_separated_string(&url_info, |info| Some(info.url.to_string())); + assert_eq!( + url_result, + "http://example.com/file1;http://example.org/file2;http://example.net/file3" + ); + + let range_result = FailedDownload::parse_to_separated_string(&url_info, |info| { + info.range.map(|(start, end)| format!("{}:{}", start, end)) + }); + assert_eq!(range_result, "10:20;30:40;50:60"); + } +} diff --git a/tests/sourmash_tst_utils.py b/tests/sourmash_tst_utils.py index 4bbc87f..aaaa37d 100644 --- a/tests/sourmash_tst_utils.py +++ b/tests/sourmash_tst_utils.py @@ -7,8 +7,7 @@ import collections import pprint -import pkg_resources -from pkg_resources import Requirement, resource_filename, ResolutionError +import importlib.metadata import traceback from io import open # pylint: disable=redefined-builtin from io import StringIO @@ -43,23 +42,13 @@ def _runscript(scriptname): namespace = {"__name__": "__main__"} namespace['sys'] = globals()['sys'] - try: - pkg_resources.load_entry_point("sourmash", 'console_scripts', scriptname)() - return 0 - except pkg_resources.ResolutionError: - pass - - path = scriptpath() - - scriptfile = os.path.join(path, scriptname) - if os.path.isfile(scriptfile): - if os.path.isfile(scriptfile): - exec( # pylint: disable=exec-used - compile(open(scriptfile).read(), scriptfile, 'exec'), - namespace) - return 0 - - return -1 + entry_points = importlib.metadata.entry_points( + group="console_scripts", name="sourmash" + ) + assert len(entry_points) == 1 + smash_cli = tuple(entry_points)[0].load() + smash_cli() + return 0 ScriptResults = collections.namedtuple('ScriptResults', diff --git a/tests/test-data/GCA_000175535.1_ASM17553v1_genomic.1-50000.fna.gz b/tests/test-data/GCA_000175535.1_ASM17553v1_genomic.1-50000.fna.gz new file mode 100644 index 0000000..c693f97 Binary files /dev/null and b/tests/test-data/GCA_000175535.1_ASM17553v1_genomic.1-50000.fna.gz differ diff --git a/tests/test-data/GCA_000175535.1_ASM17553v1_genomic.50000-100000.fna.gz b/tests/test-data/GCA_000175535.1_ASM17553v1_genomic.50000-100000.fna.gz new file mode 100644 index 0000000..a872cef Binary files /dev/null and b/tests/test-data/GCA_000175535.1_ASM17553v1_genomic.50000-100000.fna.gz differ diff --git a/tests/test-data/acc-merged-md5sums.csv b/tests/test-data/acc-merged-md5sums.csv new file mode 100644 index 0000000..b7bd78a --- /dev/null +++ b/tests/test-data/acc-merged-md5sums.csv @@ -0,0 +1,3 @@ +accession,name,moltype,md5sum,download_filename,url,range +both,both name,dna,47b9fb20c51f0552b87db5d44d5d4566;a1a8f1c6dc56999c73fe298871c963d1,both.urlsketch.fna.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/961/135/GCA_000961135.2_ASM96113v2/GCA_000961135.2_ASM96113v2_genomic.fna.gz; https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_genomic.fna.gz, + diff --git a/tests/test-data/acc-merged.csv b/tests/test-data/acc-merged.csv new file mode 100644 index 0000000..dd06da1 --- /dev/null +++ b/tests/test-data/acc-merged.csv @@ -0,0 +1,3 @@ +accession,name,moltype,md5sum,download_filename,url,range +both,both name,dna,,both.urlsketch.fna.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/961/135/GCA_000961135.2_ASM96113v2/GCA_000961135.2_ASM96113v2_genomic.fna.gz; https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_genomic.fna.gz, + diff --git a/tests/test-data/acc-url-md5sum.csv b/tests/test-data/acc-url-md5sum.csv index 542cc8a..ab39a15 100644 --- a/tests/test-data/acc-url-md5sum.csv +++ b/tests/test-data/acc-url-md5sum.csv @@ -1,3 +1,3 @@ -accession,name,moltype,md5sum,download_filename,url -GCA_000961135.2,GCA_000961135.2 Candidatus Aramenus sulfurataquae isolate AZ1-454,dna,47b9fb20c51f0552b87db5d44d5d4566,GCA_000961135.2_genomic.urlsketch.fna.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/961/135/GCA_000961135.2_ASM96113v2/GCA_000961135.2_ASM96113v2_genomic.fna.gz -GCA_000175535.1,GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14,dna,b1234567,GCA_000175535.1_genomic.urlsketch.fna.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_genomic.fna.gz +accession,name,moltype,md5sum,download_filename,url,range +GCA_000961135.2,GCA_000961135.2 Candidatus Aramenus sulfurataquae isolate AZ1-454,dna,47b9fb20c51f0552b87db5d44d5d4566,GCA_000961135.2_genomic.urlsketch.fna.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/961/135/GCA_000961135.2_ASM96113v2/GCA_000961135.2_ASM96113v2_genomic.fna.gz, +GCA_000175535.1,GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14,dna,b1234567,GCA_000175535.1_genomic.urlsketch.fna.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_genomic.fna.gz, diff --git a/tests/test-data/acc-url-range.csv b/tests/test-data/acc-url-range.csv new file mode 100644 index 0000000..b4069df --- /dev/null +++ b/tests/test-data/acc-url-range.csv @@ -0,0 +1,3 @@ +accession,name,moltype,md5sum,download_filename,url,range +GCA_000175535.1_first50kb,GCA_000175535.1_first50kb Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14,dna,a1a8f1c6dc56999c73fe298871c963d1,GCA_000175535.1_genomic_first50kb.urlsketch.fna.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_genomic.fna.gz,1-50000 +GCA_000175535.1_second50kb,GCA_000175535.1_second50kb Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14,dna,a1a8f1c6dc56999c73fe298871c963d1,GCA_000175535.1_genomic_second50kb.urlsketch.fna.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_genomic.fna.gz,50000-100000 diff --git a/tests/test-data/acc-url.csv b/tests/test-data/acc-url.csv index 8c3a87e..133d968 100644 --- a/tests/test-data/acc-url.csv +++ b/tests/test-data/acc-url.csv @@ -1,4 +1,4 @@ -accession,name,moltype,md5sum,download_filename,url -GCA_000961135.2,GCA_000961135.2 Candidatus Aramenus sulfurataquae isolate AZ1-454,dna,47b9fb20c51f0552b87db5d44d5d4566,GCA_000961135.2_genomic.urlsketch.fna.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/961/135/GCA_000961135.2_ASM96113v2/GCA_000961135.2_ASM96113v2_genomic.fna.gz -GCA_000961135.2,GCA_000961135.2 Candidatus Aramenus sulfurataquae isolate AZ1-454,protein,fb7920fb8f3cf5d6ab9b6b754a5976a4,GCA_000961135.2_protein.urlsketch.faa.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/961/135/GCA_000961135.2_ASM96113v2/GCA_000961135.2_ASM96113v2_protein.faa.gz -GCA_000175535.1,GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14,dna,a1a8f1c6dc56999c73fe298871c963d1,GCA_000175535.1_genomic.urlsketch.fna.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_genomic.fna.gz +accession,name,moltype,md5sum,download_filename,url,range +GCA_000961135.2,GCA_000961135.2 Candidatus Aramenus sulfurataquae isolate AZ1-454,dna,47b9fb20c51f0552b87db5d44d5d4566,GCA_000961135.2_genomic.urlsketch.fna.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/961/135/GCA_000961135.2_ASM96113v2/GCA_000961135.2_ASM96113v2_genomic.fna.gz, +GCA_000961135.2,GCA_000961135.2 Candidatus Aramenus sulfurataquae isolate AZ1-454,protein,fb7920fb8f3cf5d6ab9b6b754a5976a4,GCA_000961135.2_protein.urlsketch.faa.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/961/135/GCA_000961135.2_ASM96113v2/GCA_000961135.2_ASM96113v2_protein.faa.gz, +GCA_000175535.1,GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14,dna,a1a8f1c6dc56999c73fe298871c963d1,GCA_000175535.1_genomic.urlsketch.fna.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_genomic.fna.gz, diff --git a/tests/test-data/subseqs.zip b/tests/test-data/subseqs.zip new file mode 100644 index 0000000..56d7c79 Binary files /dev/null and b/tests/test-data/subseqs.zip differ diff --git a/tests/test_gbsketch.py b/tests/test_gbsketch.py index 9f03e7e..895390c 100644 --- a/tests/test_gbsketch.py +++ b/tests/test_gbsketch.py @@ -66,13 +66,14 @@ def test_gbsketch_simple(runtmp, capfd): fail_lines = failF.readlines() print(fail_lines) assert len(fail_lines) == 2 - assert fail_lines[0] == "accession,name,moltype,md5sum,download_filename,url\n" - acc, name, moltype, md5sum, download_filename, url = fail_lines[1].strip().split(',') + assert fail_lines[0] == "accession,name,moltype,md5sum,download_filename,url,range\n" + acc, name, moltype, md5sum, download_filename, url, range = fail_lines[1].strip().split(',') assert acc == "GCA_000175535.1" assert name == "GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14" assert moltype == "protein" assert download_filename == "GCA_000175535.1_protein.faa.gz" assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_protein.faa.gz" + assert range == "" def test_gbsketch_manifest(runtmp, capfd): @@ -605,13 +606,14 @@ def test_gbsketch_protein_dayhoff_hp(runtmp): fail_lines = failF.readlines() print(fail_lines) assert len(fail_lines) == 2 - assert fail_lines[0] == "accession,name,moltype,md5sum,download_filename,url\n" - acc, name, moltype, md5sum, download_filename, url = fail_lines[1].strip().split(',') + assert fail_lines[0] == "accession,name,moltype,md5sum,download_filename,url,range\n" + acc, name, moltype, md5sum, download_filename, url, range = fail_lines[1].strip().split(',') assert acc == "GCA_000175535.1" assert name == "GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14" assert moltype == "protein" assert download_filename == "GCA_000175535.1_protein.faa.gz" assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_protein.faa.gz" + assert range == "" def test_gbsketch_simple_batched_single(runtmp, capfd): diff --git a/tests/test_urlsketch.py b/tests/test_urlsketch.py index 4a25557..7e9b704 100644 --- a/tests/test_urlsketch.py +++ b/tests/test_urlsketch.py @@ -3,6 +3,8 @@ """ import os import pytest +import gzip +import screed import csv import sourmash @@ -60,15 +62,16 @@ def test_urlsketch_simple(runtmp): assert os.path.exists(failed) with open(failed, 'r') as failF: header = next(failF).strip() - assert header == "accession,name,moltype,md5sum,download_filename,url" + assert header == "accession,name,moltype,md5sum,download_filename,url,range" for line in failF: print(line) - acc, name, moltype, md5sum, download_filename, url = line.strip().split(',') + acc, name, moltype, md5sum, download_filename, url, range = line.strip().split(',') assert acc == "GCA_000175535.1" assert name == "GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14" assert moltype == "protein" assert download_filename == "GCA_000175535.1_protein.faa.gz" assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_protein.faa.gz" + assert range == "" def test_urlsketch_manifest(runtmp, capfd): @@ -151,6 +154,46 @@ def test_urlsketch_save_fastas(runtmp): assert sig.md5sum() == ss3.md5sum() +def test_urlsketch_save_fastas_no_append_across_runs(runtmp): + # make sure we overwrite files on subsequent runs (not append to existing) + acc_csv = get_test_data('acc-url.csv') + output = runtmp.output('simple.zip') + failed = runtmp.output('failed.csv') + out_dir = runtmp.output('out_fastas') + + # run once + runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output, + '--failed', failed, '-r', '1', '--fastas', out_dir, '--keep-fasta', + '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") + + # check out fastas exist + assert os.path.exists(output) + assert not runtmp.last_result.out # stdout should be empty + fa_files = os.listdir(out_dir) + assert set(fa_files) == set(['GCA_000175535.1_genomic.urlsketch.fna.gz', 'GCA_000961135.2_protein.urlsketch.faa.gz', 'GCA_000961135.2_genomic.urlsketch.fna.gz']) + + # Get the file size for each file + fsizes = set() + for fa_file in fa_files: + file_path = os.path.join(out_dir, fa_file) + file_size = os.path.getsize(file_path) + print(f"File: {fa_file}, Size: {file_size} bytes") + fsizes.add(file_size) + + # run a second time + runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output, + '--failed', failed, '-r', '1', '--fastas', out_dir, '--keep-fasta', + '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") + + fa_files2 = os.listdir(out_dir) + assert set(fa_files2) == set(['GCA_000175535.1_genomic.urlsketch.fna.gz', 'GCA_000961135.2_protein.urlsketch.faa.gz', 'GCA_000961135.2_genomic.urlsketch.fna.gz']) + for fa_file in fa_files2: + file_path = os.path.join(out_dir, fa_file) + file_size = os.path.getsize(file_path) + print(f"File: {fa_file}, Size: {file_size} bytes") + assert file_size in fsizes + + def test_urlsketch_download_only(runtmp, capfd): acc_csv = get_test_data('acc-url.csv') output = runtmp.output('simple.zip') @@ -268,7 +311,7 @@ def test_urlsketch_empty_accfile(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert 'Error: Invalid column names in CSV file. Columns should be: ["accession", "name", "moltype", "md5sum", "download_filename", "url"]' in captured.err + assert 'Error: Invalid column names in CSV file. Columns should be: ["accession", "name", "moltype", "md5sum", "download_filename", "url", "range"]' in captured.err def test_urlsketch_bad_acc_fail(runtmp, capfd): @@ -326,13 +369,14 @@ def test_urlsketch_from_gbsketch_failed(runtmp, capfd): with open(failed, 'r') as failF: fail_lines = failF.readlines() assert len(fail_lines) == 2 - assert fail_lines[0] == "accession,name,moltype,md5sum,download_filename,url\n" - acc, name, moltype, md5sum, download_filename, url = fail_lines[1].strip().split(',') + assert fail_lines[0] == "accession,name,moltype,md5sum,download_filename,url,range\n" + acc, name, moltype, md5sum, download_filename, url, range = fail_lines[1].strip().split(',') assert acc == "GCA_000175535.1" assert name == "GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14" assert moltype == "protein" assert download_filename == "GCA_000175535.1_protein.faa.gz" assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_protein.faa.gz" + assert range == "" assert not runtmp.last_result.out # stdout should be empty out2 = runtmp.output('failed-retry.zip') @@ -352,15 +396,16 @@ def test_urlsketch_from_gbsketch_failed(runtmp, capfd): assert os.path.exists(fail2) with open(fail2, 'r') as failF: header = next(failF).strip() - assert header == "accession,name,moltype,md5sum,download_filename,url" + assert header == "accession,name,moltype,md5sum,download_filename,url,range" for line in failF: print(line) - acc, name, moltype, md5sum, download_filename, url = line.strip().split(',') + acc, name, moltype, md5sum, download_filename, url, range = line.strip().split(',') assert acc == "GCA_000175535.1" assert name == "GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14" assert moltype == "protein" assert download_filename == "GCA_000175535.1_protein.faa.gz" assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_protein.faa.gz" + assert range == "" def test_zip_file_permissions(runtmp): @@ -435,7 +480,7 @@ def test_urlsketch_protein_dayhoff_hp(runtmp): fail_lines = failF.readlines() print(fail_lines) assert len(fail_lines) == 1 - assert fail_lines[0] == "accession,name,moltype,md5sum,download_filename,url\n" + assert fail_lines[0] == "accession,name,moltype,md5sum,download_filename,url,range\n" def test_urlsketch_md5sum_mismatch_checksum_file(runtmp, capfd): @@ -513,16 +558,17 @@ def test_urlsketch_md5sum_mismatch_no_checksum_file(runtmp, capfd): assert os.path.exists(failed) with open(failed, 'r') as failF: header = next(failF).strip() - assert header == "accession,name,moltype,md5sum,download_filename,url" + assert header == "accession,name,moltype,md5sum,download_filename,url,range" for line in failF: print(line) - acc, name, moltype, md5sum, download_filename, url= line.strip().split(',') + acc, name, moltype, md5sum, download_filename, url, range= line.strip().split(',') assert acc == "GCA_000175535.1" assert name == "GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14" assert moltype == "DNA" assert md5sum == "b1234567" assert download_filename == "GCA_000175535.1_genomic.urlsketch.fna.gz" assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_genomic.fna.gz" + assert range == "" def test_urlsketch_simple_batched(runtmp, capfd): @@ -534,6 +580,7 @@ def test_urlsketch_simple_batched(runtmp, capfd): out1 = runtmp.output('simple.1.zip') out2 = runtmp.output('simple.2.zip') out3 = runtmp.output('simple.3.zip') + out4 = runtmp.output('simple.4.zip') sig1 = get_test_data('GCA_000175535.1.sig.gz') sig2 = get_test_data('GCA_000961135.2.sig.gz') @@ -550,6 +597,7 @@ def test_urlsketch_simple_batched(runtmp, capfd): assert os.path.exists(out1) assert os.path.exists(out2) assert os.path.exists(out3) + assert not os.path.exists(out4) assert not os.path.exists(output) # for now, orig output file should be empty. captured = capfd.readouterr() print(captured.err) @@ -581,7 +629,7 @@ def test_urlsketch_simple_batch_restart(runtmp, capfd): out1 = runtmp.output('simple.1.zip') out2 = runtmp.output('simple.2.zip') out3 = runtmp.output('simple.3.zip') - + out4 = runtmp.output('simple.4.zip') sig1 = get_test_data('GCA_000175535.1.sig.gz') sig2 = get_test_data('GCA_000961135.2.sig.gz') @@ -592,17 +640,20 @@ def test_urlsketch_simple_batch_restart(runtmp, capfd): ss4 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein') # first, cat sig2 into an output file that will trick gbsketch into thinking it's a prior batch + # need to actually rename it first, so it will match sig that would have been written + runtmp.sourmash('sig', 'cat', sig2, '-o', out1) assert os.path.exists(out1) runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output, - '--failed', failed, '-r', '1', '--checksum-fail', ch_fail, + '--failed', failed, '-r', '5', '-n', "1", '--checksum-fail', ch_fail, '--param-str', "dna,k=31,scaled=1000,abund", '-p', "protein,k=10,scaled=200", '--batch-size', '1') assert os.path.exists(out1) assert os.path.exists(out2) assert os.path.exists(out3) + assert not os.path.exists(out4) assert not os.path.exists(output) # for now, orig output file should be empty. captured = capfd.readouterr() print(captured.err) @@ -642,14 +693,14 @@ def test_urlsketch_negative_batch_size(runtmp): def test_urlsketch_simple_batch_restart_with_incomplete_zip(runtmp, capfd): # test restart with complete + incomplete zipfile batches acc_csv = get_test_data('acc-url.csv') - output = runtmp.output('simple.zip') + output = runtmp.output('restart.zip') failed = runtmp.output('failed.csv') ch_fail = runtmp.output('checksum_dl_failed.csv') - out1 = runtmp.output('simple.1.zip') - out2 = runtmp.output('simple.2.zip') - out3 = runtmp.output('simple.3.zip') - + out1 = runtmp.output('restart.1.zip') + out2 = runtmp.output('restart.2.zip') + out3 = runtmp.output('restart.3.zip') + out4 = runtmp.output('restart.4.zip') sig1 = get_test_data('GCA_000175535.1.sig.gz') sig2 = get_test_data('GCA_000961135.2.sig.gz') @@ -668,13 +719,14 @@ def test_urlsketch_simple_batch_restart_with_incomplete_zip(runtmp, capfd): f.write(b"This is not a valid zip file!") runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output, - '--failed', failed, '-r', '1', '--checksum-fail', ch_fail, + '--failed', failed, '-r', '5', '-n', "1", '--checksum-fail', ch_fail, '--param-str', "dna,k=31,scaled=1000,abund", '-p', "protein,k=10,scaled=200", '--batch-size', '1') assert os.path.exists(out1) assert os.path.exists(out2) assert os.path.exists(out3) + assert not os.path.exists(out4) assert not os.path.exists(output) # for now, orig output file should be empty. captured = capfd.readouterr() print(captured.err) @@ -787,3 +839,618 @@ def test_urlsketch_simple_skipmer(runtmp, capfd): assert ( siginfo["molecule"] == expected["moltype"] ), f"Moltype mismatch: {siginfo['molecule']}" + + +def test_urlsketch_simple_merged(runtmp): + acc_csv = get_test_data('acc-merged.csv') + output = runtmp.output('merged.zip') + failed = runtmp.output('failed.csv') + + sig1 = get_test_data('GCA_000175535.1.sig.gz') + sig2 = get_test_data('GCA_000961135.2.sig.gz') + merged_sig = runtmp.output("sigmerge.zip") + + # create merged signature + runtmp.sourmash("sig", "merge", "-k", "31", sig1, sig2, "--set-name", "both name", '-o', merged_sig) + msigidx = sourmash.load_file_as_index(merged_sig) + msig = list(msigidx.signatures())[0] + print(msig.name) + + runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output, + '--failed', failed, '-r', '1', + '--param-str', "dna,k=31,scaled=1000") + + assert os.path.exists(output) + assert not runtmp.last_result.out # stdout should be empty + + idx = sourmash.load_file_as_index(output) + sigs = list(idx.signatures()) + + assert len(sigs) == 1 + sig = sigs[0] + assert sig.name == msig.name == "both name" + print(msig.md5sum()) + assert sig.md5sum() == msig.md5sum() + assert sig.minhash.moltype == msig.minhash.moltype == "DNA" + assert os.path.exists(failed) + + +def test_urlsketch_simple_merged_with_md5sums(runtmp): + acc_csv = get_test_data('acc-merged-md5sums.csv') + output = runtmp.output('merged.zip') + failed = runtmp.output('failed.csv') + + sig1 = get_test_data('GCA_000175535.1.sig.gz') + sig2 = get_test_data('GCA_000961135.2.sig.gz') + merged_sig = runtmp.output("sigmerge.zip") + + # create merged signature + runtmp.sourmash("sig", "merge", "-k", "31", sig1, sig2, "--set-name", "both name", '-o', merged_sig) + msigidx = sourmash.load_file_as_index(merged_sig) + msig = list(msigidx.signatures())[0] + print(msig.name) + + runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output, + '--failed', failed, '-r', '1', + '--param-str', "dna,k=31,scaled=1000") + + assert os.path.exists(output) + assert not runtmp.last_result.out # stdout should be empty + + idx = sourmash.load_file_as_index(output) + sigs = list(idx.signatures()) + + assert len(sigs) == 1 + sig = sigs[0] + assert sig.name == msig.name == "both name" + print(msig.md5sum()) + assert sig.md5sum() == msig.md5sum() + assert sig.minhash.moltype == msig.minhash.moltype == "DNA" + assert os.path.exists(failed) + + +def test_urlsketch_simple_merged_keep_fasta(runtmp): + acc_csv = get_test_data('acc-merged.csv') + output = runtmp.output('merged.zip') + failed = runtmp.output('failed.csv') + out_dir = runtmp.output('out_fastas') + + sig1 = get_test_data('GCA_000175535.1.sig.gz') + sig2 = get_test_data('GCA_000961135.2.sig.gz') + merged_sig = runtmp.output("sigmerge.zip") + + # create merged signature + runtmp.sourmash("sig", "merge", "-k", "31", sig1, sig2, "--set-name", "both name", '-o', merged_sig) + msigidx = sourmash.load_file_as_index(merged_sig) + msig = list(msigidx.signatures())[0] + print(msig.name) + + runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output, + '--failed', failed, '-r', '1', '--keep-fasta', + '--fastas', out_dir, + '--param-str', "dna,k=31,scaled=1000") + + assert os.path.exists(output) + assert not runtmp.last_result.out # stdout should be empty + # check fasta files are present + fa_files = os.listdir(out_dir) + print(fa_files) + assert fa_files == ['both.urlsketch.fna.gz'] + + # check fasta files have records from both entries + n_expected_records = 104 + n_records = 0 + # check one record from each + expected_names = ["ACUJ01000001.1 Chlamydia muridarum MopnTet14 chromosome, whole genome shotgun sequence", + "JZWS02000016.1 MAG: Candidatus Aramenus sulfurataquae isolate AZ1-454 NODE_87_length_15535_cov_30.701232, whole genome shotgun sequence"] + rec_names = [] + with screed.open(os.path.join(out_dir, fa_files[0])) as inF: + for rec in inF: + n_records +=1 + rec_names.append(rec.name) + + assert n_records == n_expected_records + assert all(n in rec_names for n in expected_names) + + idx = sourmash.load_file_as_index(output) + sigs = list(idx.signatures()) + + assert len(sigs) == 1 + sig = sigs[0] + assert sig.name == msig.name == "both name" + print(msig.md5sum()) + assert sig.md5sum() == msig.md5sum() + assert sig.minhash.moltype == msig.minhash.moltype == "DNA" + assert os.path.exists(failed) + + +def test_urlsketch_simple_merged_keep_fasta_path_in_filename(runtmp): + acc_csv = get_test_data('acc-merged.csv') + mod_csv = runtmp.output('acc-merged-filepath.csv') + output = runtmp.output('merged.zip') + failed = runtmp.output('failed.csv') + out_dir = runtmp.output('out_fastas') + + # open acc-merged.csv and prepend "/unavailable-path/subdir/" to the "download_filename" column + with open(acc_csv, 'r') as infile, open(mod_csv, 'w', newline='') as outfile: + reader = csv.DictReader(infile) + fieldnames = reader.fieldnames + writer = csv.DictWriter(outfile, fieldnames=fieldnames) + writer.writeheader() + for row in reader: + row['download_filename'] = f"unavailable-path/subdir/{row['download_filename']}" + writer.writerow(row) + + sig1 = get_test_data('GCA_000175535.1.sig.gz') + sig2 = get_test_data('GCA_000961135.2.sig.gz') + merged_sig = runtmp.output("sigmerge.zip") + + # create merged signature + runtmp.sourmash("sig", "merge", "-k", "31", sig1, sig2, "--set-name", "both name", '-o', merged_sig) + msigidx = sourmash.load_file_as_index(merged_sig) + msig = list(msigidx.signatures())[0] + print(msig.name) + + runtmp.sourmash('scripts', 'urlsketch', mod_csv, '-o', output, + '--failed', failed, '-r', '1', '--keep-fasta', + '--fastas', out_dir, + '--param-str', "dna,k=31,scaled=1000") + + assert os.path.exists(output) + assert not runtmp.last_result.out # stdout should be empty + # check fasta files are present + fa_files = [] + for root, dirs, files in os.walk(out_dir): + for file in files: + if file.endswith('fna.gz'): + fa_files.append(os.path.relpath(os.path.join(root, file), out_dir)) + print(fa_files) + assert fa_files == ['unavailable-path/subdir/both.urlsketch.fna.gz'] + + +def test_urlsketch_simple_merged_incorrect_md5sum_checksum_failure(runtmp): + acc_csv = get_test_data('acc-merged-md5sums.csv') + mod_csv = runtmp.output('acc-merged_incorrect_md5.csv') + output = runtmp.output('merged.zip') + failed = runtmp.output('failed.csv') + ch_failed = runtmp.output('ch-failed.csv') + out_dir = runtmp.output('out_fastas') + + # open file and write incorrect md5sum + with open(acc_csv, 'r') as infile, open(mod_csv, 'w', newline='') as outfile: + reader = csv.DictReader(infile) + fieldnames = reader.fieldnames + writer = csv.DictWriter(outfile, fieldnames=fieldnames) + writer.writeheader() + for row in reader: + row['md5sum'] = row['md5sum'][2:] # take off first digit from first md5sum + print(row) + writer.writerow(row) + + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'urlsketch', mod_csv, '-o', output, + '--failed', failed, '-r', '1', '--keep-fasta', + '--fastas', out_dir, '--checksum-fail', ch_failed, + '--param-str', "dna,k=31,scaled=1000") + + assert not os.path.exists(output) + assert not runtmp.last_result.out # stdout should be empty + # check failure file + assert os.path.exists(ch_failed) + with open(ch_failed, 'r') as failF: + header = next(failF).strip() + print(header) + assert header == "accession,name,moltype,md5sum_url,download_filename,url,expected_md5sum,reason" + for line in failF: + print(line) + acc, name, moltype, md5sum_url, download_filename, url, expected_md5sum, reason = line.strip().split(',') + assert acc == "both" + assert name == "both name" + assert moltype == "DNA" + assert download_filename == "both.urlsketch.fna.gz" + assert expected_md5sum == "b9fb20c51f0552b87db5d44d5d4566" + assert reason == "MD5 hash does not match. Expected: 'b9fb20c51f0552b87db5d44d5d4566'; Found: '47b9fb20c51f0552b87db5d44d5d4566'" + assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/961/135/GCA_000961135.2_ASM96113v2/GCA_000961135.2_ASM96113v2_genomic.fna.gz" + with open(failed, 'r') as fails: + header = next(fails).strip() + print(header) + assert header == "accession,name,moltype,md5sum,download_filename,url,range" + for line in fails: + print(line) + acc, name, moltype, md5sum, download_filename, url, range = line.strip().split(',') + assert acc == "both" + assert name == "both name" + assert moltype == "DNA" + assert md5sum == "b9fb20c51f0552b87db5d44d5d4566;a1a8f1c6dc56999c73fe298871c963d1" + assert download_filename == "both.urlsketch.fna.gz" + assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/961/135/GCA_000961135.2_ASM96113v2/GCA_000961135.2_ASM96113v2_genomic.fna.gz;https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_genomic.fna.gz" + assert range == "" + + +def test_urlsketch_with_range(runtmp): + acc_csv = get_test_data('acc-url-range.csv') + subseqs = get_test_data('subseqs.zip') + output = runtmp.output('range.zip') + failed = runtmp.output('failed.csv') + + # open subseq sigs + idx = sourmash.load_file_as_index(subseqs) + siglist = list(idx.signatures()) + ss1 = siglist[0] + ss2 = siglist[1] + + runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output, + '--failed', failed, '-r', '1', + '--param-str', "dna,k=31,scaled=100") + + assert os.path.exists(output) + assert not runtmp.last_result.out # stdout should be empty + + idx = sourmash.load_file_as_index(output) + sigs = list(idx.signatures()) + + assert len(sigs) == 2 + for sig in sigs: + ident = sig.name.split(' ')[0] + assert ident in ["GCA_000175535.1_first50kb", "GCA_000175535.1_second50kb"] + print(ident) + if ident == "GCA_000175535.1_first50kb": + assert sig.md5sum() == ss1.md5sum() + if ident == "GCA_000175535.1_second50kb": + assert sig.md5sum() == ss2.md5sum() + assert os.path.exists(failed) + + +def test_urlsketch_with_range_keep_fasta(runtmp): + acc_csv = get_test_data('acc-url-range.csv') + subseqs = get_test_data('subseqs.zip') + first50kb = get_test_data('GCA_000175535.1_ASM17553v1_genomic.1-50000.fna.gz') + second50kb = get_test_data('GCA_000175535.1_ASM17553v1_genomic.50000-100000.fna.gz') + output = runtmp.output('range.zip') + failed = runtmp.output('failed.csv') + out_dir = runtmp.output('out_fastas') + + # open subseq sigs + idx = sourmash.load_file_as_index(subseqs) + siglist = list(idx.signatures()) + ss1 = siglist[0] + ss2 = siglist[1] + + runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output, + '--failed', failed, '-r', '1', '--keep-fasta', + '--fastas', out_dir, + '--param-str', "dna,k=31,scaled=100") + + assert os.path.exists(output) + assert not runtmp.last_result.out # stdout should be empty + # check fasta files are present + fa_files = os.listdir(out_dir) + print(fa_files) + assert set(fa_files) == set(['GCA_000175535.1_genomic_first50kb.urlsketch.fna.gz', 'GCA_000175535.1_genomic_second50kb.urlsketch.fna.gz']) + + # Compare the contents of the generated FASTA files to the expected ones + for generated_file, expected_file in [ + ('GCA_000175535.1_genomic_first50kb.urlsketch.fna.gz', first50kb), + ('GCA_000175535.1_genomic_second50kb.urlsketch.fna.gz', second50kb) + ]: + generated_path = os.path.join(out_dir, generated_file) + + # Read the records from both files using screed + gen_records = set((record.name, record.sequence) for record in screed.open(generated_path)) + exp_records = set((record.name, record.sequence) for record in screed.open(expected_file)) + + # Assert that the records are identical + assert gen_records == exp_records + + idx = sourmash.load_file_as_index(output) + sigs = list(idx.signatures()) + + assert len(sigs) == 2 + for sig in sigs: + ident = sig.name.split(' ')[0] + assert ident in ["GCA_000175535.1_first50kb", "GCA_000175535.1_second50kb"] + print(ident) + if ident == "GCA_000175535.1_first50kb": + assert sig.md5sum() == ss1.md5sum() + if ident == "GCA_000175535.1_second50kb": + assert sig.md5sum() == ss2.md5sum() + assert os.path.exists(failed) + + +def test_urlsketch_with_range_improper_range_1(runtmp, capfd): + acc_csv = get_test_data('acc-url-range.csv') + acc_mod = runtmp.output("acc-url-range-mod.csv") + subseqs = get_test_data('subseqs.zip') + output = runtmp.output('range.zip') + failed = runtmp.output('failed.csv') + + # Modify the range in the acc_csv file + with open(acc_csv, 'r') as infile, open(acc_mod, 'w', newline='') as outfile: + reader = csv.DictReader(infile) + fieldnames = reader.fieldnames + writer = csv.DictWriter(outfile, fieldnames=fieldnames) + writer.writeheader() + for row in reader: + if row['accession'] == 'GCA_000175535.1_second50kb': + row['range'] = '100000-10000000' + writer.writerow(row) + + # open subseq sigs + idx = sourmash.load_file_as_index(subseqs) + siglist = list(idx.signatures()) + ss1 = siglist[0] + + runtmp.sourmash('scripts', 'urlsketch', acc_mod, '-o', output, + '--failed', failed, '-r', '1', + '--param-str', "dna,k=31,scaled=100") + + assert os.path.exists(output) + assert not runtmp.last_result.out # stdout should be empty + assert os.path.exists(failed) + captured = capfd.readouterr() + print(captured.err) + assert "Error: Invalid range: start=100000, end=10000000, sequence length=1088736" in captured.err + + idx = sourmash.load_file_as_index(output) + sigs = list(idx.signatures()) + + assert len(sigs) == 1 + for sig in sigs: + ident = sig.name.split(' ')[0] + assert ident == "GCA_000175535.1_first50kb" + assert sig.md5sum() == ss1.md5sum() + + with open(failed, 'r') as failF: + header = next(failF).strip() + print(header) + assert header == "accession,name,moltype,md5sum,download_filename,url,range" + for line in failF: + print(line) + acc, name, moltype, md5sum, download_filename, url, range = line.strip().split(',') + assert acc == "GCA_000175535.1_second50kb" + assert name == "GCA_000175535.1_second50kb" + assert moltype == "DNA" + assert md5sum == "b9fb20c51f0552b87db5d44d5d4566;a1a8f1c6dc56999c73fe298871c963d1" + assert download_filename == "both.urlsketch.fna.gz" + assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/961/135/GCA_000961135.2_ASM96113v2/GCA_000961135.2_ASM96113v2_genomic.fna.gz;https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_genomic.fna.gz" + assert range == "" + + +def test_urlsketch_with_range_improper_range_2(runtmp, capfd): + acc_csv = get_test_data('acc-url-range.csv') + acc_mod = runtmp.output("acc-url-range-mod.csv") + subseqs = get_test_data('subseqs.zip') + output = runtmp.output('range.zip') + failed = runtmp.output('failed.csv') + + # Modify the range in the acc_csv file + with open(acc_csv, 'r') as infile, open(acc_mod, 'w', newline='') as outfile: + reader = csv.DictReader(infile) + fieldnames = reader.fieldnames + writer = csv.DictWriter(outfile, fieldnames=fieldnames) + writer.writeheader() + for row in reader: + if row['accession'] == 'GCA_000175535.1_second50kb': + row['range'] = '-1-10000000' + writer.writerow(row) + + # open subseq sigs + idx = sourmash.load_file_as_index(subseqs) + siglist = list(idx.signatures()) + ss1 = siglist[0] + + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'urlsketch', acc_mod, '-o', output, + '--failed', failed, '-r', '1', + '--param-str', "dna,k=31,scaled=100") + + captured = capfd.readouterr() + print(captured.err) + assert "Error: Invalid range format: -1-10000000" in captured.err + + +def test_urlsketch_merged_ranged(runtmp): + acc_csv = get_test_data('acc-merged-md5sums.csv') + acc_mod = runtmp.output('acc-merged-md5sums-ranges.csv') + subseqs = get_test_data('subseqs.zip') + output = runtmp.output('range.zip') + failed = runtmp.output('failed.csv') + sketch_out = runtmp.output('sketch-subseqs.zip') + merged_out = runtmp.output('merged-subseqs.zip') + f1 = get_test_data("GCA_000175535.1_ASM17553v1_genomic.1-50000.fna.gz") + f2 = get_test_data("GCA_000175535.1_ASM17553v1_genomic.50000-100000.fna.gz") + + # Modify the acc_csv file to add range values + with open(acc_csv, 'r') as infile, open(acc_mod, 'w', newline='') as outfile: + reader = csv.DictReader(infile) + fieldnames = reader.fieldnames + writer = csv.DictWriter(outfile, fieldnames=fieldnames) + writer.writeheader() + for row in reader: + row['url'] = "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_genomic.fna.gz;https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_genomic.fna.gz" + row['md5sum'] = "a1a8f1c6dc56999c73fe298871c963d1;a1a8f1c6dc56999c73fe298871c963d1" + row['range'] = '1-50000;50000-100000' + writer.writerow(row) + print(row) + + # sketch subseq files + runtmp.sourmash('sketch', "dna", f1, f2, '--name', + 'both name', '-o', sketch_out, + '-p', "dna,k=31,scaled=100") + + idx = sourmash.load_file_as_index(sketch_out) + sigs1 = list(idx.signatures()) + assert len(sigs1) == 1 + sketchsig = sigs1[0] + + # merge subset sketches + runtmp.sourmash('sig', "merge", subseqs,'--set-name', + 'both name', '-o', merged_out) + idx = sourmash.load_file_as_index(merged_out) + sigs = list(idx.signatures()) + assert len(sigs) == 1 + mergesig = sigs[0] + + # # run urlsketch + runtmp.sourmash('scripts', 'urlsketch', acc_mod, '-o', output, + '--failed', failed, '-r', '1', + '--param-str', "dna,k=31,scaled=100") + + assert os.path.exists(output) + assert not runtmp.last_result.out # stdout should be empty + + idx = sourmash.load_file_as_index(output) + sigs = list(idx.signatures()) + assert len(sigs) == 1 + sig = sigs[0] + assert sig.name == "both name" + print(sig.md5sum()) + assert sig.md5sum() == sketchsig.md5sum() == mergesig.md5sum() == "5feeed4c8a75c8b3fe67af1270fa92c4" + + +def test_urlsketch_merged_ranged_md5sum_fail_no_checksum_file(runtmp): + acc_csv = get_test_data('acc-merged-md5sums.csv') + acc_mod = runtmp.output('acc-merged-md5sums-ranges.csv') + output = runtmp.output('range.zip') + failed = runtmp.output('failed.csv') + + # Modify the acc_csv file to add range values + with open(acc_csv, 'r') as infile, open(acc_mod, 'w', newline='') as outfile: + reader = csv.DictReader(infile) + fieldnames = reader.fieldnames + writer = csv.DictWriter(outfile, fieldnames=fieldnames) + writer.writeheader() + for row in reader: + row['url'] = "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_genomic.fna.gz;https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_genomic.fna.gz" + row['md5sum'] = "a1a8f1c6dc56999c73fe298871c963d1;b2" # second md5sum is incorrect + row['range'] = '1-50000;50000-100000' + writer.writerow(row) + print(row) + + # # run urlsketch + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'urlsketch', acc_mod, '-o', output, + '--failed', failed, '-r', '1', + '--param-str', "dna,k=31,scaled=100") + + assert not os.path.exists(output) + assert not runtmp.last_result.out # stdout should be empty + + with open(failed, 'r') as failF: + header = next(failF).strip() + assert header == "accession,name,moltype,md5sum,download_filename,url,range" + for line in failF: + print(line) + acc, name, moltype, md5sum, download_filename, url, range = line.strip().split(',') + assert acc == "both" + assert name == "both name" + assert moltype == "DNA" + assert md5sum == "a1a8f1c6dc56999c73fe298871c963d1;b2" + assert download_filename == "both.urlsketch.fna.gz" + assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_genomic.fna.gz;https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_genomic.fna.gz" + assert range == "1-50000;50000-100000" + + +def test_urlsketch_merged_ranged_md5sum_fail_with_checksum_file(runtmp): + acc_csv = get_test_data('acc-merged-md5sums.csv') + acc_mod = runtmp.output('acc-merged-md5sums-ranges.csv') + output = runtmp.output('range.zip') + failed = runtmp.output('failed.csv') + ch_fail = runtmp.output('ch_failed.csv') + + # Modify the acc_csv file to add range values + with open(acc_csv, 'r') as infile, open(acc_mod, 'w', newline='') as outfile: + reader = csv.DictReader(infile) + fieldnames = reader.fieldnames + writer = csv.DictWriter(outfile, fieldnames=fieldnames) + writer.writeheader() + for row in reader: + row['url'] = "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_genomic.fna.gz;https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_genomic.fna.gz" + row['md5sum'] = "a1a8f1c6dc56999c73fe298871c963d1;b2" # second md5sum is incorrect + row['range'] = '1-50000;50000-100000' + writer.writerow(row) + print(row) + + # # run urlsketch + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'urlsketch', acc_mod, '-o', output, + '--failed', failed, '-r', '1', '--checksum-fail', ch_fail, + '--param-str', "dna,k=31,scaled=100") + + assert not os.path.exists(output) + assert not runtmp.last_result.out # stdout should be empty + + # since this is a merged dataset, we write both checksum fail and regular fail. + with open(failed, 'r') as failF: + header = next(failF).strip() + assert header == "accession,name,moltype,md5sum,download_filename,url,range" + for line in failF: + print(line) + acc, name, moltype, md5sum, download_filename, url, range = line.strip().split(',') + assert acc == "both" + assert name == "both name" + assert moltype == "DNA" + assert md5sum == "a1a8f1c6dc56999c73fe298871c963d1;b2" + assert download_filename == "both.urlsketch.fna.gz" + assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_genomic.fna.gz;https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_genomic.fna.gz" + assert range == "1-50000;50000-100000" + + assert os.path.exists(ch_fail) + with open(ch_fail, 'r') as failF: + header = next(failF).strip() + assert header == "accession,name,moltype,md5sum_url,download_filename,url,expected_md5sum,reason" + for line in failF: + print(line) + acc, name, moltype, md5sum_url, download_filename, url, expected_md5, reason= line.strip().split(',') + assert acc == "both" + assert name == "both name" + assert moltype == "DNA" + assert md5sum_url == "" + assert download_filename == "both.urlsketch.fna.gz" + assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_genomic.fna.gz" + assert expected_md5 == "b2" + assert reason == "MD5 hash does not match. Expected: 'b2'; Found: 'a1a8f1c6dc56999c73fe298871c963d1'" + + +def test_urlsketch_merged_ranged_fail(runtmp): + acc_csv = get_test_data('acc-merged-md5sums.csv') + acc_mod = runtmp.output('acc-merged-md5sums-ranges.csv') + output = runtmp.output('range.zip') + failed = runtmp.output('failed.csv') + + # Modify the acc_csv file to add range values + with open(acc_csv, 'r') as infile, open(acc_mod, 'w', newline='') as outfile: + reader = csv.DictReader(infile) + fieldnames = reader.fieldnames + writer = csv.DictWriter(outfile, fieldnames=fieldnames) + writer.writeheader() + for row in reader: + # first url is incorrect + row['url'] = "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1;https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_genomic.fna.gz" + row['md5sum'] = "a1a8f1c6dc56999c73fe298871c963d1;" + row['range'] = '1-50000;50000-100000' + writer.writerow(row) + print(row) + + # # run urlsketch + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'urlsketch', acc_mod, '-o', output, + '--failed', failed, '-r', '1', + '--param-str', "dna,k=31,scaled=100") + + assert not os.path.exists(output) + assert not runtmp.last_result.out # stdout should be empty + + with open(failed, 'r') as failF: + header = next(failF).strip() + assert header == "accession,name,moltype,md5sum,download_filename,url,range" + for line in failF: + print(line) + acc, name, moltype, md5sum, download_filename, url, range = line.strip().split(',') + assert acc == "both" + assert name == "both name" + assert moltype == "DNA" + assert md5sum == "a1a8f1c6dc56999c73fe298871c963d1;" + assert download_filename == "both.urlsketch.fna.gz" + assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1;https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_genomic.fna.gz" + assert range == "1-50000;50000-100000"