From 7a46cfb27a8b05c1b1dd31e605a25798e077c55e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ant=C3=B4nio=20Camargo?= Date: Thu, 19 Dec 2024 07:33:25 -0800 Subject: [PATCH 01/24] Add dunder methods for the Python Record class --- src/python.rs | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/src/python.rs b/src/python.rs index 35b2822..ad2c0d7 100644 --- a/src/python.rs +++ b/src/python.rs @@ -1,5 +1,7 @@ //! Python bindings for needletail +use std::collections::hash_map::DefaultHasher; +use std::hash::{Hash, Hasher}; use std::io::Cursor; use pyo3::prelude::*; @@ -72,6 +74,59 @@ impl Record { } Ok(()) } + + pub fn __hash__(&self) -> PyResult { + let mut hasher = DefaultHasher::new(); + self.id.hash(&mut hasher); + self.seq.hash(&mut hasher); + if !self.qual.is_none() { + self.qual.hash(&mut hasher); + } + Ok(hasher.finish()) + } + + pub fn __eq__(&self, other: &Record) -> PyResult { + Ok(self.id == other.id && self.seq == other.seq && self.qual == other.qual) + } + + pub fn __len__(&self) -> PyResult { + Ok(self.seq.len()) + } + + pub fn __str__(&self) -> PyResult { + if self.qual.is_none() { + let wrapped_seq = self + .seq + .as_bytes() + .chunks(60) + .map(|chunk| String::from_utf8_lossy(chunk).to_string()) + .collect::>() + .join("\n"); + Ok(format!(">{}\n{}", self.id, wrapped_seq)) + } else { + Ok(format!( + "@{}\n{}\n+\n{}", + self.id, + self.seq, + self.qual.clone().unwrap() + )) + } + } + + fn __repr__(&self) -> PyResult { + let seq_preview = if self.seq.len() > 40 { + let start = &self.seq[..34]; + let end = &self.seq[self.seq.len() - 3..]; + format!("{}...{}", start, end) + } else { + self.seq.clone() + }; + let has_quality = self.qual.is_some(); + Ok(format!( + "Record(id={}, sequence={}, has_quality={})", + self.id, seq_preview, has_quality + )) + } } #[pyclass] From afe778c99bd5b5b80e4bfa9eadb48e82507d9efc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ant=C3=B4nio=20Camargo?= Date: Thu, 19 Dec 2024 13:13:24 -0800 Subject: [PATCH 02/24] Improve `Record.__repr__()` --- src/python.rs | 39 ++++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/src/python.rs b/src/python.rs index ad2c0d7..84478ee 100644 --- a/src/python.rs +++ b/src/python.rs @@ -1,17 +1,14 @@ //! Python bindings for needletail -use std::collections::hash_map::DefaultHasher; -use std::hash::{Hash, Hasher}; -use std::io::Cursor; - -use pyo3::prelude::*; -use pyo3::{create_exception, wrap_pyfunction}; - use crate::sequence::{complement, normalize}; use crate::{ parse_fastx_file as rs_parse_fastx_file, parse_fastx_reader, parser::SequenceRecord, FastxReader, }; +use pyo3::prelude::*; +use pyo3::{create_exception, wrap_pyfunction}; +use std::hash::{DefaultHasher, Hash, Hasher}; +use std::io::Cursor; create_exception!(needletail, NeedletailError, pyo3::exceptions::PyException); @@ -79,8 +76,9 @@ impl Record { let mut hasher = DefaultHasher::new(); self.id.hash(&mut hasher); self.seq.hash(&mut hasher); - if !self.qual.is_none() { - self.qual.hash(&mut hasher); + match &self.qual { + Some(qual) => qual.hash(&mut hasher), + None => {} } Ok(hasher.finish()) } @@ -114,17 +112,28 @@ impl Record { } fn __repr__(&self) -> PyResult { - let seq_preview = if self.seq.len() > 40 { - let start = &self.seq[..34]; + let seq_preview = if self.seq.len() > 30 { + let start = &self.seq[..26]; let end = &self.seq[self.seq.len() - 3..]; - format!("{}...{}", start, end) + format!("{}…{}", start, end) } else { self.seq.clone() }; - let has_quality = self.qual.is_some(); + let quality_preview = match &self.qual { + Some(qual) => { + if qual.len() > 30 { + let start = &qual[..26]; + let end = &qual[qual.len() - 3..]; + format!("{}…{}", start, end) + } else { + qual.clone() + } + } + None => "None".to_string(), + }; Ok(format!( - "Record(id={}, sequence={}, has_quality={})", - self.id, seq_preview, has_quality + "Record(id={}, sequence={}, quality={})", + self.id, seq_preview, quality_preview )) } } From 19a604075c17f893b078ed8d5cd312714a2af4cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ant=C3=B4nio=20Camargo?= Date: Thu, 19 Dec 2024 13:23:58 -0800 Subject: [PATCH 03/24] Add a constructor to `Record` --- src/python.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/python.rs b/src/python.rs index 84478ee..2a4464f 100644 --- a/src/python.rs +++ b/src/python.rs @@ -5,6 +5,8 @@ use crate::{ parse_fastx_file as rs_parse_fastx_file, parse_fastx_reader, parser::SequenceRecord, FastxReader, }; + +use pyo3::exceptions::PyValueError; use pyo3::prelude::*; use pyo3::{create_exception, wrap_pyfunction}; use std::hash::{DefaultHasher, Hash, Hasher}; @@ -72,6 +74,20 @@ impl Record { Ok(()) } + #[new] + #[pyo3(signature = (id, seq, qual=None))] + fn new(id: String, seq: String, qual: Option) -> PyResult { + // If `qual` is not None, check if it has the same length as `seq` + if let Some(qual) = &qual { + if qual.len() != seq.len() { + return Err(PyValueError::new_err( + "Sequence and quality strings must have the same length", + )); + } + } + Ok(Record { id, seq, qual }) + } + pub fn __hash__(&self) -> PyResult { let mut hasher = DefaultHasher::new(); self.id.hash(&mut hasher); From 3497b4e09d79ce1fa242fcc09f26ca14235747eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ant=C3=B4nio=20Camargo?= Date: Thu, 19 Dec 2024 13:30:57 -0800 Subject: [PATCH 04/24] Expose the `Record` class --- src/python.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python.rs b/src/python.rs index 2a4464f..92dff78 100644 --- a/src/python.rs +++ b/src/python.rs @@ -210,11 +210,11 @@ pub fn reverse_complement(seq: &str) -> String { #[pymodule] fn needletail(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; + m.add_class::()?; m.add_wrapped(wrap_pyfunction!(parse_fastx_file))?; m.add_wrapped(wrap_pyfunction!(parse_fastx_string))?; m.add_wrapped(wrap_pyfunction!(normalize_seq))?; m.add_wrapped(wrap_pyfunction!(reverse_complement))?; m.add("NeedletailError", py.get_type_bound::())?; - Ok(()) } From c417c3d19dde0bc44dd658e6f85785bc82e4a57e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ant=C3=B4nio=20Camargo?= Date: Thu, 19 Dec 2024 13:48:56 -0800 Subject: [PATCH 05/24] Turn `is_fasta` and `is_fastq` into properties --- src/python.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/python.rs b/src/python.rs index 92dff78..8669c65 100644 --- a/src/python.rs +++ b/src/python.rs @@ -59,10 +59,12 @@ impl Record { #[pymethods] impl Record { + #[getter] pub fn is_fasta(&self) -> PyResult { Ok(self.qual.is_none()) } + #[getter] pub fn is_fastq(&self) -> PyResult { Ok(self.qual.is_some()) } From 786d6b8c20c6e11db47ef3922b30b7f7adc533ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ant=C3=B4nio=20Camargo?= Date: Thu, 19 Dec 2024 16:34:59 -0800 Subject: [PATCH 06/24] Refactor snippet generation --- src/python.rs | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/src/python.rs b/src/python.rs index 8669c65..1a4389a 100644 --- a/src/python.rs +++ b/src/python.rs @@ -26,6 +26,16 @@ pub struct PyFastxReader { reader: Box, } +fn get_string_snippet(seq: &str, max_len: usize) -> String { + if seq.len() > max_len { + let start = &seq[..max_len - 4]; + let end = &seq[seq.len() - 3..]; + format!("{}…{}", start, end) + } else { + seq.to_string() + } +} + #[pymethods] impl PyFastxReader { fn __repr__(&self) -> PyResult { @@ -130,28 +140,14 @@ impl Record { } fn __repr__(&self) -> PyResult { - let seq_preview = if self.seq.len() > 30 { - let start = &self.seq[..26]; - let end = &self.seq[self.seq.len() - 3..]; - format!("{}…{}", start, end) - } else { - self.seq.clone() - }; - let quality_preview = match &self.qual { - Some(qual) => { - if qual.len() > 30 { - let start = &qual[..26]; - let end = &qual[qual.len() - 3..]; - format!("{}…{}", start, end) - } else { - qual.clone() - } - } + let seq_snippet = get_string_snippet(&self.seq, 30); + let quality_snippet = match &self.qual { + Some(qual) => get_string_snippet(qual, 30), None => "None".to_string(), }; Ok(format!( "Record(id={}, sequence={}, quality={})", - self.id, seq_preview, quality_preview + self.id, seq_snippet, quality_snippet )) } } From b39ab088d1357b08fe1e93fc68c71dfebeba124a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ant=C3=B4nio=20Camargo?= Date: Thu, 19 Dec 2024 17:19:36 -0800 Subject: [PATCH 07/24] Add the `name` and `description` properties --- src/python.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/python.rs b/src/python.rs index 1a4389a..517d157 100644 --- a/src/python.rs +++ b/src/python.rs @@ -69,6 +69,24 @@ impl Record { #[pymethods] impl Record { + #[getter] + pub fn name(&self) -> PyResult<&str> { + if let Some(pos) = self.id.find(char::is_whitespace) { + Ok(&self.id[..pos]) + } else { + Ok(&self.id) + } + } + + #[getter] + pub fn description(&self) -> PyResult> { + if let Some(pos) = self.id.find(char::is_whitespace) { + Ok(Some(&self.id[pos..].trim_start())) + } else { + Ok(None) + } + } + #[getter] pub fn is_fasta(&self) -> PyResult { Ok(self.qual.is_none()) From 3f348ecf1aac398f2d7ba12392006c35690b08e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ant=C3=B4nio=20Camargo?= Date: Thu, 19 Dec 2024 17:20:23 -0800 Subject: [PATCH 08/24] Condense long strings in Record's __repr__ --- src/python.rs | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/python.rs b/src/python.rs index 517d157..1f5271a 100644 --- a/src/python.rs +++ b/src/python.rs @@ -26,7 +26,7 @@ pub struct PyFastxReader { reader: Box, } -fn get_string_snippet(seq: &str, max_len: usize) -> String { +fn get_seq_snippet(seq: &str, max_len: usize) -> String { if seq.len() > max_len { let start = &seq[..max_len - 4]; let end = &seq[seq.len() - 3..]; @@ -158,14 +158,19 @@ impl Record { } fn __repr__(&self) -> PyResult { - let seq_snippet = get_string_snippet(&self.seq, 30); + let id_snippet = match self.name() { + Ok(name) if name != self.id => format!("{}…", name), + Ok(name) => name.to_string(), + Err(_) => self.id.clone(), + }; + let seq_snippet = get_seq_snippet(&self.seq, 30); let quality_snippet = match &self.qual { - Some(qual) => get_string_snippet(qual, 30), + Some(qual) => get_seq_snippet(qual, 30), None => "None".to_string(), }; Ok(format!( "Record(id={}, sequence={}, quality={})", - self.id, seq_snippet, quality_snippet + id_snippet, seq_snippet, quality_snippet )) } } From a4687e891cc0d5a13a017a00699752c7c1b5e4dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ant=C3=B4nio=20Camargo?= Date: Thu, 19 Dec 2024 17:25:58 -0800 Subject: [PATCH 09/24] Shorten Record's __repr__ --- src/python.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/python.rs b/src/python.rs index 1f5271a..e52ba58 100644 --- a/src/python.rs +++ b/src/python.rs @@ -163,13 +163,13 @@ impl Record { Ok(name) => name.to_string(), Err(_) => self.id.clone(), }; - let seq_snippet = get_seq_snippet(&self.seq, 30); + let seq_snippet = get_seq_snippet(&self.seq, 25); let quality_snippet = match &self.qual { - Some(qual) => get_seq_snippet(qual, 30), + Some(qual) => get_seq_snippet(qual, 25), None => "None".to_string(), }; Ok(format!( - "Record(id={}, sequence={}, quality={})", + "Record(id={}, seq={}, qual={})", id_snippet, seq_snippet, quality_snippet )) } From ce44c50641d39b7091c3e078fdc273d218c55127 Mon Sep 17 00:00:00 2001 From: Antonio Camargo Date: Wed, 22 Jan 2025 12:27:29 -0800 Subject: [PATCH 10/24] Do not wrap sequences in __str__ --- src/python.rs | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/python.rs b/src/python.rs index e52ba58..2f94bfe 100644 --- a/src/python.rs +++ b/src/python.rs @@ -139,14 +139,7 @@ impl Record { pub fn __str__(&self) -> PyResult { if self.qual.is_none() { - let wrapped_seq = self - .seq - .as_bytes() - .chunks(60) - .map(|chunk| String::from_utf8_lossy(chunk).to_string()) - .collect::>() - .join("\n"); - Ok(format!(">{}\n{}", self.id, wrapped_seq)) + Ok(format!(">{}\n{}", self.id, self.seq)) } else { Ok(format!( "@{}\n{}\n+\n{}", From 3770636f915ecf6af2c32062e67d0e52af4c3f9b Mon Sep 17 00:00:00 2001 From: Antonio Camargo Date: Wed, 22 Jan 2025 12:30:10 -0800 Subject: [PATCH 11/24] Add include a newline at the end of FASTX strings --- src/python.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/python.rs b/src/python.rs index 2f94bfe..dd5ca53 100644 --- a/src/python.rs +++ b/src/python.rs @@ -139,10 +139,10 @@ impl Record { pub fn __str__(&self) -> PyResult { if self.qual.is_none() { - Ok(format!(">{}\n{}", self.id, self.seq)) + Ok(format!(">{}\n{}\n", self.id, self.seq)) } else { Ok(format!( - "@{}\n{}\n+\n{}", + "@{}\n{}\n+\n{}\n", self.id, self.seq, self.qual.clone().unwrap() From 9cdf8e5e3e138ac2101b2294ff52906e8f7fc1b8 Mon Sep 17 00:00:00 2001 From: Antonio Camargo Date: Wed, 22 Jan 2025 12:43:05 -0800 Subject: [PATCH 12/24] Make is_fasta and is_fastq regular methods again --- src/python.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/python.rs b/src/python.rs index dd5ca53..3ea2fb8 100644 --- a/src/python.rs +++ b/src/python.rs @@ -87,12 +87,10 @@ impl Record { } } - #[getter] pub fn is_fasta(&self) -> PyResult { Ok(self.qual.is_none()) } - #[getter] pub fn is_fastq(&self) -> PyResult { Ok(self.qual.is_some()) } From 01fe91ea55bfd15b6b25cbda43f1e679c63befb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ant=C3=B4nio=20Camargo?= Date: Wed, 22 Jan 2025 16:33:00 -0800 Subject: [PATCH 13/24] Add docstrings to the Python classes and functions --- src/python.rs | 165 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 160 insertions(+), 5 deletions(-) diff --git a/src/python.rs b/src/python.rs index 3cb2e2a..a4074a5 100644 --- a/src/python.rs +++ b/src/python.rs @@ -21,11 +21,6 @@ macro_rules! py_try { }; } -#[pyclass] -pub struct PyFastxReader { - reader: Box, -} - fn get_seq_snippet(seq: &str, max_len: usize) -> String { if seq.len() > max_len { let start = &seq[..max_len - 4]; @@ -36,6 +31,26 @@ fn get_seq_snippet(seq: &str, max_len: usize) -> String { } } +/// An iterator that yields sequence records. +/// +/// Yields +/// ------ +/// Record +/// A `Record` object representing a sequence record. +/// +/// See also +/// -------- +/// parse_fastx_file: +/// A function to parse sequence records from a FASTA/FASTQ file. +/// parse_fastx_string: +/// A function to parse sequence records from a FASTA/FASTQ string. +/// Record: +/// A class representing a FASTA/FASTQ sequence record. +#[pyclass] +pub struct PyFastxReader { + reader: Box, +} + #[pymethods] impl PyFastxReader { fn __repr__(&self) -> PyResult { @@ -56,6 +71,43 @@ impl PyFastxReader { } } +/// A record representing a biological sequence. +/// +/// Parameters +/// ---------- +/// id : str +/// The identifier of the sequence record. +/// seq : str +/// A string representing the sequence. +/// +/// Attributes +/// ---------- +/// id : str +/// The identifier of the sequence record. In a FASTA file, this is the +/// string containing all characters (including whitespaces) after the +/// leading '>' character. In a FASTQ file, this is the string containing +/// all characters (including whitespaces) after the leading '@' character. +/// seq : str +/// A string representing the sequence. +/// qual : str, optional +/// A string representing the quality scores of the sequence. If the object +/// represents a FASTA record, this attribute will be `None`. +/// name : str +/// The name of the sequence record. This is the string before the first +/// whitespace character in the `id` attribute. +/// description : str, optional +/// The description of the sequence record. This is the string after the +/// first whitespace character in the `id` attribute. If the `id` attribute +/// contains no whitespace characters, this attribute will be `None`. +/// +/// Methods +/// ------- +/// is_fasta +/// Check if the object represents a FASTA record. +/// is_fastq +/// Check if the object represents a FASTQ record. +/// normalize(iupac) +/// Normalize the sequence stored in the `seq` attribute of the object. #[pyclass] pub struct Record { #[pyo3(get)] @@ -96,14 +148,31 @@ impl Record { } } + /// Check if the object represents a FASTA record. + /// + /// Returns + /// ------- + /// bool + /// `True` if the record lacks quality information, otherwise `False`. pub fn is_fasta(&self) -> PyResult { Ok(self.qual.is_none()) } + /// Check if the object represents a FASTQ record. + /// + /// Returns + /// ------- + /// bool + /// `True` if the record has quality information, otherwise `False`. pub fn is_fastq(&self) -> PyResult { Ok(self.qual.is_some()) } + /// Normalize the sequence stored in the `seq` attribute of the object. + /// + /// See also + /// -------- + /// normalize_seq: A function to normalize nucleotide sequence strings. pub fn normalize(&mut self, iupac: bool) -> PyResult<()> { if let Some(s) = normalize(self.seq.as_bytes(), iupac) { self.seq = String::from_utf8_lossy(&s).to_string(); @@ -178,18 +247,93 @@ impl Record { // TODO: what would be really nice is to detect the type of pyobject so it would on file object etc // not for initial release though +/// An iterator that reads sequence records from a FASTA/FASTQ file. +/// +/// Parameters +/// ---------- +/// path : str +/// The path to a FASTA/FASTQ file. +/// +/// Returns +/// ------- +/// PyFastxReader +/// A `PyFastxReader` iterator that yields `Record` objects representing +/// sequences from the input file. +/// +/// Raises +/// ------ +/// NeedletailError +/// If an error occurs while reading and parsing the input file. +/// +/// See also +/// -------- +/// parse_fastx_string: +/// A function to parse sequence records from a FASTA/FASTQ string. +/// PyFastxReader: +/// A class with instances that are iterators that yield `Record` objects. #[pyfunction] fn parse_fastx_file(path: &str) -> PyResult { let reader = py_try!(rs_parse_fastx_file(path)); Ok(PyFastxReader { reader }) } +/// Parse sequence records from a FASTA/FASTQ string. +/// +/// Parameters +/// ---------- +/// content : str +/// A string containing FASTA/FASTQ-formatted sequence records. +/// +/// Returns +/// ------- +/// PyFastxReader +/// A `PyFastxReader` iterator that yields `Record` objects representing +/// sequences from the input string. +/// +/// Raises +/// ------ +/// NeedletailError +/// If an error occurs while parsing the input string. +/// +/// See also +/// -------- +/// parse_fastx_file: +/// A function to parse sequence records from a FASTA/FASTQ file. +/// PyFastxReader: +/// A class with instances that are iterators that yield `Record` objects. #[pyfunction] fn parse_fastx_string(content: &str) -> PyResult { let reader = py_try!(parse_fastx_reader(Cursor::new(content.to_owned()))); Ok(PyFastxReader { reader }) } +/// Normalize the sequence string of nucleotide records by: +/// - Converting lowercase characters to uppercase. +/// - Removing whitespace and newline characters. +/// - Replacing 'U' with 'T'. +/// - Replacing '.' and '~' with '-'. +/// - Replacing characters not in 'ACGTN-' with 'N'. +/// +/// Parameters +/// ---------- +/// seq : str +/// A string representing a nucleotide sequence. +/// iupac : bool +/// If `True`, characters representing nucleotide ambiguity ('B', 'D', +/// 'H', 'V', 'R', 'Y', 'S', 'W', 'K', and 'M', and their lowercase +/// forms) will not be converted to 'N'. Lowercase characters will still +/// be converted to uppercase. +/// +/// Returns +/// ------- +/// str +/// The normalized sequence string. +/// +/// Notes +/// ----- +/// The `normalize` method is designed for nucleotide sequences only. If +/// used with protein sequences, it will incorrectly process amino acid +/// characters as if they were nucleotides. #[pyfunction] pub fn normalize_seq(seq: &str, iupac: bool) -> PyResult { if let Some(s) = normalize(seq.as_bytes(), iupac) { @@ -199,6 +343,17 @@ pub fn normalize_seq(seq: &str, iupac: bool) -> PyResult { } } +/// Compute the reverse complement of a nucleotide sequence. +/// +/// Parameters: +/// ----------- +/// seq : str +/// A string representing a nucleotide sequence. +/// +/// Returns: +/// -------- +/// str +/// The reverse complement of the input nucleotide sequence. #[pyfunction] pub fn reverse_complement(seq: &str) -> String { let comp: Vec = seq From 78d8a7f1f2f31b033f07e2ddffd43a27095244e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ant=C3=B4nio=20Camargo?= Date: Wed, 22 Jan 2025 16:39:28 -0800 Subject: [PATCH 14/24] Add a to-do list to the beggining of the file --- src/python.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/python.rs b/src/python.rs index a4074a5..f70fe55 100644 --- a/src/python.rs +++ b/src/python.rs @@ -1,5 +1,10 @@ //! Python bindings for needletail +// TODO: +// - Add support for `pathlib.Path` objects in `parse_fastx_file`. +// - Make `normalize_seq` and `reverse_complement` functions able to handle +// `Record` objects as input. + use crate::sequence::{complement, normalize}; use crate::{ parse_fastx_file as rs_parse_fastx_file, parse_fastx_reader, parser::SequenceRecord, @@ -244,9 +249,6 @@ impl Record { } } -// TODO: what would be really nice is to detect the type of pyobject so it would on file object etc -// not for initial release though - /// An iterator that reads sequence records from a FASTA/FASTQ file. /// /// Parameters From 7cd2c47e8bcd66e021b6be9d12d1734288f42c9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ant=C3=B4nio=20Camargo?= Date: Wed, 22 Jan 2025 16:41:26 -0800 Subject: [PATCH 15/24] Add item to to-do list --- src/python.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/python.rs b/src/python.rs index f70fe55..e7d5f15 100644 --- a/src/python.rs +++ b/src/python.rs @@ -1,6 +1,7 @@ //! Python bindings for needletail // TODO: +// - Make the return values of `__repr__` and `__str__` show up as raw strings. // - Add support for `pathlib.Path` objects in `parse_fastx_file`. // - Make `normalize_seq` and `reverse_complement` functions able to handle // `Record` objects as input. From b900fb806de57fee5f6d7e1cb00892de3f6de742 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ant=C3=B4nio=20Camargo?= Date: Thu, 23 Jan 2025 17:46:49 -0800 Subject: [PATCH 16/24] Fix indentation in `normalize_seq` docstring --- src/python.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/python.rs b/src/python.rs index e7d5f15..a8fddd7 100644 --- a/src/python.rs +++ b/src/python.rs @@ -334,9 +334,9 @@ fn parse_fastx_string(content: &str) -> PyResult { /// /// Notes /// ----- -/// The `normalize` method is designed for nucleotide sequences only. If -/// used with protein sequences, it will incorrectly process amino acid -/// characters as if they were nucleotides. +/// The `normalize` method is designed for nucleotide sequences only. If +/// used with protein sequences, it will incorrectly process amino acid +/// characters as if they were nucleotides. #[pyfunction] pub fn normalize_seq(seq: &str, iupac: bool) -> PyResult { if let Some(s) = normalize(seq.as_bytes(), iupac) { From eaba5b1bf81dbc61726224c95a29aaf9ce38f285 Mon Sep 17 00:00:00 2001 From: Antonio Camargo Date: Thu, 23 Jan 2025 20:27:42 -0800 Subject: [PATCH 17/24] Update `parse_fastx_file` to accept `pathlib.Path` objects --- src/python.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/python.rs b/src/python.rs index a8fddd7..d97f3da 100644 --- a/src/python.rs +++ b/src/python.rs @@ -2,7 +2,6 @@ // TODO: // - Make the return values of `__repr__` and `__str__` show up as raw strings. -// - Add support for `pathlib.Path` objects in `parse_fastx_file`. // - Make `normalize_seq` and `reverse_complement` functions able to handle // `Record` objects as input. @@ -17,6 +16,7 @@ use pyo3::prelude::*; use pyo3::{create_exception, wrap_pyfunction}; use std::hash::{DefaultHasher, Hash, Hasher}; use std::io::Cursor; +use std::path::PathBuf; create_exception!(needletail, NeedletailError, pyo3::exceptions::PyException); @@ -254,7 +254,7 @@ impl Record { /// /// Parameters /// ---------- -/// path : str +/// path : str or pathlib.Path /// The path to a FASTA/FASTQ file. /// /// Returns @@ -275,7 +275,7 @@ impl Record { /// PyFastxReader: /// A class with instances that are iterators that yield `Record` objects. #[pyfunction] -fn parse_fastx_file(path: &str) -> PyResult { +fn parse_fastx_file(path: PathBuf) -> PyResult { let reader = py_try!(rs_parse_fastx_file(path)); Ok(PyFastxReader { reader }) } From 4fcf73b161bc344643e044fb1c94132a6ef173e9 Mon Sep 17 00:00:00 2001 From: Antonio Camargo Date: Fri, 24 Jan 2025 14:16:50 -0800 Subject: [PATCH 18/24] Set the default of valuehe iupac parameter to False --- src/python.rs | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/python.rs b/src/python.rs index d97f3da..7db5dbd 100644 --- a/src/python.rs +++ b/src/python.rs @@ -179,6 +179,7 @@ impl Record { /// See also /// -------- /// normalize_seq: A function to normalize nucleotide sequence strings. + #[pyo3(signature = (iupac=false))] pub fn normalize(&mut self, iupac: bool) -> PyResult<()> { if let Some(s) = normalize(self.seq.as_bytes(), iupac) { self.seq = String::from_utf8_lossy(&s).to_string(); @@ -311,17 +312,20 @@ fn parse_fastx_string(content: &str) -> PyResult { } /// Normalize the sequence string of nucleotide records by: -/// - Converting lowercase characters to uppercase. -/// - Removing whitespace and newline characters. -/// - Replacing 'U' with 'T'. -/// - Replacing '.' and '~' with '-'. -/// - Replacing characters not in 'ACGTN-' with 'N'. +/// +/// - Converting lowercase characters to uppercase. +/// - Removing whitespace and newline characters. +/// - Replacing 'U' with 'T'. +/// - Replacing '.' and '~' with '-'. +/// - Replacing characters not in 'ACGTN-' with 'N', unless `iupac` is set +/// to `True`, in which case characters representing nucleotide ambiguity +/// are not replaced. /// /// Parameters /// ---------- /// seq : str /// A string representing a nucleotide sequence. -/// iupac : bool +/// iupac : bool, default: False /// If `True`, characters representing nucleotide ambiguity ('B', 'D', /// 'H', 'V', 'R', 'Y', 'S', 'W', 'K', and 'M', and their lowercase /// forms) will not be converted to 'N'. Lowercase characters will still @@ -338,6 +342,7 @@ fn parse_fastx_string(content: &str) -> PyResult { /// used with protein sequences, it will incorrectly process amino acid /// characters as if they were nucleotides. #[pyfunction] +#[pyo3(signature = (seq, iupac=false))] pub fn normalize_seq(seq: &str, iupac: bool) -> PyResult { if let Some(s) = normalize(seq.as_bytes(), iupac) { Ok(String::from_utf8_lossy(&s).to_string()) From 89d94f2613bddc0550968ff4d7ead6b52886f803 Mon Sep 17 00:00:00 2001 From: Antonio Camargo Date: Fri, 24 Jan 2025 14:20:47 -0800 Subject: [PATCH 19/24] Add new task to to-do --- src/python.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/python.rs b/src/python.rs index 7db5dbd..882e1b1 100644 --- a/src/python.rs +++ b/src/python.rs @@ -1,6 +1,8 @@ //! Python bindings for needletail // TODO: +// - Add a property to the `Record` class that returns the quality scores as a +// list of integers. // - Make the return values of `__repr__` and `__str__` show up as raw strings. // - Make `normalize_seq` and `reverse_complement` functions able to handle // `Record` objects as input. From 1e8b7f41dabdccda8c466710ec8aaa90f958c3f3 Mon Sep 17 00:00:00 2001 From: Antonio Camargo Date: Fri, 24 Jan 2025 14:56:44 -0800 Subject: [PATCH 20/24] Update Python tests --- src/python.rs | 4 +- test_python.py | 186 +++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 153 insertions(+), 37 deletions(-) diff --git a/src/python.rs b/src/python.rs index 882e1b1..93a9d1f 100644 --- a/src/python.rs +++ b/src/python.rs @@ -241,9 +241,9 @@ impl Record { Ok(name) => name.to_string(), Err(_) => self.id.clone(), }; - let seq_snippet = get_seq_snippet(&self.seq, 25); + let seq_snippet = get_seq_snippet(&self.seq, 20); let quality_snippet = match &self.qual { - Some(qual) => get_seq_snippet(qual, 25), + Some(qual) => get_seq_snippet(qual, 20), None => "None".to_string(), }; Ok(format!( diff --git a/test_python.py b/test_python.py index 6d2bf4d..c154ae5 100644 --- a/test_python.py +++ b/test_python.py @@ -1,13 +1,153 @@ import unittest +from pathlib import Path -from needletail import parse_fastx_file, parse_fastx_string, NeedletailError, reverse_complement, normalize_seq - +from needletail import ( + NeedletailError, + Record, + normalize_seq, + parse_fastx_file, + parse_fastx_string, + reverse_complement, +) FASTA_FILE = "./tests/data/test.fa" FASTQ_FILE = "./tests/specimen/FASTQ/example.fastq" -class ParsingTestCase(unittest.TestCase): +class RecordClassTestCase(unittest.TestCase): + def test_fasta_record(self): + record = Record("test description", "AGCTGATCGA") + self.assertEqual(record.id, "test description") + self.assertEqual(record.seq, "AGCTGATCGA") + self.assertIsNone(record.qual) + + def test_fastq_record(self): + record = Record("test description", "AGCTGATCGA", ";**9;;????") + self.assertEqual(record.id, "test description") + self.assertEqual(record.seq, "AGCTGATCGA") + self.assertEqual(record.qual, ";**9;;????") + + def test_record_properties(self): + record = Record("test description", "AGCTGATCGA") + self.assertEqual(record.name, "test") + self.assertEqual(record.description, "description") + + def test_record_normalize(self): + record = Record("test", "AGCTGYrtcga") + record.normalize(iupac=True) + self.assertEqual(record.seq, "AGCTGYRTCGA") + record.normalize() + self.assertEqual(record.seq, "AGCTGNNTCGA") + + def test_format_record_method(self): + record = Record("test", "AGCTGATCGA") + self.assertTrue(record.is_fasta()) + self.assertFalse(record.is_fastq()) + record = Record("test", "AGCTGATCGA", ";**9;;????") + self.assertFalse(record.is_fasta()) + self.assertTrue(record.is_fastq()) + + def test_record_eq(self): + record1 = Record("test", "AGCTGATCGA", ";**9;;????") + record2 = Record("test", "AGCTGATCGA", ";**9;;????") + record3 = Record("test2", "AGCTGATCGA", ";**9;;????") + record4 = Record("test", "TCGATCAGCT", ";**9;;????") + record5 = Record("test", "AGCTGATCGA", "????;**9;;") + record6 = Record("test", "AGCTGATCGA") + self.assertEqual(record1, record2) + self.assertNotEqual(record1, record3) + self.assertNotEqual(record1, record4) + self.assertNotEqual(record1, record5) + self.assertNotEqual(record1, record6) + + def test_record_str(self): + self.assertEqual(str(Record("test", "AGCTGATCGA")), ">test\nAGCTGATCGA\n") + self.assertEqual( + str(Record("test", "AGCTGATCGA", ";**9;;????")), + "@test\nAGCTGATCGA\n+\n;**9;;????\n", + ) + + def test_record_repr(self): + self.assertEqual( + repr(Record("test", "AGCTGATCGAAGCTGATCGAA")), + "Record(id=test, seq=AGCTGATCGAAGCTGA…GAA, qual=None)", + ) + self.assertEqual( + repr(Record("test", "AGCTGATCGAAGCTGATCGAA", ";**9;;????;**9;;????;")), + "Record(id=test, seq=AGCTGATCGAAGCTGA…GAA, qual=;**9;;????;**9;;…??;)", + ) + + def test_record_len(self): + self.assertEqual(len(Record("test", "AGCTGATCGA")), 10) + + def test_record_hash(self): + record1 = Record("test", "AGCTGATCGA") + record2 = Record("test", "AGCTGATCGA") + record3 = Record("test", "AGCTGATCGA", ";**9;;????") + record4 = Record("test", "AGCTGATCGA", ";**9;;????") + record5 = Record("test", "TCGATCAGCT") + record6 = Record("test2", "AGCTGATCGA") + record7 = Record("test", "AGCTGATCGA", "????;**9;;") + self.assertEqual(hash(record1), hash(record2)) + self.assertNotEqual(hash(record1), hash(record3)) + self.assertNotEqual(hash(record1), hash(record5)) + self.assertNotEqual(hash(record1), hash(record6)) + self.assertNotEqual(hash(record1), hash(record3)) + self.assertEqual(hash(record3), hash(record4)) + self.assertNotEqual(hash(record3), hash(record7)) + + +class NormalizeTestCase(unittest.TestCase): + def test_no_normalization_needed(self): + self.assertEqual(normalize_seq("ACGTU", iupac=False), "ACGTT") + + def test_capitalization(self): + self.assertEqual(normalize_seq("acgtu", iupac=False), "ACGTT") + + def test_default_parameters(self): + self.assertEqual( + normalize_seq("BDHVRYSWKM"), normalize_seq("BDHVRYSWKM", iupac=False) + ) + + def test_iupac_parameter(self): + self.assertEqual(normalize_seq("BDHVRYSWKM", iupac=False), "NNNNNNNNNN") + self.assertEqual(normalize_seq("BDHVRYSWKM", iupac=True), "BDHVRYSWKM") + self.assertEqual(normalize_seq("bdhvryswkm", iupac=True), "BDHVRYSWKM") + + def test_gap_normalization(self): + self.assertEqual(normalize_seq("N-N-N-N", iupac=False), "N-N-N-N") + self.assertEqual(normalize_seq("N.N.N.N", iupac=False), "N-N-N-N") + self.assertEqual(normalize_seq("N~N~N~N", iupac=False), "N-N-N-N") + + def test_whitespace_removal(self): + self.assertEqual(normalize_seq("N N N N", iupac=False), "NNNN") + self.assertEqual(normalize_seq("N\tN\tN\tN", iupac=False), "NNNN") + self.assertEqual(normalize_seq("N\nN\nN\nN", iupac=False), "NNNN") + self.assertEqual(normalize_seq("N\rN\rN\rN", iupac=False), "NNNN") + + def test_non_alphabet_characters(self): + self.assertEqual(normalize_seq("N!N!N!N", iupac=False), "NNNNNNN") + self.assertEqual(normalize_seq("N@N@N@N", iupac=False), "NNNNNNN") + self.assertEqual(normalize_seq("N#N#N#N", iupac=False), "NNNNNNN") + self.assertEqual(normalize_seq("N$N$N$N", iupac=False), "NNNNNNN") + self.assertEqual(normalize_seq("N%N%N%N", iupac=False), "NNNNNNN") + self.assertEqual(normalize_seq("N^N^N^N", iupac=False), "NNNNNNN") + self.assertEqual(normalize_seq("N&N&N&N", iupac=False), "NNNNNNN") + self.assertEqual(normalize_seq("N*N*N*N", iupac=False), "NNNNNNN") + self.assertEqual(normalize_seq("N|N|N|N", iupac=False), "NNNNNNN") + self.assertEqual(normalize_seq("N9N5N1N", iupac=False), "NNNNNNN") + + +class ReverseComplementTestCase(unittest.TestCase): + def test_reverse_complement(self): + self.assertEqual(reverse_complement("a"), "t") + self.assertEqual(reverse_complement("c"), "g") + self.assertEqual(reverse_complement("g"), "c") + self.assertEqual(reverse_complement("n"), "n") + self.assertEqual(reverse_complement("atcg"), "cgat") + + +class FileParsingTestCase(unittest.TestCase): def get_fasta_reader(self): return parse_fastx_file(FASTA_FILE) @@ -20,17 +160,10 @@ def test_can_parse_fasta_file(self): self.assertEqual(record.id, "test") self.assertEqual(record.seq, "AGCTGATCGA") self.assertIsNone(record.qual) - record.normalize(iupac=False) - self.assertEqual(record.seq, "AGCTGATCGA") - self.assertTrue(record.is_fasta()) if i == 1: self.assertEqual(record.id, "test2") self.assertEqual(record.seq, "TAGC") self.assertIsNone(record.qual) - record.normalize(iupac=False) - self.assertEqual(record.seq, "TAGC") - self.assertTrue(record.is_fasta()) - self.assertTrue(i <= 1) def test_can_parse_fastq_file(self): @@ -39,21 +172,17 @@ def test_can_parse_fastq_file(self): self.assertEqual(record.id, "EAS54_6_R1_2_1_413_324") self.assertEqual(record.seq, "CCCTTCTTGTCTTCAGCGTTTCTCC") self.assertEqual(record.qual, ";;3;;;;;;;;;;;;7;;;;;;;88") - record.normalize(iupac=False) - self.assertEqual(record.seq, "CCCTTCTTGTCTTCAGCGTTTCTCC") - self.assertTrue(record.is_fastq()) if i == 1: self.assertEqual(record.id, "EAS54_6_R1_2_1_540_792") self.assertEqual(record.seq, "TTGGCAGGCCAAGGCCGATGGATCA") self.assertEqual(record.qual, ";;;;;;;;;;;7;;;;;-;;;3;83") - record.normalize(iupac=False) - self.assertEqual(record.seq, "TTGGCAGGCCAAGGCCGATGGATCA") - self.assertTrue(record.is_fastq()) - self.assertTrue(i <= 2) + def test_pathlib_path_input(self): + parse_fastx_file(Path(FASTA_FILE)) + -class ParsingStrTestCase(ParsingTestCase): +class StrParsingTestCase(FileParsingTestCase): def get_fasta_reader(self): with open(FASTA_FILE) as f: content = f.read() @@ -64,22 +193,8 @@ def get_fastq_reader(self): content = f.read() return parse_fastx_string(content) - -class MiscelleanousTestCase(unittest.TestCase): - def test_normalize_seq(self): - self.assertEqual(normalize_seq("ACGTU", iupac=False), "ACGTT") - self.assertEqual(normalize_seq("acgtu", iupac=False), "ACGTT") - self.assertEqual(normalize_seq("N.N-N~N N", iupac=False), "N-N-N-NN") - self.assertEqual(normalize_seq("BDHVRYSWKM", iupac=True), "BDHVRYSWKM") - self.assertEqual(normalize_seq("bdhvryswkm", iupac=True), "BDHVRYSWKM") - - def test_reverse_complement(self): - self.assertEqual(reverse_complement("a"), "t") - self.assertEqual(reverse_complement("c"), "g") - self.assertEqual(reverse_complement("g"), "c") - self.assertEqual(reverse_complement("n"), "n") - - self.assertEqual(reverse_complement("atcg"), "cgat") + def test_pathlib_path_input(self): + pass class ErroringTestCase(unittest.TestCase): @@ -92,5 +207,6 @@ def test_invalid_record(self): for i, record in enumerate(parse_fastx_string("Not a valid file")): print(i) -if __name__ == '__main__': + +if __name__ == "__main__": unittest.main() From 4de47d9053c477d2f16fce4d85a03445b59e2993 Mon Sep 17 00:00:00 2001 From: Antonio Camargo Date: Fri, 24 Jan 2025 15:01:39 -0800 Subject: [PATCH 21/24] Fix linting issue --- src/python.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/python.rs b/src/python.rs index 93a9d1f..6888c24 100644 --- a/src/python.rs +++ b/src/python.rs @@ -315,13 +315,13 @@ fn parse_fastx_string(content: &str) -> PyResult { /// Normalize the sequence string of nucleotide records by: /// -/// - Converting lowercase characters to uppercase. -/// - Removing whitespace and newline characters. -/// - Replacing 'U' with 'T'. -/// - Replacing '.' and '~' with '-'. -/// - Replacing characters not in 'ACGTN-' with 'N', unless `iupac` is set -/// to `True`, in which case characters representing nucleotide ambiguity -/// are not replaced. +/// - Converting lowercase characters to uppercase. +/// - Removing whitespace and newline characters. +/// - Replacing 'U' with 'T'. +/// - Replacing '.' and '~' with '-'. +/// - Replacing characters not in 'ACGTN-' with 'N', unless `iupac` is `True`, +/// in which case characters representing nucleotide ambiguity are not +/// replaced. /// /// Parameters /// ---------- From f16c10dffd14d2fd2a45c0176216d2aa7f05369c Mon Sep 17 00:00:00 2001 From: Antonio Camargo Date: Mon, 27 Jan 2025 12:40:39 -0800 Subject: [PATCH 22/24] Use assertIsInstance in test_pathlib_path_input --- test_python.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test_python.py b/test_python.py index c154ae5..a8c4aaf 100644 --- a/test_python.py +++ b/test_python.py @@ -3,6 +3,7 @@ from needletail import ( NeedletailError, + PyFastxReader, Record, normalize_seq, parse_fastx_file, @@ -179,7 +180,7 @@ def test_can_parse_fastq_file(self): self.assertTrue(i <= 2) def test_pathlib_path_input(self): - parse_fastx_file(Path(FASTA_FILE)) + self.assertIsInstance(parse_fastx_file(Path(FASTA_FILE)), PyFastxReader) class StrParsingTestCase(FileParsingTestCase): From 3435cf5e1b2c3f938ca2559b2a8f8e2f3277a176 Mon Sep 17 00:00:00 2001 From: Antonio Camargo Date: Mon, 27 Jan 2025 12:40:58 -0800 Subject: [PATCH 23/24] Add __pycache__ to .gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 1cb70a9..7484571 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ Cargo.lock venv/ .DS_Store .idea/ -test.py \ No newline at end of file +test.py +__pycache__/ From 94423ff08fdd02861668305d5bf66d079ef0a217 Mon Sep 17 00:00:00 2001 From: Antonio Camargo Date: Mon, 27 Jan 2025 12:56:24 -0800 Subject: [PATCH 24/24] Make FileParsingTestCase inherit from StrParsingTestCase --- test_python.py | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/test_python.py b/test_python.py index a8c4aaf..f644e50 100644 --- a/test_python.py +++ b/test_python.py @@ -3,7 +3,6 @@ from needletail import ( NeedletailError, - PyFastxReader, Record, normalize_seq, parse_fastx_file, @@ -148,12 +147,16 @@ def test_reverse_complement(self): self.assertEqual(reverse_complement("atcg"), "cgat") -class FileParsingTestCase(unittest.TestCase): +class StrParsingTestCase(unittest.TestCase): def get_fasta_reader(self): - return parse_fastx_file(FASTA_FILE) + with open(FASTA_FILE) as f: + content = f.read() + return parse_fastx_string(content) def get_fastq_reader(self): - return parse_fastx_file(FASTQ_FILE) + with open(FASTQ_FILE) as f: + content = f.read() + return parse_fastx_string(content) def test_can_parse_fasta_file(self): for i, record in enumerate(self.get_fasta_reader()): @@ -179,23 +182,16 @@ def test_can_parse_fastq_file(self): self.assertEqual(record.qual, ";;;;;;;;;;;7;;;;;-;;;3;83") self.assertTrue(i <= 2) - def test_pathlib_path_input(self): - self.assertIsInstance(parse_fastx_file(Path(FASTA_FILE)), PyFastxReader) - -class StrParsingTestCase(FileParsingTestCase): +class FileParsingTestCase(StrParsingTestCase): def get_fasta_reader(self): - with open(FASTA_FILE) as f: - content = f.read() - return parse_fastx_string(content) + return parse_fastx_file(FASTA_FILE) def get_fastq_reader(self): - with open(FASTQ_FILE) as f: - content = f.read() - return parse_fastx_string(content) + return parse_fastx_file(FASTQ_FILE) def test_pathlib_path_input(self): - pass + parse_fastx_file(Path(FASTA_FILE)) class ErroringTestCase(unittest.TestCase):