From 31235a29c7e277577b23648428232326144ecf48 Mon Sep 17 00:00:00 2001 From: Andreas Hartel Date: Wed, 27 Nov 2024 05:39:18 +0100 Subject: [PATCH] Upgrade nom dependency to 7.1.3 (#41) * upgrade nom to 6.2.2 * upgrade nom to 7.1.3 * cargo clippy * fix formatting and clippy lint * bump major version because of Display trait implementation --- Cargo.toml | 4 +- examples/read_file.rs | 6 +-- examples/read_filtered.rs | 2 +- examples/read_gzip.rs | 6 +-- examples/read_raw.rs | 6 +-- src/error.rs | 4 +- src/parser.rs | 44 +++++++++++------- src/record.rs | 9 ++-- src/record_type.rs | 8 ++-- src/truncated_type.rs | 8 ++-- src/warc_reader.rs | 94 ++++++++++++++++++++------------------- src/warc_writer.rs | 8 ++-- 12 files changed, 111 insertions(+), 88 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index dd58079..b0a8f57 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "warc" -version = "0.3.3" +version = "0.4.0" description = "A Rust library for reading and writing WARC files." readme = "README.md" repository = "https://github.com/jedireza/warc" @@ -12,7 +12,7 @@ edition = "2018" [dependencies] chrono = "0.4.11" -nom = "5.1.1" +nom = "7.1.3" url = "2" uuid = { version = "0.8.1", features = ["v4"] } diff --git a/examples/read_file.rs b/examples/read_file.rs index 318da3e..7ec74cb 100644 --- a/examples/read_file.rs +++ b/examples/read_file.rs @@ -10,9 +10,9 @@ fn main() -> Result<(), std::io::Error> { match record { Err(err) => println!("ERROR: {}\r\n", err), Ok(record) => { - println!("{}: {}", WarcHeader::RecordID.to_string(), record.warc_id(),); - println!("{}: {}", WarcHeader::Date.to_string(), record.date(),); - println!(""); + println!("{}: {}", WarcHeader::RecordID, record.warc_id(),); + println!("{}: {}", WarcHeader::Date, record.date(),); + println!(); } } } diff --git a/examples/read_filtered.rs b/examples/read_filtered.rs index 7e53637..0f49a27 100644 --- a/examples/read_filtered.rs +++ b/examples/read_filtered.rs @@ -16,7 +16,7 @@ fn main() -> std::io::Result<()> { let filtered_file_names: Vec<_> = args.map(|s| s.to_string_lossy().to_string()).collect(); if filtered_file_names.is_empty() { - return Err(usage_err!("one or more filtered file names not supplied"))?; + Err(usage_err!("one or more filtered file names not supplied"))?; } let mut file = WarcReader::from_path_gzip(warc_name)?; diff --git a/examples/read_gzip.rs b/examples/read_gzip.rs index a4e0690..a472955 100644 --- a/examples/read_gzip.rs +++ b/examples/read_gzip.rs @@ -10,9 +10,9 @@ fn main() -> Result<(), std::io::Error> { match record { Err(err) => println!("ERROR: {}\r\n", err), Ok(record) => { - println!("{}: {}", WarcHeader::RecordID.to_string(), record.warc_id()); - println!("{}: {}", WarcHeader::Date.to_string(), record.date()); - println!(""); + println!("{}: {}", WarcHeader::RecordID, record.warc_id()); + println!("{}: {}", WarcHeader::Date, record.date()); + println!(); } } } diff --git a/examples/read_raw.rs b/examples/read_raw.rs index c6c4c60..1cdd054 100644 --- a/examples/read_raw.rs +++ b/examples/read_raw.rs @@ -12,15 +12,15 @@ fn main() -> Result<(), std::io::Error> { Ok((headers, _)) => { println!( "{}: {}", - WarcHeader::RecordID.to_string(), + WarcHeader::RecordID, String::from_utf8_lossy(headers.as_ref().get(&WarcHeader::RecordID).unwrap()) ); println!( "{}: {}", - WarcHeader::Date.to_string(), + WarcHeader::Date, String::from_utf8_lossy(headers.as_ref().get(&WarcHeader::Date).unwrap()) ); - println!(""); + println!(); } } } diff --git a/src/error.rs b/src/error.rs index bfdbdfb..fd350b8 100644 --- a/src/error.rs +++ b/src/error.rs @@ -26,9 +26,9 @@ impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { Error::ParseHeaders(_) => write!(f, "Error parsing headers."), - Error::MissingHeader(ref h) => write!(f, "Missing required header: {}", h.to_string()), + Error::MissingHeader(ref h) => write!(f, "Missing required header: {}", h), Error::MalformedHeader(ref h, ref r) => { - write!(f, "Malformed header: {}: {}", h.to_string(), r) + write!(f, "Malformed header: {}: {}", h, r) } Error::ReadData(_) => write!(f, "Error reading data source."), Error::ReadOverflow => write!(f, "Read further than expected."), diff --git a/src/parser.rs b/src/parser.rs index a4ab032..4627c85 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -14,7 +14,10 @@ fn version(input: &[u8]) -> IResult<&[u8], &str> { let version_str = match str::from_utf8(version) { Err(_) => { - return Err(nom::Err::Error((input, ErrorKind::Verify))); + return Err(nom::Err::Error(nom::error::Error::new( + input, + ErrorKind::Verify, + ))); } Ok(version) => version, }; @@ -23,8 +26,7 @@ fn version(input: &[u8]) -> IResult<&[u8], &str> { } fn is_header_token_char(chr: u8) -> bool { - match chr { - 0..=31 + !matches!(chr, 0..=31 | 128..=255 | b'(' | b')' @@ -43,9 +45,7 @@ fn is_header_token_char(chr: u8) -> bool { | b'{' | b'}' | b' ' - | b'\\' => false, - _ => true, - } + | b'\\') } fn header(input: &[u8]) -> IResult<&[u8], (&[u8], &[u8])> { @@ -63,6 +63,7 @@ fn header(input: &[u8]) -> IResult<&[u8], (&[u8], &[u8])> { /// Parse a WARC header block. // TODO: evaluate the use of `ErrorKind::Verify` here. +#[allow(clippy::type_complexity)] pub fn headers(input: &[u8]) -> IResult<&[u8], (&str, Vec<(&str, &[u8])>, usize)> { let (input, version) = version(input)?; let (input, headers) = many1(header)(input)?; @@ -73,22 +74,31 @@ pub fn headers(input: &[u8]) -> IResult<&[u8], (&str, Vec<(&str, &[u8])>, usize) for header in headers { let token_str = match str::from_utf8(header.0) { Err(_) => { - return Err(nom::Err::Error((input, ErrorKind::Verify))); + return Err(nom::Err::Error(nom::error::Error::new( + input, + ErrorKind::Verify, + ))); } Ok(token) => token, }; - if content_length == None && token_str.to_lowercase() == "content-length" { + if content_length.is_none() && token_str.to_lowercase() == "content-length" { let value_str = match str::from_utf8(header.1) { Err(_) => { - return Err(nom::Err::Error((input, ErrorKind::Verify))); + return Err(nom::Err::Error(nom::error::Error::new( + input, + ErrorKind::Verify, + ))); } Ok(value) => value, }; match value_str.parse::() { Err(_) => { - return Err(nom::Err::Error((input, ErrorKind::Verify))); + return Err(nom::Err::Error(nom::error::Error::new( + input, + ErrorKind::Verify, + ))); } Ok(len) => { content_length = Some(len); @@ -101,7 +111,7 @@ pub fn headers(input: &[u8]) -> IResult<&[u8], (&str, Vec<(&str, &[u8])>, usize) // TODO: Technically if we didn't find a `content-length` header, the record is invalid. Should // we be returning an error here instead? - if content_length == None { + if content_length.is_none() { content_length = Some(0); } @@ -109,6 +119,7 @@ pub fn headers(input: &[u8]) -> IResult<&[u8], (&str, Vec<(&str, &[u8])>, usize) } /// Parse an entire WARC record. +#[allow(clippy::type_complexity)] pub fn record(input: &[u8]) -> IResult<&[u8], (&str, Vec<(&str, &[u8])>, &[u8])> { let (input, (headers, _)) = tuple((headers, line_ending))(input)?; let (input, (body, _, _)) = tuple((take(headers.2), line_ending, line_ending))(input)?; @@ -125,13 +136,13 @@ mod tests { #[test] fn version_parsing() { - assert_eq!(version(&b"WARC/0.0\r\n"[..]), Ok((&b""[..], &"0.0"[..]))); + assert_eq!(version(&b"WARC/0.0\r\n"[..]), Ok((&b""[..], "0.0"))); - assert_eq!(version(&b"WARC/1.0\r\n"[..]), Ok((&b""[..], &"1.0"[..]))); + assert_eq!(version(&b"WARC/1.0\r\n"[..]), Ok((&b""[..], "1.0"))); assert_eq!( version(&b"WARC/2.0-alpha\r\n"[..]), - Ok((&b""[..], &"2.0-alpha"[..])) + Ok((&b""[..], "2.0-alpha")) ); } @@ -168,7 +179,10 @@ mod tests { assert_eq!( headers(&raw_invalid[..]), - Err(Err::Error((&b"\r\n"[..], ErrorKind::Verify))) + Err(Err::Error(nom::error::Error::new( + &b"\r\n"[..], + ErrorKind::Verify + ))) ); let raw = b"\ diff --git a/src/record.rs b/src/record.rs index d37122f..f4db352 100644 --- a/src/record.rs +++ b/src/record.rs @@ -51,9 +51,8 @@ mod streaming_trait { impl<'t, T: Read + 't> Read for StreamingBody<'t, T> { fn read(&mut self, data: &mut [u8]) -> std::io::Result { let max_read = std::cmp::min(data.len(), *self.1 as usize); - self.0.read(&mut data[..max_read as usize]).map(|n| { + self.0.read(&mut data[..max_read]).inspect(|&n| { *self.1 -= n as u64; - n }) } } @@ -156,7 +155,7 @@ impl std::fmt::Display for RawRecordHeader { fn fmt(&self, w: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { writeln!(w, "WARC/{}", self.version)?; for (key, value) in self.as_ref().iter() { - writeln!(w, "{}: {}", key.to_string(), String::from_utf8_lossy(value))?; + writeln!(w, "{}: {}", key, String::from_utf8_lossy(value))?; } writeln!(w)?; @@ -263,7 +262,7 @@ impl Record { /// The current implementation generates random values based on UUID version 4. /// pub fn generate_record_id() -> String { - format!("<{}>", Uuid::new_v4().to_urn().to_string()) + format!("<{}>", Uuid::new_v4().to_urn()) } fn parse_content_length(len: &str) -> Result { @@ -1058,7 +1057,7 @@ mod raw_tests { let output = headers.to_string(); - let expected_lines = vec![ + let expected_lines = [ "WARC/1.0", "warc-type: dunno", "warc-date: 2024-01-01T00:00:00Z", diff --git a/src/record_type.rs b/src/record_type.rs index 2cac12e..d00052c 100644 --- a/src/record_type.rs +++ b/src/record_type.rs @@ -1,4 +1,6 @@ #![allow(missing_docs)] + +use std::fmt::Display; #[derive(Clone, Debug, PartialEq)] pub enum RecordType { WarcInfo, @@ -12,8 +14,8 @@ pub enum RecordType { Unknown(String), } -impl ToString for RecordType { - fn to_string(&self) -> String { +impl Display for RecordType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let stringified = match *self { RecordType::WarcInfo => "warcinfo", RecordType::Response => "response", @@ -25,7 +27,7 @@ impl ToString for RecordType { RecordType::Continuation => "continuation", RecordType::Unknown(ref val) => val.as_ref(), }; - stringified.to_string() + f.write_str(stringified) } } diff --git a/src/truncated_type.rs b/src/truncated_type.rs index 86cffd6..80ae82b 100644 --- a/src/truncated_type.rs +++ b/src/truncated_type.rs @@ -1,4 +1,6 @@ #![allow(missing_docs)] + +use std::fmt::Display; #[derive(Clone, Debug, PartialEq)] pub enum TruncatedType { Length, @@ -8,8 +10,8 @@ pub enum TruncatedType { Unknown(String), } -impl ToString for TruncatedType { - fn to_string(&self) -> String { +impl Display for TruncatedType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let stringified = match *self { TruncatedType::Length => "length", TruncatedType::Time => "time", @@ -17,7 +19,7 @@ impl ToString for TruncatedType { TruncatedType::Unspecified => "unspecified", TruncatedType::Unknown(ref val) => val.as_ref(), }; - stringified.to_string() + f.write_str(stringified) } } diff --git a/src/warc_reader.rs b/src/warc_reader.rs index 5aba65b..121de8d 100644 --- a/src/warc_reader.rs +++ b/src/warc_reader.rs @@ -54,10 +54,10 @@ impl WarcReader> { pub fn from_path>(path: P) -> io::Result { let file = fs::OpenOptions::new() .read(true) - .write(true) .create(true) + .truncate(false) .open(&path)?; - let reader = BufReader::with_capacity(1 * MB, file); + let reader = BufReader::with_capacity(MB, file); Ok(WarcReader::new(reader)) } @@ -71,7 +71,7 @@ impl WarcReader>>> { pub fn from_path_gzip>(path: P) -> io::Result { let file = fs::File::open(&path)?; - let gzip_stream = GzipReader::new(BufReader::with_capacity(1 * MB, file))?; + let gzip_stream = GzipReader::new(BufReader::with_capacity(MB, file))?; Ok(WarcReader::new(BufReader::new(gzip_stream))) } } @@ -112,14 +112,18 @@ impl Iterator for RawRecordIter { } let headers_parsed = match parser::headers(&header_buffer) { - Err(e) => return Some(Err(Error::ParseHeaders(e.to_owned()))), + Err(e) => { + return Some(Err(Error::ParseHeaders( + e.map(|inner| (inner.input.to_owned(), inner.code)), + ))) + } Ok(parsed) => parsed.1, }; let version_ref = headers_parsed.0; let headers_ref = headers_parsed.1; let expected_body_len = headers_parsed.2; - let mut body_buffer: Vec = Vec::with_capacity(1 * MB); + let mut body_buffer: Vec = Vec::with_capacity(MB); let mut found_body = false; let mut body_bytes_read = 0; let maximum_read_range = expected_body_len + 4; @@ -195,14 +199,19 @@ impl Iterator for RecordIter { } let headers_parsed = match parser::headers(&header_buffer) { - Err(e) => return Some(Err(Error::ParseHeaders(e.to_owned()))), + Err(e) => { + return Some(Err(Error::ParseHeaders( + e.map(|inner| (inner.input.to_owned(), inner.code)), + ))); + } + Ok(parsed) => parsed.1, }; let version_ref = headers_parsed.0; let headers_ref = headers_parsed.1; let expected_body_len = headers_parsed.2; - let mut body_buffer: Vec = Vec::with_capacity(1 * MB); + let mut body_buffer: Vec = Vec::with_capacity(MB); let mut found_body = false; let mut body_bytes_read = 0; let maximum_read_range = expected_body_len + 4; @@ -271,7 +280,7 @@ impl StreamingIter<'_, R> { } fn skip_body(&mut self) -> Result<(), Error> { - let mut read_buffer = [0u8; 1 * MB]; + let mut read_buffer = [0u8; MB]; let maximum_read_range = self.current_item_size; let mut body_bytes_left = maximum_read_range; while body_bytes_left > 0 { @@ -339,7 +348,11 @@ impl StreamingIter<'_, R> { } let headers_parsed = match parser::headers(&header_buffer) { - Err(e) => return Some(Err(Error::ParseHeaders(e.to_owned()))), + Err(e) => { + return Some(Err(Error::ParseHeaders( + e.map(|inner| (inner.input.to_owned(), inner.code)), + ))) + } Ok(parsed) => parsed.1, }; let version_ref = headers_parsed.0; @@ -393,18 +406,15 @@ mod iter_raw_tests { "; let expected_version = "1.0"; - let expected_headers: HashMap> = HashMap::from_iter( - vec![ - (WarcHeader::WarcType, b"dunno".to_vec()), - (WarcHeader::ContentLength, b"5".to_vec()), - ( - WarcHeader::RecordID, - b"".to_vec(), - ), - (WarcHeader::Date, b"2020-07-08T02:52:55Z".to_vec()), - ] - .into_iter(), - ); + let expected_headers: HashMap> = HashMap::from_iter(vec![ + (WarcHeader::WarcType, b"dunno".to_vec()), + (WarcHeader::ContentLength, b"5".to_vec()), + ( + WarcHeader::RecordID, + b"".to_vec(), + ), + (WarcHeader::Date, b"2020-07-08T02:52:55Z".to_vec()), + ]); let expected_body: &[u8] = b"12345"; let mut reader = WarcReader::new(create_reader!(raw)).iter_raw_records(); @@ -438,18 +448,15 @@ mod iter_raw_tests { let mut reader = WarcReader::new(create_reader!(raw)).iter_raw_records(); { let expected_version = "1.0"; - let expected_headers: HashMap> = HashMap::from_iter( - vec![ - (WarcHeader::WarcType, b"dunno".to_vec()), - (WarcHeader::ContentLength, b"5".to_vec()), - ( - WarcHeader::RecordID, - b"".to_vec(), - ), - (WarcHeader::Date, b"2020-07-08T02:52:55Z".to_vec()), - ] - .into_iter(), - ); + let expected_headers: HashMap> = HashMap::from_iter(vec![ + (WarcHeader::WarcType, b"dunno".to_vec()), + (WarcHeader::ContentLength, b"5".to_vec()), + ( + WarcHeader::RecordID, + b"".to_vec(), + ), + (WarcHeader::Date, b"2020-07-08T02:52:55Z".to_vec()), + ]); let expected_body: &[u8] = b"12345"; let (headers, body) = reader.next().unwrap().unwrap(); @@ -460,18 +467,15 @@ mod iter_raw_tests { { let expected_version = "1.0"; - let expected_headers: HashMap> = HashMap::from_iter( - vec![ - (WarcHeader::WarcType, b"another".to_vec()), - (WarcHeader::ContentLength, b"6".to_vec()), - ( - WarcHeader::RecordID, - b"".to_vec(), - ), - (WarcHeader::Date, b"2020-07-08T02:52:56Z".to_vec()), - ] - .into_iter(), - ); + let expected_headers: HashMap> = HashMap::from_iter(vec![ + (WarcHeader::WarcType, b"another".to_vec()), + (WarcHeader::ContentLength, b"6".to_vec()), + ( + WarcHeader::RecordID, + b"".to_vec(), + ), + (WarcHeader::Date, b"2020-07-08T02:52:56Z".to_vec()), + ]); let expected_body: &[u8] = b"123456"; let (headers, body) = reader.next().unwrap().unwrap(); diff --git a/src/warc_writer.rs b/src/warc_writer.rs index 557004b..4514ba7 100644 --- a/src/warc_writer.rs +++ b/src/warc_writer.rs @@ -45,7 +45,7 @@ impl WarcWriter { for (token, value) in headers.as_ref().iter() { bytes_written += self.writer.write(token.to_string().as_bytes())?; bytes_written += self.writer.write(&[58, 32])?; - bytes_written += self.writer.write(&value)?; + bytes_written += self.writer.write(value)?; bytes_written += self.writer.write(&[13, 10])?; } bytes_written += self.writer.write(&[13, 10])?; @@ -83,8 +83,9 @@ impl WarcWriter> { .read(true) .write(true) .create(true) + .truncate(false) .open(&path)?; - let writer = BufWriter::with_capacity(1 * MB, file); + let writer = BufWriter::with_capacity(MB, file); Ok(WarcWriter::new(writer)) } @@ -98,9 +99,10 @@ impl WarcWriter>> { .read(true) .write(true) .create(true) + .truncate(false) .open(&path)?; let gzip_stream = GzipWriter::new(file)?; - let writer = BufWriter::with_capacity(1 * MB, gzip_stream); + let writer = BufWriter::with_capacity(MB, gzip_stream); Ok(WarcWriter::new(writer)) }