-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* starting over * it compiles * working again * begin parsing + thinking about internal references instead of owned * body as bytes * str + -version enum * now helper + syntax * parsing working with &str -- next move to &[u8] * &str => &[u8] * remove extra line ending in example * cleanup, rename record helpers, make internal parsers private * lib api syntax * streaming combinators * example syntax * test copy fix * fix display trait for warc record * non destructive header space parsing * rename header key to token * warc file with write working * remove old http dep * remove read example + minor syntax for warc file * parsed and owner struct variants * syntax, minor writing changes, now reading with iterator * some minor syntax improvements for file * impl from converter for record and header * minor organization change for header * header structs with inner vecs + iter and display trait impl * remove write_ref method + minor cleanup * update todo * remove macro + reference doc comments * introduce error type * tweaks to error handling * update read file example + remove some todo comments * drop left/right delim values * drop warc naming prefixes + drop ref types + hashmap instead of vec for headers * update read example * cargo version bump + readme update * update travis
- Loading branch information
Showing
37 changed files
with
661 additions
and
1,199 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,3 @@ | ||
target | ||
Cargo.lock | ||
warc_example.warc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,14 @@ | ||
language: rust | ||
rust: | ||
- stable | ||
- beta | ||
- nightly | ||
- beta | ||
- stable | ||
matrix: | ||
allow_failures: | ||
- rust: nightly | ||
before_script: | ||
- rustup component add rustfmt | ||
script: | ||
- cargo build | ||
- cargo test | ||
- cargo fmt -- --check |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,15 +1,16 @@ | ||
[package] | ||
name = "warc" | ||
version = "0.0.3" | ||
version = "0.1.0" | ||
description = "A Rust library for reading and writing WARC files." | ||
readme = "README.md" | ||
repository = "https://github.com/jedireza/rust-warc" | ||
repository = "https://github.com/jedireza/warc" | ||
documentation = "https://docs.rs/crate/warc/" | ||
license = "MIT" | ||
authors = ["Reza Akhavan <[email protected]>"] | ||
keywords = ["warc", "web", "archive"] | ||
edition = "2018" | ||
|
||
[dependencies] | ||
chrono = "0.4.0" | ||
hyper = "0.11.0" | ||
uuid = { version = "0.5.0", features = ["v4"] } | ||
chrono = "0.4.11" | ||
nom = "5.1.1" | ||
uuid = { version = "0.8.1", features = ["v4"] } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,16 +1,34 @@ | ||
extern crate warc; | ||
|
||
use warc::WarcRecord; | ||
use warc::header::WarcIpAddress; | ||
use warc::header::WarcRecordType; | ||
use warc::header::WarcType; | ||
use warc::header::{CONTENT_LENGTH, WARC_DATE, WARC_IP_ADDRESS, WARC_RECORD_ID, WARC_TYPE}; | ||
use warc::{Record, RecordType}; | ||
|
||
fn main() { | ||
let mut rec = WarcRecord::new(); | ||
let body = "hello warc! 👋".to_owned().into_bytes(); | ||
|
||
rec.headers.set(WarcType(WarcRecordType::WarcInfo)); | ||
rec.headers.set(WarcIpAddress("127.0.0.1".to_owned())); | ||
rec.set_body("hello world! 👋".to_owned().into_bytes()); | ||
let record = Record { | ||
version: "1.0".to_owned(), | ||
headers: vec![ | ||
( | ||
WARC_RECORD_ID.to_owned(), | ||
Record::make_uuid().to_owned().into_bytes(), | ||
), | ||
( | ||
WARC_TYPE.to_owned(), | ||
RecordType::WarcInfo.to_string().into_bytes(), | ||
), | ||
(WARC_DATE.to_owned(), Record::make_date().into_bytes()), | ||
( | ||
WARC_IP_ADDRESS.to_owned(), | ||
"127.0.0.1".to_owned().into_bytes(), | ||
), | ||
( | ||
CONTENT_LENGTH.to_owned(), | ||
body.len().to_string().into_bytes(), | ||
), | ||
] | ||
.into_iter() | ||
.collect(), | ||
body: body, | ||
}; | ||
|
||
println!("{}", rec); | ||
print!("{}", record); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
use warc::header::{WARC_DATE, WARC_RECORD_ID}; | ||
use warc::File; | ||
|
||
fn main() -> Result<(), std::io::Error> { | ||
let file = File::open("warc_example.warc")?; | ||
|
||
let mut count = 0; | ||
for record in file { | ||
count += 1; | ||
match record { | ||
Err(err) => println!("ERROR: {}\r\n", err), | ||
Ok(record) => { | ||
println!( | ||
"{}: {}", | ||
WARC_RECORD_ID, | ||
String::from_utf8_lossy(record.headers.get(WARC_RECORD_ID).unwrap()) | ||
); | ||
println!( | ||
"{}: {}", | ||
WARC_DATE, | ||
String::from_utf8_lossy(record.headers.get(WARC_DATE).unwrap()) | ||
); | ||
println!(""); | ||
} | ||
} | ||
} | ||
|
||
println!("Total records: {}", count); | ||
|
||
Ok(()) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
use warc::header::{CONTENT_LENGTH, WARC_DATE, WARC_IP_ADDRESS, WARC_RECORD_ID, WARC_TYPE}; | ||
use warc::{File, Record, RecordType}; | ||
|
||
fn main() -> Result<(), std::io::Error> { | ||
let date = Record::make_date(); | ||
let body = format!("wrote to the file on {}", date); | ||
let body = body.into_bytes(); | ||
|
||
let record = Record { | ||
version: "1.0".to_owned(), | ||
headers: vec![ | ||
(WARC_RECORD_ID.to_owned(), Record::make_uuid().into_bytes()), | ||
( | ||
WARC_TYPE.to_owned(), | ||
RecordType::WarcInfo.to_string().into_bytes(), | ||
), | ||
(WARC_DATE.to_owned(), date.into_bytes()), | ||
( | ||
WARC_IP_ADDRESS.to_owned(), | ||
"127.0.0.1".to_owned().into_bytes(), | ||
), | ||
( | ||
CONTENT_LENGTH.to_owned(), | ||
body.len().to_string().into_bytes(), | ||
), | ||
] | ||
.into_iter() | ||
.collect(), | ||
body: body, | ||
}; | ||
|
||
let mut file = File::open("warc_example.warc")?; | ||
|
||
let bytes_written = file.write(&record)?; | ||
|
||
println!("{} bytes written.", bytes_written); | ||
|
||
Ok(()) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
use std::error; | ||
use std::fmt; | ||
|
||
#[derive(Debug)] | ||
pub enum Error { | ||
ParseHeaders, | ||
ReadData, | ||
ReadOverflow, | ||
UnexpectedEOB, | ||
} | ||
|
||
impl fmt::Display for Error { | ||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | ||
match *self { | ||
Error::ParseHeaders => write!(f, "Error parsing headers."), | ||
Error::ReadData => write!(f, "Error reading data source."), | ||
Error::ReadOverflow => write!(f, "Read further than expected."), | ||
Error::UnexpectedEOB => write!(f, "Unexpected end of body."), | ||
} | ||
} | ||
} | ||
|
||
impl error::Error for Error { | ||
fn source(&self) -> Option<&(dyn error::Error + 'static)> { | ||
match *self { | ||
Error::ParseHeaders => None, | ||
Error::ReadData => None, | ||
Error::ReadOverflow => None, | ||
Error::UnexpectedEOB => None, | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
use crate::parser; | ||
use crate::{Error, Record}; | ||
use std::fs; | ||
use std::io; | ||
use std::io::{BufRead, BufReader, Seek, SeekFrom, Write}; | ||
use std::path::Path; | ||
|
||
const KB: usize = 1_024; | ||
const MB: usize = 1_048_576; | ||
|
||
pub struct File { | ||
reader: BufReader<fs::File>, | ||
} | ||
|
||
impl File { | ||
pub fn open<P: AsRef<Path>>(path: P) -> io::Result<Self> { | ||
let file = fs::OpenOptions::new() | ||
.read(true) | ||
.write(true) | ||
.create(true) | ||
.open(&path)?; | ||
let reader = BufReader::with_capacity(1 * MB, file); | ||
|
||
Ok(File { reader: reader }) | ||
} | ||
|
||
pub fn write(&mut self, record: &Record) -> io::Result<usize> { | ||
let mut bytes_written = 0; | ||
|
||
let mut file = self.reader.get_ref(); | ||
file.seek(SeekFrom::End(0))?; | ||
|
||
bytes_written += file.write(&[87, 65, 82, 67, 47])?; | ||
bytes_written += file.write(record.version.as_bytes())?; | ||
bytes_written += file.write(&[13, 10])?; | ||
|
||
for (token, value) in record.headers.iter() { | ||
bytes_written += file.write(token.as_bytes())?; | ||
bytes_written += file.write(&[58, 32])?; | ||
bytes_written += file.write(&value)?; | ||
bytes_written += file.write(&[13, 10])?; | ||
} | ||
bytes_written += file.write(&[13, 10])?; | ||
|
||
bytes_written += file.write(&record.body)?; | ||
bytes_written += file.write(&[13, 10])?; | ||
bytes_written += file.write(&[13, 10])?; | ||
|
||
Ok(bytes_written) | ||
} | ||
} | ||
|
||
impl Iterator for File { | ||
type Item = Result<Record, Error>; | ||
|
||
fn next(&mut self) -> Option<Self::Item> { | ||
let mut header_buffer: Vec<u8> = Vec::with_capacity(64 * KB); | ||
let mut found_headers = false; | ||
while !found_headers { | ||
let bytes_read = match self.reader.read_until(b'\n', &mut header_buffer) { | ||
Err(_) => return Some(Err(Error::ReadData)), | ||
Ok(len) => len, | ||
}; | ||
|
||
if bytes_read == 0 { | ||
return None; | ||
} | ||
|
||
if bytes_read == 2 { | ||
let last_two_chars = header_buffer.len() - 2; | ||
if &header_buffer[last_two_chars..] == b"\r\n" { | ||
found_headers = true; | ||
} | ||
} | ||
} | ||
|
||
let headers_parsed = match parser::headers(&header_buffer) { | ||
Err(_) => return Some(Err(Error::ParseHeaders)), | ||
Ok(parsed) => parsed.1, | ||
}; | ||
let version_ref = headers_parsed.0; | ||
let headers_ref = headers_parsed.1; | ||
let expected_body_len = headers_parsed.2; | ||
|
||
let mut body_buffer: Vec<u8> = Vec::with_capacity(1 * MB); | ||
let mut found_body = expected_body_len == 0; | ||
let mut body_bytes_read = 0; | ||
let maximum_read_range = expected_body_len + 4; | ||
while !found_body { | ||
let bytes_read = match self.reader.read_until(b'\n', &mut body_buffer) { | ||
Err(_) => return Some(Err(Error::ReadData)), | ||
Ok(len) => len, | ||
}; | ||
|
||
body_bytes_read += bytes_read; | ||
|
||
// we expect 4 characters (\r\n\r\n) after the body | ||
if bytes_read == 2 && body_bytes_read == maximum_read_range { | ||
found_body = true; | ||
} | ||
|
||
if bytes_read == 0 { | ||
return Some(Err(Error::UnexpectedEOB)); | ||
} | ||
|
||
if body_bytes_read > maximum_read_range { | ||
return Some(Err(Error::ReadOverflow)); | ||
} | ||
} | ||
|
||
let body_ref = &body_buffer[..expected_body_len]; | ||
|
||
let record = Record { | ||
version: version_ref.to_owned(), | ||
headers: headers_ref | ||
.into_iter() | ||
.map(|(token, value)| (token.to_owned(), value.to_owned())) | ||
.collect(), | ||
body: body_ref.to_owned(), | ||
}; | ||
|
||
return Some(Ok(record)); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
pub const CONTENT_LENGTH: &str = "content-length"; | ||
pub const CONTENT_TYPE: &str = "content-type"; | ||
pub const WARC_BLOCK_DIGEST: &str = "warc-block-digest"; | ||
pub const WARC_CONCURRENT_TO: &str = "warc-concurrent-to"; | ||
pub const WARC_DATE: &str = "warc-date"; | ||
pub const WARC_FILENAME: &str = "warc-filename"; | ||
pub const WARC_IDENTIFIED_PAYLOAD_TYPE: &str = "warc-identified-payload-type"; | ||
pub const WARC_IP_ADDRESS: &str = "warc-ip-address"; | ||
pub const WARC_PAYLOAD_DIGEST: &str = "warc-payload-digest"; | ||
pub const WARC_PROFILE: &str = "warc-profile"; | ||
pub const WARC_RECORD_ID: &str = "warc-record-id"; | ||
pub const WARC_REFERS_TO: &str = "warc-refers-to"; | ||
pub const WARC_SEGMENT_NUMBER: &str = "warc-segment-number"; | ||
pub const WARC_SEGMENT_ORIGIN_ID: &str = "warc-segment-origin-id"; | ||
pub const WARC_SEGMENT_TOTAL_LENGTH: &str = "warc-segment-total-length"; | ||
pub const WARC_TARGET_URI: &str = "warc-target-uri"; | ||
pub const WARC_TRUNCATED: &str = "warc-truncated"; | ||
pub const WARC_TYPE: &str = "warc-type"; | ||
pub const WARC_WARCINFO_ID: &str = "warc-warcinfo-id"; |
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.