Skip to content

Commit

Permalink
Parsing + Reading + Writing (#5)
Browse files Browse the repository at this point in the history
* starting over
* it compiles
* working again
* begin parsing + thinking about internal references instead of owned
* body as bytes
* str + -version enum
* now helper + syntax
* parsing working with &str -- next move to &[u8]
* &str => &[u8]
* remove extra line ending in example
* cleanup, rename record helpers, make internal parsers private
* lib api syntax
* streaming combinators
* example syntax
* test copy fix
* fix display trait for warc record
* non destructive header space parsing
* rename header key to token
* warc file with write working
* remove old http dep
* remove read example + minor syntax for warc file
* parsed and owner struct variants
* syntax, minor writing changes, now reading with iterator
* some minor syntax improvements for file
* impl from converter for record and header
* minor organization change for header
* header structs with inner vecs + iter and display trait impl
* remove write_ref method + minor cleanup
* update todo
* remove macro + reference doc comments
* introduce error type
* tweaks to error handling
* update read file example + remove some todo comments
* drop left/right delim values
* drop warc naming prefixes + drop ref types + hashmap instead of vec for headers
* update read example
* cargo version bump + readme update
* update travis
  • Loading branch information
jedireza authored Apr 19, 2020
1 parent ce1c682 commit b41e8c1
Show file tree
Hide file tree
Showing 37 changed files with 661 additions and 1,199 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
target
Cargo.lock
warc_example.warc
10 changes: 8 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
language: rust
rust:
- stable
- beta
- nightly
- beta
- stable
matrix:
allow_failures:
- rust: nightly
before_script:
- rustup component add rustfmt
script:
- cargo build
- cargo test
- cargo fmt -- --check
11 changes: 6 additions & 5 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
[package]
name = "warc"
version = "0.0.3"
version = "0.1.0"
description = "A Rust library for reading and writing WARC files."
readme = "README.md"
repository = "https://github.com/jedireza/rust-warc"
repository = "https://github.com/jedireza/warc"
documentation = "https://docs.rs/crate/warc/"
license = "MIT"
authors = ["Reza Akhavan <[email protected]>"]
keywords = ["warc", "web", "archive"]
edition = "2018"

[dependencies]
chrono = "0.4.0"
hyper = "0.11.0"
uuid = { version = "0.5.0", features = ["v4"] }
chrono = "0.4.11"
nom = "5.1.1"
uuid = { version = "0.8.1", features = ["v4"] }
12 changes: 3 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,15 +1,9 @@
# WARC (Web ARChive)

A Rust library for reading and writing WARC files, implementing
[ISO28500][ISO28500].

[ISO28500]: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf


## Work in progress

The project is incomplete and the API is subject to change.
A Rust library for reading and writing WARC files.

- Docs: https://docs.rs/warc/0.1.0/
- Crate: https://crates.io/crates/warc

## License

Expand Down
40 changes: 29 additions & 11 deletions examples/hello_warc.rs
Original file line number Diff line number Diff line change
@@ -1,16 +1,34 @@
extern crate warc;

use warc::WarcRecord;
use warc::header::WarcIpAddress;
use warc::header::WarcRecordType;
use warc::header::WarcType;
use warc::header::{CONTENT_LENGTH, WARC_DATE, WARC_IP_ADDRESS, WARC_RECORD_ID, WARC_TYPE};
use warc::{Record, RecordType};

fn main() {
let mut rec = WarcRecord::new();
let body = "hello warc! 👋".to_owned().into_bytes();

rec.headers.set(WarcType(WarcRecordType::WarcInfo));
rec.headers.set(WarcIpAddress("127.0.0.1".to_owned()));
rec.set_body("hello world! 👋".to_owned().into_bytes());
let record = Record {
version: "1.0".to_owned(),
headers: vec![
(
WARC_RECORD_ID.to_owned(),
Record::make_uuid().to_owned().into_bytes(),
),
(
WARC_TYPE.to_owned(),
RecordType::WarcInfo.to_string().into_bytes(),
),
(WARC_DATE.to_owned(), Record::make_date().into_bytes()),
(
WARC_IP_ADDRESS.to_owned(),
"127.0.0.1".to_owned().into_bytes(),
),
(
CONTENT_LENGTH.to_owned(),
body.len().to_string().into_bytes(),
),
]
.into_iter()
.collect(),
body: body,
};

println!("{}", rec);
print!("{}", record);
}
31 changes: 31 additions & 0 deletions examples/read_file.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
use warc::header::{WARC_DATE, WARC_RECORD_ID};
use warc::File;

fn main() -> Result<(), std::io::Error> {
let file = File::open("warc_example.warc")?;

let mut count = 0;
for record in file {
count += 1;
match record {
Err(err) => println!("ERROR: {}\r\n", err),
Ok(record) => {
println!(
"{}: {}",
WARC_RECORD_ID,
String::from_utf8_lossy(record.headers.get(WARC_RECORD_ID).unwrap())
);
println!(
"{}: {}",
WARC_DATE,
String::from_utf8_lossy(record.headers.get(WARC_DATE).unwrap())
);
println!("");
}
}
}

println!("Total records: {}", count);

Ok(())
}
39 changes: 39 additions & 0 deletions examples/write_file.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
use warc::header::{CONTENT_LENGTH, WARC_DATE, WARC_IP_ADDRESS, WARC_RECORD_ID, WARC_TYPE};
use warc::{File, Record, RecordType};

fn main() -> Result<(), std::io::Error> {
let date = Record::make_date();
let body = format!("wrote to the file on {}", date);
let body = body.into_bytes();

let record = Record {
version: "1.0".to_owned(),
headers: vec![
(WARC_RECORD_ID.to_owned(), Record::make_uuid().into_bytes()),
(
WARC_TYPE.to_owned(),
RecordType::WarcInfo.to_string().into_bytes(),
),
(WARC_DATE.to_owned(), date.into_bytes()),
(
WARC_IP_ADDRESS.to_owned(),
"127.0.0.1".to_owned().into_bytes(),
),
(
CONTENT_LENGTH.to_owned(),
body.len().to_string().into_bytes(),
),
]
.into_iter()
.collect(),
body: body,
};

let mut file = File::open("warc_example.warc")?;

let bytes_written = file.write(&record)?;

println!("{} bytes written.", bytes_written);

Ok(())
}
32 changes: 32 additions & 0 deletions src/error.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
use std::error;
use std::fmt;

#[derive(Debug)]
pub enum Error {
ParseHeaders,
ReadData,
ReadOverflow,
UnexpectedEOB,
}

impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match *self {
Error::ParseHeaders => write!(f, "Error parsing headers."),
Error::ReadData => write!(f, "Error reading data source."),
Error::ReadOverflow => write!(f, "Read further than expected."),
Error::UnexpectedEOB => write!(f, "Unexpected end of body."),
}
}
}

impl error::Error for Error {
fn source(&self) -> Option<&(dyn error::Error + 'static)> {
match *self {
Error::ParseHeaders => None,
Error::ReadData => None,
Error::ReadOverflow => None,
Error::UnexpectedEOB => None,
}
}
}
124 changes: 124 additions & 0 deletions src/file.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
use crate::parser;
use crate::{Error, Record};
use std::fs;
use std::io;
use std::io::{BufRead, BufReader, Seek, SeekFrom, Write};
use std::path::Path;

const KB: usize = 1_024;
const MB: usize = 1_048_576;

pub struct File {
reader: BufReader<fs::File>,
}

impl File {
pub fn open<P: AsRef<Path>>(path: P) -> io::Result<Self> {
let file = fs::OpenOptions::new()
.read(true)
.write(true)
.create(true)
.open(&path)?;
let reader = BufReader::with_capacity(1 * MB, file);

Ok(File { reader: reader })
}

pub fn write(&mut self, record: &Record) -> io::Result<usize> {
let mut bytes_written = 0;

let mut file = self.reader.get_ref();
file.seek(SeekFrom::End(0))?;

bytes_written += file.write(&[87, 65, 82, 67, 47])?;
bytes_written += file.write(record.version.as_bytes())?;
bytes_written += file.write(&[13, 10])?;

for (token, value) in record.headers.iter() {
bytes_written += file.write(token.as_bytes())?;
bytes_written += file.write(&[58, 32])?;
bytes_written += file.write(&value)?;
bytes_written += file.write(&[13, 10])?;
}
bytes_written += file.write(&[13, 10])?;

bytes_written += file.write(&record.body)?;
bytes_written += file.write(&[13, 10])?;
bytes_written += file.write(&[13, 10])?;

Ok(bytes_written)
}
}

impl Iterator for File {
type Item = Result<Record, Error>;

fn next(&mut self) -> Option<Self::Item> {
let mut header_buffer: Vec<u8> = Vec::with_capacity(64 * KB);
let mut found_headers = false;
while !found_headers {
let bytes_read = match self.reader.read_until(b'\n', &mut header_buffer) {
Err(_) => return Some(Err(Error::ReadData)),
Ok(len) => len,
};

if bytes_read == 0 {
return None;
}

if bytes_read == 2 {
let last_two_chars = header_buffer.len() - 2;
if &header_buffer[last_two_chars..] == b"\r\n" {
found_headers = true;
}
}
}

let headers_parsed = match parser::headers(&header_buffer) {
Err(_) => return Some(Err(Error::ParseHeaders)),
Ok(parsed) => parsed.1,
};
let version_ref = headers_parsed.0;
let headers_ref = headers_parsed.1;
let expected_body_len = headers_parsed.2;

let mut body_buffer: Vec<u8> = Vec::with_capacity(1 * MB);
let mut found_body = expected_body_len == 0;
let mut body_bytes_read = 0;
let maximum_read_range = expected_body_len + 4;
while !found_body {
let bytes_read = match self.reader.read_until(b'\n', &mut body_buffer) {
Err(_) => return Some(Err(Error::ReadData)),
Ok(len) => len,
};

body_bytes_read += bytes_read;

// we expect 4 characters (\r\n\r\n) after the body
if bytes_read == 2 && body_bytes_read == maximum_read_range {
found_body = true;
}

if bytes_read == 0 {
return Some(Err(Error::UnexpectedEOB));
}

if body_bytes_read > maximum_read_range {
return Some(Err(Error::ReadOverflow));
}
}

let body_ref = &body_buffer[..expected_body_len];

let record = Record {
version: version_ref.to_owned(),
headers: headers_ref
.into_iter()
.map(|(token, value)| (token.to_owned(), value.to_owned()))
.collect(),
body: body_ref.to_owned(),
};

return Some(Ok(record));
}
}
19 changes: 19 additions & 0 deletions src/header.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
pub const CONTENT_LENGTH: &str = "content-length";
pub const CONTENT_TYPE: &str = "content-type";
pub const WARC_BLOCK_DIGEST: &str = "warc-block-digest";
pub const WARC_CONCURRENT_TO: &str = "warc-concurrent-to";
pub const WARC_DATE: &str = "warc-date";
pub const WARC_FILENAME: &str = "warc-filename";
pub const WARC_IDENTIFIED_PAYLOAD_TYPE: &str = "warc-identified-payload-type";
pub const WARC_IP_ADDRESS: &str = "warc-ip-address";
pub const WARC_PAYLOAD_DIGEST: &str = "warc-payload-digest";
pub const WARC_PROFILE: &str = "warc-profile";
pub const WARC_RECORD_ID: &str = "warc-record-id";
pub const WARC_REFERS_TO: &str = "warc-refers-to";
pub const WARC_SEGMENT_NUMBER: &str = "warc-segment-number";
pub const WARC_SEGMENT_ORIGIN_ID: &str = "warc-segment-origin-id";
pub const WARC_SEGMENT_TOTAL_LENGTH: &str = "warc-segment-total-length";
pub const WARC_TARGET_URI: &str = "warc-target-uri";
pub const WARC_TRUNCATED: &str = "warc-truncated";
pub const WARC_TYPE: &str = "warc-type";
pub const WARC_WARCINFO_ID: &str = "warc-warcinfo-id";
15 changes: 0 additions & 15 deletions src/header/content_length.rs

This file was deleted.

Loading

0 comments on commit b41e8c1

Please sign in to comment.