Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Streaming file reads + more #2

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ tags
docs/_build*
build
dist
/.idea
154 changes: 115 additions & 39 deletions par2ools/par2.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,8 @@
This is only intended to be able to read packets in par2, not repair,
verify, or create new par2 files."""

import os
import glob
import struct
import hashlib

from par2ools import fileutil

Expand All @@ -20,75 +19,152 @@
"16s" # packet type
)

FILE_DESCRIPTION_PACKET = ("<64s" # PACKET_HEADER
"16s" # fileid, hash of [hash16k, length, name]
"16s" # hashfull; hash of the whole file (which?)
"16s" # hash16k; hash of the first 16k of the file (which?)
FILE_DESCRIPTION_PACKET = ("<"
"16s" # fileid; file this packet belongs to
"16s" # hashfull; md5 hash of the whole file
"16s" # hash16k; md5 hash of the first 16k of the file
"Q" # length of the file
)

MAIN_PACKET = ("<"
"Q" # slice_size; The size of the slices in bytes
"I" # num_files; Number of files in the recovery set
)

MAIN_PACKET_FILEID = ("<"
"16s" # fileid;
)

FILE_CHECKSUM_PACKET = ("<"
"16s" # fileid; file this packet belongs to
)

FILE_CHECKSUM_PACKET_SLICE = ("<"
"16s" # hash; MD5 hash of the slice
"i" # checksum; CRC32 checksum of the slice
)

class Header(object):
fmt = PACKET_HEADER
def __init__(self, par2file, offset=0):
self.raw = par2file[offset:offset+struct.calcsize(self.fmt)]
parts = struct.unpack(self.fmt, self.raw)
size = struct.calcsize(PACKET_HEADER) # Size of just the header

def __init__(self, raw):
parts = struct.unpack(self.fmt, raw)
self.magic = parts[0]
self.length = parts[1]
self.length = parts[1] # Length of the full packet (including header)
self.body_length = self.length - self.size
self.hash = parts[2]
self.setid = parts[3]
self.type = parts[4]

def verify(self):
return self.magic == 'PAR2\x00PKT'

def verify_packet(self, raw_packet):
if len(raw_packet) < self.length:
return False
validate_start = 8 + 8 + 16 # Skip the first 3 fields
raw = raw_packet[validate_start:]
return hashlib.md5(raw).digest() == self.hash

class UnknownPar2Packet(object):
fmt = PACKET_HEADER
def __init__(self, par2file, offset=0):
self.raw = par2file[offset:offset+struct.calcsize(self.fmt)]
self.header = Header(self.raw)
def __init__(self, header, raw):
self.raw = raw
self.header = header

class FileDescriptionPacket(object):
header_type = 'PAR 2.0\x00FileDesc'
fmt = FILE_DESCRIPTION_PACKET

def __init__(self, par2file, offset=0):
name_start = offset+struct.calcsize(self.fmt)
self.raw = par2file[offset:name_start]
parts = struct.unpack(self.fmt, self.raw)
self.header = Header(parts[0])
packet = par2file[offset:offset+self.header.length]
self.fileid = parts[1]
self.file_hashfull = parts[2]
self.file_hash16k = parts[3]
self.file_length = parts[4]
self.name = packet[struct.calcsize(self.fmt):].strip('\x00')

def __init__(self, header, raw):
self.header = header
name_start = struct.calcsize(self.fmt)
parts = struct.unpack(self.fmt, raw[:name_start])
self.fileid = parts[0]
self.file_hashfull = parts[1]
self.file_hash16k = parts[2]
self.file_length = parts[3]
self.name = raw[name_start:].strip('\x00')

class MainPacket(object):
fmt = MAIN_PACKET
fmt_array = MAIN_PACKET_FILEID
header_type = 'PAR 2.0\x00Main\x00\x00\x00\x00'

def __init__(self, header, raw):
self.header = header
array_start = struct.calcsize(self.fmt)
parts = struct.unpack(self.fmt, raw[:array_start])
self.slice_size = parts[0]
self.num_files = parts[1]
hash_size = struct.calcsize(self.fmt_array)
num_ids = (self.header.length - self.header.size - array_start) / hash_size
self.file_ids = []
for idx in range(num_ids):
start = array_start + (hash_size * idx)
parts = struct.unpack(self.fmt_array, raw[start:start+hash_size])
self.file_ids.append(parts[0])
self.num_nonrecovery_files = self.num_files - num_ids

class InputFileSliceChecksumPacket(object):
fmt = FILE_CHECKSUM_PACKET
slice_fmt = FILE_CHECKSUM_PACKET_SLICE
header_type = 'PAR 2.0\x00IFSC\x00\x00\x00\x00'

def __init__(self, header, raw):
self.header = header
body_size = struct.calcsize(self.fmt)
parts = struct.unpack(self.fmt, raw[:body_size])
self.fileid = parts[0]
# Unpack slices
slice_size = struct.calcsize(self.slice_fmt)
self.num_slices = (self.header.length - (body_size + header.size)) / slice_size
self.slice_md5 = []
self.slice_crc = []
for idx in range(self.num_slices):
start = body_size + (slice_size * idx)
parts = struct.unpack(self.slice_fmt, raw[start:start+slice_size])
self.slice_md5.append(parts[0])
self.slice_crc.append(parts[1])

class Par2File(object):
def __init__(self, obj_or_path):
"""A convenient object that reads and makes sense of Par2 blocks."""
self.path = None
self.main_packet = None
if isinstance(obj_or_path, basestring):
with open(obj_or_path) as f:
self.contents = f.read()
self.path = obj_or_path
self.path = obj_or_path
with open(obj_or_path) as fle:
self.packets = self.read_packets(fle)
else:
self.contents = obj_or_path.read()
if getattr(obj_or_path, 'name', None):
self.path = obj_or_path.name
self.packets = self.read_packets()
self.packets = self.read_packets(obj_or_path)

def read_packets(self):
offset = 0
filelen = len(self.contents)
def read_packets(self, fle):
packets = []
while offset < filelen:
header = Header(self.contents, offset)
if header.type == FileDescriptionPacket.header_type:
packets.append(FileDescriptionPacket(self.contents, offset))
while True:
raw_header = fle.read(Header.size)
if not raw_header:
break # catch EOF
header = Header(raw_header)
if not header.verify():
break
raw_body = fle.read(header.body_length)
if not header.verify_packet(raw_header + raw_body):
# If the packet was invalid, we cant trust the length to skip
# to the next packet, so abort with what we have already.
break
if header.type == MainPacket.header_type:
self.main_packet = MainPacket(header, raw_body)
packets.append(self.main_packet)
elif header.type == FileDescriptionPacket.header_type:
packets.append(FileDescriptionPacket(header, raw_body))
elif header.type == InputFileSliceChecksumPacket.header_type:
packets.append(InputFileSliceChecksumPacket(header, raw_body))
else:
packets.append(UnknownPar2Packet(self.contents, offset))
offset += header.length
packets.append(UnknownPar2Packet(header, raw_body))
return packets

def filenames(self):
Expand Down