From 5495964ae252e9975dab244c8fcbacef745878c2 Mon Sep 17 00:00:00 2001 From: Michael Cooper Date: Sun, 10 Jan 2016 18:37:28 +1100 Subject: [PATCH 1/6] Adding MainPacket parsing --- par2ools/par2.py | 40 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/par2ools/par2.py b/par2ools/par2.py index 1fd4294..3e1053d 100644 --- a/par2ools/par2.py +++ b/par2ools/par2.py @@ -21,12 +21,21 @@ ) FILE_DESCRIPTION_PACKET = ("<64s" # PACKET_HEADER - "16s" # fileid, hash of [hash16k, length, name] - "16s" # hashfull; hash of the whole file (which?) - "16s" # hash16k; hash of the first 16k of the file (which?) + "16s" # fileid; file this packet belongs to + "16s" # hashfull; md5 hash of the whole file + "16s" # hash16k; md5 hash of the first 16k of the file "Q" # length of the file ) +MAIN_PACKET = ("<64s" # PACKET_HEADER + "Q" # slice_size; The size of the slices in bytes + "I" # num_files; Number of files in the recovery set +) + +MAIN_PACKET_FILEID = ("<" + "16s" # fileid; +) + class Header(object): fmt = PACKET_HEADER def __init__(self, par2file, offset=0): @@ -63,6 +72,25 @@ def __init__(self, par2file, offset=0): self.file_length = parts[4] self.name = packet[struct.calcsize(self.fmt):].strip('\x00') +class MainPacket(object): + fmt = MAIN_PACKET + fmt_array = MAIN_PACKET_FILEID + header_type = 'PAR 2.0\x00Main\x00\x00\x00\x00' + + def __init__(self, par2file, offset=0): + array_start = struct.calcsize(self.fmt) + parts = struct.unpack(self.fmt, par2file[offset:offset+array_start]) + self.header = Header(parts[0]) + self.slice_size = parts[1] + self.num_files = parts[2] + hash_size = struct.calcsize(self.fmt_array) + num_ids = (self.header.length - array_start) / hash_size + self.file_ids = [] + for idx in range(num_ids): + start = offset + array_start + (hash_size * idx) + parts = struct.unpack(self.fmt_array, par2file[start:start+hash_size]) + self.file_ids.append(parts[0]) + self.num_nonrecovery_files = self.num_files - num_ids class Par2File(object): def __init__(self, obj_or_path): @@ -76,6 +104,7 @@ def __init__(self, obj_or_path): self.contents = obj_or_path.read() if getattr(obj_or_path, 'name', None): self.path = obj_or_path.name + self.main_packet = None self.packets = self.read_packets() def read_packets(self): @@ -84,7 +113,10 @@ def read_packets(self): packets = [] while offset < filelen: header = Header(self.contents, offset) - if header.type == FileDescriptionPacket.header_type: + if header.type == MainPacket.header_type: + self.main_packet = MainPacket(self.contents, offset) + packets.append(self.main_packet) + elif header.type == FileDescriptionPacket.header_type: packets.append(FileDescriptionPacket(self.contents, offset)) else: packets.append(UnknownPar2Packet(self.contents, offset)) From 75e206feafcccf80f48c2cf204665f798d9d5a3e Mon Sep 17 00:00:00 2001 From: Michael Cooper Date: Sun, 10 Jan 2016 18:38:18 +1100 Subject: [PATCH 2/6] Adding Checksum packet parsing --- par2ools/par2.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/par2ools/par2.py b/par2ools/par2.py index 3e1053d..926c18e 100644 --- a/par2ools/par2.py +++ b/par2ools/par2.py @@ -36,6 +36,15 @@ "16s" # fileid; ) +FILE_CHECKSUM_PACKET = ("<64s" # PACKET_HEADER + "16s" # fileid; file this packet belongs to +) + +FILE_CHECKSUM_PACKET_SLICE = ("<" + "16s" # hash; MD5 hash of the slice + "i" # checksum; CRC32 checksum of the slice +) + class Header(object): fmt = PACKET_HEADER def __init__(self, par2file, offset=0): @@ -92,6 +101,27 @@ def __init__(self, par2file, offset=0): self.file_ids.append(parts[0]) self.num_nonrecovery_files = self.num_files - num_ids +class InputFileSliceChecksumPacket(object): + fmt = FILE_CHECKSUM_PACKET + slice_fmt = FILE_CHECKSUM_PACKET_SLICE + header_type = 'PAR 2.0\x00IFSC\x00\x00\x00\x00' + + def __init__(self, par2file, offset=0): + header_size = struct.calcsize(self.fmt) + parts = struct.unpack(self.fmt, par2file[offset:offset+header_size]) + self.header = Header(parts[0]) + self.fileid = parts[1] + # Unpack slices + slice_size = struct.calcsize(self.slice_fmt) + self.num_slices = (self.header.length - header_size) / slice_size + self.slice_md5 = [] + self.slice_crc = [] + for idx in range(self.num_slices): + start = offset + header_size + (slice_size * idx) + parts = struct.unpack(self.slice_fmt, par2file[start:start+slice_size]) + self.slice_md5.append(parts[0]) + self.slice_crc.append(parts[1]) + class Par2File(object): def __init__(self, obj_or_path): """A convenient object that reads and makes sense of Par2 blocks.""" @@ -118,6 +148,8 @@ def read_packets(self): packets.append(self.main_packet) elif header.type == FileDescriptionPacket.header_type: packets.append(FileDescriptionPacket(self.contents, offset)) + elif header.type == InputFileSliceChecksumPacket.header_type: + packets.append(InputFileSliceChecksumPacket(self.contents, offset)) else: packets.append(UnknownPar2Packet(self.contents, offset)) offset += header.length From 2afa2ab64911c421decfe8cbcc8d9335238f1e63 Mon Sep 17 00:00:00 2001 From: Michael Cooper Date: Sun, 10 Jan 2016 18:38:45 +1100 Subject: [PATCH 3/6] Fully verify the packet headers --- par2ools/par2.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/par2ools/par2.py b/par2ools/par2.py index 926c18e..edcc24d 100644 --- a/par2ools/par2.py +++ b/par2ools/par2.py @@ -6,9 +6,8 @@ This is only intended to be able to read packets in par2, not repair, verify, or create new par2 files.""" -import os -import glob import struct +import hashlib from par2ools import fileutil @@ -56,8 +55,14 @@ def __init__(self, par2file, offset=0): self.setid = parts[3] self.type = parts[4] - def verify(self): - return self.magic == 'PAR2\x00PKT' + def verify(self, par2file, offset=0): + if self.magic != 'PAR2\x00PKT': + return False + if self.length + offset > len(par2file): + return False + validate_start = 8 + 8 + 16 # Skip the first 3 fields + raw = par2file[offset+validate_start:offset+self.length] + return hashlib.md5(raw).digest() == self.hash class UnknownPar2Packet(object): fmt = PACKET_HEADER @@ -143,6 +148,10 @@ def read_packets(self): packets = [] while offset < filelen: header = Header(self.contents, offset) + if not header.verify(self.contents, offset): + # If the packet was invalid, we cant trust the length + # So we need to abort with what we had. + break if header.type == MainPacket.header_type: self.main_packet = MainPacket(self.contents, offset) packets.append(self.main_packet) From bd7b6f58341b53a5dd0a18cd0b74a9821a69819a Mon Sep 17 00:00:00 2001 From: Michael Cooper Date: Mon, 11 Jan 2016 09:53:55 +1100 Subject: [PATCH 4/6] Ignore pycharm --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index d4bc2bd..e85cfde 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ tags docs/_build* build dist +/.idea From 4d42004149dc050b4bb00a9d59284bcfdb20fa4d Mon Sep 17 00:00:00 2001 From: Michael Cooper Date: Mon, 11 Jan 2016 09:55:03 +1100 Subject: [PATCH 5/6] Don't double parse headers --- par2ools/par2.py | 85 ++++++++++++++++++++++++------------------------ 1 file changed, 43 insertions(+), 42 deletions(-) diff --git a/par2ools/par2.py b/par2ools/par2.py index edcc24d..d662a4a 100644 --- a/par2ools/par2.py +++ b/par2ools/par2.py @@ -19,14 +19,14 @@ "16s" # packet type ) -FILE_DESCRIPTION_PACKET = ("<64s" # PACKET_HEADER +FILE_DESCRIPTION_PACKET = ("<" "16s" # fileid; file this packet belongs to "16s" # hashfull; md5 hash of the whole file "16s" # hash16k; md5 hash of the first 16k of the file "Q" # length of the file ) -MAIN_PACKET = ("<64s" # PACKET_HEADER +MAIN_PACKET = ("<" "Q" # slice_size; The size of the slices in bytes "I" # num_files; Number of files in the recovery set ) @@ -35,7 +35,7 @@ "16s" # fileid; ) -FILE_CHECKSUM_PACKET = ("<64s" # PACKET_HEADER +FILE_CHECKSUM_PACKET = ("<" "16s" # fileid; file this packet belongs to ) @@ -46,11 +46,12 @@ class Header(object): fmt = PACKET_HEADER - def __init__(self, par2file, offset=0): - self.raw = par2file[offset:offset+struct.calcsize(self.fmt)] - parts = struct.unpack(self.fmt, self.raw) + size = struct.calcsize(PACKET_HEADER) # Size of just the header + + def __init__(self, raw): + parts = struct.unpack(self.fmt, raw) self.magic = parts[0] - self.length = parts[1] + self.length = parts[1] # Length of the full packet (including header) self.hash = parts[2] self.setid = parts[3] self.type = parts[4] @@ -66,43 +67,41 @@ def verify(self, par2file, offset=0): class UnknownPar2Packet(object): fmt = PACKET_HEADER - def __init__(self, par2file, offset=0): - self.raw = par2file[offset:offset+struct.calcsize(self.fmt)] - self.header = Header(self.raw) + def __init__(self, header, raw): + self.raw = raw + self.header = header class FileDescriptionPacket(object): header_type = 'PAR 2.0\x00FileDesc' fmt = FILE_DESCRIPTION_PACKET - def __init__(self, par2file, offset=0): - name_start = offset+struct.calcsize(self.fmt) - self.raw = par2file[offset:name_start] - parts = struct.unpack(self.fmt, self.raw) - self.header = Header(parts[0]) - packet = par2file[offset:offset+self.header.length] - self.fileid = parts[1] - self.file_hashfull = parts[2] - self.file_hash16k = parts[3] - self.file_length = parts[4] - self.name = packet[struct.calcsize(self.fmt):].strip('\x00') + def __init__(self, header, raw): + self.header = header + name_start = struct.calcsize(self.fmt) + parts = struct.unpack(self.fmt, raw[:name_start]) + self.fileid = parts[0] + self.file_hashfull = parts[1] + self.file_hash16k = parts[2] + self.file_length = parts[3] + self.name = raw[name_start:].strip('\x00') class MainPacket(object): fmt = MAIN_PACKET fmt_array = MAIN_PACKET_FILEID header_type = 'PAR 2.0\x00Main\x00\x00\x00\x00' - def __init__(self, par2file, offset=0): + def __init__(self, header, raw): + self.header = header array_start = struct.calcsize(self.fmt) - parts = struct.unpack(self.fmt, par2file[offset:offset+array_start]) - self.header = Header(parts[0]) - self.slice_size = parts[1] - self.num_files = parts[2] + parts = struct.unpack(self.fmt, raw[:array_start]) + self.slice_size = parts[0] + self.num_files = parts[1] hash_size = struct.calcsize(self.fmt_array) - num_ids = (self.header.length - array_start) / hash_size + num_ids = (self.header.length - self.header.size - array_start) / hash_size self.file_ids = [] for idx in range(num_ids): - start = offset + array_start + (hash_size * idx) - parts = struct.unpack(self.fmt_array, par2file[start:start+hash_size]) + start = array_start + (hash_size * idx) + parts = struct.unpack(self.fmt_array, raw[start:start+hash_size]) self.file_ids.append(parts[0]) self.num_nonrecovery_files = self.num_files - num_ids @@ -111,19 +110,19 @@ class InputFileSliceChecksumPacket(object): slice_fmt = FILE_CHECKSUM_PACKET_SLICE header_type = 'PAR 2.0\x00IFSC\x00\x00\x00\x00' - def __init__(self, par2file, offset=0): - header_size = struct.calcsize(self.fmt) - parts = struct.unpack(self.fmt, par2file[offset:offset+header_size]) - self.header = Header(parts[0]) - self.fileid = parts[1] + def __init__(self, header, raw): + self.header = header + body_size = struct.calcsize(self.fmt) + parts = struct.unpack(self.fmt, raw[:body_size]) + self.fileid = parts[0] # Unpack slices slice_size = struct.calcsize(self.slice_fmt) - self.num_slices = (self.header.length - header_size) / slice_size + self.num_slices = (self.header.length - (body_size + header.size)) / slice_size self.slice_md5 = [] self.slice_crc = [] for idx in range(self.num_slices): - start = offset + header_size + (slice_size * idx) - parts = struct.unpack(self.slice_fmt, par2file[start:start+slice_size]) + start = body_size + (slice_size * idx) + parts = struct.unpack(self.slice_fmt, raw[start:start+slice_size]) self.slice_md5.append(parts[0]) self.slice_crc.append(parts[1]) @@ -147,20 +146,22 @@ def read_packets(self): filelen = len(self.contents) packets = [] while offset < filelen: - header = Header(self.contents, offset) + raw_header = self.contents[offset:offset+Header.size] + header = Header(raw_header) if not header.verify(self.contents, offset): # If the packet was invalid, we cant trust the length # So we need to abort with what we had. break + raw_body = self.contents[offset+Header.size:offset+header.length] if header.type == MainPacket.header_type: - self.main_packet = MainPacket(self.contents, offset) + self.main_packet = MainPacket(header, raw_body) packets.append(self.main_packet) elif header.type == FileDescriptionPacket.header_type: - packets.append(FileDescriptionPacket(self.contents, offset)) + packets.append(FileDescriptionPacket(header, raw_body)) elif header.type == InputFileSliceChecksumPacket.header_type: - packets.append(InputFileSliceChecksumPacket(self.contents, offset)) + packets.append(InputFileSliceChecksumPacket(header, raw_body)) else: - packets.append(UnknownPar2Packet(self.contents, offset)) + packets.append(UnknownPar2Packet(header, raw_body)) offset += header.length return packets From 2ec713cd47626158aad76676d21c0ff948ba4da3 Mon Sep 17 00:00:00 2001 From: Michael Cooper Date: Mon, 11 Jan 2016 23:45:45 +1100 Subject: [PATCH 6/6] Stream file reading instead of having it all in memory --- par2ools/par2.py | 46 ++++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/par2ools/par2.py b/par2ools/par2.py index d662a4a..955a146 100644 --- a/par2ools/par2.py +++ b/par2ools/par2.py @@ -52,17 +52,19 @@ def __init__(self, raw): parts = struct.unpack(self.fmt, raw) self.magic = parts[0] self.length = parts[1] # Length of the full packet (including header) + self.body_length = self.length - self.size self.hash = parts[2] self.setid = parts[3] self.type = parts[4] - def verify(self, par2file, offset=0): - if self.magic != 'PAR2\x00PKT': - return False - if self.length + offset > len(par2file): + def verify(self): + return self.magic == 'PAR2\x00PKT' + + def verify_packet(self, raw_packet): + if len(raw_packet) < self.length: return False - validate_start = 8 + 8 + 16 # Skip the first 3 fields - raw = par2file[offset+validate_start:offset+self.length] + validate_start = 8 + 8 + 16 # Skip the first 3 fields + raw = raw_packet[validate_start:] return hashlib.md5(raw).digest() == self.hash class UnknownPar2Packet(object): @@ -130,29 +132,30 @@ class Par2File(object): def __init__(self, obj_or_path): """A convenient object that reads and makes sense of Par2 blocks.""" self.path = None + self.main_packet = None if isinstance(obj_or_path, basestring): - with open(obj_or_path) as f: - self.contents = f.read() - self.path = obj_or_path + self.path = obj_or_path + with open(obj_or_path) as fle: + self.packets = self.read_packets(fle) else: - self.contents = obj_or_path.read() if getattr(obj_or_path, 'name', None): self.path = obj_or_path.name - self.main_packet = None - self.packets = self.read_packets() + self.packets = self.read_packets(obj_or_path) - def read_packets(self): - offset = 0 - filelen = len(self.contents) + def read_packets(self, fle): packets = [] - while offset < filelen: - raw_header = self.contents[offset:offset+Header.size] + while True: + raw_header = fle.read(Header.size) + if not raw_header: + break # catch EOF header = Header(raw_header) - if not header.verify(self.contents, offset): - # If the packet was invalid, we cant trust the length - # So we need to abort with what we had. + if not header.verify(): + break + raw_body = fle.read(header.body_length) + if not header.verify_packet(raw_header + raw_body): + # If the packet was invalid, we cant trust the length to skip + # to the next packet, so abort with what we have already. break - raw_body = self.contents[offset+Header.size:offset+header.length] if header.type == MainPacket.header_type: self.main_packet = MainPacket(header, raw_body) packets.append(self.main_packet) @@ -162,7 +165,6 @@ def read_packets(self): packets.append(InputFileSliceChecksumPacket(header, raw_body)) else: packets.append(UnknownPar2Packet(header, raw_body)) - offset += header.length return packets def filenames(self):