From b2312a3e077629d536e8fbab492ac56a85c7a6d0 Mon Sep 17 00:00:00 2001 From: Alex Willmer Date: Sun, 19 May 2019 23:53:33 +0100 Subject: [PATCH 01/20] Add Python Pickle format --- serialization/pickle.ksy | 443 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 443 insertions(+) create mode 100644 serialization/pickle.ksy diff --git a/serialization/pickle.ksy b/serialization/pickle.ksy new file mode 100644 index 000000000..28c719d88 --- /dev/null +++ b/serialization/pickle.ksy @@ -0,0 +1,443 @@ +meta: + id: pickle + title: Python pickle serialiazation format (protocol 3) + application: Python + file-extension: + - pickle + - pkl + #- pkl0 + #- pkl1 + #- pkl2 + license: CC0-1.0 + endian: le +doc: | + Python Pickle format serializes Python objects to a byte stream, as a sequence + of operations to run on the Pickle Virtual Machine. + + The format is mostly implementation defined, there is no formal specification. + Pickle data types are closely coupled to the Python object model. + Python singletons, and most builtin types (e.g. `None`, `int`,`dict`, `list`) + are serialised using dedicated Pickle opcodes. + Other builtin types, and all classes (e.g. `set`, `datetime.datetime`) are + serialised by encoding the name of a constructor callable. + They are deserialised by importing that constructor, and calling it. +doc-ref: https://github.com/python/cpython/blob/3.3/Lib/pickletools.py +seq: + # TODO is there a way to declare PROTO is optional, but only valid at position 0? + - id: ops + type: op + repeat: eos + # TODO is there a way to declare a trailing STOP is required? +types: + op: + seq: + - id: code + type: u1 + enum: opcode + doc: | + Operation code that determines which action should be + performed next by the Pickel Virtual Machine. Some opcodes + are only available in later versions of the Pickle protocol. + - id: arg + type: + switch-on: code + cases: + 'opcode::int': decimalnl_short + 'opcode::binint': s4 + 'opcode::binint1': u1 + 'opcode::binint2': u2 + 'opcode::long': decimalnl_long + 'opcode::long1': long1 + 'opcode::string': stringnl + 'opcode::binstring': string4 + 'opcode::short_binstring': string1 + 'opcode::binbytes': bytes4 + 'opcode::short_binbytes': bytes1 + #'opcode::none': no_arg + #'opcode::newtrue': no_arg + #'opcode::newfalse': no_arg + 'opcode::unicode': unicodestringnl + 'opcode::binunicode': unicodestring4 + 'opcode::binfloat': f8 + #'opcode::empty_list': no_arg + #'opcode::append': no_arg + #'opcode::appends': no_arg + #'opcode::list': no_arg + #'opcode::empty_tuple': no_arg + #'opcode::tuple': no_arg + #'opcode::tuple1': no_arg + #'opcode::tuple2': no_arg + #'opcode::tuple3': no_arg + #'opcode::empty_dict': no_arg + #'opcode::dict': no_arg + #'opcode::setitem': no_arg + #'opcode::setitem2': no_arg + #'opcode::pop': no_arg + #'opcode::dup': no_arg + #'opcode::mark': no_arg + #'opcode::pop_mark': no_arg + 'opcode::get': decimalnl_short + 'opcode::binget': u1 + 'opcode::long_binget': u4 + 'opcode::put': decimalnl_short + 'opcode::binput': u1 + 'opcode::long_binput': u4 + 'opcode::ext1': u1 + 'opcode::ext2': u2 + 'opcode::ext4': u4 + 'opcode::global': stringnl_noescape_pair + #'opcode::reduce': no_arg + #'opcode::build': no_arg + 'opcode::inst': stringnl_noescape_pair + #'opcode::obj': no_arg + #'opcode::newobj': no_arg + 'opcode::proto': u1 + #'opcode::stop': no_arg + 'opcode::persid': stringnl_noescape + #'opcode::binpersid': no_arg + doc: | + Optional argument for the operation. Data type and length + are determined by the value of the opcode. + + decimalnl_short: + seq: + - id: val + type: str + encoding: ASCII + terminator: 0x0a # "\n" + doc: Integer, encoded with the ASCII characters [0-9-]. + decimalnl_long: + seq: + - id: val + type: str + encoding: ASCII + terminator: 0x0a # "\n" + doc: Integer, encoded with the ASCII chracters [0-9-], followed by 'L'. + # TODO Can kaitai express constraint that these are quoted? + stringnl: + seq: + - id: val + terminator: 0x0a # "\n" + doc: Quoted string, possibly containing Python string escapes. + stringnl_noescape: + seq: + - id: val + terminator: 0x0a # "\n" + doc: Unquoted string, does not contain string escapes. + stringnl_noescape_pair: + seq: + - id: val1 + type: stringnl_noescape + - id: val2 + type: stringnl_noescape + doc: Pair of unquoted, unescaped strings. + unicodestringnl: + seq: + - id: val + type: str + encoding: ASCII + terminator: 0x0a # "\n" + doc: Unquoted string, containing Python Unicode escapes. + floatnl: + seq: + - id: val + type: str + encoding: ASCII + terminator: 0x0a # "\n" + doc: Double float, encoded with the ASCII characters [0-9.eE-], or 'nan'. + long1: + seq: + - id: len + type: u1 + - id: val + size: len + doc: | + Large signed integer, in the range -2**(8*255-1) to 2**(8*255-1)-1, + encoded as two's complement. + long4: + seq: + - id: len + type: u4 + - id: val + size: len + doc: | + Large signed integer, in the range -2**(8*2**32-1) to 2**(8*2**32-1)-1, + encoded as two's complement. + string1: + seq: + - id: len + type: u1 + - id: val + type: str + encoding: latin1 + size: len + doc: Length prefixed string, between 0 and 255 bytes long. + string4: + seq: + - id: len + # Not a typo, the length really is a signed integer + type: s4 + - id: val + type: str + encoding: latin1 + size: len + doc: Length prefixed string, between 0 and 2**31-1 bytes long + bytes1: + seq: + - id: len + type: u1 + - id: val + size: len + doc: Length prefixed byte string, between 0 and 255 bytes long. + bytes4: + seq: + - id: len + type: u4 + - id: val + size: len + doc: Length prefixed string, between 0 and 2**31-1 bytes long + unicodestring4: + seq: + - id: len + type: u4 + - id: val + type: str + encoding: utf8 + size: len + doc: Length prefixed string, between 0 and 2**32-1 bytes long + # Some opcodes have no associated argument + # no_arg: {} + +enums: + opcode: + 0x28: # "(" + id: "mark" + -orig-id: MARK + doc: push special markobject on stack + 0x2e: # "." + id: "stop" + -orig-id: STOP + doc: every pickle ends with STOP + 0x30: # "0" + id: "pop" + -orig-id: POP + doc: discard topmost stack item + 0x31: # "1" + id: "pop_mark" + -orig-id: POP_MARK + doc: discard stack top through topmost markobject + 0x32: # "2" + id: "dup" + -orig-id: DUP + doc: duplicate top stack item + 0x46: # "F" + id: "float" + -orig-id: FLOAT + doc: push float object; decimal string argument + 0x49: # "I" + id: "int" + -orig-id: INT + doc: push integer or bool; decimal string argument + 0x4a: # "J" + id: "binint" + -orig-id: BININT + doc: push four-byte signed int + 0x4b: # "K" + id: "binint1" + -orig-id: BININT1 + doc: push 1-byte unsigned int + 0x4c: # "L" + id: "long" + -orig-id: LONG + doc: push long; decimal string argument + 0x4d: # "M" + id: "binint2" + -orig-id: BININT2 + doc: push 2-byte unsigned int + 0x4e: # "N" + id: "none" + -orig-id: NONE + doc: push None + 0x50: # "P" + id: "persid" + -orig-id: PERSID + doc: push persistent object; id is taken from string arg + 0x51: # "Q" + id: "binpersid" + -orig-id: BINPERSID + doc: push persistent object; id is taken from stack + 0x52: # "R" + id: "reduce" + -orig-id: REDUCE + doc: apply callable to argtuple, both on stack + 0x53: # "S" + id: "string" + -orig-id: STRING + doc: push string; NL-terminated string argument + 0x54: # "T" + id: "binstring" + -orig-id: BINSTRING + doc: push string; counted binary string argument + 0x55: # "U" + id: "short_binstring" + -orig-id: SHORT_BINSTRING + doc: push string; counted binary string argument 256 bytes + 0x56: # "V" + id: "unicode" + -orig-id: UNICODE + doc: push Unicode string; raw-unicode-escaped argument + 0x58: # "X" + id: "binunicode" + -orig-id: BINUNICODE + doc: push Unicode string; counted UTF-8 string argument + 0x61: # "a" + id: "append" + -orig-id: APPEND + doc: append stack top to list below it + 0x62: # "b" + id: "build" + -orig-id: BUILD + doc: call __setstate__ or __dict__.update() + 0x63: # "c" + id: "global" + -orig-id: GLOBAL + doc: push self.find_class(modname, name); 2 string args + 0x64: # "d" + id: "dict" + -orig-id: DICT + doc: build a dict from stack items + 0x7d: # "}" + id: "empty_dict" + -orig-id: EMPTY_DICT + doc: push empty dict + 0x65: # "e" + id: "appends" + -orig-id: APPENDS + doc: extend list on stack by topmost stack slice + 0x67: # "g" + id: "get" + -orig-id: GET + doc: push item from memo on stack; index is string arg + 0x68: # "h" + id: "binget" + -orig-id: BINGET + doc: push item from memo on stack; index is 1-byte arg + 0x69: # "i" + id: "inst" + -orig-id: INST + doc: build & push class instance + 0x6a: # "j" + id: "long_binget" + -orig-id: LONG_BINGET + doc: push item from memo on stack; index is 4-byte arg + 0x6c: # "l" + id: "list" + -orig-id: LIST + doc: build list from topmost stack items + 0x5d: # "]" + id: "empty_list" + -orig-id: EMPTY_LIST + doc: push empty list + 0x6f: # "o" + id: "obj" + -orig-id: OBJ + doc: build & push class instance + 0x70: # "p" + id: "put" + -orig-id: PUT + doc: store stack top in memo; index is string arg + 0x71: # "q" + id: "binput" + -orig-id: BINPUT + doc: store stack top in memo; index is 1-byte arg + 0x72: # "r" + id: "long_binput" + -orig-id: LONG_BINPUT + doc: store stack top in memo; index is 4-byte arg + 0x73: # "s" + id: "setitem" + -orig-id: SETITEM + doc: add key+value pair to dict + 0x74: # "t" + id: "tuple" + -orig-id: TUPLE + doc: build tuple from topmost stack items + 0x29: # ")" + id: "empty_tuple" + -orig-id: EMPTY_TUPLE + doc: push empty tuple + 0x75: # "u" + id: "setitems" + -orig-id: SETITEMS + doc: modify dict by adding topmost key+value pairs + 0x47: # "G" + id: "binfloat" + -orig-id: BINFLOAT + doc: push float; arg is 8-byte float encoding + + #'I01\n': + # id: "true" + # doc: not an opcode; see INT docs in pickletools.py + #'I00\n': + # id: "false" + # doc: not an opcode; see INT docs in pickletools.py + + # Protocol 2 + 0x80: + id: "proto" + -orig-id: PROTO + doc: identify pickle protocol + 0x81: + id: "newobj" + -orig-id: NEWOBJ + doc: build object by applying cls.__new__ to argtuple + 0x82: + id: "ext1" + -orig-id: EXT1 + doc: push object from extension registry; 1-byte index + 0x83: + id: "ext2" + -orig-id: EXT2 + doc: ditto, but 2-byte index + 0x84: + id: "ext4" + -orig-id: EXT4 + doc: ditto, but 4-byte index + 0x85: + id: "tuple1" + -orig-id: TUPLE1 + doc: build 1-tuple from stack top + 0x86: + id: "tuple2" + -orig-id: TUPLE2 + doc: build 2-tuple from two topmost stack items + 0x87: + id: "tuple3" + -orig-id: TUPLE3 + doc: build 3-tuple from three topmost stack items + 0x88: + id: "newtrue" + -orig-id: NEWTRUE + doc: push True + 0x89: + id: "newfalse" + -orig-id: NEWFALSE + doc: push False + 0x8a: + id: "long1" + -orig-id: LONG1 + doc: push long from < 256 bytes + 0x8b: + id: "long4" + -orig-id: LONG4 + doc: push really big long + + # Protocol 3 (Python 3.x) + 0x42: # "B" + id: "binbytes" + -orig-id: BINBYTES + doc: push bytes; counted binary string argument + 0x43: # "C" + id: "short_binbytes" + -orig-id: SHORT_BINBYTES + doc: push bytes; counted binary string argument < 256 bytes From 31febf98131fa6e76ae11e984f8918ea68fb0ff1 Mon Sep 17 00:00:00 2001 From: Alex Willmer Date: Mon, 20 May 2019 21:26:48 +0100 Subject: [PATCH 02/20] Remove personal file extensions These are not generally used, they were not intented to be in the initial commit. --- serialization/pickle.ksy | 3 --- 1 file changed, 3 deletions(-) diff --git a/serialization/pickle.ksy b/serialization/pickle.ksy index 28c719d88..d2c6fb4b1 100644 --- a/serialization/pickle.ksy +++ b/serialization/pickle.ksy @@ -5,9 +5,6 @@ meta: file-extension: - pickle - pkl - #- pkl0 - #- pkl1 - #- pkl2 license: CC0-1.0 endian: le doc: | From 25ccdf62f99f5f64e1d5a2c50d6d4cedd4380c58 Mon Sep 17 00:00:00 2001 From: Alex Willmer Date: Mon, 20 May 2019 21:27:54 +0100 Subject: [PATCH 03/20] Add forgotten long4 and float cases --- serialization/pickle.ksy | 2 ++ 1 file changed, 2 insertions(+) diff --git a/serialization/pickle.ksy b/serialization/pickle.ksy index d2c6fb4b1..e9d6a2e3f 100644 --- a/serialization/pickle.ksy +++ b/serialization/pickle.ksy @@ -45,6 +45,7 @@ types: 'opcode::binint2': u2 'opcode::long': decimalnl_long 'opcode::long1': long1 + 'opcode::long4': long4 'opcode::string': stringnl 'opcode::binstring': string4 'opcode::short_binstring': string1 @@ -55,6 +56,7 @@ types: #'opcode::newfalse': no_arg 'opcode::unicode': unicodestringnl 'opcode::binunicode': unicodestring4 + 'opcode::float': floatnl 'opcode::binfloat': f8 #'opcode::empty_list': no_arg #'opcode::append': no_arg From 281cdced51a7d5e2b45e8bc7690dd189f41b5aaf Mon Sep 17 00:00:00 2001 From: Alex Willmer Date: Mon, 20 May 2019 21:36:00 +0100 Subject: [PATCH 04/20] Enable cases for opcodes that take no argument This means efery parsed op will have the same attributes, and that every known opcode is explicitly tested for. So unknown opcodes are unambiguous. Explicit is better than implicit. --- serialization/pickle.ksy | 56 ++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/serialization/pickle.ksy b/serialization/pickle.ksy index e9d6a2e3f..9c4d47808 100644 --- a/serialization/pickle.ksy +++ b/serialization/pickle.ksy @@ -51,30 +51,30 @@ types: 'opcode::short_binstring': string1 'opcode::binbytes': bytes4 'opcode::short_binbytes': bytes1 - #'opcode::none': no_arg - #'opcode::newtrue': no_arg - #'opcode::newfalse': no_arg + 'opcode::none': no_arg + 'opcode::newtrue': no_arg + 'opcode::newfalse': no_arg 'opcode::unicode': unicodestringnl 'opcode::binunicode': unicodestring4 'opcode::float': floatnl 'opcode::binfloat': f8 - #'opcode::empty_list': no_arg - #'opcode::append': no_arg - #'opcode::appends': no_arg - #'opcode::list': no_arg - #'opcode::empty_tuple': no_arg - #'opcode::tuple': no_arg - #'opcode::tuple1': no_arg - #'opcode::tuple2': no_arg - #'opcode::tuple3': no_arg - #'opcode::empty_dict': no_arg - #'opcode::dict': no_arg - #'opcode::setitem': no_arg - #'opcode::setitem2': no_arg - #'opcode::pop': no_arg - #'opcode::dup': no_arg - #'opcode::mark': no_arg - #'opcode::pop_mark': no_arg + 'opcode::empty_list': no_arg + 'opcode::append': no_arg + 'opcode::appends': no_arg + 'opcode::list': no_arg + 'opcode::empty_tuple': no_arg + 'opcode::tuple': no_arg + 'opcode::tuple1': no_arg + 'opcode::tuple2': no_arg + 'opcode::tuple3': no_arg + 'opcode::empty_dict': no_arg + 'opcode::dict': no_arg + 'opcode::setitem': no_arg + 'opcode::setitems': no_arg + 'opcode::pop': no_arg + 'opcode::dup': no_arg + 'opcode::mark': no_arg + 'opcode::pop_mark': no_arg 'opcode::get': decimalnl_short 'opcode::binget': u1 'opcode::long_binget': u4 @@ -85,15 +85,15 @@ types: 'opcode::ext2': u2 'opcode::ext4': u4 'opcode::global': stringnl_noescape_pair - #'opcode::reduce': no_arg - #'opcode::build': no_arg + 'opcode::reduce': no_arg + 'opcode::build': no_arg 'opcode::inst': stringnl_noescape_pair - #'opcode::obj': no_arg - #'opcode::newobj': no_arg + 'opcode::obj': no_arg + 'opcode::newobj': no_arg 'opcode::proto': u1 - #'opcode::stop': no_arg + 'opcode::stop': no_arg 'opcode::persid': stringnl_noescape - #'opcode::binpersid': no_arg + 'opcode::binpersid': no_arg doc: | Optional argument for the operation. Data type and length are determined by the value of the opcode. @@ -204,8 +204,8 @@ types: encoding: utf8 size: len doc: Length prefixed string, between 0 and 2**32-1 bytes long - # Some opcodes have no associated argument - # no_arg: {} + no_arg: + doc: Some opcodes take no argument, this empty type is used for them. enums: opcode: From 5f0479279b30831869088496b91402161e5c83e7 Mon Sep 17 00:00:00 2001 From: Alex Willmer Date: Mon, 20 May 2019 21:38:18 +0100 Subject: [PATCH 05/20] Note that floatnl type can encode infinties Also noticed that a positive exponent would be encoded as 1e+100, not 1e100. --- serialization/pickle.ksy | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/serialization/pickle.ksy b/serialization/pickle.ksy index 9c4d47808..f88a20d52 100644 --- a/serialization/pickle.ksy +++ b/serialization/pickle.ksy @@ -143,7 +143,9 @@ types: type: str encoding: ASCII terminator: 0x0a # "\n" - doc: Double float, encoded with the ASCII characters [0-9.eE-], or 'nan'. + doc: | + Double float, encoded with the ASCII characters [0-9.e+-], '-inf', 'inf', + or 'nan'. long1: seq: - id: len From 5f473d62c6498c77d4d996f31e09e286d250082e Mon Sep 17 00:00:00 2001 From: Alex Willmer Date: Mon, 20 May 2019 22:43:57 +0100 Subject: [PATCH 06/20] Declare stringnl and stringnl_noescape encodings --- serialization/pickle.ksy | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/serialization/pickle.ksy b/serialization/pickle.ksy index f88a20d52..3e90ede18 100644 --- a/serialization/pickle.ksy +++ b/serialization/pickle.ksy @@ -116,11 +116,15 @@ types: stringnl: seq: - id: val + type: str + encoding: ASCII terminator: 0x0a # "\n" doc: Quoted string, possibly containing Python string escapes. stringnl_noescape: seq: - id: val + type: str + encoding: ASCII terminator: 0x0a # "\n" doc: Unquoted string, does not contain string escapes. stringnl_noescape_pair: From ffa2a9a5cd666a40eb18b21e51f9158ab57485c4 Mon Sep 17 00:00:00 2001 From: Alex Willmer Date: Mon, 20 May 2019 22:45:52 +0100 Subject: [PATCH 07/20] Rename pickle format to python_pickle --- serialization/{pickle.ksy => python_pickle.ksy} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename serialization/{pickle.ksy => python_pickle.ksy} (99%) diff --git a/serialization/pickle.ksy b/serialization/python_pickle.ksy similarity index 99% rename from serialization/pickle.ksy rename to serialization/python_pickle.ksy index 3e90ede18..00dba614d 100644 --- a/serialization/pickle.ksy +++ b/serialization/python_pickle.ksy @@ -1,5 +1,5 @@ meta: - id: pickle + id: python_pickle title: Python pickle serialiazation format (protocol 3) application: Python file-extension: From 7fe651c48bfc95ec05bc6e7d829904328d8682a5 Mon Sep 17 00:00:00 2001 From: Alex Willmer Date: Mon, 20 May 2019 22:49:37 +0100 Subject: [PATCH 08/20] Spelling correction in title --- serialization/python_pickle.ksy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/serialization/python_pickle.ksy b/serialization/python_pickle.ksy index 00dba614d..68dd55538 100644 --- a/serialization/python_pickle.ksy +++ b/serialization/python_pickle.ksy @@ -1,6 +1,6 @@ meta: id: python_pickle - title: Python pickle serialiazation format (protocol 3) + title: Python pickle serialization format (protocol 3) application: Python file-extension: - pickle From 0d0c03b32cb58b12c8a01240c667e9e340a41916 Mon Sep 17 00:00:00 2001 From: Alex Willmer Date: Tue, 21 May 2019 20:58:09 +0100 Subject: [PATCH 09/20] Add cross references --- serialization/python_pickle.ksy | 3 +++ 1 file changed, 3 insertions(+) diff --git a/serialization/python_pickle.ksy b/serialization/python_pickle.ksy index 68dd55538..b80a64df8 100644 --- a/serialization/python_pickle.ksy +++ b/serialization/python_pickle.ksy @@ -7,6 +7,9 @@ meta: - pkl license: CC0-1.0 endian: le + xref: + justsolve: Pickle + wikidata: Q7190889 doc: | Python Pickle format serializes Python objects to a byte stream, as a sequence of operations to run on the Pickle Virtual Machine. From 33bc2592212fb69775fe61c9100d504d96a28206 Mon Sep 17 00:00:00 2001 From: Alex Willmer Date: Tue, 21 May 2019 20:58:48 +0100 Subject: [PATCH 10/20] Spelling correction --- serialization/python_pickle.ksy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/serialization/python_pickle.ksy b/serialization/python_pickle.ksy index b80a64df8..5b33cf834 100644 --- a/serialization/python_pickle.ksy +++ b/serialization/python_pickle.ksy @@ -36,7 +36,7 @@ types: enum: opcode doc: | Operation code that determines which action should be - performed next by the Pickel Virtual Machine. Some opcodes + performed next by the Pickle Virtual Machine. Some opcodes are only available in later versions of the Pickle protocol. - id: arg type: From 31ea16e013c2627d3c9c80e86e428d63b34441b9 Mon Sep 17 00:00:00 2001 From: Alex Willmer Date: Tue, 21 May 2019 21:00:03 +0100 Subject: [PATCH 11/20] Remove encoding from string1 & string4, add explanation --- serialization/python_pickle.ksy | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/serialization/python_pickle.ksy b/serialization/python_pickle.ksy index 5b33cf834..2b47efc3e 100644 --- a/serialization/python_pickle.ksy +++ b/serialization/python_pickle.ksy @@ -176,20 +176,38 @@ types: - id: len type: u1 - id: val - type: str - encoding: latin1 size: len - doc: Length prefixed string, between 0 and 255 bytes long. + doc: | + Length prefixed string, between 0 and 255 bytes long. Encoding is + unspecified. + + The default Python 2.x string type (`str`) is a sequence of bytes. + These are pickled as `string1` or `string4`, when protocol == 2. + The bytes are written directly, no explicit encoding is performed. + + Python 3.x will not pickle an object as `string1` or `string4`. + Instead, opcodes and types with a known encoding are used. + When unpickling + + - `pickle.Unpickler` objects default to ASCII, which can be overriden + - `pickletools.dis` uses latin1, and cannot be overriden + doc-ref: https://github.com/python/cpython/blob/bb8071a4/Lib/pickle.py#L486-L495 string4: seq: - id: len # Not a typo, the length really is a signed integer type: s4 - id: val - type: str - encoding: latin1 size: len - doc: Length prefixed string, between 0 and 2**31-1 bytes long + doc: | + Length prefixed string, between 0 and 2**31-1 bytes long. Encoding is + unspecified. + + Although the len field is signed, any length < 0 will raise an exception + during unpickling. + + See the documentation for `string1` for further detail about encodings. + doc-ref: https://github.com/python/cpython/blob/bb8071a4/Lib/pickle.py#L486-L495 bytes1: seq: - id: len From 583c4e60bb67f89a38a0ed68b0b79693bf832fc4 Mon Sep 17 00:00:00 2001 From: Alex Willmer Date: Tue, 21 May 2019 21:02:10 +0100 Subject: [PATCH 12/20] Tweak formatting, separate types with whitespace --- serialization/python_pickle.ksy | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/serialization/python_pickle.ksy b/serialization/python_pickle.ksy index 2b47efc3e..b1200f53d 100644 --- a/serialization/python_pickle.ksy +++ b/serialization/python_pickle.ksy @@ -108,6 +108,7 @@ types: encoding: ASCII terminator: 0x0a # "\n" doc: Integer, encoded with the ASCII characters [0-9-]. + decimalnl_long: seq: - id: val @@ -115,6 +116,7 @@ types: encoding: ASCII terminator: 0x0a # "\n" doc: Integer, encoded with the ASCII chracters [0-9-], followed by 'L'. + # TODO Can kaitai express constraint that these are quoted? stringnl: seq: @@ -123,6 +125,7 @@ types: encoding: ASCII terminator: 0x0a # "\n" doc: Quoted string, possibly containing Python string escapes. + stringnl_noescape: seq: - id: val @@ -130,6 +133,7 @@ types: encoding: ASCII terminator: 0x0a # "\n" doc: Unquoted string, does not contain string escapes. + stringnl_noescape_pair: seq: - id: val1 @@ -137,6 +141,7 @@ types: - id: val2 type: stringnl_noescape doc: Pair of unquoted, unescaped strings. + unicodestringnl: seq: - id: val @@ -144,6 +149,7 @@ types: encoding: ASCII terminator: 0x0a # "\n" doc: Unquoted string, containing Python Unicode escapes. + floatnl: seq: - id: val @@ -153,6 +159,7 @@ types: doc: | Double float, encoded with the ASCII characters [0-9.e+-], '-inf', 'inf', or 'nan'. + long1: seq: - id: len @@ -162,6 +169,7 @@ types: doc: | Large signed integer, in the range -2**(8*255-1) to 2**(8*255-1)-1, encoded as two's complement. + long4: seq: - id: len @@ -171,6 +179,7 @@ types: doc: | Large signed integer, in the range -2**(8*2**32-1) to 2**(8*2**32-1)-1, encoded as two's complement. + string1: seq: - id: len @@ -192,6 +201,7 @@ types: - `pickle.Unpickler` objects default to ASCII, which can be overriden - `pickletools.dis` uses latin1, and cannot be overriden doc-ref: https://github.com/python/cpython/blob/bb8071a4/Lib/pickle.py#L486-L495 + string4: seq: - id: len @@ -208,6 +218,7 @@ types: See the documentation for `string1` for further detail about encodings. doc-ref: https://github.com/python/cpython/blob/bb8071a4/Lib/pickle.py#L486-L495 + bytes1: seq: - id: len @@ -215,6 +226,7 @@ types: - id: val size: len doc: Length prefixed byte string, between 0 and 255 bytes long. + bytes4: seq: - id: len From 936ffd125f68ea22f5900b78e72fca0a540a7f34 Mon Sep 17 00:00:00 2001 From: Alex Willmer Date: Tue, 21 May 2019 21:30:19 +0100 Subject: [PATCH 13/20] Remove protocol from title, add brief explanations of protocols --- serialization/python_pickle.ksy | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/serialization/python_pickle.ksy b/serialization/python_pickle.ksy index b1200f53d..0f4669631 100644 --- a/serialization/python_pickle.ksy +++ b/serialization/python_pickle.ksy @@ -1,6 +1,6 @@ meta: id: python_pickle - title: Python pickle serialization format (protocol 3) + title: Python pickle serialization format application: Python file-extension: - pickle @@ -21,6 +21,14 @@ doc: | Other builtin types, and all classes (e.g. `set`, `datetime.datetime`) are serialised by encoding the name of a constructor callable. They are deserialised by importing that constructor, and calling it. + + Pickle format has evolved with Python, later protocols add opcodes & types. + Later Python releases can pickle to or unpickle from any earlier protocol. + + * Protocol 0: ASCII clean, no explicit version, fields are '\n' terminated. + * Protocol 1: Binary, no explicit version, first length prefixed types. + * Protocol 2: Python 2.3+. Explicit versioning, more length prefixed types. + * Protocol 3: Python 3.0+. Dedicated opcodes for `bytes` objects. doc-ref: https://github.com/python/cpython/blob/3.3/Lib/pickletools.py seq: # TODO is there a way to declare PROTO is optional, but only valid at position 0? From 67190fda9ae23da52a8cf14ae872b660b8ddfa95 Mon Sep 17 00:00:00 2001 From: Alex Willmer Date: Tue, 21 May 2019 21:31:33 +0100 Subject: [PATCH 14/20] Mention security risk of unpickling abitrary pickles --- serialization/python_pickle.ksy | 2 ++ 1 file changed, 2 insertions(+) diff --git a/serialization/python_pickle.ksy b/serialization/python_pickle.ksy index 0f4669631..63e12922f 100644 --- a/serialization/python_pickle.ksy +++ b/serialization/python_pickle.ksy @@ -21,6 +21,8 @@ doc: | Other builtin types, and all classes (e.g. `set`, `datetime.datetime`) are serialised by encoding the name of a constructor callable. They are deserialised by importing that constructor, and calling it. + So, unpickling an arbitrary pickle, using the Python's stdlib pickle module + can cause arbitrary code execution. Pickle format has evolved with Python, later protocols add opcodes & types. Later Python releases can pickle to or unpickle from any earlier protocol. From 65e4636977bc2373d02ba20f5c0bb315cc44d541 Mon Sep 17 00:00:00 2001 From: Alex Willmer Date: Wed, 22 May 2019 21:12:42 +0100 Subject: [PATCH 15/20] Add link to Pickle protocol 2 Python Enhancement Proposal (PEP) --- serialization/python_pickle.ksy | 1 + 1 file changed, 1 insertion(+) diff --git a/serialization/python_pickle.ksy b/serialization/python_pickle.ksy index 63e12922f..ff35be3a2 100644 --- a/serialization/python_pickle.ksy +++ b/serialization/python_pickle.ksy @@ -30,6 +30,7 @@ doc: | * Protocol 0: ASCII clean, no explicit version, fields are '\n' terminated. * Protocol 1: Binary, no explicit version, first length prefixed types. * Protocol 2: Python 2.3+. Explicit versioning, more length prefixed types. + https://www.python.org/dev/peps/pep-0307/ * Protocol 3: Python 3.0+. Dedicated opcodes for `bytes` objects. doc-ref: https://github.com/python/cpython/blob/3.3/Lib/pickletools.py seq: From fb7264161dcd288f1eef4dd70e94b73be0c1a2cb Mon Sep 17 00:00:00 2001 From: Alex Willmer Date: Wed, 22 May 2019 21:16:02 +0100 Subject: [PATCH 16/20] Correct maximum length in doc of bytes4 type --- serialization/python_pickle.ksy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/serialization/python_pickle.ksy b/serialization/python_pickle.ksy index ff35be3a2..2429c09a9 100644 --- a/serialization/python_pickle.ksy +++ b/serialization/python_pickle.ksy @@ -244,7 +244,7 @@ types: type: u4 - id: val size: len - doc: Length prefixed string, between 0 and 2**31-1 bytes long + doc: Length prefixed string, between 0 and 2**32-1 bytes long unicodestring4: seq: - id: len From eb2eda2bbb25d351fecd1d8e4d1de59ef322cecc Mon Sep 17 00:00:00 2001 From: Alex Willmer Date: Wed, 22 May 2019 21:20:08 +0100 Subject: [PATCH 17/20] Add pickle protocol 4 opcodes and types --- serialization/python_pickle.ksy | 98 ++++++++++++++++++++++++++++++++- 1 file changed, 97 insertions(+), 1 deletion(-) diff --git a/serialization/python_pickle.ksy b/serialization/python_pickle.ksy index 2429c09a9..99f643299 100644 --- a/serialization/python_pickle.ksy +++ b/serialization/python_pickle.ksy @@ -32,7 +32,9 @@ doc: | * Protocol 2: Python 2.3+. Explicit versioning, more length prefixed types. https://www.python.org/dev/peps/pep-0307/ * Protocol 3: Python 3.0+. Dedicated opcodes for `bytes` objects. -doc-ref: https://github.com/python/cpython/blob/3.3/Lib/pickletools.py + * Protocol 4: Python 3.4+. Opcodes for 64 bit strings, framing, `set`. + https://www.python.org/dev/peps/pep-3154/ +doc-ref: https://github.com/python/cpython/blob/v3.7.3/Lib/pickletools.py seq: # TODO is there a way to declare PROTO is optional, but only valid at position 0? - id: ops @@ -65,11 +67,14 @@ types: 'opcode::short_binstring': string1 'opcode::binbytes': bytes4 'opcode::short_binbytes': bytes1 + 'opcode::binbytes8': bytes8 'opcode::none': no_arg 'opcode::newtrue': no_arg 'opcode::newfalse': no_arg 'opcode::unicode': unicodestringnl + 'opcode::short_binunicode': unicodestring1 'opcode::binunicode': unicodestring4 + 'opcode::binunicode8': unicodestring8 'opcode::float': floatnl 'opcode::binfloat': f8 'opcode::empty_list': no_arg @@ -85,6 +90,9 @@ types: 'opcode::dict': no_arg 'opcode::setitem': no_arg 'opcode::setitems': no_arg + 'opcode::empty_set': no_arg + 'opcode::additems': no_arg + 'opcode::frozenset': no_arg 'opcode::pop': no_arg 'opcode::dup': no_arg 'opcode::mark': no_arg @@ -95,17 +103,21 @@ types: 'opcode::put': decimalnl_short 'opcode::binput': u1 'opcode::long_binput': u4 + 'opcode::memoize': no_arg 'opcode::ext1': u1 'opcode::ext2': u2 'opcode::ext4': u4 'opcode::global': stringnl_noescape_pair + 'opcode::stack_global': no_arg 'opcode::reduce': no_arg 'opcode::build': no_arg 'opcode::inst': stringnl_noescape_pair 'opcode::obj': no_arg 'opcode::newobj': no_arg + 'opcode::newobj_ex': no_arg 'opcode::proto': u1 'opcode::stop': no_arg + 'opcode::frame': u8 'opcode::persid': stringnl_noescape 'opcode::binpersid': no_arg doc: | @@ -245,6 +257,31 @@ types: - id: val size: len doc: Length prefixed string, between 0 and 2**32-1 bytes long + + bytes8: + seq: + - id: len + type: u8 + - id: val + size: len + doc: | + Length prefixed string, between 0 and 2**64-1 bytes long. + + Only a 64-bit build of Python would produce a pickle containing strings + large enough to need this type. Such a pickle could not be unpickled on + a 32-bit build of Python, because the string would be larger than + `sys.maxsize`. + + unicodestring1: + seq: + - id: len + type: u4 + - id: val + type: str + encoding: utf8 + size: len + doc: Length prefixed string, between 0 and 255 bytes long + unicodestring4: seq: - id: len @@ -254,6 +291,23 @@ types: encoding: utf8 size: len doc: Length prefixed string, between 0 and 2**32-1 bytes long + + unicodestring8: + seq: + - id: len + type: u8 + - id: val + type: str + encoding: utf8 + size: len + doc: | + Length prefixed string, between 0 and 2**64-1 bytes long. + + Only a 64-bit build of Python would produce a pickle containing strings + large enough to need this type. Such a pickle could not be unpickled on + a 32-bit build of Python, because the string would be larger than + `sys.maxsize`. + no_arg: doc: Some opcodes take no argument, this empty type is used for them. @@ -490,3 +544,45 @@ enums: id: "short_binbytes" -orig-id: SHORT_BINBYTES doc: push bytes; counted binary string argument < 256 bytes + + # Protocol 4 + 0x8c: + id: "short_binunicode" + -orig-id: SHORT_BINUNICODE + doc: push short string; UTF-8 length < 256 bytes + 0x8d: + id: "binunicode8" + -orig-id: BINUNICODE8 + doc: push very long string + 0x8e: + id: "binbytes8" + -orig-id: BINBYTES8 + doc: push very long bytes string + 0x8f: + id: "empty_set" + -orig-id: EMPTY_SET + doc: push empty set on the stack + 0x90: + id: "additems" + -orig-id: ADDITEMS + doc: modify set by adding topmost stack items + 0x91: + id: "frozenset" + -orig-id: FROZENSET + doc: build frozenset from topmost stack items + 0x92: + id: "newobj_ex" + -orig-id: NEWOBJ_EX + doc: like NEWOBJ but work with keyword only arguments + 0x93: + id: "stack_global" + -orig-id: STACK_GLOBAL + doc: same as GLOBAL but using names on the stacks + 0x94: + id: "memoize" + -orig-id: MEMOIZE + doc: store top of the stack in memo + 0x95: + id: "frame" + -orig-id: FRAME + doc: indicate the beginning of a new frame From ce1aab3e974a0a39f616d8dafadbc2223ddcfb65 Mon Sep 17 00:00:00 2001 From: Alex Willmer Date: Thu, 23 May 2019 22:21:20 +0100 Subject: [PATCH 18/20] Align indentation of enums --- serialization/python_pickle.ksy | 330 ++++++++++++++++---------------- 1 file changed, 165 insertions(+), 165 deletions(-) diff --git a/serialization/python_pickle.ksy b/serialization/python_pickle.ksy index 99f643299..95ced4dfe 100644 --- a/serialization/python_pickle.ksy +++ b/serialization/python_pickle.ksy @@ -314,169 +314,169 @@ types: enums: opcode: 0x28: # "(" - id: "mark" - -orig-id: MARK - doc: push special markobject on stack + id: "mark" + -orig-id: MARK + doc: push special markobject on stack 0x2e: # "." - id: "stop" - -orig-id: STOP - doc: every pickle ends with STOP + id: "stop" + -orig-id: STOP + doc: every pickle ends with STOP 0x30: # "0" - id: "pop" - -orig-id: POP - doc: discard topmost stack item + id: "pop" + -orig-id: POP + doc: discard topmost stack item 0x31: # "1" - id: "pop_mark" - -orig-id: POP_MARK - doc: discard stack top through topmost markobject + id: "pop_mark" + -orig-id: POP_MARK + doc: discard stack top through topmost markobject 0x32: # "2" - id: "dup" - -orig-id: DUP - doc: duplicate top stack item + id: "dup" + -orig-id: DUP + doc: duplicate top stack item 0x46: # "F" - id: "float" - -orig-id: FLOAT - doc: push float object; decimal string argument + id: "float" + -orig-id: FLOAT + doc: push float object; decimal string argument 0x49: # "I" - id: "int" - -orig-id: INT - doc: push integer or bool; decimal string argument + id: "int" + -orig-id: INT + doc: push integer or bool; decimal string argument 0x4a: # "J" - id: "binint" - -orig-id: BININT - doc: push four-byte signed int + id: "binint" + -orig-id: BININT + doc: push four-byte signed int 0x4b: # "K" - id: "binint1" - -orig-id: BININT1 - doc: push 1-byte unsigned int + id: "binint1" + -orig-id: BININT1 + doc: push 1-byte unsigned int 0x4c: # "L" - id: "long" - -orig-id: LONG - doc: push long; decimal string argument + id: "long" + -orig-id: LONG + doc: push long; decimal string argument 0x4d: # "M" - id: "binint2" - -orig-id: BININT2 - doc: push 2-byte unsigned int + id: "binint2" + -orig-id: BININT2 + doc: push 2-byte unsigned int 0x4e: # "N" - id: "none" - -orig-id: NONE - doc: push None + id: "none" + -orig-id: NONE + doc: push None 0x50: # "P" - id: "persid" - -orig-id: PERSID - doc: push persistent object; id is taken from string arg + id: "persid" + -orig-id: PERSID + doc: push persistent object; id is taken from string arg 0x51: # "Q" - id: "binpersid" - -orig-id: BINPERSID - doc: push persistent object; id is taken from stack + id: "binpersid" + -orig-id: BINPERSID + doc: push persistent object; id is taken from stack 0x52: # "R" - id: "reduce" - -orig-id: REDUCE - doc: apply callable to argtuple, both on stack + id: "reduce" + -orig-id: REDUCE + doc: apply callable to argtuple, both on stack 0x53: # "S" - id: "string" - -orig-id: STRING - doc: push string; NL-terminated string argument + id: "string" + -orig-id: STRING + doc: push string; NL-terminated string argument 0x54: # "T" - id: "binstring" - -orig-id: BINSTRING - doc: push string; counted binary string argument + id: "binstring" + -orig-id: BINSTRING + doc: push string; counted binary string argument 0x55: # "U" - id: "short_binstring" - -orig-id: SHORT_BINSTRING - doc: push string; counted binary string argument 256 bytes + id: "short_binstring" + -orig-id: SHORT_BINSTRING + doc: push string; counted binary string argument 256 bytes 0x56: # "V" - id: "unicode" - -orig-id: UNICODE - doc: push Unicode string; raw-unicode-escaped argument + id: "unicode" + -orig-id: UNICODE + doc: push Unicode string; raw-unicode-escaped argument 0x58: # "X" - id: "binunicode" - -orig-id: BINUNICODE - doc: push Unicode string; counted UTF-8 string argument + id: "binunicode" + -orig-id: BINUNICODE + doc: push Unicode string; counted UTF-8 string argument 0x61: # "a" - id: "append" - -orig-id: APPEND - doc: append stack top to list below it + id: "append" + -orig-id: APPEND + doc: append stack top to list below it 0x62: # "b" - id: "build" - -orig-id: BUILD - doc: call __setstate__ or __dict__.update() + id: "build" + -orig-id: BUILD + doc: call __setstate__ or __dict__.update() 0x63: # "c" - id: "global" - -orig-id: GLOBAL - doc: push self.find_class(modname, name); 2 string args + id: "global" + -orig-id: GLOBAL + doc: push self.find_class(modname, name); 2 string args 0x64: # "d" - id: "dict" - -orig-id: DICT - doc: build a dict from stack items + id: "dict" + -orig-id: DICT + doc: build a dict from stack items 0x7d: # "}" - id: "empty_dict" - -orig-id: EMPTY_DICT - doc: push empty dict + id: "empty_dict" + -orig-id: EMPTY_DICT + doc: push empty dict 0x65: # "e" - id: "appends" - -orig-id: APPENDS - doc: extend list on stack by topmost stack slice + id: "appends" + -orig-id: APPENDS + doc: extend list on stack by topmost stack slice 0x67: # "g" - id: "get" - -orig-id: GET - doc: push item from memo on stack; index is string arg + id: "get" + -orig-id: GET + doc: push item from memo on stack; index is string arg 0x68: # "h" - id: "binget" - -orig-id: BINGET - doc: push item from memo on stack; index is 1-byte arg + id: "binget" + -orig-id: BINGET + doc: push item from memo on stack; index is 1-byte arg 0x69: # "i" - id: "inst" - -orig-id: INST - doc: build & push class instance + id: "inst" + -orig-id: INST + doc: build & push class instance 0x6a: # "j" - id: "long_binget" - -orig-id: LONG_BINGET - doc: push item from memo on stack; index is 4-byte arg + id: "long_binget" + -orig-id: LONG_BINGET + doc: push item from memo on stack; index is 4-byte arg 0x6c: # "l" - id: "list" - -orig-id: LIST - doc: build list from topmost stack items + id: "list" + -orig-id: LIST + doc: build list from topmost stack items 0x5d: # "]" - id: "empty_list" - -orig-id: EMPTY_LIST - doc: push empty list + id: "empty_list" + -orig-id: EMPTY_LIST + doc: push empty list 0x6f: # "o" - id: "obj" - -orig-id: OBJ - doc: build & push class instance + id: "obj" + -orig-id: OBJ + doc: build & push class instance 0x70: # "p" - id: "put" - -orig-id: PUT - doc: store stack top in memo; index is string arg + id: "put" + -orig-id: PUT + doc: store stack top in memo; index is string arg 0x71: # "q" - id: "binput" - -orig-id: BINPUT - doc: store stack top in memo; index is 1-byte arg + id: "binput" + -orig-id: BINPUT + doc: store stack top in memo; index is 1-byte arg 0x72: # "r" - id: "long_binput" - -orig-id: LONG_BINPUT - doc: store stack top in memo; index is 4-byte arg + id: "long_binput" + -orig-id: LONG_BINPUT + doc: store stack top in memo; index is 4-byte arg 0x73: # "s" - id: "setitem" - -orig-id: SETITEM - doc: add key+value pair to dict + id: "setitem" + -orig-id: SETITEM + doc: add key+value pair to dict 0x74: # "t" - id: "tuple" - -orig-id: TUPLE - doc: build tuple from topmost stack items + id: "tuple" + -orig-id: TUPLE + doc: build tuple from topmost stack items 0x29: # ")" - id: "empty_tuple" - -orig-id: EMPTY_TUPLE - doc: push empty tuple + id: "empty_tuple" + -orig-id: EMPTY_TUPLE + doc: push empty tuple 0x75: # "u" - id: "setitems" - -orig-id: SETITEMS - doc: modify dict by adding topmost key+value pairs + id: "setitems" + -orig-id: SETITEMS + doc: modify dict by adding topmost key+value pairs 0x47: # "G" - id: "binfloat" - -orig-id: BINFLOAT - doc: push float; arg is 8-byte float encoding + id: "binfloat" + -orig-id: BINFLOAT + doc: push float; arg is 8-byte float encoding #'I01\n': # id: "true" @@ -487,63 +487,63 @@ enums: # Protocol 2 0x80: - id: "proto" - -orig-id: PROTO - doc: identify pickle protocol + id: "proto" + -orig-id: PROTO + doc: identify pickle protocol 0x81: - id: "newobj" - -orig-id: NEWOBJ - doc: build object by applying cls.__new__ to argtuple + id: "newobj" + -orig-id: NEWOBJ + doc: build object by applying cls.__new__ to argtuple 0x82: - id: "ext1" - -orig-id: EXT1 - doc: push object from extension registry; 1-byte index + id: "ext1" + -orig-id: EXT1 + doc: push object from extension registry; 1-byte index 0x83: - id: "ext2" - -orig-id: EXT2 - doc: ditto, but 2-byte index + id: "ext2" + -orig-id: EXT2 + doc: ditto, but 2-byte index 0x84: - id: "ext4" - -orig-id: EXT4 - doc: ditto, but 4-byte index + id: "ext4" + -orig-id: EXT4 + doc: ditto, but 4-byte index 0x85: - id: "tuple1" - -orig-id: TUPLE1 - doc: build 1-tuple from stack top + id: "tuple1" + -orig-id: TUPLE1 + doc: build 1-tuple from stack top 0x86: - id: "tuple2" - -orig-id: TUPLE2 - doc: build 2-tuple from two topmost stack items + id: "tuple2" + -orig-id: TUPLE2 + doc: build 2-tuple from two topmost stack items 0x87: - id: "tuple3" - -orig-id: TUPLE3 - doc: build 3-tuple from three topmost stack items + id: "tuple3" + -orig-id: TUPLE3 + doc: build 3-tuple from three topmost stack items 0x88: - id: "newtrue" - -orig-id: NEWTRUE - doc: push True + id: "newtrue" + -orig-id: NEWTRUE + doc: push True 0x89: - id: "newfalse" - -orig-id: NEWFALSE - doc: push False + id: "newfalse" + -orig-id: NEWFALSE + doc: push False 0x8a: - id: "long1" - -orig-id: LONG1 - doc: push long from < 256 bytes + id: "long1" + -orig-id: LONG1 + doc: push long from < 256 bytes 0x8b: - id: "long4" - -orig-id: LONG4 - doc: push really big long + id: "long4" + -orig-id: LONG4 + doc: push really big long # Protocol 3 (Python 3.x) 0x42: # "B" - id: "binbytes" - -orig-id: BINBYTES - doc: push bytes; counted binary string argument + id: "binbytes" + -orig-id: BINBYTES + doc: push bytes; counted binary string argument 0x43: # "C" - id: "short_binbytes" - -orig-id: SHORT_BINBYTES - doc: push bytes; counted binary string argument < 256 bytes + id: "short_binbytes" + -orig-id: SHORT_BINBYTES + doc: push bytes; counted binary string argument < 256 bytes # Protocol 4 0x8c: From 50c81b9302eb77f386ef6039b51578b00db8a5f1 Mon Sep 17 00:00:00 2001 From: Alex Willmer Date: Sat, 25 May 2019 21:18:32 +0100 Subject: [PATCH 19/20] Use lowercase for all encoding: declerations --- serialization/python_pickle.ksy | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/serialization/python_pickle.ksy b/serialization/python_pickle.ksy index 95ced4dfe..4c3868e50 100644 --- a/serialization/python_pickle.ksy +++ b/serialization/python_pickle.ksy @@ -128,7 +128,7 @@ types: seq: - id: val type: str - encoding: ASCII + encoding: ascii terminator: 0x0a # "\n" doc: Integer, encoded with the ASCII characters [0-9-]. @@ -136,7 +136,7 @@ types: seq: - id: val type: str - encoding: ASCII + encoding: ascii terminator: 0x0a # "\n" doc: Integer, encoded with the ASCII chracters [0-9-], followed by 'L'. @@ -145,7 +145,7 @@ types: seq: - id: val type: str - encoding: ASCII + encoding: ascii terminator: 0x0a # "\n" doc: Quoted string, possibly containing Python string escapes. @@ -153,7 +153,7 @@ types: seq: - id: val type: str - encoding: ASCII + encoding: ascii terminator: 0x0a # "\n" doc: Unquoted string, does not contain string escapes. @@ -169,7 +169,7 @@ types: seq: - id: val type: str - encoding: ASCII + encoding: ascii terminator: 0x0a # "\n" doc: Unquoted string, containing Python Unicode escapes. @@ -177,7 +177,7 @@ types: seq: - id: val type: str - encoding: ASCII + encoding: ascii terminator: 0x0a # "\n" doc: | Double float, encoded with the ASCII characters [0-9.e+-], '-inf', 'inf', From b10f69313fddeca0f3bfe10201b822d85d71ede4 Mon Sep 17 00:00:00 2001 From: Alex Willmer Date: Sat, 25 May 2019 21:22:27 +0100 Subject: [PATCH 20/20] Document protocol 0 False/True special case --- serialization/python_pickle.ksy | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/serialization/python_pickle.ksy b/serialization/python_pickle.ksy index 4c3868e50..c85f187e7 100644 --- a/serialization/python_pickle.ksy +++ b/serialization/python_pickle.ksy @@ -130,7 +130,11 @@ types: type: str encoding: ascii terminator: 0x0a # "\n" - doc: Integer, encoded with the ASCII characters [0-9-]. + doc: | + Integer or boolean, encoded with the ASCII characters [0-9-]. + + The values '00' and '01' encode the Python values `False` and `True`. + Normally a value would not contain leading '0' characters. decimalnl_long: seq: @@ -478,13 +482,6 @@ enums: -orig-id: BINFLOAT doc: push float; arg is 8-byte float encoding - #'I01\n': - # id: "true" - # doc: not an opcode; see INT docs in pickletools.py - #'I00\n': - # id: "false" - # doc: not an opcode; see INT docs in pickletools.py - # Protocol 2 0x80: id: "proto"