diff --git a/format/all/all.go b/format/all/all.go index 782067193..95ffa72a5 100644 --- a/format/all/all.go +++ b/format/all/all.go @@ -31,6 +31,7 @@ import ( _ "github.com/wader/fq/format/png" _ "github.com/wader/fq/format/protobuf" _ "github.com/wader/fq/format/raw" + _ "github.com/wader/fq/format/sqlite3" _ "github.com/wader/fq/format/tar" _ "github.com/wader/fq/format/tiff" _ "github.com/wader/fq/format/vorbis" diff --git a/format/format.go b/format/format.go index 80babc890..5a9a2e58f 100644 --- a/format/format.go +++ b/format/format.go @@ -79,6 +79,7 @@ const ( RAW = "raw" SLL_PACKET = "sll_packet" SLL2_PACKET = "sll2_packet" + SQLITE3 = "sqlite3" TAR = "tar" TCP_SEGMENT = "tcp_segment" TIFF = "tiff" diff --git a/format/sqlite3/sqlite3.go b/format/sqlite3/sqlite3.go new file mode 100644 index 000000000..14f00ac4a --- /dev/null +++ b/format/sqlite3/sqlite3.go @@ -0,0 +1,423 @@ +package sqlite3 + +// https://www.sqlite.org/fileformat.html +// https://sqlite.org/src/file?name=src/btreeInt.h&ci=trunk +// https://sqlite.org/schematab.html + +// TODO: format version +// TODO: table/column names +// TODO: assert version and schema version? +// TODO: ptrmap +// TDOO: wal/journal files? combine? + +// > A table with the name "sqlite_sequence" that is used to keep track of the maximum historical INTEGER PRIMARY KEY for a table using AUTOINCREMENT. +// CREATE TABLE sqlite_sequence(name,seq); +// > Tables with names of the form "sqlite_statN" where N is an integer. Such tables store database statistics gathered by the ANALYZE command and used by the query planner to help determine the best algorithm to use for each query. +// CREATE TABLE sqlite_stat1(tbl,idx,stat); +// Only if compiled with SQLITE_ENABLE_STAT2: +// CREATE TABLE sqlite_stat2(tbl,idx,sampleno,sample); +// Only if compiled with SQLITE_ENABLE_STAT3: +// CREATE TABLE sqlite_stat3(tbl,idx,nEq,nLt,nDLt,sample); +// Only if compiled with SQLITE_ENABLE_STAT4: +// CREATE TABLE sqlite_stat4(tbl,idx,nEq,nLt,nDLt,sample); +// TODO: sqlite_autoindex_TABLE_N index + +import ( + "bytes" + "embed" + + "github.com/wader/fq/format" + "github.com/wader/fq/format/registry" + "github.com/wader/fq/internal/mathextra" + "github.com/wader/fq/pkg/bitio" + "github.com/wader/fq/pkg/decode" + "github.com/wader/fq/pkg/scalar" +) + +//go:embed *.jq +var sqlite3FS embed.FS + +func init() { + registry.MustRegister(decode.Format{ + Name: format.SQLITE3, + Description: "SQLite v3 database", + Groups: []string{format.PROBE}, + DecodeFn: sqlite3Decode, + ToRepr: "_sqlite3_torepr", + Files: sqlite3FS, + }) +} + +const sqlite3HeaderSize = 100 + +const ( + serialTypeNULL = 0 + serialTypeS8 = 1 + serialTypeSBE16 = 2 + serialTypeSBE24 = 3 + serialTypeSBE32 = 4 + serialTypeSBE48 = 5 + serialTypeSBE64 = 6 + serialTypeFloatBE64 = 7 + serialTypeInteger0 = 8 + serialTypeInteger1 = 9 + serialTypeInternal10 = 10 + serialTypeInternal11 = 11 +) + +var serialTypeMap = scalar.SToSymStr{ + serialTypeNULL: "null", + serialTypeS8: "s8", + serialTypeSBE16: "sbe16", + serialTypeSBE24: "sbe24", + serialTypeSBE32: "sbe32", + serialTypeSBE48: "sbe48", + serialTypeSBE64: "sbe64", + serialTypeFloatBE64: "floatbe64", + serialTypeInteger0: "integer0", + serialTypeInteger1: "integer1", + serialTypeInternal10: "internal10", + serialTypeInternal11: "internal11", +} + +const ( + pageTypePtrmap = 0x00 + pageTypeBTreeIndexInterior = 0x02 + pageTypeBTreeTableInterior = 0x05 + pageTypeBTreeIndexLeaf = 0x0a + pageTypeBTreeTableLeaf = 0x0d +) + +var pageTypeMap = scalar.UToSymStr{ + // pageTypePtrmap: "ptrmap", + pageTypeBTreeIndexInterior: "index_interior", + pageTypeBTreeTableInterior: "table_interior", + pageTypeBTreeIndexLeaf: "index_leaf", + pageTypeBTreeTableLeaf: "table_leaf", +} + +var ptrmapTypeMap = scalar.UToSymStr{ + 1: "rootpage", + 2: "freepage", + 3: "overflow1", + 4: "overflow2", + 5: "btree", +} + +const ( + textEncodingUTF8 = 1 + textEncodingUTF16LE = 2 + textEncodingUTF16BE = 3 +) + +var textEncodingMap = scalar.UToSymStr{ + textEncodingUTF8: "utf8", + textEncodingUTF16LE: "utf16le", + textEncodingUTF16BE: "utf16be", +} + +var versionMap = scalar.UToSymStr{ + 1: "legacy", + 2: "wal", +} + +type sqlite3Header struct { + pageSize int64 + databaseSizePages int + textEncoding int +} + +// TODO: all bits if nine bytes? +// TODO: two complement on bit read count +func varintDecode(d *decode.D) int64 { + var n uint64 + for i := 0; i < 9; i++ { + v := d.U8() + n = n<<7 | v&0b0111_1111 + if v&0b1000_0000 == 0 { + break + } + } + return mathextra.TwosComplement(64, n) +} + +func sqlite3DecodeSerialType(d *decode.D, h sqlite3Header, typ int64) { + switch typ { + case serialTypeNULL: + d.FieldValueNil("value") + case serialTypeS8: + d.FieldS8("value") + case serialTypeSBE16: + d.FieldS16("value") + case serialTypeSBE24: + d.FieldS24("value") + case serialTypeSBE32: + d.FieldS32("value") + case serialTypeSBE48: + d.FieldS48("value") + case serialTypeSBE64: + d.FieldS64("value") + case serialTypeFloatBE64: + d.FieldF64("value") + case serialTypeInteger0: + d.FieldValueU("value", 0) + case serialTypeInteger1: + d.FieldValueU("value", 1) + case 10, 11: + // internal, should not appear in wellformed file + default: + if typ%2 == 0 { + // N => 12 and even: (N-12)/2 bytes blob. + d.FieldRawLen("value", (typ-12)/2*8) + } else { + // N => 13 and odd: (N-13)/2 bytes text + l := int(typ-13) / 2 + switch h.textEncoding { + case textEncodingUTF8: + d.FieldUTF8("value", l) + case textEncodingUTF16LE: + d.FieldUTF16LE("value", l) + case textEncodingUTF16BE: + d.FieldUTF16BE("value", l) + } + } + } +} + +func sqlite3DecodeCellFreeblock(d *decode.D) uint64 { + nextOffset := d.FieldU16("next_offset") + if nextOffset == 0 { + return 0 + } + // TODO: "header" is size bytes or offset+size? seems to be just size + // "size of the freeblock in bytes, including the 4-byte header" + size := d.FieldU16("size") + // TODO: really? + if size == 0 { + return 0 + } + d.FieldRawLen("space", int64(size-4)*8) + return nextOffset +} + +func sqlite3CellPayloadDecode(d *decode.D, h sqlite3Header) { + lengthStart := d.Pos() + length := d.FieldSFn("length", varintDecode) + lengthBits := d.Pos() - lengthStart + var serialTypes []int64 + d.LenFn((length)*8-lengthBits, func(d *decode.D) { + d.FieldArray("serials", func(d *decode.D) { + for !d.End() { + serialTypes = append(serialTypes, d.FieldSFn("serial", varintDecode, scalar.Fn(func(s scalar.S) (scalar.S, error) { + typ := s.ActualS() + if st, ok := serialTypeMap[typ]; ok { + s.Description = st + } else if typ >= 12 && typ%2 == 0 { + s.Description = "blob" + } else if typ >= 13 && typ%2 != 0 { + s.Description = "text" + } + + return s, nil + }))) + } + }) + }) + d.FieldArray("contents", func(d *decode.D) { + for _, s := range serialTypes { + sqlite3DecodeSerialType(d, h, s) + } + }) +} + +func sqlite3DecodeTreePage(d *decode.D, h sqlite3Header, x int64, payLoadLen int64) { + // formulas from sqlite format spec + u := h.pageSize + p := payLoadLen + m := ((u - 12) * 32 / 255) - 23 + k := m + ((p - m) % (u - 4)) + + var firstPayLoadLen int64 + if k <= x { + firstPayLoadLen = k + } else { + firstPayLoadLen = m + } + + if p <= x { + // payload fits in page + d.LenFn(firstPayLoadLen*8, func(d *decode.D) { + d.FieldStruct("payload", func(d *decode.D) { sqlite3CellPayloadDecode(d, h) }) + }) + } else { + // payload overflows, collect payload parts + payLoadBB := &bytes.Buffer{} + + d.FieldArray("overflow_pages", func(d *decode.D) { + var nextPage int64 + d.FieldStruct("overflow_page", func(d *decode.D) { + bib := d.FieldRawLen("data", firstPayLoadLen*8) + nextPage = d.FieldS32("next_page") + + bb, err := bib.Bytes() + if err != nil { + d.IOPanic(err, "first overflow page") + } + payLoadBB.Write(bb) + }) + + payLoadLenLeft := payLoadLen - firstPayLoadLen + for nextPage != 0 { + d.SeekAbs(((nextPage - 1) * h.pageSize) * 8) + d.FieldStruct("overflow_page", func(d *decode.D) { + nextPage = d.FieldS32("next_page") + overflowSize := mathextra.MinInt64(h.pageSize-4, payLoadLenLeft) + bib := d.FieldRawLen("data", overflowSize*8) + payLoadLenLeft -= overflowSize + + bb, err := bib.Bytes() + if err != nil { + d.IOPanic(err, "overflow page") + } + payLoadBB.Write(bb) + }) + } + }) + + d.FieldStructRootBitBufFn("payload", + bitio.NewBufferFromBytes(payLoadBB.Bytes(), -1), + func(d *decode.D) { sqlite3CellPayloadDecode(d, h) }, + ) + } +} + +func sqlite3Decode(d *decode.D, in interface{}) interface{} { + var h sqlite3Header + + d.FieldStruct("header", func(d *decode.D) { + d.FieldUTF8("magic", 16, d.AssertStr("SQLite format 3\x00")) + pageSizeS := d.FieldScalarU16("page_size", scalar.UToSymU{1: 65536}) // in bytes. Must be a power of two between 512 and 32768 inclusive, or the value 1 representing a page size of 65536. + d.FieldU8("write_version", versionMap) // 1 for legacy; 2 for WAL. + d.FieldU8("read_version", versionMap) // . 1 for legacy; 2 for WAL. + d.FieldU8("unused_space") // at the end of each page. Usually 0. + d.FieldU8("maximum_embedded_payload_fraction") // . Must be 64. + d.FieldU8("minimum_embedded_payload_fraction") // . Must be 32. + d.FieldU8("leaf_payload_fraction") // . Must be 32. + d.FieldU32("file_change_counter") // + databaseSizePages := int(d.FieldU32("database_size_pages")) // . The "in-header database size". + d.FieldU32("page_number_freelist") // of the first freelist trunk page. + d.FieldU32("total_number_freelist") // pages. + d.FieldU32("schema_cookie") // . + d.FieldU32("schema_format_number") // . Supported schema formats are 1, 2, 3, and 4. + d.FieldU32("default_page_cache_size") // . + d.FieldU32("page_number_largest_root_btree") // page when in auto-vacuum or incremental-vacuum modes, or zero otherwise. + textEncoding := int(d.FieldU32("text_encoding", textEncodingMap)) + d.FieldU32("user_version") // " as read and set by the user_version pragma. + d.FieldU32("incremental_vacuum_mode") // False (zero) otherwise. + d.FieldU32("application_id") // " set by PRAGMA application_id. + d.FieldRawLen("reserved", 160, d.BitBufIsZero()) // for expansion. Must be zero. + d.FieldU32("version_valid_for") // number. + d.FieldU32("sqlite_version_number") // + + // TODO: nicer API for fallback? + pageSize := int64(pageSizeS.ActualU()) + if pageSizeS.Sym != nil { + pageSize = int64(pageSizeS.SymU()) + } + + h = sqlite3Header{ + pageSize: pageSize, + databaseSizePages: databaseSizePages, + textEncoding: textEncoding, + } + }) + + d.FieldArray("pages", func(d *decode.D) { + for i := 0; i < h.databaseSizePages; i++ { + pageOffset := h.pageSize * int64(i) + d.SeekAbs(pageOffset * 8) + // skip header for first page + if i == 0 { + d.SeekRel(sqlite3HeaderSize * 8) + } + + d.FieldStruct("page", func(d *decode.D) { + typ := d.FieldU8("type", pageTypeMap) + switch typ { + // case pageTypePtrmap: + // TODO: how to know if just a overflow page? + // log.Printf("ptrmap i: %#+v\n", i) + // d.FieldArray("entries", func(d *decode.D) { + // for j := int64(0); j < h.pageSize/5; j++ { + // d.FieldStruct("entry", func(d *decode.D) { + // d.FieldU8("type", ptrmapTypeMap) + // d.FieldU32("page_number") + // }) + // } + // }) + default: + d.FieldRawLen("data", (h.pageSize-4)*8) + + case pageTypeBTreeIndexInterior, + pageTypeBTreeIndexLeaf, + pageTypeBTreeTableInterior, + pageTypeBTreeTableLeaf: + startFreeblocks := d.FieldU16("start_freeblocks") // The two-byte integer at offset 1 gives the start of the first freeblock on the page, or is zero if there are no freeblocks. + pageCells := d.FieldU16("page_cells") // The two-byte integer at offset 3 gives the number of cells on the page. + d.FieldU16("cell_start") // sThe two-byte integer at offset 5 designates the start of the cell content area. A zero value for this integer is interpreted as 65536. + d.FieldU8("cell_fragments") // The one-byte integer at offset 7 gives the number of fragmented free bytes within the cell content area. + + switch typ { + case pageTypeBTreeIndexInterior, + pageTypeBTreeTableInterior: + d.FieldU32("right_pointer") // The four-byte page number at offset 8 is the right-most pointer. This value appears in the header of interior b-tree pages only and is omitted from all other pages. + } + var cellPointers []uint64 + d.FieldArray("cells_pointers", func(d *decode.D) { + for i := uint64(0); i < pageCells; i++ { + cellPointers = append(cellPointers, d.FieldU16("pointer")) + } + }) + if startFreeblocks != 0 { + d.FieldArray("freeblocks", func(d *decode.D) { + nextOffset := startFreeblocks + for nextOffset != 0 { + d.SeekAbs((pageOffset + int64(nextOffset)) * 8) + d.FieldStruct("freeblock", func(d *decode.D) { + nextOffset = sqlite3DecodeCellFreeblock(d) + }) + } + }) + } + d.FieldArray("cells", func(d *decode.D) { + for _, p := range cellPointers { + d.FieldStruct("cell", func(d *decode.D) { + // TODO: SeekAbs with fn later? + d.SeekAbs((pageOffset + int64(p)) * 8) + switch typ { + case pageTypeBTreeIndexInterior: + d.FieldU32("left_child") + payLoadLen := d.FieldSFn("payload_len", varintDecode) + // formula for x from sqlite format spec + sqlite3DecodeTreePage(d, h, ((h.pageSize-12)*64/255)-23, payLoadLen) + case pageTypeBTreeTableInterior: + d.FieldU32("left_child") + d.FieldSFn("rowid", varintDecode) + case pageTypeBTreeIndexLeaf: + payLoadLen := d.FieldSFn("payload_len", varintDecode) + sqlite3DecodeTreePage(d, h, ((h.pageSize-12)*64/255)-23, payLoadLen) + case pageTypeBTreeTableLeaf: + payLoadLen := d.FieldSFn("payload_len", varintDecode) + d.FieldSFn("rowid", varintDecode) + sqlite3DecodeTreePage(d, h, h.pageSize-35, payLoadLen) + } + }) + } + }) + } + }) + } + }) + + return nil +} diff --git a/format/sqlite3/sqlite3.jq b/format/sqlite3/sqlite3.jq new file mode 100644 index 000000000..fe0047ee2 --- /dev/null +++ b/format/sqlite3/sqlite3.jq @@ -0,0 +1,61 @@ +# TODO: why page numbers-1? 0 excluded as special? + +# [ +# 12556, +# 12557, +# 12558 +# ] + +def sqlite3_btree_walk($page): + ( . as $root + | ( def _t: + if .type == "table_interior" or .type == "index_interior" then + ($root.pages[.cells[].left_child-1, .right_pointer-1] | _t) + elif .type == "table_leaf" or .type == "index_leaf" then + .cells[] + else + error("unknown page type \(.type)") + end; + ($page | _t) + ) + ); + +# CREATE TABLE sqlite_schema( +# type text, +# name text, +# tbl_name text, +# rootpage integer, +# sql text +# ); +def sqlite3_schema: + ( [ sqlite3_btree_walk(.pages[0]) + | .payload.contents as [$type, $name, $tbl_name, $rootpage, $sql] + | { key: $name, + value: {$type, $name, $tbl_name, $rootpage, $sql} + } + ] + | from_entries + ); + +def sqlite3_rows($name): + ( sqlite3_schema[$name] as $s + | if $s == null then error("could not find name") end + | sqlite3_btree_walk(.pages[$s.rootpage-1]) + | . as {rowid: $rowid, payload: {$contents}} + | $contents + | tovalue + | if .[0] == null then .[0] = $rowid end + ); + +def _sqlite3_torepr: + ( . as $root + | sqlite3_schema + | map( + ( select(.type == "table") as $t + | { key: $t.name, + value: [$root | sqlite3_rows($t.name)] + } + ) + ) + | from_entries + ); diff --git a/format/sqlite3/testdata/test.db b/format/sqlite3/testdata/test.db new file mode 100644 index 000000000..a6bc9ba53 Binary files /dev/null and b/format/sqlite3/testdata/test.db differ diff --git a/format/sqlite3/testdata/test.sh b/format/sqlite3/testdata/test.sh new file mode 100755 index 000000000..8663f8e38 --- /dev/null +++ b/format/sqlite3/testdata/test.sh @@ -0,0 +1,3 @@ +#!/bin/sh + +cat test.sql | sqlite3 test.db diff --git a/format/sqlite3/testdata/test.sql b/format/sqlite3/testdata/test.sql new file mode 100644 index 000000000..98cfd8e55 --- /dev/null +++ b/format/sqlite3/testdata/test.sql @@ -0,0 +1,18 @@ +CREATE TABLE aaa ( + cint int primary key, + cvarchar varchar(30), + ctext text, + creal real, + cblob blob +); +INSERT INTO "aaa" VALUES(0, 'var1', 'text1', 0, "blob1"); +INSERT INTO "aaa" VALUES(1, 'var2', 'test2', 1, "blob2"); +INSERT INTO "aaa" VALUES(128, 'var3', 'test3', 128, "blob3"); +INSERT INTO "aaa" VALUES(-128, 'var3', 'test3', -128, "blob3"); +INSERT INTO "aaa" VALUES(9223372036854775807, 'var4', 'test4', 9223372036854775807, "blob4"); +INSERT INTO "aaa" VALUES(-9223372036854775808, 'var5', 'test5', -9223372036854775808, "blob5"); + +-- CREATE TABLE aaa ( +-- cint int primary key +-- ); +-- INSERT INTO "aaa" VALUES(123);