diff --git a/format/all/all.go b/format/all/all.go index 86d5801c05..670318ac6e 100644 --- a/format/all/all.go +++ b/format/all/all.go @@ -26,6 +26,7 @@ import ( _ "github.com/wader/fq/format/png" _ "github.com/wader/fq/format/protobuf" _ "github.com/wader/fq/format/raw" + _ "github.com/wader/fq/format/sqlite3" _ "github.com/wader/fq/format/tar" _ "github.com/wader/fq/format/tiff" _ "github.com/wader/fq/format/vorbis" diff --git a/format/format.go b/format/format.go index 0874fe9ff6..e2be7291c9 100644 --- a/format/format.go +++ b/format/format.go @@ -88,6 +88,8 @@ const ( WAV = "wav" WEBP = "webp" ZIP = "zip" + + SQLITE3 = "sqlite3" ) // below are data types used to communicate between formats In/Out diff --git a/format/sqlite3/sqlite3.go b/format/sqlite3/sqlite3.go new file mode 100644 index 0000000000..ed295011c3 --- /dev/null +++ b/format/sqlite3/sqlite3.go @@ -0,0 +1,274 @@ +package sqlite3 + +// https://www.sqlite.org/fileformat.html +// https://sqlite.org/schematab.html + +// TODO: page overflow +// TODO: format version +// TODO: text encoding +// TODO: table/column names +// TODO: assert version and schema version? +// TODO: ptrmap +// TODO: how to represent NULL serials + +// CREATE TABLE sqlite_schema( +// type text, +// name text, +// tbl_name text, +// rootpage integer, +// sql text +// ); +// > A table with the name "sqlite_sequence" that is used to keep track of the maximum historical INTEGER PRIMARY KEY for a table using AUTOINCREMENT. +// CREATE TABLE sqlite_sequence(name,seq); +// > Tables with names of the form "sqlite_statN" where N is an integer. Such tables store database statistics gathered by the ANALYZE command and used by the query planner to help determine the best algorithm to use for each query. +// CREATE TABLE sqlite_stat1(tbl,idx,stat); +// Only if compiled with SQLITE_ENABLE_STAT2: +// CREATE TABLE sqlite_stat2(tbl,idx,sampleno,sample); +// Only if compiled with SQLITE_ENABLE_STAT3: +// CREATE TABLE sqlite_stat3(tbl,idx,nEq,nLt,nDLt,sample); +// Only if compiled with SQLITE_ENABLE_STAT4: +// CREATE TABLE sqlite_stat4(tbl,idx,nEq,nLt,nDLt,sample); +// TODO: sqlite_autoindex_TABLE_N index + +import ( + "embed" + + "github.com/wader/fq/format" + "github.com/wader/fq/format/registry" + "github.com/wader/fq/internal/num" + "github.com/wader/fq/pkg/decode" + "github.com/wader/fq/pkg/scalar" +) + +//go:embed *.jq +var sqlite3FS embed.FS + +func init() { + registry.MustRegister(decode.Format{ + Name: format.SQLITE3, + Description: "SQLite v3 database", + Groups: []string{format.PROBE}, + DecodeFn: sqlite3Decode, + Files: sqlite3FS, + }) +} + +const ( + bTreeIndexInterior = 0x02 + bTreeTableInterior = 0x05 + bTreeIndexLeaf = 0x0a + bTreeTableLeaf = 0x0d +) + +var bTreeTypeMap = scalar.UToScalar{ + bTreeIndexInterior: scalar.S{Sym: "index_interior", Description: "Index interior b-tree page"}, + bTreeTableInterior: scalar.S{Sym: "table_interior", Description: "Table interior b-tree page"}, + bTreeIndexLeaf: scalar.S{Sym: "index_leaf", Description: "Index leaf b-tree page"}, + bTreeTableLeaf: scalar.S{Sym: "table_leaf", Description: "Table leaf b-tree page"}, +} + +const ( + textEncodingUTF8 = 1 + textEncodingUTF16LE = 2 + textEncodingUTF16BE = 3 +) + +var textEncodingMap = scalar.UToSymStr{ + textEncodingUTF8: "utf8", + textEncodingUTF16LE: "utf16le", + textEncodingUTF16BE: "utf16be", +} + +var versionMap = scalar.UToSymStr{ + 1: "legacy", + 2: "wal", +} + +// TODO: all bits if nine bytes? +// TODO: two complement on bit read count +func varintDecode(d *decode.D) int64 { + var n uint64 + for i := 0; i < 9; i++ { + v := d.U8() + n = n<<7 | v&0b0111_1111 + if v&0b1000_0000 == 0 { + break + } + } + return num.TwosComplement(64, n) +} + +func sqlite3DecodeSerialType(d *decode.D, typ int64) { + switch typ { + case 0: + d.FieldValueStr("value", "NULL", scalar.Description("null")) + case 1: + d.FieldS8("value", scalar.Description("8-bit integer")) + case 2: + d.FieldS16("value", scalar.Description("16-bit integer")) + case 3: + d.FieldS24("value", scalar.Description("24-bit integer")) + case 4: + d.FieldS32("value", scalar.Description("32-bit integer")) + case 5: + d.FieldS48("value", scalar.Description("48-bit integer")) + case 6: + d.FieldS64("value", scalar.Description("64-bit integer")) + case 7: + d.FieldF64("value", scalar.Description("64-bit float")) + case 8: + d.FieldValueU("value", 0, scalar.Description("constant 0")) + case 9: + d.FieldValueU("value", 1, scalar.Description("constant 1")) + case 10, 11: + default: + if typ%2 == 0 { + // N => 12 and even: (N-12)/2 bytes blob. + d.FieldRawLen("value", (typ-12)/2*8, scalar.Description("blob")) + } else { + // N => 13 and odd: (N-13)/2 bytes text + d.FieldUTF8("value", int(typ-13)/2, scalar.Description("text")) + } + } +} + +func sqlite3CellFreeblockDecode(d *decode.D) uint64 { + nextOffset := d.FieldU16("next_offset") + if nextOffset == 0 { + return 0 + } + // TODO: "header" is size bytes or offset+size? seems to be just size + // "size of the freeblock in bytes, including the 4-byte header" + size := d.FieldU16("size") + d.FieldRawLen("space", int64(size-4)*8) + return nextOffset +} + +func sqlite3CellPayloadDecode(d *decode.D) { + lengthStart := d.Pos() + length := d.FieldSFn("length", varintDecode) + lengtbBits := d.Pos() - lengthStart + var serialTypes []int64 + d.LenFn((length)*8-lengtbBits, func(d *decode.D) { + d.FieldArray("serials", func(d *decode.D) { + for !d.End() { + serialTypes = append(serialTypes, d.FieldSFn("serial", varintDecode)) + } + }) + }) + d.FieldArray("contents", func(d *decode.D) { + for _, s := range serialTypes { + sqlite3DecodeSerialType(d, s) + } + }) +} + +func sqlite3Decode(d *decode.D, in interface{}) interface{} { + var pageSizeS *scalar.S + var databaseSizePages uint64 + + d.FieldStruct("header", func(d *decode.D) { + d.FieldUTF8("magic", 16, d.AssertStr("SQLite format 3\x00")) + pageSizeS = d.FieldScalarU16("page_size", scalar.UToSymU{1: 65536}) // in bytes. Must be a power of two between 512 and 32768 inclusive, or the value 1 representing a page size of 65536. + d.FieldU8("write_version", versionMap) // 1 for legacy; 2 for WAL. + d.FieldU8("read_version", versionMap) // . 1 for legacy; 2 for WAL. + d.FieldU8("unused_space") // at the end of each page. Usually 0. + d.FieldU8("maximum_embedded_payload_fraction") // . Must be 64. + d.FieldU8("minimum_embedded_payload_fraction") // . Must be 32. + d.FieldU8("leaf_payload_fraction") // . Must be 32. + d.FieldU32("file_change_counter") // + databaseSizePages = d.FieldU32("database_size_pages") // . The "in-header database size". + d.FieldU32("page_number_freelist") // of the first freelist trunk page. + d.FieldU32("total_number_freelist") // pages. + d.FieldU32("schema_cookie") // . + d.FieldU32("schema_format_number") // . Supported schema formats are 1, 2, 3, and 4. + d.FieldU32("default_page_cache_size") // . + d.FieldU32("page_number_largest_root_btree") // page when in auto-vacuum or incremental-vacuum modes, or zero otherwise. + d.FieldU32("text_encoding", textEncodingMap) + d.FieldU32("user_version") // " as read and set by the user_version pragma. + d.FieldU32("incremental_vacuum_mode") // False (zero) otherwise. + d.FieldU32("application_id") // " set by PRAGMA application_id. + d.FieldRawLen("reserved", 160, d.BitBufIsZero()) // for expansion. Must be zero. + d.FieldU32("version_valid_for") // number. + d.FieldU32("sqlite_version_number") // + }) + + // TODO: nicer API for fallback? + pageSize := pageSizeS.ActualU() + if pageSizeS.Sym != nil { + pageSize = pageSizeS.SymU() + } + + d.FieldArray("pages", func(d *decode.D) { + for i := uint64(0); i < databaseSizePages; i++ { + pageOffset := int64(pageSize) * int64(i) + d.SeekAbs(pageOffset * 8) + // skip header for first page + if i == 0 { + d.SeekRel(100 * 8) + } + + d.FieldStruct("page", func(d *decode.D) { + typ := d.FieldU8("type", bTreeTypeMap) + startFreeblocks := d.FieldU16("start_freeblocks") // The two-byte integer at offset 1 gives the start of the first freeblock on the page, or is zero if there are no freeblocks. + pageCells := d.FieldU16("page_cells") // The two-byte integer at offset 3 gives the number of cells on the page. + d.FieldU16("cell_start") // sThe two-byte integer at offset 5 designates the start of the cell content area. A zero value for this integer is interpreted as 65536. + d.FieldU8("cell_fragments") // The one-byte integer at offset 7 gives the number of fragmented free bytes within the cell content area. + switch typ { + case bTreeIndexInterior, + bTreeTableInterior: + d.FieldU32("right_pointer") // The four-byte page number at offset 8 is the right-most pointer. This value appears in the header of interior b-tree pages only and is omitted from all other pages. + } + var cellPointers []uint64 + d.FieldArray("cells_pointers", func(d *decode.D) { + for i := uint64(0); i < pageCells; i++ { + cellPointers = append(cellPointers, d.FieldU16("pointer")) + } + }) + if startFreeblocks != 0 { + d.FieldArray("freeblocks", func(d *decode.D) { + nextOffset := startFreeblocks + for nextOffset != 0 { + d.SeekAbs((pageOffset + int64(nextOffset)) * 8) + d.FieldStruct("freeblock", func(d *decode.D) { + nextOffset = sqlite3CellFreeblockDecode(d) + }) + } + }) + } + d.FieldArray("cells", func(d *decode.D) { + for _, p := range cellPointers { + d.FieldStruct("cell", func(d *decode.D) { + // TODO: SeekAbs with fn later? + d.SeekAbs((pageOffset + int64(p)) * 8) + switch typ { + case bTreeIndexInterior: + d.FieldU32("left_child") + payLoadLen := d.FieldSFn("payload_len", varintDecode) + d.LenFn(payLoadLen*8, func(d *decode.D) { + d.FieldStruct("payload", sqlite3CellPayloadDecode) + }) + case bTreeTableInterior: + d.FieldU32("left_child") + d.FieldSFn("rowid", varintDecode) + case bTreeIndexLeaf: + payLoadLen := d.FieldSFn("payload_len", varintDecode) + d.LenFn(payLoadLen*8, func(d *decode.D) { + d.FieldStruct("payload", sqlite3CellPayloadDecode) + }) + case bTreeTableLeaf: + payLoadLen := d.FieldSFn("payload_len", varintDecode) + d.FieldSFn("rowid", varintDecode) + d.LenFn(payLoadLen*8, func(d *decode.D) { + d.FieldStruct("payload", sqlite3CellPayloadDecode) + }) + } + }) + } + }) + }) + } + }) + + return nil +} diff --git a/format/sqlite3/sqlite3.jq b/format/sqlite3/sqlite3.jq new file mode 100644 index 0000000000..011c478381 --- /dev/null +++ b/format/sqlite3/sqlite3.jq @@ -0,0 +1,40 @@ + +# TODO: index tables? +# TODO: traverse is wrong somehow +# TODO: chinook.db => [sqlite3_table("Track")] | length => 316, should be 3503 rows + +def sqlite3_traverse($root; $page): + def _t: + ( . # debug({TRAVESE: .}) + | if .type == "table_interior" or .type == "index_interior" then + ( $root.pages[.cells[].left_child, .right_pointer] + | _t + ) + elif .type == "table_leaf" or .type == "index_leaf" then + ( .cells[] + ) + end + ); + ( $page + | _t + ); + +def sqlite3_table($name): + ( . as $root + | ( first( + ( sqlite3_traverse($root; $root.pages[0]) + | select(.payload.contents | .[0] == "table" and .[2] == $name) + ) + ) + ) as $table_start_cell + | ( first( + ( sqlite3_traverse($root; $root.pages[0]) + | select(.payload.contents| .[0] == "index" and .[2] == $name) + ) + ) + ) as $index_start_cell + | sqlite3_traverse($root; $root.pages[$index_start_cell.payload.contents[3]-1]) as $index_row + | sqlite3_traverse($root; $root.pages[$table_start_cell.payload.contents[3]-1]) + | first(select(.rowid == $index_row.payload.contents[1])) + | .payload.contents + ); diff --git a/format/sqlite3/testdata/test.db b/format/sqlite3/testdata/test.db new file mode 100644 index 0000000000..aa763c9ce5 Binary files /dev/null and b/format/sqlite3/testdata/test.db differ diff --git a/format/sqlite3/testdata/test.sh b/format/sqlite3/testdata/test.sh new file mode 100755 index 0000000000..8663f8e38c --- /dev/null +++ b/format/sqlite3/testdata/test.sh @@ -0,0 +1,3 @@ +#!/bin/sh + +cat test.sql | sqlite3 test.db diff --git a/format/sqlite3/testdata/test.sql b/format/sqlite3/testdata/test.sql new file mode 100644 index 0000000000..98cfd8e551 --- /dev/null +++ b/format/sqlite3/testdata/test.sql @@ -0,0 +1,18 @@ +CREATE TABLE aaa ( + cint int primary key, + cvarchar varchar(30), + ctext text, + creal real, + cblob blob +); +INSERT INTO "aaa" VALUES(0, 'var1', 'text1', 0, "blob1"); +INSERT INTO "aaa" VALUES(1, 'var2', 'test2', 1, "blob2"); +INSERT INTO "aaa" VALUES(128, 'var3', 'test3', 128, "blob3"); +INSERT INTO "aaa" VALUES(-128, 'var3', 'test3', -128, "blob3"); +INSERT INTO "aaa" VALUES(9223372036854775807, 'var4', 'test4', 9223372036854775807, "blob4"); +INSERT INTO "aaa" VALUES(-9223372036854775808, 'var5', 'test5', -9223372036854775808, "blob5"); + +-- CREATE TABLE aaa ( +-- cint int primary key +-- ); +-- INSERT INTO "aaa" VALUES(123);