lancedb · wjones127 · Jul 24, 2023 · Jul 10, 2023 · Jul 10, 2023 · Jul 17, 2023
diff --git a/docs/_static/fragment_structure.png b/docs/_static/fragment_structure.png
diff --git a/docs/format.rst b/docs/format.rst
@@ -13,7 +13,8 @@ A `Lance Dataset` is organized in a directory.
         latest.manifest -- The manifest file for the latest version.
         _versions/*.manifest -- Manifest file for each dataset version.
         _indices/{UUID-*}/index.idx -- Secondary index, each index per directory.
-
+        _deletions/*.{arrow,bin} -- Deletion files, which contain ids of rows
+          that have been deleted.
 
 A ``Manifest`` file includes the metadata to describe a version of the dataset.
 
@@ -23,6 +24,9 @@ A ``Manifest`` file includes the metadata to describe a version of the dataset.
    :start-at: // Manifest is
    :end-at: } // Manifest
 
+Fragments
+~~~~~~~~~
+
 ``DataFragment`` represents a chunk of data in the dataset. Itself includes one or more ``DataFile``,
 where each ``DataFile`` can contain several columns in the chunk of data. It also may include a 
 ``DeletionFile``, which is explained in a later section.
@@ -34,6 +38,17 @@ where each ``DataFile`` can contain several columns in the chunk of data. It als
    :end-at: } // DataFile
 
 
+The overall structure of a fragment is shown below. One or more data files store
+the columns of a fragment. New columns can be added to a fragment by adding new
+data files. The deletion file (if present), stores the rows that have been
+deleted from the fragment.
+
+.. image:: _static/fragment_structure.png
+
+Every row has a unique id, which is an u64 that is composed of two u32s: the
+fragment id and the local row id. The local row id is just the index of the
+row in the data files.
+
 File Structure
 --------------
 
@@ -75,6 +90,60 @@ on whether you are trying to read or write the table. Readers should check the
 should check ``writer_feature_flags``. If either sees a flag they don't know, they
 should return an "unsupported" error on any read or write operation.
 
+Fields
+------
+
+Fields represent the metadata for a column. This includes the name, data type,
+id, nullability, and encoding.
+
+Fields are listed in depth first order, and can be one of (1) parent (struct),
+(2) repeated (list/array), or (3) leaf (primitive). For example, the schema:
+
+.. code-block::
+
+    a: i32
+    b: struct {
+        c: list<i32>
+        d: i32
+    }
+
+Would be represented as the following field list:
+
+.. list-table::
+   :widths: 20 20 20 20 25
+   :header-rows: 1
+
+   * - name
+     - id
+     - type
+     - parent_id
+     - logical_type
+   * - ``a``
+     - 1
+     - LEAF
+     - 0
+     - ``"int32"``
+   * - ``b``
+     - 2
+     - PARENT
+     - 0
+     - ``"struct"``
+   * - ``b.c``
+     - 3
+     - REPEATED
+     - 2
+     - ``"list"``
+   * - ``b.c``
+     - 4
+     - LEAF
+     - 3
+     - ``"int32"``
+   * - ``b.d``
+     - 5
+     - LEAF
+     - 2
+     - ``"int32"``
+
 Encodings
 ---------
 

diff --git a/protos/format.proto b/protos/format.proto
@@ -140,6 +140,11 @@ message DataFile {
   // Relative path to the root.
   string path = 1;
   // The ids of the fields/columns in this file.
+  //
+  // IDs are assigned based on position in the file, offset by the max existing
+  // field id in the table (if any already). So when a fragment is first created
+  // with one file of N columns, the field ids will be 1, 2, ..., N. If a second,
+  // fragment is created with M columns, the field ids will be N+1, N+2, ..., N+M.
   repeated int32 fields = 2;
 } // DataFile
 
@@ -182,7 +187,7 @@ message Metadata {
 
   // The file position that page table is stored.
   //
-  // A page table is a matrix of N x N x 2, where N = num_fields, and M =
+  // A page table is a matrix of N x M x 2, where N = num_fields, and M =
   // num_batches. Each cell in the table is a pair of <position:int64,
   // length:int64> of the page. Both position and length are int64 values. The
   // <position, length> of all the pages in the same column are then
@@ -234,11 +239,39 @@ message Field {
   // Fully qualified name.
   string name = 2;
   /// Field Id.
+  ///
+  /// See the comment in `DataFile.fields` for how field ids are assigned.
   int32 id = 3;
   /// Parent Field ID. If not set, this is a top-level column.
   int32 parent_id = 4;
 
   // Logical types, support parameterized Arrow Type.
+  //
+  // PARENT types will always have logical type "struct".
+  //
+  // REPEATED types may have logical types:
+  // * "list"
+  // * "large_list"
+  // * "list.struct"
+  // * "large_list.struct"
+  // The final two are used if the list values are structs, and therefore the
+  // field is both implicitly REPEATED and PARENT.
+  //
+  // LEAF types may have logical types:
+  // * "null"
+  // * "bool"
+  // * "int8" / "uint8"
+  // * "int16" / "uint16"
+  // * "int32" / "uint32"
+  // * "int64" / "uint64"
+  // * "halffloat" / "float" / "double"
+  // * "string" / "large_string"
+  // * "binary" / "large_binary"
+  // * "date32:day"
+  // * "date64:ms"
+  // * "decimal:128:{precision}:{scale}" / "decimal:256:{precision}:{scale}"
+  // * "time:{unit}" / "timestamp:{unit}" / "duration:{unit}", where unit is "s", "ms", "us", "ns"
+  // * "dict:{value_type}:{index_type}:false"
   string logical_type = 5;
   // If this field is nullable.
   bool nullable = 6;