Skip to content

Commit

Permalink
Merge pull request #1664 from mirage/craigfe@user-defined-indexing-st…
Browse files Browse the repository at this point in the history
…rategies

irmin-pack: add support for user-defined indexing strategies
  • Loading branch information
craigfe authored Dec 23, 2021
2 parents d533cbb + 8eca001 commit 551740b
Show file tree
Hide file tree
Showing 11 changed files with 177 additions and 43 deletions.
5 changes: 5 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,11 @@
reduce indexing overhead. This change is fully backwards-compatible with
existing stores using `irmin-pack.2.x` versions, but not
forwards compatible. (#1649 #1655, @CraigFe @Ngoguey42)
- Added support for user-specified indexing strategies. The default strategy
is to index all objects appended to the pack file (as before), but users may
now choose to index fewer objects in order to improve the write performance
of the store, at the cost of introducing potential duplicate values to the
pack file. (#1664, @CraigFe)

- **irmin-unix**
- Clean up command line interface. Allow config file to be specified when
Expand Down
17 changes: 16 additions & 1 deletion src/irmin-pack/conf.ml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ module Default = struct
let readonly = false
let merge_throttle = `Block_writes
let freeze_throttle = `Block_writes
let indexing_strategy = Pack_store.Indexing_strategy.default
end

open Irmin.Backend.Conf
Expand Down Expand Up @@ -67,6 +68,17 @@ module Key = struct
freeze_throttle_t Default.freeze_throttle

let root = root spec

let indexing_strategy =
let serialisable_t = [%typ: [ `Always | `Minimal ]] in
key ~spec ~doc:"Strategy to use for adding objects to the index"
"indexing-strategy"
(Irmin.Type.map serialisable_t
(function
| `Always -> Pack_store.Indexing_strategy.always
| `Minimal -> Pack_store.Indexing_strategy.minimal)
(fun _ -> Fmt.failwith "Can't serialise indexing strategy"))
Default.indexing_strategy
end

let fresh config = get config Key.fresh
Expand All @@ -76,11 +88,13 @@ let index_log_size config = get config Key.index_log_size
let merge_throttle config = get config Key.merge_throttle
let freeze_throttle config = get config Key.freeze_throttle
let root config = get config Key.root
let indexing_strategy config = get config Key.indexing_strategy

let init ?(fresh = Default.fresh) ?(readonly = Default.readonly)
?(lru_size = Default.lru_size) ?(index_log_size = Default.index_log_size)
?(merge_throttle = Default.merge_throttle)
?(freeze_throttle = Default.freeze_throttle) root =
?(freeze_throttle = Default.freeze_throttle)
?(indexing_strategy = Default.indexing_strategy) root =
let config = empty spec in
let config = add config Key.root root in
let config = add config Key.fresh fresh in
Expand All @@ -89,4 +103,5 @@ let init ?(fresh = Default.fresh) ?(readonly = Default.readonly)
let config = add config Key.readonly readonly in
let config = add config Key.merge_throttle merge_throttle in
let config = add config Key.freeze_throttle freeze_throttle in
let config = add config Key.indexing_strategy indexing_strategy in
verify config
9 changes: 8 additions & 1 deletion src/irmin-pack/conf.mli
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,11 @@ module type S = sig
- [Some `Varint]: the length header is a LEB128-encoded integer at the
very beginning of the encoded value.
- [None]: there is no length header, and values have unknown size. *)
- [None]: there is no length header, and values have unknown size. NOTE:
when using [irmin-pack] in this mode, the selected indexing strategy
{i must} index all contents values (as recovering contents values from
the store will require referring to the index for their length
information). *)
end

val spec : Irmin.Backend.Conf.Spec.t
Expand All @@ -43,6 +47,7 @@ module Key : sig
val root : string Irmin.Backend.Conf.key
val merge_throttle : merge_throttle Irmin.Backend.Conf.key
val freeze_throttle : freeze_throttle Irmin.Backend.Conf.key
val indexing_strategy : Pack_store.Indexing_strategy.t Irmin.Backend.Conf.key
end

val fresh : Irmin.Backend.Conf.t -> bool
Expand All @@ -52,6 +57,7 @@ val readonly : Irmin.Backend.Conf.t -> bool
val merge_throttle : Irmin.Backend.Conf.t -> merge_throttle
val freeze_throttle : Irmin.Backend.Conf.t -> freeze_throttle
val root : Irmin.Backend.Conf.t -> string
val indexing_strategy : Irmin.Backend.Conf.t -> Pack_store.Indexing_strategy.t

val init :
?fresh:bool ->
Expand All @@ -60,5 +66,6 @@ val init :
?index_log_size:int ->
?merge_throttle:merge_throttle ->
?freeze_throttle:freeze_throttle ->
?indexing_strategy:Pack_store.Indexing_strategy.t ->
string ->
Irmin.config
25 changes: 16 additions & 9 deletions src/irmin-pack/ext.ml
Original file line number Diff line number Diff line change
Expand Up @@ -136,12 +136,13 @@ module Maker (Config : Conf.S) = struct
f contents node commit)))

let unsafe_v config =
let root = Conf.root config in
let fresh = Conf.fresh config in
let lru_size = Conf.lru_size config in
let readonly = Conf.readonly config in
let log_size = Conf.index_log_size config in
let throttle = Conf.merge_throttle config in
let root = Conf.root config
and fresh = Conf.fresh config
and lru_size = Conf.lru_size config
and readonly = Conf.readonly config
and log_size = Conf.index_log_size config
and throttle = Conf.merge_throttle config
and indexing_strategy = Conf.indexing_strategy config in
let f = ref (fun () -> ()) in
let index =
Index.v
Expand All @@ -150,10 +151,16 @@ module Maker (Config : Conf.S) = struct
~fresh ~readonly ~throttle ~log_size root
in
let* contents =
Contents.CA.v ~fresh ~readonly ~lru_size ~index root
Contents.CA.v ~fresh ~readonly ~lru_size ~index ~indexing_strategy
root
in
let* node =
Node.CA.v ~fresh ~readonly ~lru_size ~index ~indexing_strategy root
in
let* commit =
Commit.CA.v ~fresh ~readonly ~lru_size ~index ~indexing_strategy
root
in
let* node = Node.CA.v ~fresh ~readonly ~lru_size ~index root in
let* commit = Commit.CA.v ~fresh ~readonly ~lru_size ~index root in
let+ branch = Branch.v ~fresh ~readonly root in
(* Stores share instances in memory, one flush is enough. In case of a
system crash, the flush_callback might not make with the disk. In
Expand Down
1 change: 1 addition & 0 deletions src/irmin-pack/inode_intf.ml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ module type Persistent = sig
?readonly:bool ->
?lru_size:int ->
index:index ->
indexing_strategy:Pack_store.Indexing_strategy.t ->
string ->
read t Lwt.t

Expand Down
6 changes: 5 additions & 1 deletion src/irmin-pack/irmin_pack_intf.ml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ module type Sigs = sig
?index_log_size:int ->
?merge_throttle:Conf.merge_throttle ->
?freeze_throttle:Conf.freeze_throttle ->
?indexing_strategy:Pack_store.Indexing_strategy.t ->
string ->
Irmin.config
(** Configuration options for stores.
Expand All @@ -47,7 +48,10 @@ module type Sigs = sig
the strategy to use when the index cache is full and an async
[Index.merge] in already in progress. [Block_writes] (the default)
blocks any new writes until the merge is completed. [Overcommit_memory]
does not block but indefinitely expands the in-memory cache. *)
does not block but indefinitely expands the in-memory cache.
@param indexing_strategy
The {{!Pack_store.Indexing_strategy} indexing strategy} of the backend
store. Defaults to {!Pack_store.Indexing_strategy.default}. *)

exception RO_not_allowed

Expand Down
71 changes: 58 additions & 13 deletions src/irmin-pack/pack_store.ml
Original file line number Diff line number Diff line change
@@ -1,6 +1,38 @@
open! Import
include Pack_store_intf

module Indexing_strategy = struct
type t = value_length:int -> Pack_value.Kind.t -> bool

let always ~value_length:_ _ = true

let minimal : t =
fun ~value_length:_ -> function
| Commit_v2 ->
(* Commits must be indexed as the branch store contains only their
hashes. All {i internal} references to V1 commits are via offset
(from other V1 commit objects). *)
true
| Inode_v2_root ->
(* It's safe not to index V1 root inodes because they are never
referenced by V0 commit objects (only V1 commit objects, which
contain direct pointers rather than hashes).*)
false
| Inode_v2_nonroot -> false
| Contents -> false
| Commit_v1 | Inode_v1_unstable | Inode_v1_stable ->
(* We never append new V0 values, so this choice is irrelevant to the
store implementation, but we do assume that existing V0 objects are
indexed (as they may be referenced via hash by other V0 objects), and
this must be accounted for when reconstructing the index. *)
true

let default = always
end

module type S = S with type indexing_strategy := Indexing_strategy.t
module type Maker = Maker with type indexing_strategy := Indexing_strategy.t

module Table (K : Irmin.Hash.S) = Hashtbl.Make (struct
type t = K.t

Expand Down Expand Up @@ -45,6 +77,7 @@ module Maker (Index : Pack_index.S) (K : Irmin.Hash.S with type t = Index.key) :
type 'a t = {
mutable block : IO.t;
index : Index.t;
indexing_strategy : Indexing_strategy.t;
dict : Dict.t;
mutable open_instances : int;
}
Expand All @@ -55,7 +88,7 @@ module Maker (Index : Pack_index.S) (K : Irmin.Hash.S with type t = Index.key) :
true)
else false

let unsafe_v ~index ~fresh ~readonly file =
let unsafe_v ~index ~indexing_strategy ~fresh ~readonly file =
let root = Filename.dirname file in
let dict = Dict.v ~fresh ~readonly root in
let block =
Expand All @@ -64,12 +97,12 @@ module Maker (Index : Pack_index.S) (K : Irmin.Hash.S with type t = Index.key) :
let version = Some selected_version in
IO.v ~version ~fresh ~readonly file
in
{ block; index; dict; open_instances = 1 }
{ block; index; indexing_strategy; dict; open_instances = 1 }

let IO_cache.{ v } =
IO_cache.memoize ~valid
~clear:(fun t -> IO.truncate t.block)
~v:(fun index -> unsafe_v ~index)
~v:(fun (index, indexing_strategy) -> unsafe_v ~index ~indexing_strategy)
Layout.pack

let close t =
Expand Down Expand Up @@ -132,17 +165,18 @@ module Maker (Index : Pack_index.S) (K : Irmin.Hash.S with type t = Index.key) :
if index_merge then Index.merge t.pack.index;
Dict.flush t.pack.dict;
IO.flush t.pack.block;
if index then Index.flush ~no_callback:() t.pack.index;
if index then Index.flush t.pack.index;
Tbl.clear t.staging

let unsafe_v_no_cache ~fresh ~readonly ~lru_size ~index root =
let pack = v index ~fresh ~readonly root in
let unsafe_v_no_cache ~fresh ~readonly ~lru_size ~index ~indexing_strategy
root =
let pack = v (index, indexing_strategy) ~fresh ~readonly root in
let staging = Tbl.create 127 in
let lru = Lru.create lru_size in
{ staging; lru; pack; open_instances = 1; readonly }

let unsafe_v ?(fresh = false) ?(readonly = false) ?(lru_size = 10_000)
~index root =
~index ~indexing_strategy root =
try
let t = Hashtbl.find roots (root, readonly) in
if valid t then (
Expand All @@ -152,13 +186,18 @@ module Maker (Index : Pack_index.S) (K : Irmin.Hash.S with type t = Index.key) :
Hashtbl.remove roots (root, readonly);
raise Not_found)
with Not_found ->
let t = unsafe_v_no_cache ~fresh ~readonly ~lru_size ~index root in
let t =
unsafe_v_no_cache ~fresh ~readonly ~lru_size ~index ~indexing_strategy
root
in
if fresh then unsafe_clear t;
Hashtbl.add roots (root, readonly) t;
t

let v ?fresh ?readonly ?lru_size ~index root =
let t = unsafe_v ?fresh ?readonly ?lru_size ~index root in
let v ?fresh ?readonly ?lru_size ~index ~indexing_strategy root =
let t =
unsafe_v ?fresh ?readonly ?lru_size ~index ~indexing_strategy root
in
Lwt.return t

let io_read_and_decode_hash ~off t =
Expand Down Expand Up @@ -445,7 +484,12 @@ module Maker (Index : Pack_index.S) (K : Irmin.Hash.S with type t = Index.key) :
Val.encode_bin ~offset_of_key ~dict hash v (IO.append t.pack.block);
let len = Int63.to_int (IO.offset t.pack.block -- off) in
let key = Pack_key.v_direct ~hash ~offset:off ~length:len in
Index.add ~overcommit t.pack.index hash (off, len, kind);
let () =
let kind = Val.kind v in
let should_index = t.pack.indexing_strategy ~value_length:len kind in
if should_index then
Index.add ~overcommit t.pack.index hash (off, len, kind)
in
if Tbl.length t.staging >= auto_flush then flush t
else Tbl.add t.staging hash v;
Lru.add t.lru hash v;
Expand Down Expand Up @@ -500,8 +544,9 @@ module Maker (Index : Pack_index.S) (K : Irmin.Hash.S with type t = Index.key) :
module Inner = Make_without_close_checks (Val)
include Indexable.Closeable (Inner)

let v ?fresh ?readonly ?lru_size ~index path =
Inner.v ?fresh ?readonly ?lru_size ~index path >|= make_closeable
let v ?fresh ?readonly ?lru_size ~index ~indexing_strategy path =
Inner.v ?fresh ?readonly ?lru_size ~index ~indexing_strategy path
>|= make_closeable

let sync t = Inner.sync (get_open_exn t)

Expand Down
44 changes: 38 additions & 6 deletions src/irmin-pack/pack_store_intf.ml
Original file line number Diff line number Diff line change
@@ -1,19 +1,22 @@
open! Import

(** A [Pack_store.S] is a closeable, persistent implementation of
{!Content_addressable.S} that uses an append-only file of variable-length
data blocks. The data file is indexed by hash via {!Pack_index.S}
implementation. *)
(** A [Pack_store.S] is a closeable, persistent implementation of {!Indexable.S}
that uses an append-only file of variable-length data blocks.
Certain values in the data file are indexed by hash via a {!Pack_index.S}
implementation, but not all of them need be. *)
module type S = sig
include Indexable.S

type index
type indexing_strategy

val v :
?fresh:bool ->
?readonly:bool ->
?lru_size:int ->
index:index ->
indexing_strategy:indexing_strategy ->
string ->
read t Lwt.t

Expand All @@ -35,6 +38,7 @@ end
module type Maker = sig
type hash
type index
type indexing_strategy

(** Save multiple kind of values in the same pack file. Values will be
distinguished using [V.magic], so they have to all be different. *)
Expand All @@ -48,11 +52,39 @@ module type Maker = sig
and type hash = hash
and type value = V.t
and type index := index
and type indexing_strategy := indexing_strategy
end

module type Sigs = sig
module type S = S
module type Maker = Maker
module Indexing_strategy : sig
type t = value_length:int -> Pack_value.Kind.t -> bool
(** The type of configurations for [irmin-pack]'s indexing strategy, which
dictates whether or not newly-appended pack entries should also be added
to the index. Strategies are parameterised over:
- the length of the binary encoding of the {i object} inside the pack
entry (i.e. not accounting for the encoded hash and kind character);
- the kind of the pack object having been added.
Indexing more than the {!minimal} strategy only impacts performance and
not correctness: more indexing results in a larger index and a smaller
pack file. *)

val always : t
(** The strategy that indexes all objects. *)

val minimal : t
(** The strategy that indexes as few objects as possible while still
maintaing store integrity. *)

val default : t
(** [default] is the indexing strategy used by [irmin-pack] instances that
do not explicitly set an indexing strategy in {!Irmin_pack.config}.
Currently set to {!always}. *)
end

module type S = S with type indexing_strategy := Indexing_strategy.t
module type Maker = Maker with type indexing_strategy := Indexing_strategy.t

val selected_version : Version.t

Expand Down
Loading

0 comments on commit 551740b

Please sign in to comment.