From 89e30de3d2aa993148b26d567b1769f5eba8746d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Thir=C3=A9?= Date: Wed, 2 Oct 2024 07:48:28 +0200 Subject: [PATCH 1/5] KVS: Fix properly the leak of opened files --- src/lib_stdlib_unix/key_value_store.ml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/lib_stdlib_unix/key_value_store.ml b/src/lib_stdlib_unix/key_value_store.ml index 51c708b992b7..820e2a483bc5 100644 --- a/src/lib_stdlib_unix/key_value_store.ml +++ b/src/lib_stdlib_unix/key_value_store.ml @@ -897,10 +897,7 @@ end = struct if !closed then Lwt.return (Error (Error_monad.TzTrace.make (Closed {action = "remove"}))) else - let on_file_closed ~on_file_opened:_ = - let+ () = may_remove_file layout.filepath in - Table.remove files layout.filepath - in + let on_file_closed ~on_file_opened:_ = may_remove_file layout.filepath in let p = Action.remove_file ~on_file_closed @@ -910,7 +907,9 @@ end = struct layout.filepath in Table.replace last_actions layout.filepath (Remove p) ; - Table.replace files layout.filepath (Closing p) ; + (match Table.find_opt files layout.filepath with + | None -> () + | Some _ -> Table.replace files layout.filepath (Closing p)) ; let* () = p in (* See [close_file] for an explanation of the lines below. *) (match Table.find_opt last_actions layout.filepath with -- GitLab From 727c1205963ca43151887cc0596f8e71c7d56c9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Thir=C3=A9?= Date: Wed, 2 Oct 2024 07:49:02 +0200 Subject: [PATCH 2/5] KVS: Fix the leak of number of actions --- src/lib_stdlib_unix/key_value_store.ml | 35 +++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/src/lib_stdlib_unix/key_value_store.ml b/src/lib_stdlib_unix/key_value_store.ml index 820e2a483bc5..309a3d71b5da 100644 --- a/src/lib_stdlib_unix/key_value_store.ml +++ b/src/lib_stdlib_unix/key_value_store.ml @@ -815,7 +815,16 @@ end = struct in let p = Action.read ~on_file_closed files last_actions layout key in Table.replace last_actions layout.filepath (Read p) ; - let+ _file, value = p in + let+ file, value = p in + (match file with + | None -> ( + match Table.find_opt last_actions layout.filepath with + | Some (Read p) -> ( + match Lwt.state p with + | Lwt.Return _ -> Table.remove last_actions layout.filepath + | _ -> ()) + | _ -> ()) + | Some _ -> ()) ; value (* Very similar to [read] action except we only look at the bitset @@ -839,7 +848,16 @@ end = struct Action.value_exists ~on_file_closed files last_actions layout key in Table.replace last_actions layout.filepath (Value_exists p) ; - let+ _, exists = p in + let+ file, exists = p in + (match file with + | None -> ( + match Table.find_opt last_actions layout.filepath with + | Some (Read p) -> ( + match Lwt.state p with + | Lwt.Return _ -> Table.remove last_actions layout.filepath + | _ -> ()) + | _ -> ()) + | Some _ -> ()) ; exists (* Very similar to [value_exists] action except we look at the @@ -861,7 +879,16 @@ end = struct in let p = Action.count_values ~on_file_closed files last_actions layout in Table.replace last_actions layout.filepath (Count_values p) ; - let+ _, count = p in + let+ file, count = p in + (match file with + | None -> ( + match Table.find_opt last_actions layout.filepath with + | Some (Count_values p) -> ( + match Lwt.state p with + | Lwt.Return _ -> Table.remove last_actions layout.filepath + | _ -> ()) + | _ -> ()) + | Some _ -> ()) ; count let write ?(override = false) {files; last_actions; lru; closed} layout key @@ -913,7 +940,7 @@ end = struct let* () = p in (* See [close_file] for an explanation of the lines below. *) (match Table.find_opt last_actions layout.filepath with - | Some (Close p) -> ( + | Some (Close p) | Some (Remove p) -> ( match Lwt.state p with | Lwt.Return _ -> Table.remove last_actions layout.filepath | _ -> ()) -- GitLab From 5a7ba2d6e6a19b27f0d13cd263bae3c25845504c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Thir=C3=A9?= Date: Wed, 2 Oct 2024 07:49:25 +0200 Subject: [PATCH 3/5] KVS/Test: Support ongoing actions --- src/lib_stdlib_unix/test/test_key_value_store_fuzzy.ml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/lib_stdlib_unix/test/test_key_value_store_fuzzy.ml b/src/lib_stdlib_unix/test/test_key_value_store_fuzzy.ml index 3178bb7b265c..04362f3a9802 100644 --- a/src/lib_stdlib_unix/test/test_key_value_store_fuzzy.ml +++ b/src/lib_stdlib_unix/test/test_key_value_store_fuzzy.ml @@ -141,6 +141,8 @@ module type S = sig module View : sig val opened_files : ('file, 'key, 'value) t -> int + + val ongoing_actions : ('file, 'key, 'value) t -> int end end @@ -191,6 +193,10 @@ module R : S = struct let opened_files table = table |> Stdlib.Hashtbl.to_seq_keys |> Seq.map fst |> List.of_seq |> List.sort_uniq compare |> List.length + + let ongoing_actions table = + table |> Stdlib.Hashtbl.to_seq_keys |> Seq.map fst |> List.of_seq + |> List.sort_uniq compare |> List.length end end -- GitLab From ed8784b54ad179ca1e29f710cfce44f6cc8c3305 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Thir=C3=A9?= Date: Wed, 2 Oct 2024 07:50:10 +0200 Subject: [PATCH 4/5] KVS/Test: Test the property that the tables are bounded --- .../test/test_key_value_store_fuzzy.ml | 21 ++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/src/lib_stdlib_unix/test/test_key_value_store_fuzzy.ml b/src/lib_stdlib_unix/test/test_key_value_store_fuzzy.ml index 04362f3a9802..5dfc7ca039ed 100644 --- a/src/lib_stdlib_unix/test/test_key_value_store_fuzzy.ml +++ b/src/lib_stdlib_unix/test/test_key_value_store_fuzzy.ml @@ -566,7 +566,26 @@ let run_scenario (function Ok () -> return_unit | Error err -> fail err) promises_running_seq in - run_actions action next_actions Seq_s.empty + (* After waiting for all the promises to be executed, the + number of opened files in theory should not exceed the + number of files in the LRU. *) + if L.View.opened_files left <= lru_size then + if L.View.ongoing_actions left <= lru_size then + run_actions action next_actions Seq_s.empty + else + failwith + "Expected size of actions table to be at most %d. Got %d \ + (remaining actions: %d)." + lru_size + (L.View.ongoing_actions left) + (List.length next_actions + 1) + else + failwith + "Expected size of files table to be at most %d. Got %d (remaining \ + actions: %d)." + lru_size + (L.View.opened_files left) + (List.length next_actions + 1) | (Parallel, action) :: next_actions -> (* We do not wait for promises to end and append them to the list of promises on-going. *) -- GitLab From 02e65cfeaa2dc48a9b3e63bbc889ebea9775a4c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Thir=C3=A9?= Date: Wed, 2 Oct 2024 17:08:42 +0200 Subject: [PATCH 5/5] KVS: Ensure the number of file descriptors is bounded by the LRU --- src/lib_stdlib_unix/key_value_store.ml | 70 +++++++++++--------------- 1 file changed, 29 insertions(+), 41 deletions(-) diff --git a/src/lib_stdlib_unix/key_value_store.ml b/src/lib_stdlib_unix/key_value_store.ml index 309a3d71b5da..ac9e8f475a3f 100644 --- a/src/lib_stdlib_unix/key_value_store.ml +++ b/src/lib_stdlib_unix/key_value_store.ml @@ -521,9 +521,7 @@ end = struct let remove_with_opened_file files lru filepath opened_file = let open Lwt_syntax in let* () = close_opened_file opened_file in - (* It may happen that the node was already evicted by a concurrent - action. Hence [LRU.remove] can fail. *) - (try LRU.remove lru opened_file.lru_node with _ -> ()) ; + LRU.remove lru opened_file.lru_node ; Table.remove files filepath ; Lwt_unix.unlink filepath @@ -558,9 +556,13 @@ end = struct | None -> on_file_closed ~on_file_opened | Some opened_file -> on_file_opened opened_file - let close_file files last_actions filepath = + let close_file files lru last_actions filepath = let on_file_closed ~on_file_opened:_ = Lwt.return_unit in - let on_file_opened opened_file = close_opened_file opened_file in + let on_file_opened opened_file = + let open Lwt_syntax in + let+ () = close_opened_file opened_file in + LRU.remove lru opened_file.lru_node + in generic_action files last_actions filepath ~on_file_closed ~on_file_opened let read ~on_file_closed files last_actions layout key = @@ -625,7 +627,7 @@ end = struct generic_action files last_actions filepath ~on_file_closed ~on_file_opened end - let close_file files last_actions filepath = + let close_file files lru last_actions filepath = (* Since this function does not aim to be exposed, we do not check whether the store is closed. This would actually be a mistake since it is used while the store is closing. @@ -642,7 +644,7 @@ end = struct (* [p] is a promise that triggers the action of closing the file. It is important to not wait on it so that we can update the store's last_actions atomically to ensure invariant (A). *) - let p = Action.close_file files last_actions filepath in + let p = Action.close_file files lru last_actions filepath in Table.replace files filepath (Closing p) ; Table.replace last_actions filepath (Close p) ; @@ -675,7 +677,7 @@ end = struct closed := true ; let* () = Table.iter_s - (fun filename _ -> close_file files last_actions filename) + (fun filename _ -> close_file files lru last_actions filename) files in LRU.clear lru ; @@ -691,39 +693,25 @@ end = struct let add_lru files last_actions lru filename = let open Lwt_syntax in let rec loop () = - let lru_node, remove = LRU.add_and_return_erased lru filename in - match remove with - | None -> return lru_node - | Some filepath -> - (* The reason why we remove the node is because of a deadlock - that may occur in the PBT test, but has almost 0 chance to - occur in practice. - - A deadlock can arise if the call to [close_file] below - requires to evict the current node. How can this happen? - With an LRU of size 1, this can happen if the [close_file] - function must wait the end of the current actions on going - for this file and those actions require to open the very - same file. How? Well for example if the on going actions - are [Remove] followed by a [Write]. This is stupid, but - possible (so generated by the PBT) and has very few chances - to appear in practice. Now if the LRU is of size [n], this - has even less chance to occur (you would need to generate a - circle of dependencies). *) - LRU.remove lru lru_node ; - (* We want to ensure that the number of file descriptors opened - is bounded by the size of the LRU. This is why we wait first - for the eviction promise to be fulfilled that will close the - file evicted. *) - let* () = close_file files last_actions filepath in - (* We call recursively to be sure that when trying to add the - node again, no node will be evicted. This strategy may - create in theory some starvation since while doing this - other nodes may have been added to the KVS, and - consequently, we may have to wait closing more files. In - practice, it should not be an issue: The starvation can - happen only if the store is busy all the time. *) - loop () + if LRU.capacity lru = LRU.length lru then + let node = LRU.oldest_element lru |> Stdlib.Option.get in + let filepath = LRU.data node in + (* We want to ensure that the number of file descriptors opened + is bounded by the size of the LRU. This is why we wait first + for the eviction promise to be fulfilled that will close the + file evicted. *) + let* () = close_file files lru last_actions filepath in + (* We call recursively to be sure that when trying to add the + node again, no node will be evicted. This strategy may + create in theory some starvation since while doing this + other nodes may have been added to the KVS, and + consequently, we may have to wait closing more files. In + practice, it should not be an issue: The starvation can + happen only if the store is busy all the time. *) + (loop [@ocaml.tailcall]) () + else + let lru_node = LRU.add lru filename in + Lwt.return lru_node in loop () -- GitLab