diff --git a/src/proto_016_PtMumbai/lib_sc_rollup_node/event.ml b/src/proto_016_PtMumbai/lib_sc_rollup_node/event.ml index ea2fa9aaa35ec021ba3f29b197e53dca08917f47..79b27226eef8bf0561b339fc1421099bf9873487 100644 --- a/src/proto_016_PtMumbai/lib_sc_rollup_node/event.ml +++ b/src/proto_016_PtMumbai/lib_sc_rollup_node/event.ml @@ -124,6 +124,14 @@ module Simple = struct "Warning: DAL is enabled in the protocol but no DAL node was provided \ for the rollup node." () + + let acquiring_lock = + declare_0 + ~section + ~name:"acquiring_lock" + ~level:Notice + ~msg:"Acquiring lock on data directory." + () end let starting_node = Simple.(emit starting_node) @@ -157,3 +165,5 @@ let kernel_debug_dont_wait msg = Simple.(emit__dont_wait__use_with_care kernel_debug) msg let warn_dal_enabled_no_node () = Simple.(emit warn_dal_enabled_no_node) () + +let acquiring_lock () = Simple.(emit acquiring_lock) () diff --git a/src/proto_016_PtMumbai/lib_sc_rollup_node/event.mli b/src/proto_016_PtMumbai/lib_sc_rollup_node/event.mli index cad046f4fe71216a78b8e1c9f12c99436a5334e0..188c37b99008a79c9a7fc8fc561b0e57d9223478 100644 --- a/src/proto_016_PtMumbai/lib_sc_rollup_node/event.mli +++ b/src/proto_016_PtMumbai/lib_sc_rollup_node/event.mli @@ -75,3 +75,7 @@ val kernel_debug_dont_wait : string -> unit (** [warn_dal_enabled_no_node ()] emits a warning for when DAL is enabled in the protocol but the rollup node has no DAL node. *) val warn_dal_enabled_no_node : unit -> unit Lwt.t + +(** [acquiring_lock ()] emits an event to indicate that the node is attempting + to acquire a lock on the data directory. *) +val acquiring_lock : unit -> unit Lwt.t diff --git a/src/proto_016_PtMumbai/lib_sc_rollup_node/node_context.ml b/src/proto_016_PtMumbai/lib_sc_rollup_node/node_context.ml index 8e679744649d1ed163c3e51d8af2452af6abd943..7232ef3eb7fe8a3a3112ac844bb988977d963cc8 100644 --- a/src/proto_016_PtMumbai/lib_sc_rollup_node/node_context.ml +++ b/src/proto_016_PtMumbai/lib_sc_rollup_node/node_context.ml @@ -46,6 +46,7 @@ type 'a t = { fee_parameters : Configuration.fee_parameters; protocol_constants : Constants.t; loser_mode : Loser_mode.t; + lockfile : Lwt_unix.file_descr; store : 'a store; context : 'a Context.index; lcc : ('a, lcc) Reference.t; @@ -128,6 +129,38 @@ let check_and_set_rollup_address context rollup_address = fail_unless Sc_rollup.Address.(rollup_address = saved_address) @@ Sc_rollup_node_errors.Unexpected_rollup {rollup_address; saved_address} +let lock ~data_dir = + let lockfile_path = Filename.concat data_dir "lock" in + let lock_aux ~data_dir = + let open Lwt_result_syntax in + let*! () = Event.acquiring_lock () in + let*! () = Lwt_utils_unix.create_dir data_dir in + let* lockfile = + protect @@ fun () -> + Lwt_unix.openfile + lockfile_path + [Unix.O_CREAT; O_RDWR; O_CLOEXEC; O_SYNC] + 0o644 + |> Lwt_result.ok + in + let* () = + protect ~on_error:(fun err -> + let*! () = Lwt_unix.close lockfile in + fail err) + @@ fun () -> + let*! () = Lwt_unix.lockf lockfile Unix.F_LOCK 0 in + return_unit + in + return lockfile + in + trace (Sc_rollup_node_errors.Could_not_acquire_lock lockfile_path) + @@ lock_aux ~data_dir + +let unlock {lockfile; _} = + Lwt.finalize + (fun () -> Lwt_unix.lockf lockfile Unix.F_ULOCK 0) + (fun () -> Lwt_unix.close lockfile) + let init (cctxt : Protocol_client_context.full) ~data_dir mode Configuration.( { @@ -142,6 +175,7 @@ let init (cctxt : Protocol_client_context.full) ~data_dir mode _; } as configuration) = let open Lwt_result_syntax in + let* lockfile = lock ~data_dir in let dal_cctxt = Option.map Dal_node_client.make_unix_cctxt dal_node_endpoint in @@ -194,11 +228,12 @@ let init (cctxt : Protocol_client_context.full) ~data_dir mode fee_parameters; protocol_constants; loser_mode; + lockfile; store; context; } -let close {cctxt; store; context; l1_ctxt; _} = +let close ({cctxt; store; context; l1_ctxt; _} as node_ctxt) = let open Lwt_result_syntax in let message = cctxt#message in let*! () = message "Shutting down L1@." in @@ -207,6 +242,8 @@ let close {cctxt; store; context; l1_ctxt; _} = let*! () = Context.close context in let*! () = message "Closing store@." in let* () = Store.close store in + let*! () = message "Releasing lock@." in + let*! () = unlock node_ctxt in return_unit let checkout_context node_ctxt block_hash = @@ -758,6 +795,7 @@ module Internal_for_tests = struct |> Data_encoding.Binary.to_bytes_exn Constants_repr.encoding |> Data_encoding.Binary.of_bytes_exn Constants.encoding in + let* lockfile = lock ~data_dir in let* store = Store.load Read_write @@ -794,6 +832,7 @@ module Internal_for_tests = struct fee_parameters = Configuration.default_fee_parameters; protocol_constants; loser_mode = Loser_mode.no_failures; + lockfile; store; context; } diff --git a/src/proto_016_PtMumbai/lib_sc_rollup_node/node_context.mli b/src/proto_016_PtMumbai/lib_sc_rollup_node/node_context.mli index 03fb62b860c40eda2747ca3d982327df7a5a6add..65c7fb4bba074f980a3b2d5ab3415e42945f67dd 100644 --- a/src/proto_016_PtMumbai/lib_sc_rollup_node/node_context.mli +++ b/src/proto_016_PtMumbai/lib_sc_rollup_node/node_context.mli @@ -63,6 +63,8 @@ type 'a t = { loser_mode : Loser_mode.t; (** If different from [Loser_mode.no_failures], the rollup node issues wrong commitments (for tests). *) + lockfile : Lwt_unix.file_descr; + (** A lock file acquired when the node starts. *) store : 'a store; (** The store for the persistent storage. *) context : 'a Context.index; (** The persistent context for the rollup node. *) diff --git a/src/proto_016_PtMumbai/lib_sc_rollup_node/sc_rollup_node_errors.ml b/src/proto_016_PtMumbai/lib_sc_rollup_node/sc_rollup_node_errors.ml index c0b3197a1bbede41f7a8025b0f0d387f70bba2c2..b00d2cb1e556b250c1d841d041b35057c818e119 100644 --- a/src/proto_016_PtMumbai/lib_sc_rollup_node/sc_rollup_node_errors.ml +++ b/src/proto_016_PtMumbai/lib_sc_rollup_node/sc_rollup_node_errors.ml @@ -57,6 +57,7 @@ type error += | No_publisher | Refutation_player_failed_to_start | No_refutation_coordinator + | Could_not_acquire_lock of string type error += Lost_game of Protocol.Alpha_context.Sc_rollup.Game.game_result @@ -345,4 +346,20 @@ let () = `Permanent Data_encoding.unit (function Refutation_player_failed_to_start -> Some () | _ -> None) - (fun () -> Refutation_player_failed_to_start) + (fun () -> Refutation_player_failed_to_start) ; + + register_error_kind + `Permanent + ~id:"could_not_acquire_lock" + ~title:"Could not acquire lock on data dir" + ~description:"Could not acquire lock on data dir." + ~pp:(fun ppf f -> + Format.fprintf + ppf + "Could not acquire lock on data directory, another rollup node may \ + already be running with this data. If this is not the case, consider \ + removing manually the file %S" + f) + Data_encoding.(obj1 (req "lock_file" string)) + (function Could_not_acquire_lock f -> Some f | _ -> None) + (fun f -> Could_not_acquire_lock f) diff --git a/src/proto_017_PtNairob/lib_sc_rollup_node/event.ml b/src/proto_017_PtNairob/lib_sc_rollup_node/event.ml index ea2fa9aaa35ec021ba3f29b197e53dca08917f47..79b27226eef8bf0561b339fc1421099bf9873487 100644 --- a/src/proto_017_PtNairob/lib_sc_rollup_node/event.ml +++ b/src/proto_017_PtNairob/lib_sc_rollup_node/event.ml @@ -124,6 +124,14 @@ module Simple = struct "Warning: DAL is enabled in the protocol but no DAL node was provided \ for the rollup node." () + + let acquiring_lock = + declare_0 + ~section + ~name:"acquiring_lock" + ~level:Notice + ~msg:"Acquiring lock on data directory." + () end let starting_node = Simple.(emit starting_node) @@ -157,3 +165,5 @@ let kernel_debug_dont_wait msg = Simple.(emit__dont_wait__use_with_care kernel_debug) msg let warn_dal_enabled_no_node () = Simple.(emit warn_dal_enabled_no_node) () + +let acquiring_lock () = Simple.(emit acquiring_lock) () diff --git a/src/proto_017_PtNairob/lib_sc_rollup_node/event.mli b/src/proto_017_PtNairob/lib_sc_rollup_node/event.mli index cad046f4fe71216a78b8e1c9f12c99436a5334e0..188c37b99008a79c9a7fc8fc561b0e57d9223478 100644 --- a/src/proto_017_PtNairob/lib_sc_rollup_node/event.mli +++ b/src/proto_017_PtNairob/lib_sc_rollup_node/event.mli @@ -75,3 +75,7 @@ val kernel_debug_dont_wait : string -> unit (** [warn_dal_enabled_no_node ()] emits a warning for when DAL is enabled in the protocol but the rollup node has no DAL node. *) val warn_dal_enabled_no_node : unit -> unit Lwt.t + +(** [acquiring_lock ()] emits an event to indicate that the node is attempting + to acquire a lock on the data directory. *) +val acquiring_lock : unit -> unit Lwt.t diff --git a/src/proto_017_PtNairob/lib_sc_rollup_node/node_context.ml b/src/proto_017_PtNairob/lib_sc_rollup_node/node_context.ml index e80c753579cc80329cd55aa1496a98c7a525e7c0..62282c4ee3217f2a79e32389eec1446cf5887c50 100644 --- a/src/proto_017_PtNairob/lib_sc_rollup_node/node_context.ml +++ b/src/proto_017_PtNairob/lib_sc_rollup_node/node_context.ml @@ -46,6 +46,7 @@ type 'a t = { fee_parameters : Configuration.fee_parameters; protocol_constants : Constants.t; loser_mode : Loser_mode.t; + lockfile : Lwt_unix.file_descr; store : 'a store; context : 'a Context.index; lcc : ('a, lcc) Reference.t; @@ -128,6 +129,38 @@ let check_and_set_rollup_address context rollup_address = fail_unless Sc_rollup.Address.(rollup_address = saved_address) @@ Sc_rollup_node_errors.Unexpected_rollup {rollup_address; saved_address} +let lock ~data_dir = + let lockfile_path = Filename.concat data_dir "lock" in + let lock_aux ~data_dir = + let open Lwt_result_syntax in + let*! () = Event.acquiring_lock () in + let*! () = Lwt_utils_unix.create_dir data_dir in + let* lockfile = + protect @@ fun () -> + Lwt_unix.openfile + lockfile_path + [Unix.O_CREAT; O_RDWR; O_CLOEXEC; O_SYNC] + 0o644 + |> Lwt_result.ok + in + let* () = + protect ~on_error:(fun err -> + let*! () = Lwt_unix.close lockfile in + fail err) + @@ fun () -> + let*! () = Lwt_unix.lockf lockfile Unix.F_LOCK 0 in + return_unit + in + return lockfile + in + trace (Sc_rollup_node_errors.Could_not_acquire_lock lockfile_path) + @@ lock_aux ~data_dir + +let unlock {lockfile; _} = + Lwt.finalize + (fun () -> Lwt_unix.lockf lockfile Unix.F_ULOCK 0) + (fun () -> Lwt_unix.close lockfile) + let init (cctxt : Protocol_client_context.full) ~data_dir mode Configuration.( { @@ -142,6 +175,7 @@ let init (cctxt : Protocol_client_context.full) ~data_dir mode _; } as configuration) = let open Lwt_result_syntax in + let* lockfile = lock ~data_dir in let dal_cctxt = Option.map Dal_node_client.make_unix_cctxt dal_node_endpoint in @@ -194,11 +228,12 @@ let init (cctxt : Protocol_client_context.full) ~data_dir mode fee_parameters; protocol_constants; loser_mode; + lockfile; store; context; } -let close {cctxt; store; context; l1_ctxt; _} = +let close ({cctxt; store; context; l1_ctxt; _} as node_ctxt) = let open Lwt_result_syntax in let message = cctxt#message in let*! () = message "Shutting down L1@." in @@ -207,6 +242,8 @@ let close {cctxt; store; context; l1_ctxt; _} = let*! () = Context.close context in let*! () = message "Closing store@." in let* () = Store.close store in + let*! () = message "Releasing lock@." in + let*! () = unlock node_ctxt in return_unit let checkout_context node_ctxt block_hash = @@ -743,6 +780,7 @@ module Internal_for_tests = struct |> Data_encoding.Binary.to_bytes_exn Constants_repr.encoding |> Data_encoding.Binary.of_bytes_exn Constants.encoding in + let* lockfile = lock ~data_dir in let* store = Store.load Read_write @@ -779,6 +817,7 @@ module Internal_for_tests = struct fee_parameters = Configuration.default_fee_parameters; protocol_constants; loser_mode = Loser_mode.no_failures; + lockfile; store; context; } diff --git a/src/proto_017_PtNairob/lib_sc_rollup_node/node_context.mli b/src/proto_017_PtNairob/lib_sc_rollup_node/node_context.mli index 7583b4377195ca3f958505544349d9f68fd34e0f..55897abfc7c6330e6991033a5ab599179a0d31c7 100644 --- a/src/proto_017_PtNairob/lib_sc_rollup_node/node_context.mli +++ b/src/proto_017_PtNairob/lib_sc_rollup_node/node_context.mli @@ -63,6 +63,8 @@ type 'a t = { loser_mode : Loser_mode.t; (** If different from [Loser_mode.no_failures], the rollup node issues wrong commitments (for tests). *) + lockfile : Lwt_unix.file_descr; + (** A lock file acquired when the node starts. *) store : 'a store; (** The store for the persistent storage. *) context : 'a Context.index; (** The persistent context for the rollup node. *) diff --git a/src/proto_017_PtNairob/lib_sc_rollup_node/sc_rollup_node_errors.ml b/src/proto_017_PtNairob/lib_sc_rollup_node/sc_rollup_node_errors.ml index c0b3197a1bbede41f7a8025b0f0d387f70bba2c2..b00d2cb1e556b250c1d841d041b35057c818e119 100644 --- a/src/proto_017_PtNairob/lib_sc_rollup_node/sc_rollup_node_errors.ml +++ b/src/proto_017_PtNairob/lib_sc_rollup_node/sc_rollup_node_errors.ml @@ -57,6 +57,7 @@ type error += | No_publisher | Refutation_player_failed_to_start | No_refutation_coordinator + | Could_not_acquire_lock of string type error += Lost_game of Protocol.Alpha_context.Sc_rollup.Game.game_result @@ -345,4 +346,20 @@ let () = `Permanent Data_encoding.unit (function Refutation_player_failed_to_start -> Some () | _ -> None) - (fun () -> Refutation_player_failed_to_start) + (fun () -> Refutation_player_failed_to_start) ; + + register_error_kind + `Permanent + ~id:"could_not_acquire_lock" + ~title:"Could not acquire lock on data dir" + ~description:"Could not acquire lock on data dir." + ~pp:(fun ppf f -> + Format.fprintf + ppf + "Could not acquire lock on data directory, another rollup node may \ + already be running with this data. If this is not the case, consider \ + removing manually the file %S" + f) + Data_encoding.(obj1 (req "lock_file" string)) + (function Could_not_acquire_lock f -> Some f | _ -> None) + (fun f -> Could_not_acquire_lock f) diff --git a/src/proto_alpha/lib_sc_rollup_node/event.ml b/src/proto_alpha/lib_sc_rollup_node/event.ml index ea2fa9aaa35ec021ba3f29b197e53dca08917f47..79b27226eef8bf0561b339fc1421099bf9873487 100644 --- a/src/proto_alpha/lib_sc_rollup_node/event.ml +++ b/src/proto_alpha/lib_sc_rollup_node/event.ml @@ -124,6 +124,14 @@ module Simple = struct "Warning: DAL is enabled in the protocol but no DAL node was provided \ for the rollup node." () + + let acquiring_lock = + declare_0 + ~section + ~name:"acquiring_lock" + ~level:Notice + ~msg:"Acquiring lock on data directory." + () end let starting_node = Simple.(emit starting_node) @@ -157,3 +165,5 @@ let kernel_debug_dont_wait msg = Simple.(emit__dont_wait__use_with_care kernel_debug) msg let warn_dal_enabled_no_node () = Simple.(emit warn_dal_enabled_no_node) () + +let acquiring_lock () = Simple.(emit acquiring_lock) () diff --git a/src/proto_alpha/lib_sc_rollup_node/event.mli b/src/proto_alpha/lib_sc_rollup_node/event.mli index cad046f4fe71216a78b8e1c9f12c99436a5334e0..188c37b99008a79c9a7fc8fc561b0e57d9223478 100644 --- a/src/proto_alpha/lib_sc_rollup_node/event.mli +++ b/src/proto_alpha/lib_sc_rollup_node/event.mli @@ -75,3 +75,7 @@ val kernel_debug_dont_wait : string -> unit (** [warn_dal_enabled_no_node ()] emits a warning for when DAL is enabled in the protocol but the rollup node has no DAL node. *) val warn_dal_enabled_no_node : unit -> unit Lwt.t + +(** [acquiring_lock ()] emits an event to indicate that the node is attempting + to acquire a lock on the data directory. *) +val acquiring_lock : unit -> unit Lwt.t diff --git a/src/proto_alpha/lib_sc_rollup_node/node_context.ml b/src/proto_alpha/lib_sc_rollup_node/node_context.ml index da0689f47a99c0585718aab34e6a39648eda5de3..9579bedbf95879291edb2fa0a3fc034f8ff627ab 100644 --- a/src/proto_alpha/lib_sc_rollup_node/node_context.ml +++ b/src/proto_alpha/lib_sc_rollup_node/node_context.ml @@ -47,6 +47,7 @@ type 'a t = { fee_parameters : Configuration.fee_parameters; protocol_constants : Constants.t; loser_mode : Loser_mode.t; + lockfile : Lwt_unix.file_descr; store : 'a store; context : 'a Context.index; lcc : ('a, lcc) Reference.t; @@ -129,6 +130,38 @@ let check_and_set_rollup_address context rollup_address = fail_unless Sc_rollup.Address.(rollup_address = saved_address) @@ Sc_rollup_node_errors.Unexpected_rollup {rollup_address; saved_address} +let lock ~data_dir = + let lockfile_path = Filename.concat data_dir "lock" in + let lock_aux ~data_dir = + let open Lwt_result_syntax in + let*! () = Event.acquiring_lock () in + let*! () = Lwt_utils_unix.create_dir data_dir in + let* lockfile = + protect @@ fun () -> + Lwt_unix.openfile + lockfile_path + [Unix.O_CREAT; O_RDWR; O_CLOEXEC; O_SYNC] + 0o644 + |> Lwt_result.ok + in + let* () = + protect ~on_error:(fun err -> + let*! () = Lwt_unix.close lockfile in + fail err) + @@ fun () -> + let*! () = Lwt_unix.lockf lockfile Unix.F_LOCK 0 in + return_unit + in + return lockfile + in + trace (Sc_rollup_node_errors.Could_not_acquire_lock lockfile_path) + @@ lock_aux ~data_dir + +let unlock {lockfile; _} = + Lwt.finalize + (fun () -> Lwt_unix.lockf lockfile Unix.F_ULOCK 0) + (fun () -> Lwt_unix.close lockfile) + let init (cctxt : Protocol_client_context.full) ~data_dir mode Configuration.( { @@ -143,6 +176,7 @@ let init (cctxt : Protocol_client_context.full) ~data_dir mode _; } as configuration) = let open Lwt_result_syntax in + let* lockfile = lock ~data_dir in let dal_cctxt = Option.map Dal_node_client.make_unix_cctxt dal_node_endpoint in @@ -208,11 +242,12 @@ let init (cctxt : Protocol_client_context.full) ~data_dir mode fee_parameters; protocol_constants; loser_mode; + lockfile; store; context; } -let close {cctxt; store; context; l1_ctxt; _} = +let close ({cctxt; store; context; l1_ctxt; _} as node_ctxt) = let open Lwt_result_syntax in let message = cctxt#message in let*! () = message "Shutting down L1@." in @@ -221,6 +256,8 @@ let close {cctxt; store; context; l1_ctxt; _} = let*! () = Context.close context in let*! () = message "Closing store@." in let* () = Store.close store in + let*! () = message "Releasing lock@." in + let*! () = unlock node_ctxt in return_unit let checkout_context node_ctxt block_hash = @@ -757,6 +794,7 @@ module Internal_for_tests = struct |> Data_encoding.Binary.to_bytes_exn Constants_repr.encoding |> Data_encoding.Binary.of_bytes_exn Constants.encoding in + let* lockfile = lock ~data_dir in let* store = Store.load Read_write @@ -794,6 +832,7 @@ module Internal_for_tests = struct fee_parameters = Configuration.default_fee_parameters; protocol_constants; loser_mode = Loser_mode.no_failures; + lockfile; store; context; } diff --git a/src/proto_alpha/lib_sc_rollup_node/node_context.mli b/src/proto_alpha/lib_sc_rollup_node/node_context.mli index 5e1123e56da65c06236523c961de36c4e1351a0a..7fc8e343c7087062160b515df1ab8149a6720946 100644 --- a/src/proto_alpha/lib_sc_rollup_node/node_context.mli +++ b/src/proto_alpha/lib_sc_rollup_node/node_context.mli @@ -65,6 +65,8 @@ type 'a t = { loser_mode : Loser_mode.t; (** If different from [Loser_mode.no_failures], the rollup node issues wrong commitments (for tests). *) + lockfile : Lwt_unix.file_descr; + (** A lock file acquired when the node starts. *) store : 'a store; (** The store for the persistent storage. *) context : 'a Context.index; (** The persistent context for the rollup node. *) diff --git a/src/proto_alpha/lib_sc_rollup_node/sc_rollup_node_errors.ml b/src/proto_alpha/lib_sc_rollup_node/sc_rollup_node_errors.ml index c0b3197a1bbede41f7a8025b0f0d387f70bba2c2..b00d2cb1e556b250c1d841d041b35057c818e119 100644 --- a/src/proto_alpha/lib_sc_rollup_node/sc_rollup_node_errors.ml +++ b/src/proto_alpha/lib_sc_rollup_node/sc_rollup_node_errors.ml @@ -57,6 +57,7 @@ type error += | No_publisher | Refutation_player_failed_to_start | No_refutation_coordinator + | Could_not_acquire_lock of string type error += Lost_game of Protocol.Alpha_context.Sc_rollup.Game.game_result @@ -345,4 +346,20 @@ let () = `Permanent Data_encoding.unit (function Refutation_player_failed_to_start -> Some () | _ -> None) - (fun () -> Refutation_player_failed_to_start) + (fun () -> Refutation_player_failed_to_start) ; + + register_error_kind + `Permanent + ~id:"could_not_acquire_lock" + ~title:"Could not acquire lock on data dir" + ~description:"Could not acquire lock on data dir." + ~pp:(fun ppf f -> + Format.fprintf + ppf + "Could not acquire lock on data directory, another rollup node may \ + already be running with this data. If this is not the case, consider \ + removing manually the file %S" + f) + Data_encoding.(obj1 (req "lock_file" string)) + (function Could_not_acquire_lock f -> Some f | _ -> None) + (fun f -> Could_not_acquire_lock f)