diff --git a/doc_src/_examples/disciplines/types/plot_retry_discipline.py b/doc_src/_examples/disciplines/types/plot_retry_discipline.py new file mode 100644 index 0000000000000000000000000000000000000000..d760a5d581e01f353fa2e854d5291718b6ba0447 --- /dev/null +++ b/doc_src/_examples/disciplines/types/plot_retry_discipline.py @@ -0,0 +1,196 @@ +# Copyright 2021 IRT Saint Exupéry, https://www.irt-saintexupery.com +# +# This work is licensed under a BSD 0-Clause License. +# +# Permission to use, copy, modify, and/or distribute this software +# for any purpose with or without fee is hereby granted. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL +# WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL +# THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, +# OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING +# FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, +# NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION +# WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# Contributors: +# INITIAL AUTHORS - initial API and implementation and/or initial +# documentation +# :author: Jean-François Figué +# OTHER AUTHORS - MACROSCOPIC CHANGES +""" +Create a retry discipline +========================= + +Sometimes, +the execution of a discipline can fail and work after several repetitions. +The :class:`.RetryDiscipline` facilitates the management of these failures and repetitions. +This class illustrates this feature. +""" + +from __future__ import annotations + +import sys +import time +from typing import TYPE_CHECKING + +from numpy import array + +from gemseo import configure_logger +from gemseo import create_discipline +from gemseo.core.discipline import Discipline +from gemseo.disciplines.wrappers.retry_discipline import RetryDiscipline + +if TYPE_CHECKING: + from gemseo.typing import StrKeyMapping + +LOGGER = configure_logger() + +# %% +# Toy discipline +# -------------- +# For that example, +# we create an :class:`.AnalyticDiscipline` to evaluate the expression :math:`y=1/x`: +analytic_disc = create_discipline("AnalyticDiscipline", expressions={"y": "1/x"}) + +# %% +# This discipline will raise a ``ZeroDivisionError`` when :math:`x=0`. +# +# Execution without failure +# ------------------------- +# Let's wrap this toy discipline in a :class:`.RetryDiscipline` +# parametrized by a maximum number of 3 execution attempts: +retry_disc = RetryDiscipline(analytic_disc, n_trials=3) + +# %% +# We can execute this :class:`.RetryDiscipline` at :math:`x=2`: +retry_disc.execute({"x": array([2.0])}) +retry_disc.io.data + +# %% +# and verify that the computation is correctly performed, :math:`y=0.5`, +# with only one execution attempt: +retry_disc.n_executions + +# %% +# Execution with failure +# ---------------------- +# If an exception like a ``ZeroDivisionError`` occurs, +# we do not want to retry the execution and just do something else. +# To do this, +# we need to define the fatal exceptions for which the execution is not retried. +# It means that if that error is raised, +# then the discipline :class:`.RetryDiscipline` will stop execution +# rather than retrying an attempt. +retry_disc = RetryDiscipline( + analytic_disc, n_trials=3, fatal_exceptions=[ZeroDivisionError] +) + +try: + retry_disc.execute() +except ZeroDivisionError: + LOGGER.info("Manage this fatal exception.") + +# %% +# We can verify the number of attempts is only :math:`1`: +retry_disc.n_executions + +# %% +# To highlight the use of ``n_trials`` parameter, let's try another toy discipline, +# which will crash the first 2 executions and finally succeed at the third attempt. + + +class FictiveDiscipline(Discipline): + """Discipline to be executed several times. + + - The first 2 times, raise a RuntimeError, + - and finally succeed. + """ + + def __init__(self) -> None: + super().__init__() + self.attempt = 0 + + def _run(self, input_data: StrKeyMapping) -> StrKeyMapping: + self.attempt += 1 + LOGGER.info("attempt: %s", self.attempt) + if self.attempt < 3: + raise RuntimeError + return {} + + +# %% +# We can then illustrate the use of ``n_trials`` parameter. Here we intentionally set +# this value to 4, knowing the discipline will complete before at the third trial: + + +test_n_trials = FictiveDiscipline() +retry_disc = RetryDiscipline(test_n_trials, n_trials=4) + +retry_disc.execute() + +# %% +# and verify the calculation has been tried 3 times to succeed: +retry_disc.n_executions + +# %% +# Limit the execution time +# ------------------------ +# If you want to limit the duration of the wrapped discipline, +# use the ``timeout`` option. +# Here is an example of a discipline +# whose execution does nothing except sleep for 5 seconds: + + +class DisciplineLongTimeRunning(Discipline): + """A discipline that could run for a while, to test the timeout feature.""" + + def _run(self, input_data: StrKeyMapping) -> None: + time.sleep(5.0) + + +# %% +# Now we wrap it in :class:`.RetryDiscipline`, +# set the ``timeout`` argument to 2 seconds +# and execute this new discipline: + +retry_disc = RetryDiscipline(DisciplineLongTimeRunning(), n_trials=1, timeout=2.0) + +sys.tracebacklimit = 0 +try: + LOGGER.info("Running discipline...") + retry_disc.execute({}) + LOGGER.info("Discipline completed without reaching the time limit.") +except TimeoutError: + LOGGER.info("Discipline stopped, due to a TimeoutError.") + +# %% +# In the log, +# we can see the initial and final times of the discipline execution. +# We can also read that the timeout is reached. +# +# In some cases, +# this option could be very useful. +# For example if you wrap an SSH discipline +# (see `gemseo-ssh plugin `__) +# in :class:`.RetryDiscipline`. +# In that context, +# it can be important to limit the duration when an ssh connexion is too slow. +# + +# %% +# .. note:: +# +# The user can build his :class:`.RetryDiscipline` with a combination of all the +# available parameters. +# Some attributes of the discipline are public and can be modified after +# instantiation (``fatal_exceptions``, ``n_trials``, ...) +# +# .. note:: +# +# In the previous example, we added ``sys.tracebacklimit = 0`` to +# limit message output by exception, just in order the +# output is only focused on what we aim to demonstrate with that example. +# Please don't put this statement in normal use, otherwise you could miss some +# important messages in the output. +# diff --git a/src/gemseo/disciplines/wrappers/retry_discipline.py b/src/gemseo/disciplines/wrappers/retry_discipline.py index d7d617713d4411a3b2eb9b24538ade080e443b36..aff19faac1ceedaca2e6789d0d2981358133f37d 100644 --- a/src/gemseo/disciplines/wrappers/retry_discipline.py +++ b/src/gemseo/disciplines/wrappers/retry_discipline.py @@ -27,6 +27,7 @@ from typing import TYPE_CHECKING from typing import ClassVar from gemseo.core.discipline import Discipline +from gemseo.core.execution_status import ExecutionStatus if TYPE_CHECKING: from collections.abc import Iterable @@ -41,7 +42,7 @@ class RetryDiscipline(Discipline): This :class:`.Discipline` wraps another discipline. - It can be executed multiple times (up to a specified number of retries) + It can be executed multiple times (up to a specified number of trials) if the previous attempts fail to produce any result. A timeout in seconds can be specified to prevent executions from becoming stuck. @@ -66,11 +67,11 @@ class RetryDiscipline(Discipline): ) """The possible timeout exceptions that can be raised during execution.""" - n_retry: int - """The number of retry of the discipline.""" + n_trials: int + """The number of trials to execute the discipline.""" wait_time: float - """The time to wait between 2 retries (in seconds).""" + """The time to wait between 2 trials (in seconds).""" timeout: float """The maximum duration, in seconds, that the discipline is allowed to run.""" @@ -82,7 +83,7 @@ class RetryDiscipline(Discipline): def __init__( self, discipline: Discipline, - n_retry: int = 5, + n_trials: int = 5, wait_time: float = 0.0, timeout: float = math.inf, fatal_exceptions: Iterable[type[Exception]] = (), @@ -90,8 +91,8 @@ class RetryDiscipline(Discipline): """ Args: discipline: The discipline to wrap in the retry loop. - n_retry: The number of retry of the discipline. - wait_time: The time to wait between 2 retries (in seconds). + n_trials: The number of trials of the discipline. + wait_time: The time to wait between 2 trials (in seconds). timeout: The maximum duration, in seconds, that the discipline is allowed to run. If this time limit is exceeded, the execution is terminated. If ``math.inf``, the @@ -109,7 +110,7 @@ class RetryDiscipline(Discipline): self.__discipline = discipline self.io.input_grammar = discipline.io.input_grammar self.io.output_grammar = discipline.io.output_grammar - self.n_retry = n_retry + self.n_trials = n_trials self.wait_time = wait_time self.timeout = timeout self.fatal_exceptions = fatal_exceptions @@ -123,11 +124,13 @@ class RetryDiscipline(Discipline): def _run(self, input_data: StrKeyMapping) -> StrKeyMapping | None: self.__n_executions = 0 - for n_try in range(1, self.n_retry + 1): + for n_trial in range(1, self.n_trials + 1): self.__n_executions += 1 LOGGER.debug( - "Trying to execute the discipline: attempt %d/%d", n_try, self.n_retry + "Trying to execute the discipline: attempt %d/%d", + n_trial, + self.n_trials, ) try: @@ -144,7 +147,7 @@ class RetryDiscipline(Discipline): current_error = TimeoutError(msg) except Exception as error: # noqa: BLE001 - if isinstance(error, self.fatal_exceptions): + if isinstance(error, tuple(self.fatal_exceptions)): LOGGER.info( "Failed to execute discipline %s, " "aborting retry because of the exception type %s.", @@ -155,14 +158,16 @@ class RetryDiscipline(Discipline): current_error = error time.sleep(self.wait_time) + self.__discipline.execution_status.value = ExecutionStatus.Status.DONE - plural_suffix = "s" if self.n_retry > 1 else "" + plural_suffix = "s" if self.n_trials > 1 else "" LOGGER.error( "Failed to execute discipline %s after %d attempt%s.", self.__discipline.name, - self.n_retry, + self.n_trials, plural_suffix, ) + raise current_error def _execute_discipline(self, input_data: StrKeyMapping) -> StrKeyMapping: diff --git a/tests/disciplines/wrappers/test_retry_discipline.py b/tests/disciplines/wrappers/test_retry_discipline.py index 8d2e3ff85dba52a1f04d3753a00ec792ae9604b9..7af07c2e9d2a9e0ee5af87380731586a82818f32 100644 --- a/tests/disciplines/wrappers/test_retry_discipline.py +++ b/tests/disciplines/wrappers/test_retry_discipline.py @@ -85,13 +85,13 @@ def test_retry_discipline(an_analytic_discipline, timeout, caplog) -> None: @pytest.mark.parametrize("wait_time", [0.5, 1.0]) -@pytest.mark.parametrize("n_retry", [1, 3]) +@pytest.mark.parametrize("n_trials", [1, 3]) def test_failure_retry_discipline_with_timeout( - an_analytic_discipline, n_retry, wait_time, caplog + an_analytic_discipline, n_trials, wait_time, caplog ) -> None: """Test failure of the discipline with a too much very short timeout.""" disc_with_timeout = RetryDiscipline( - an_analytic_discipline, timeout=1e-4, n_retry=n_retry, wait_time=wait_time + an_analytic_discipline, timeout=1e-4, n_trials=n_trials, wait_time=wait_time ) with ( @@ -105,16 +105,16 @@ def test_failure_retry_discipline_with_timeout( disc_with_timeout.execute({"x": array([4.0])}) elapsed_time = timer.elapsed_time - assert elapsed_time > 0.05 + (n_retry - 1) * wait_time + assert elapsed_time > 0.05 + (n_trials - 1) * wait_time - assert disc_with_timeout.n_executions == n_retry + assert disc_with_timeout.n_executions == n_trials assert disc_with_timeout.local_data == {"x": array([4.0])} assert "Process stopped as it exceeds timeout" in caplog.text - plural_suffix = "s" if n_retry > 1 else "" + plural_suffix = "s" if n_trials > 1 else "" log_message = ( - f"Failed to execute discipline AnalyticDiscipline after {n_retry}" + f"Failed to execute discipline AnalyticDiscipline after {n_trials}" f" attempt{plural_suffix}." ) assert log_message in caplog.text @@ -123,9 +123,9 @@ def test_failure_retry_discipline_with_timeout( def test_failure_zero_division_error(a_crashing_analytic_discipline, caplog) -> None: """Test failure of the discipline with a bad x entry. - In order to catch the ZeroDivisionError, set n_retry=1 + In order to catch the ZeroDivisionError, set n_trials=1 """ - disc = RetryDiscipline(a_crashing_analytic_discipline, n_retry=1) + disc = RetryDiscipline(a_crashing_analytic_discipline, n_trials=1) with pytest.raises(ZeroDivisionError, match="float division by zero"): disc.execute({"x": array([0.0])}) @@ -144,9 +144,9 @@ def test_failure_zero_division_error(a_crashing_analytic_discipline, caplog) -> (OverflowError, FloatingPointError, ZeroDivisionError), ], ) -@pytest.mark.parametrize("n_try", [1, 3]) +@pytest.mark.parametrize("n_trials", [1, 3]) def test_failure_zero_division_error_with_timeout( - n_try: int, + n_trials: int, fatal_exceptions: Iterable[type[Exception]], a_crashing_analytic_discipline, caplog, @@ -154,11 +154,11 @@ def test_failure_zero_division_error_with_timeout( """Test failure of the discipline with timeout and a bad x entry. In order to catch the ZeroDivisionError that arises before timeout (5s), test with - n_retry=1 and 3 to be sure every case is ok. + n_trials=1 and 3 to be sure every case is ok. """ disc = RetryDiscipline( a_crashing_analytic_discipline, - n_retry=n_try, + n_trials=n_trials, timeout=10.0, fatal_exceptions=fatal_exceptions, ) @@ -184,7 +184,7 @@ def test_a_not_implemented_error_analytic_discipline( """ retry_discipline = RetryDiscipline( a_crashing_discipline_in_run, - n_retry=5, + n_trials=5, timeout=100.0, fatal_exceptions=( ZeroDivisionError, @@ -212,10 +212,10 @@ def test_retry_discipline_timeout_feature( a_long_time_running_discipline, caplog ) -> None: """Test the timeout feature of discipline with a long computation.""" - n_retry = 1 + n_trials = 1 disc_with_timeout = RetryDiscipline( - a_long_time_running_discipline, timeout=2.0, n_retry=n_retry + a_long_time_running_discipline, timeout=2.0, n_trials=n_trials ) with pytest.raises( TimeoutError, @@ -224,7 +224,7 @@ def test_retry_discipline_timeout_feature( ): disc_with_timeout.execute({"x": array([0.0])}) - assert disc_with_timeout.n_executions == n_retry + assert disc_with_timeout.n_executions == n_trials assert disc_with_timeout.local_data == {} assert "Process stopped as it exceeds timeout" in caplog.text