huggingface · guipenedo · May 27, 2024 · May 24, 2024 · May 24, 2024 · May 24, 2024
diff --git a/README.md b/README.md
@@ -96,6 +96,7 @@ Some options common to all executors:
 - `pipeline` a list consisting of the pipeline steps that should be run
 - `logging_dir` a datafolder where log files, statistics and more should be saved. Do not reuse folders for different pipelines/jobs as this will overwrite your stats, logs and completions.
 - `skip_completed` (_bool_, `True` by default) datatrove keeps track of completed tasks so that when you relaunch a job they can be skipped. Set this to `False` to disable this behaviour
+- `randomize_start` (_bool_, `False` by default) randomizes the start time of each task within a job by approximately 3 minutes to prevent all tasks from starting simultaneously and potentially overloading the system. 
 
 Call an executor's `run` method to execute its pipeline.
 

diff --git a/src/datatrove/executor/base.py b/src/datatrove/executor/base.py
@@ -1,5 +1,7 @@
 import dataclasses
 import json
+import random
+import time
 from abc import ABC, abstractmethod
 from collections import deque
 from collections.abc import Sequence
@@ -27,6 +29,7 @@ class PipelineExecutor(ABC):
         logging_dir: where to save logs, stats, etc. Should be parsable into a datatrove.io.DataFolder
         skip_completed: whether to skip tasks that were completed in
                 previous runs. default: True
+        randomize_start: randomize the start of each task in a job in a ~3 min window
     """
 
     @abstractmethod
@@ -35,10 +38,12 @@ def __init__(
         pipeline: list[PipelineStep | Callable],
         logging_dir: DataFolderLike = None,
         skip_completed: bool = True,
+        randomize_start: bool = False,
     ):
         self.pipeline: list[PipelineStep | Callable] = pipeline
         self.logging_dir = get_datafolder(logging_dir if logging_dir else f"logs/{get_timestamp()}_{get_random_str()}")
         self.skip_completed = skip_completed
+        self.randomize_start = randomize_start
 
     @abstractmethod
     def run(self):
@@ -74,6 +79,9 @@ def _run_for_rank(self, rank: int, local_rank: int = 0) -> PipelineStats:
             return PipelineStats()
         logfile = add_task_logger(self.logging_dir, rank, local_rank)
         log_pipeline(self.pipeline)
+
+        if self.randomize_start:
+            time.sleep(random.randint(0, 60 * 3))
         try:
             # pipe data from one step to the next
             pipelined_data = None

diff --git a/src/datatrove/executor/local.py b/src/datatrove/executor/local.py
@@ -30,6 +30,7 @@ class LocalPipelineExecutor(PipelineExecutor):
             Tasks [local_rank_offset, local_rank_offset + local_tasks] will be run.
         depends: another LocalPipelineExecutor that should run
             before this one
+        randomize_start: randomize the start of each task in a job in a ~3 min window
     """
 
     def __init__(
@@ -43,8 +44,9 @@ def __init__(
         start_method: str = "forkserver",
         local_tasks: int = -1,
         local_rank_offset: int = 0,
+        randomize_start: bool = False,
     ):
-        super().__init__(pipeline, logging_dir, skip_completed)
+        super().__init__(pipeline, logging_dir, skip_completed, randomize_start)
         self.tasks = tasks
         self.workers = workers if workers != -1 else tasks
         self.start_method = start_method

diff --git a/src/datatrove/executor/slurm.py b/src/datatrove/executor/slurm.py
@@ -3,7 +3,6 @@
 import json
 import math
 import os
-import random
 import signal
 import subprocess
 import sys
@@ -114,7 +113,7 @@ def __init__(
         srun_args: dict = None,
         tasks_per_job: int = 1,
     ):
-        super().__init__(pipeline, logging_dir, skip_completed)
+        super().__init__(pipeline, logging_dir, skip_completed, randomize_start)
         self.tasks = tasks
         self.workers = workers
         self.partition = partition
@@ -178,8 +177,6 @@ def run(self):
                     break
                 rank = all_ranks[rank_to_run]
 
-                if self.randomize_start:
-                    time.sleep(random.randint(0, 60 * 3))
                 self._run_for_rank(rank)
         else:
             # we still have to launch the job