Skip to content

Commit

Permalink
Make colorization configurable for both files and console output (hug…
Browse files Browse the repository at this point in the history
…gingface#185)

* add colorize toggles for both console output and log files

* fix colorization of final output messages

* globally change colorization on executor init

* fix duplicated logging sink

* fix duplicated logging sink

* document new colorize options

* change colorization to env variables only

* bugfix

* nit

* nit
  • Loading branch information
guipenedo authored May 17, 2024
1 parent 0ee28ca commit 777352d
Show file tree
Hide file tree
Showing 23 changed files with 62 additions and 35 deletions.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,12 @@ For a pipeline with `logging_dir` **mylogspath/exp1**, the following folder stru
```
</details>

### Colorization
Log messages support colorization. By default, colorization will be auto detected for console messages and disabled for log files (logs/task_XXXXX.log).
To explicitly enable or disable colorization, you may set the following environment variables:
- `DATATROVE_COLORIZE_LOGS` "1" to add ANSI colors to console log messages and "0" to disable colorization.
- `DATATROVE_COLORIZE_LOG_FILES` set to "1" to add ANSI colors to log messages saved to logs/task_XXXXX.log.

## DataFolder / paths
Datatrove supports a wide variety of input/output sources through [fsspec](https://filesystem-spec.readthedocs.io/en/latest/).

Expand Down
11 changes: 8 additions & 3 deletions src/datatrove/executor/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,16 @@
from collections.abc import Sequence
from typing import Callable

from loguru import logger

from datatrove.io import DataFolderLike, get_datafolder
from datatrove.pipeline.base import PipelineStep
from datatrove.utils.logging import add_task_logger, close_task_logger, get_random_str, get_timestamp, log_pipeline
from datatrove.utils.logging import (
add_task_logger,
close_task_logger,
get_random_str,
get_timestamp,
log_pipeline,
logger,
)
from datatrove.utils.stats import PipelineStats


Expand Down
2 changes: 1 addition & 1 deletion src/datatrove/executor/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
from typing import Callable

import multiprocess
from loguru import logger

from datatrove.executor.base import PipelineExecutor
from datatrove.io import DataFolderLike
from datatrove.pipeline.base import PipelineStep
from datatrove.utils.logging import logger
from datatrove.utils.stats import PipelineStats


Expand Down
3 changes: 1 addition & 2 deletions src/datatrove/executor/slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,11 @@

import dill
from dill import CONTENTS_FMODE
from loguru import logger

from datatrove.executor.base import PipelineExecutor
from datatrove.io import DataFolderLike
from datatrove.pipeline.base import PipelineStep
from datatrove.utils.logging import get_random_str, get_timestamp
from datatrove.utils.logging import get_random_str, get_timestamp, logger


def requeue_handler(signum, _frame):
Expand Down
3 changes: 2 additions & 1 deletion src/datatrove/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
from fsspec.implementations.dirfs import DirFileSystem
from fsspec.implementations.local import LocalFileSystem
from huggingface_hub import HfFileSystem, cached_assets_path
from loguru import logger

from datatrove.utils.logging import logger


class OutputFileManager:
Expand Down
2 changes: 1 addition & 1 deletion src/datatrove/pipeline/decont/n_grams.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@
from typing import Tuple

import numpy as np
from loguru import logger

from datatrove.data import Document, DocumentsPipeline
from datatrove.io import DataFolderLike, file_exists, get_datafolder, open_file
from datatrove.pipeline.base import PipelineStep
from datatrove.pipeline.filters.base_filter import BaseFilter
from datatrove.pipeline.writers.disk_base import DiskWriter
from datatrove.utils.binaryio import read_np_from_file
from datatrove.utils.logging import logger
from datatrove.utils.text import TextNormConfig, simplify_text, xxhash64


Expand Down
2 changes: 1 addition & 1 deletion src/datatrove/pipeline/dedup/bloom_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
import math

import numpy as np
from loguru import logger

from datatrove.data import Document, DocumentsPipeline
from datatrove.io import DataFolderLike, get_datafolder
from datatrove.pipeline.base import PipelineStep
from datatrove.pipeline.writers.disk_base import DiskWriter
from datatrove.utils.logging import logger
from datatrove.utils.text import DEF_TEXT_NORM_CONFIG, TextNormConfig, sha1_hash32, simplify_text
from datatrove.utils.typeshelper import StatHints

Expand Down
2 changes: 1 addition & 1 deletion src/datatrove/pipeline/dedup/exact_substrings.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@
from typing import BinaryIO, Generator

import numpy as np
from loguru import logger

from datatrove.io import DataFolderLike, get_datafolder
from datatrove.pipeline.base import DocumentsPipeline, PipelineStep
from datatrove.utils.logging import logger

from ...utils.tokenization import PipelineStepWithTokenizer
from ...utils.typeshelper import ExtensionHelperES as EH
Expand Down
2 changes: 1 addition & 1 deletion src/datatrove/pipeline/dedup/minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@

import numpy as np
from fsspec.spec import AbstractBufferedFile
from loguru import logger

from datatrove.data import DocumentsPipeline
from datatrove.io import DataFolderLike, get_datafolder
from datatrove.pipeline.base import PipelineStep
from datatrove.pipeline.writers.disk_base import DiskWriter
from datatrove.utils.binaryio import read_tuples_from_file, seek_to_start
from datatrove.utils.logging import logger
from datatrove.utils.text import TextNormConfig, sha1_hash32, sha1_hash64, simplify_text
from datatrove.utils.typeshelper import StatHints

Expand Down
2 changes: 1 addition & 1 deletion src/datatrove/pipeline/dedup/sentence_dedup.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@

import numpy as np
from fsspec.spec import AbstractBufferedFile
from loguru import logger
from tqdm import tqdm

from datatrove.data import Document, DocumentsPipeline
from datatrove.io import DataFolderLike, get_datafolder
from datatrove.pipeline.base import PipelineStep
from datatrove.utils.binaryio import read_np_from_file, read_tuples_from_file
from datatrove.utils.logging import logger
from datatrove.utils.text import SPLIT_TEXT_SENTENCES, TextNormConfig, sha1_hash64, simplify_text, split_into_parts
from datatrove.utils.typeshelper import ExtensionHelperSD, StatHints

Expand Down
2 changes: 1 addition & 1 deletion src/datatrove/pipeline/dedup/url_dedup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,13 @@

import numpy as np
from fsspec.spec import AbstractBufferedFile
from loguru import logger
from tqdm import tqdm

from datatrove.data import Document, DocumentsPipeline
from datatrove.io import DataFolderLike, get_datafolder
from datatrove.pipeline.base import PipelineStep
from datatrove.utils.binaryio import read_np_from_file, read_tuples_from_file
from datatrove.utils.logging import logger
from datatrove.utils.text import xxhash64
from datatrove.utils.typeshelper import ExtensionHelperSD, StatHints

Expand Down
3 changes: 1 addition & 2 deletions src/datatrove/pipeline/extractors/base.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
from abc import abstractmethod
from concurrent.futures import ThreadPoolExecutor

from loguru import logger

from datatrove.data import DocumentsPipeline
from datatrove.pipeline.base import PipelineStep
from datatrove.utils.logging import logger
from datatrove.utils.typeshelper import StatHints


Expand Down
2 changes: 1 addition & 1 deletion src/datatrove/pipeline/filters/unigram_log_probs.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@

import numpy as np
from huggingface_hub import cached_assets_path
from loguru import logger

from datatrove.data import Document
from datatrove.pipeline.filters.base_filter import BaseFilter
from datatrove.pipeline.writers.disk_base import DiskWriter
from datatrove.utils.logging import logger


UNIGRAM_DOWNLOAD = "https://ai2-s2-research-public.s3-us-west-2.amazonaws.com/lucas/google-1T-unigram/unigram_freq.csv"
Expand Down
2 changes: 1 addition & 1 deletion src/datatrove/pipeline/filters/url_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
from typing import Iterable

from huggingface_hub import cached_assets_path
from loguru import logger

from datatrove.data import Document
from datatrove.io import safely_create_file
from datatrove.utils._import_utils import ASSETS_PATH
from datatrove.utils.logging import logger

from ..writers.disk_base import DiskWriter
from .base_filter import BaseFilter
Expand Down
2 changes: 1 addition & 1 deletion src/datatrove/pipeline/readers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
from types import MethodType
from typing import Callable

from loguru import logger
from tqdm import tqdm

from datatrove.data import Document, DocumentsPipeline
from datatrove.io import DataFolderLike, get_datafolder
from datatrove.pipeline.base import PipelineStep
from datatrove.utils.logging import logger


class BaseReader(PipelineStep):
Expand Down
3 changes: 1 addition & 2 deletions src/datatrove/pipeline/readers/jsonl.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,9 @@
from json import JSONDecodeError
from typing import Callable, Literal

from loguru import logger

from datatrove.io import DataFolderLike
from datatrove.pipeline.readers.base import BaseDiskReader
from datatrove.utils.logging import logger


class JsonlReader(BaseDiskReader):
Expand Down
2 changes: 1 addition & 1 deletion src/datatrove/pipeline/tokens/context_shuffler.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import mmap

import numpy as np
from loguru import logger
from numpy.random import default_rng

from datatrove.data import DocumentsPipeline
from datatrove.io import DataFolderLike, get_datafolder
from datatrove.pipeline.base import PipelineStep
from datatrove.pipeline.tokens.merger import load_doc_ends
from datatrove.utils.logging import logger


class DocumentTokenizerContextShuffler(PipelineStep):
Expand Down
2 changes: 1 addition & 1 deletion src/datatrove/pipeline/tokens/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@

import humanize
import numpy as np
from loguru import logger
from numpy.random import default_rng

from datatrove.data import Document, DocumentsPipeline
from datatrove.io import DataFolder, DataFolderLike, get_datafolder
from datatrove.utils.logging import logger
from datatrove.utils.tokenization import PipelineStepWithTokenizer, batched


Expand Down
2 changes: 1 addition & 1 deletion src/datatrove/pipeline/writers/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@
preupload_lfs_files,
)
from huggingface_hub.utils import HfHubHTTPError
from loguru import logger

from datatrove.io import DataFolderLike, get_datafolder
from datatrove.pipeline.writers import ParquetWriter
from datatrove.utils.logging import logger


MAX_RETRIES = 12
Expand Down
2 changes: 1 addition & 1 deletion src/datatrove/tools/failed_logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
import os.path
import re

from loguru import logger
from rich.console import Console
from rich.prompt import Confirm

from datatrove.io import get_datafolder
from datatrove.utils._import_utils import is_rich_available
from datatrove.utils.logging import logger


if not is_rich_available():
Expand Down
2 changes: 1 addition & 1 deletion src/datatrove/tools/jobs_status.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
import json
import os.path

from loguru import logger
from rich.console import Console

from datatrove.io import get_datafolder
from datatrove.utils._import_utils import is_rich_available
from datatrove.utils.logging import logger


if not is_rich_available():
Expand Down
2 changes: 1 addition & 1 deletion src/datatrove/tools/merge_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
import json
import os.path

from loguru import logger
from tqdm import tqdm

from datatrove.io import get_datafolder, open_file
from datatrove.utils.logging import logger
from datatrove.utils.stats import PipelineStats


Expand Down
36 changes: 27 additions & 9 deletions src/datatrove/utils/logging.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,19 @@
import os
import random
import string
import sys
from datetime import datetime

from loguru import logger

from datatrove.io import DataFolder

def get_env_bool(name, default=None):
env_var = os.environ.get(name, None)
return default if env_var is None else (env_var.lower().strip() in ("yes", "true", "t", "1"))


DATATROVE_COLORIZE_LOGS = get_env_bool("DATATROVE_COLORIZE_LOGS")
DATATROVE_COLORIZE_LOG_FILES = get_env_bool("DATATROVE_COLORIZE_LOG_FILES", False)


def get_timestamp() -> str:
Expand All @@ -29,21 +37,24 @@ def get_random_str(length=5):
return "".join(random.choice(string.ascii_lowercase) for _ in range(length))


def add_task_logger(logging_dir: DataFolder, rank: int, local_rank: int = 0):
def add_task_logger(
logging_dir,
rank: int,
local_rank: int = 0,
):
"""
Sets up logging for a given task
Args:
logging_dir: DataFolder:
logging_dir: DataFolder
rank: int:
local_rank: int: (Default value = 0)
Returns:
"""
logger.remove()
logfile = logging_dir.open(f"logs/task_{rank:05d}.log", "w")
logger.add(sys.stderr, level="INFO" if local_rank == 0 else "ERROR")
logger.add(logfile, colorize=True, level="DEBUG")
logger.add(sys.stderr, colorize=DATATROVE_COLORIZE_LOGS, level="INFO" if local_rank == 0 else "ERROR")
logger.add(logfile, colorize=DATATROVE_COLORIZE_LOG_FILES, level="DEBUG")
logger.info(f"Launching pipeline for {rank=}")
return logfile

Expand All @@ -53,14 +64,17 @@ def close_task_logger(logfile):
Close logfile and reset logging setup
Args:
logfile:
Returns:
"""
logger.complete()
logger.remove()
setup_default_logger() # re-add default logger
logfile.close()
logger.add(sys.stderr) # re-add default logger


def setup_default_logger():
logger.remove()
logger.add(sys.stderr, colorize=DATATROVE_COLORIZE_LOGS)


def log_pipeline(pipeline):
Expand All @@ -74,3 +88,7 @@ def log_pipeline(pipeline):
"""
steps = "\n".join([pipe.__repr__() if callable(pipe) else "Iterable" for pipe in pipeline])
logger.info(f"\n--- 🛠️ PIPELINE 🛠\n{steps}")


# set colorization based on env vars
setup_default_logger()

0 comments on commit 777352d

Please sign in to comment.