From 673c071a0bb50f41757dc682c65462f38e2bdd98 Mon Sep 17 00:00:00 2001 From: Hugo Perrier Date: Mon, 9 Dec 2024 14:09:25 +0100 Subject: [PATCH] :recycle: Refactor ContentTagger mechanism --- melusine/base.py | 2 +- melusine/conf/pipelines/demo_pipeline.yaml | 3 + melusine/conf/processors/refined_tagger.yaml | 2 +- melusine/detectors.py | 22 +- melusine/{io => io_mixin}/__init__.py | 2 +- melusine/{io => io_mixin}/_classes.py | 4 - melusine/message.py | 56 +- melusine/pipeline.py | 2 +- melusine/processors.py | 87 +- tests/base/test_message.py | 56 +- tests/detectors/test_thanks_detector.py | 30 +- .../detectors/test_vacation_reply_detector.py | 46 +- tests/functional/test_emails_fixtures.py | 109 ++- tests/{io => io_mixin}/__init__.py | 0 tests/{io => io_mixin}/test_io_mixin.py | 4 +- .../processors/test_content_refined_tagger.py | 819 ++++++++++++++++++ tests/processors/test_content_tagger.py | 787 ----------------- tests/processors/test_processors.py | 105 ++- 18 files changed, 1162 insertions(+), 974 deletions(-) rename melusine/{io => io_mixin}/__init__.py (66%) rename melusine/{io => io_mixin}/_classes.py (93%) rename tests/{io => io_mixin}/__init__.py (100%) rename tests/{io => io_mixin}/test_io_mixin.py (93%) create mode 100644 tests/processors/test_content_refined_tagger.py delete mode 100644 tests/processors/test_content_tagger.py diff --git a/melusine/base.py b/melusine/base.py index 36c347a9..b7cd1996 100644 --- a/melusine/base.py +++ b/melusine/base.py @@ -25,7 +25,7 @@ from sklearn.base import BaseEstimator, TransformerMixin from melusine.backend import backend -from melusine.io import IoMixin +from melusine.io_mixin import IoMixin logger = logging.getLogger(__name__) diff --git a/melusine/conf/pipelines/demo_pipeline.yaml b/melusine/conf/pipelines/demo_pipeline.yaml index 0b13b1f2..3da0c72c 100644 --- a/melusine/conf/pipelines/demo_pipeline.yaml +++ b/melusine/conf/pipelines/demo_pipeline.yaml @@ -12,6 +12,9 @@ demo_pipeline: - class_name: ContentTagger config_key: content_tagger module: melusine.processors + - class_name: RefinedTagger + config_key: refined_tagger + module: melusine.processors - class_name: TextExtractor config_key: text_extractor module: melusine.processors diff --git a/melusine/conf/processors/refined_tagger.yaml b/melusine/conf/processors/refined_tagger.yaml index 1f3bd7bd..321a9127 100644 --- a/melusine/conf/processors/refined_tagger.yaml +++ b/melusine/conf/processors/refined_tagger.yaml @@ -1,2 +1,2 @@ -content_tagger: +refined_tagger: default_tag: BODY diff --git a/melusine/detectors.py b/melusine/detectors.py index 96bf51a7..905c0c0f 100644 --- a/melusine/detectors.py +++ b/melusine/detectors.py @@ -6,7 +6,7 @@ """ -from typing import Any, Dict, List, Tuple +from typing import Any, Dict, List from melusine.base import MelusineDetector, MelusineItem, MelusineRegex from melusine.message import Message @@ -95,19 +95,12 @@ def pre_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineIte target_tags={self.BODY_PART}, stop_at={self.GREETINGS_PART} ) - # Extract the THANKS part in the last message - thanks_parts: List[Tuple[str, str]] = row[self.messages_column][0].extract_parts(target_tags={self.THANKS_PART}) - - # Compute THANKS text - if not thanks_parts: - thanks_text: str = "" - else: - thanks_text = "\n".join(x[1] for x in thanks_parts) + # Extract the THANKS text in the last message + thanks_text = row[self.messages_column][0].extract_text(target_tags={self.THANKS_PART}) # Save debug data if debug_mode: debug_dict = { - self.THANKS_PARTS_COL: thanks_parts, self.THANKS_TEXT_COL: thanks_text, self.HAS_BODY: has_body, } @@ -236,20 +229,13 @@ def pre_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineIte """ # Last message body last_message: Message = row[self.messages_column][0] - body_parts = last_message.extract_last_body() - - if body_parts: - row[self.CONST_TEXT_COL_NAME] = "\n".join(text for tag, text in body_parts) - else: - row[self.CONST_TEXT_COL_NAME] = "" + row[self.CONST_TEXT_COL_NAME] = last_message.extract_text(target_tags=("BODY",), stop_at=("GREETINGS",)) # Prepare and save debug data if debug_mode: debug_dict: Dict[str, Any] = { self.CONST_DEBUG_TEXT_KEY: row[self.CONST_TEXT_COL_NAME], } - if self.messages_column: - debug_dict[self.CONST_DEBUG_PARTS_KEY] = body_parts row[self.debug_dict_col].update(debug_dict) return row diff --git a/melusine/io/__init__.py b/melusine/io_mixin/__init__.py similarity index 66% rename from melusine/io/__init__.py rename to melusine/io_mixin/__init__.py index 8c9e89af..1249eeb6 100644 --- a/melusine/io/__init__.py +++ b/melusine/io_mixin/__init__.py @@ -2,6 +2,6 @@ The melusine.io module includes classes for input/output data. """ -from melusine.io._classes import IoMixin +from melusine.io_mixin._classes import IoMixin __all__ = ["IoMixin"] diff --git a/melusine/io/_classes.py b/melusine/io_mixin/_classes.py similarity index 93% rename from melusine/io/_classes.py rename to melusine/io_mixin/_classes.py index e7c6c24f..f5128ba3 100644 --- a/melusine/io/_classes.py +++ b/melusine/io_mixin/_classes.py @@ -27,10 +27,6 @@ class IoMixin: Defines generic load methods. """ - def __init__(self, **kwargs: Any): - """Initialize attribute.""" - self.json_exclude_list: list[str] = ["_func", "json_exclude_list"] - @classmethod def from_config( cls: type[T], diff --git a/melusine/message.py b/melusine/message.py index bcac5797..4f4b112d 100644 --- a/melusine/message.py +++ b/melusine/message.py @@ -8,7 +8,7 @@ import re from datetime import datetime -from typing import Any, Dict, Iterable, List, Optional, Tuple +from typing import Any, Dict, Iterable, List, Optional from melusine import config @@ -22,6 +22,7 @@ class Message: DEFAULT_STR_TAG_NAME_LENGTH = 22 MAIN_TAG_TYPE = "refined_tag" FALLBACK_TAG_TYPE = "base_tag" + MAIN_TEXT_TYPE = "base_text" def __init__( self, @@ -65,6 +66,9 @@ def __init__( self.clean_header: str = "" self.clean_text: str = "" + self.effective_tag_key = "base_tag" + self.effective_text_key = "base_text" + @property def str_tag_name_length(self) -> int: """ @@ -89,7 +93,7 @@ def extract_parts( self, target_tags: Optional[Iterable[str]] = None, stop_at: Optional[Iterable[str]] = None, - tag_type: str = MAIN_TAG_TYPE, + tag_type: Optional[str] = None, ) -> List[Dict[str, Any]]: """ Function to extract target tags from the message. @@ -110,13 +114,11 @@ def extract_parts( if not self.tags: return [] + if tag_type is None: + tag_type = self.effective_tag_key + # List of tags in the message - try: - tag_name_list: List[str] = [x[tag_type] for x in self.tags] - # If tag_type is not available, fall back on base_tag - except KeyError: - tag_type = self.FALLBACK_TAG_TYPE - tag_name_list: List[str] = [x[tag_type] for x in self.tags] + tag_name_list: List[str] = [x[tag_type] for x in self.tags] if target_tags is None: target_tags = tag_name_list @@ -135,11 +137,42 @@ def extract_parts( return [x for x in effective_tags if x[tag_type] in target_tags] + def extract_text( + self, + target_tags: Optional[Iterable[str]] = None, + stop_at: Optional[Iterable[str]] = None, + tag_type: Optional[str] = None, + text_type: str = MAIN_TEXT_TYPE, + separator: str = "\n", + ) -> str: + """ + Function to extract target tags from the message. + + Parameters + ---------- + target_tags: + Tags to be extracted. + stop_at: + Tags for which extraction should stop. + tag_type: + Type of tags to consider. + text_type: + Type of text to consider + separator: + Separator to join the extracted texts. + + Returns + ------- + _: List of extracted tags. + """ + parts = self.extract_parts(target_tags=target_tags, stop_at=stop_at, tag_type=tag_type) + return separator.join([x[text_type] for x in parts]) + def extract_last_body( self, target_tags: Iterable[str] = ("BODY",), stop_at: Iterable[str] = ("GREETINGS",), - tag_type: str = MAIN_TAG_TYPE, + tag_type: Optional[str] = None, ) -> List[Dict[str, Any]]: """ Extract the BODY parts of the last message in the email. @@ -160,7 +193,7 @@ def has_tags( self, target_tags: Iterable[str] = ("BODY",), stop_at: Optional[Iterable[str]] = None, - tag_type: str = MAIN_TAG_TYPE, + tag_type: Optional[str] = None, ) -> bool: """ Function to check if input tags are present in the message. @@ -182,6 +215,9 @@ def has_tags( if self.tags is None: return False + if tag_type is None: + tag_type = self.effective_tag_key + if not stop_at: stop_at = set() diff --git a/melusine/pipeline.py b/melusine/pipeline.py index 7c37dfbf..bead9733 100644 --- a/melusine/pipeline.py +++ b/melusine/pipeline.py @@ -16,7 +16,7 @@ from melusine.backend import backend from melusine.backend.base_backend import Any from melusine.base import MelusineTransformer -from melusine.io import IoMixin +from melusine.io_mixin import IoMixin T = TypeVar("T") diff --git a/melusine/processors.py b/melusine/processors.py index 5bd8f4f3..0e989f40 100644 --- a/melusine/processors.py +++ b/melusine/processors.py @@ -639,16 +639,19 @@ def extract(self, message_list: list[Message]) -> str: # Message has been tagged if message.tags is not None: if self.include_tags: - tags = message.extract_parts(target_tags=self.include_tags, stop_at=self.stop_at) - message_text_list = [x[1] for x in tags] + extracted_text = message.extract_text( + target_tags=self.include_tags, stop_at=self.stop_at, separator=self.sep + ) elif self.exclude_tags: tags = message.extract_parts(target_tags=None, stop_at=self.stop_at) - message_text_list = [part for tag, part in tags if tag not in self.exclude_tags] + message_text_list = [ + tag_data[message.effective_text_key] + for tag_data in tags + if tag_data[message.effective_tag_key] not in self.exclude_tags + ] + extracted_text = self.sep.join(message_text_list) else: - message_text_list = [part for tag, part in message.tags] - - # Join message text list - extracted_text = self.sep.join(message_text_list) + extracted_text = message.extract_text(target_tags=None, stop_at=self.stop_at, separator=self.sep) # Message has not been tagged else: @@ -929,6 +932,8 @@ def compile_tag_regex(self, tag: str) -> re.Pattern: regex = re.compile(regex, flags=self.default_regex_flag) except re.error: raise ValueError(f"Invalid regex for tag {tag}:\n{regex}") + elif isinstance(regex, re.Pattern): + pass else: raise ValueError( f"Tag {tag} does not return any of the supported types : " @@ -940,7 +945,7 @@ def compile_tag_regex(self, tag: str) -> re.Pattern: return regex - def tag_text(self, text: str) -> list[tuple[str, str]]: + def tag_text(self, text: str) -> list[dict[str, Any]]: """ Method to apply content tagging on a text. @@ -951,8 +956,7 @@ def tag_text(self, text: str) -> list[tuple[str, str]]: Returns ------- - _: list[tuple[str, str]] - List of tag/text couples (ex: [("HELLO", "bonjour")]) + _: List of tag/text couples """ parts = self.split_text(text) tags = list() @@ -1188,7 +1192,6 @@ def __init__( default_tag: str = "BODY", valid_part_regex: str = r"[a-z0-9?]", default_regex_flag: int = re.IGNORECASE | re.MULTILINE, - post_process: bool = True, text_attribute: str = "text", ): """ @@ -1212,7 +1215,6 @@ def __init__( default_tag=default_tag, valid_part_regex=valid_part_regex, default_regex_flag=default_regex_flag, - post_process=post_process, text_attribute=text_attribute, ) @@ -1517,13 +1519,15 @@ def SIGNATURE(self) -> str | list[str] | re.Pattern: return [ # Phone / Fax - r"(?:^.{,3}(?:T[ée]l(?:[ée]phone)?\.?|mobile|phone|num[ée]ro|ligne).{,20}(?: *(?:\n+|$)))", ( r"^(.{,10}:? ?\(?((?:\+|00)\(?33\)?(?: ?\(0\))?|0)\s*[1-" r"9]([\s.-]*\d{2}){4}.{,10}){,3}" + rf"({email_address_regex}.{{,10}})?" "( *(\n+|$))" ), - r"^.{,3}(T[ée]l[ée]?(phone|copie)?|Fax|mobile|phone|num[ée]ro|ligne).{,20}$", + # Make sure there are at least 6 digits + r"^.{,3}(T[ée]l[ée]?(phone|copie)?|Fax|mobile|phone|num[ée]ro|ligne).{,20}\d{2}[ .-]?\d{2}[ .-]?\d{2}.{,20} *(?:\n+|$)", + # Phone number on separate line + r"^.{,3}(T[ée]l[ée]?(phone|copie)?|Fax|mobile|phone|num[ée]ro|ligne).{,3} *(?:\n+|$)", r"^.{,3}Appel non surtax[ée].{,3}$", # Street / Address / Post code street_address_regex, @@ -1549,10 +1553,18 @@ def SIGNATURE(self) -> str | list[str] | re.Pattern: class RefinedTagger(MelusineTransformer): - BASE_TAG_KEY = "base_tag" - REFINED_TAG_KEY = "refined_tag" - - def __init__(self, input_columns: str = "messages", output_columns: str = "messages", default_tag: str = "BODY"): + """ + Post-processing class to refine initial tags. + """ + def __init__( + self, + input_columns: str = "messages", + output_columns: str = "messages", + default_tag: str = "BODY", + tag_key: str = "base_tag", + text_key: str = "base_text", + refined_tag_key: str = "refined_tag", + ): """ Parameters ---------- @@ -1562,8 +1574,14 @@ def __init__(self, input_columns: str = "messages", output_columns: str = "messa Outputs columns for the transform operation default_tag: str Default tag to apply to untagged text + tag_key: input tag jey + text_key: input text key + refined_tag_key: output tag key """ self.default_tag = default_tag + self.base_tag_key = tag_key + self.base_text_key = text_key + self.refined_tag_key = refined_tag_key super().__init__( input_columns=input_columns, @@ -1585,10 +1603,25 @@ def post_process_messages(self, messages: list[Message]) -> list[Message]: """ for message in messages: message.tags = self.post_process_tags(message.tags) + message.effective_tag_key = self.refined_tag_key return messages - def post_process_tags(self, tags: list[dict[str, Any]]) -> list[dict[str, Any]]: + def post_process_tags(self, tags: list[dict[str, Any]] | None) -> list[dict[str, Any]] | None: + """ + Method to post-process tags. + + Parameters + ---------- + tags: Initial tags + + Returns + ------- + _: Refined tags + """ + if tags is None: + return None + # Signature lines containing first/last name tags = self.detect_name_signature(tags) @@ -1619,16 +1652,16 @@ def detect_name_signature(self, tags: list[dict[str, Any]]) -> list[dict[str, An forbidden_words: set[str] = {"urgent", "attention"} for tag_data in tags: - tag = tag_data[self.BASE_TAG_KEY] + tag = tag_data[self.base_tag_key] if tag == self.default_tag: - text = tag_data[self.BASE_TAG_KEY] + text = tag_data[self.base_text_key] match = re.match(line_with_name, text) has_forbidden_words: bool = bool(forbidden_words.intersection(text.lower().split())) if match and not has_forbidden_words: tag = "SIGNATURE_NAME" - tag_data[self.REFINED_TAG_KEY] = tag + tag_data[self.refined_tag_key] = tag return tags @@ -1664,7 +1697,6 @@ def __init__( ) self.tags_to_ignore = tuple(tags_to_ignore) - self.json_exclude_list.append("input_columns") @property def email_pattern(self) -> str: @@ -1757,7 +1789,14 @@ def filter_message_list(self, message_list: list[Message]) -> list[Message]: top_message = message_list[0] parts = top_message.extract_parts() - contains_only_tags_to_ignore = all([tag.startswith(self.tags_to_ignore) for tag, _ in parts]) + try: + contains_only_tags_to_ignore = all( + [tag_data[Message.MAIN_TAG_TYPE].startswith(self.tags_to_ignore) for tag_data in parts] + ) + except KeyError: + contains_only_tags_to_ignore = all( + [tag_data[Message.FALLBACK_TAG_TYPE].startswith(self.tags_to_ignore) for tag_data in parts] + ) if contains_only_tags_to_ignore and (len(message_list) > 1): message_list = message_list[1:] diff --git a/tests/base/test_message.py b/tests/base/test_message.py index 3db9d60e..49b1e075 100644 --- a/tests/base/test_message.py +++ b/tests/base/test_message.py @@ -19,9 +19,9 @@ def test_message_repr(): def test_message_has_tags(): message = Message(text="Hello") message.tags = [ - ("HELLO", "Bonjour"), - ("BODY", "Pouvez-vous"), - ("GREETINGS", "Cordialement"), + {"base_text": "Bonjour", "base_tag": "HELLO"}, + {"base_text": "Pouvez-vous", "base_tag": "BODY"}, + {"base_text": "Cordialement", "base_tag": "GREETINGS"}, ] assert not message.has_tags(target_tags=["FOOTER"]) @@ -32,9 +32,9 @@ def test_message_has_tags(): def test_message_has_tags_stop_at(): message = Message(text="Hello") message.tags = [ - ("HELLO", "Bonjour"), - ("GREETINGS", "Cordialement"), - ("BODY", "Blah Blah Blah"), + {"base_text": "Bonjour", "base_tag": "HELLO"}, + {"base_text": "Cordialement", "base_tag": "GREETINGS"}, + {"base_text": "Blah Blah Blah", "base_tag": "BODY"}, ] assert not message.has_tags(target_tags=["BODY"], stop_at=["GREETINGS"]) @@ -49,25 +49,25 @@ def test_message_has_tags_no_tags(): def test_message_extract_parts(): message = Message(text="Hello") message.tags = [ - ("HELLO", "Bonjour"), - ("BODY", "Pouvez-vous"), - ("GREETINGS", "Cordialement"), + {"base_text": "Bonjour", "base_tag": "HELLO"}, + {"base_text": "Pouvez-vous", "base_tag": "BODY"}, + {"base_text": "Cordialement", "base_tag": "GREETINGS"}, ] - assert message.extract_parts(target_tags={"BODY"}) == [("BODY", "Pouvez-vous")] + assert message.extract_parts(target_tags={"BODY"}) == [{"base_text": "Pouvez-vous", "base_tag": "BODY"}] assert message.extract_parts(target_tags=["GREETINGS", "HELLO"]) == [ - ("HELLO", "Bonjour"), - ("GREETINGS", "Cordialement"), + {"base_text": "Bonjour", "base_tag": "HELLO"}, + {"base_text": "Cordialement", "base_tag": "GREETINGS"}, ] def test_message_extract_parts_stop(): message = Message(text="Hello") message.tags = [ - ("HELLO", "Bonjour"), - ("FOOTER", "Envoyé depuis mon Iphone"), - ("GREETINGS", "Cordialement"), - ("BODY", "Blah Blah Blah"), + {"base_text": "Bonjour", "base_tag": "HELLO"}, + {"base_text": "Envoyé depuis mon Iphone", "base_tag": "FOOTER"}, + {"base_text": "Cordialement", "base_tag": "GREETINGS"}, + {"base_text": "Blah Blah Blah", "base_tag": "BODY"}, ] extracted = message.extract_parts(target_tags=["BODY"], stop_at=["FOOTER", "GREETINGS"]) @@ -84,21 +84,21 @@ def test_message_extract_parts_no_tags(): def test_message_extract_last_body(): message = Message(text="Hello") message.tags = [ - ("HELLO", "Bonjour"), - ("BODY", "Pouvez-vous"), - ("GREETINGS", "Cordialement"), + {"base_text": "Bonjour", "base_tag": "HELLO"}, + {"base_text": "Pouvez-vous", "base_tag": "BODY"}, + {"base_text": "Cordialement", "base_tag": "GREETINGS"}, ] - assert message.extract_last_body() == [("BODY", "Pouvez-vous")] + assert message.extract_last_body() == [{"base_text": "Pouvez-vous", "base_tag": "BODY"}] def test_str(): # Arrange message = Message(meta="Test\nmeta", text="Hello") message.tags = [ - ("TAG", "ABC"), - ("TAAG", "ABCD"), - ("TAAAG", "ABCDE"), + {"base_text": "ABC", "base_tag": "TAG"}, + {"base_text": "ABCD", "base_tag": "TAAG"}, + {"base_text": "ABCDE", "base_tag": "TAAAG"}, ] expected_list = [ @@ -126,9 +126,9 @@ def test_str_no_meta(): # Arrange message = Message(text="Hello") message.tags = [ - ("TAG", "ABC"), - ("TAAG", "ABCD"), - ("TAAAG", "ABCDE"), + {"base_text": "ABC", "base_tag": "TAG"}, + {"base_text": "ABCD", "base_tag": "TAAG"}, + {"base_text": "ABCDE", "base_tag": "TAAAG"}, ] expected_list = [ @@ -175,6 +175,6 @@ def test_str_no_tags(): def test_str_no_conf(reset_melusine_config): config.reset({"Test": "Test"}) - message = Message(text="test", tags=[("TEST TAG", "TEST TEXT")]) + message = Message(text="test", tags=[{"base_text": "TEST TEXT", "base_tag": "TEST TAG"}]) print(message) - assert True + assert message.__str__() diff --git a/tests/detectors/test_thanks_detector.py b/tests/detectors/test_thanks_detector.py index 7f6e41ce..da500738 100644 --- a/tests/detectors/test_thanks_detector.py +++ b/tests/detectors/test_thanks_detector.py @@ -17,8 +17,8 @@ def thanks_detector_df(): m0 = Message("") m0.tags = [ - ("HELLO", "Bonjour"), - ("THANKS", "Merci beaucoup"), + {"base_text": "Bonjour", "base_tag": "HELLO"}, + {"base_text": "Merci beaucoup", "base_tag": "THANKS"}, ] m0_messages = [m0] m0_expected = True @@ -31,8 +31,8 @@ def thanks_detector_df(): m1 = Message("") m1.tags = [ - ("HELLO", "Bonjour"), - ("THANKS", "Merci, j'attends une reponse"), + {"base_text": "Bonjour", "base_tag": "HELLO"}, + {"base_text": "Merci, j'attends une reponse", "base_tag": "THANKS"}, ] m1_messages = [m1] m1_expected = False @@ -57,7 +57,6 @@ def thanks_detector_df(): def test_thanks_detector(thanks_detector_df): """Unit test of the debug mode.""" df = thanks_detector_df - df_copy = df.copy() detector = ThanksDetector( name="thanks", ) @@ -96,23 +95,26 @@ def test_thanks_detector_missing_field(thanks_detector_df): [ ( [ - ("HELLO", "Bonjour madame"), - ("BODY", "Voici le dossier"), - ("THANKS", "Merci a vous"), + {"base_text": "Bonjour madame", "base_tag": "HELLO"}, + {"base_text": "Voici le dossier", "base_tag": "BODY"}, + {"base_text": "Merci a vous", "base_tag": "THANKS"}, ], True, "Merci a vous", - [("THANKS", "Merci a vous")], + [{"base_text": "Merci a vous", "base_tag": "THANKS"}], ), ( [ - ("HELLO", "Bonjour madame"), - ("THANKS", "Merci"), - ("THANKS", "Merci a vous"), + {"base_text": "Bonjour madame", "base_tag": "HELLO"}, + {"base_text": "Merci", "base_tag": "THANKS"}, + {"base_text": "Merci a vous", "base_tag": "THANKS"}, ], False, "Merci\nMerci a vous", - [("THANKS", "Merci"), ("THANKS", "Merci a vous")], + [ + {"base_text": "Merci", "base_tag": "THANKS"}, + {"base_text": "Merci a vous", "base_tag": "THANKS"} + ], ), ], ) @@ -136,8 +138,6 @@ def test_thanks_detector_debug(tags, has_body, thanks_text, thanks_parts): assert "debug_thanks" in data assert "has_body" in data["debug_thanks"] assert "thanks_text" in data["debug_thanks"] - assert "thanks_parts" in data["debug_thanks"] assert data["debug_thanks"]["has_body"] == has_body assert data["debug_thanks"]["thanks_text"] == thanks_text - assert data["debug_thanks"]["thanks_parts"] == thanks_parts diff --git a/tests/detectors/test_vacation_reply_detector.py b/tests/detectors/test_vacation_reply_detector.py index 40a625ff..75945224 100644 --- a/tests/detectors/test_vacation_reply_detector.py +++ b/tests/detectors/test_vacation_reply_detector.py @@ -32,12 +32,12 @@ def test_instanciation(): text="Bonjour, je vous confirme l'annulation du rdv du 01/01/2022 " + "à 16h. Bien cordialement, John Smith.", tags=[ - ("HELLO", "Bonjour,"), - ( - "BODY", - "je vous confirme l'annulation du rdv du 01/01/2022 à 16h.", - ), - ("GREETINGS", "Bien cordialement, John Smith."), + {"base_tag": "HELLO", "base_text": "Bonjour,"}, + { + "base_tag": "BODY", + "base_text": "je vous confirme l'annulation du rdv du 01/01/2022 à 16h." + }, + {"base_tag": "GREETINGS", "base_text": "Bien cordialement, John Smith."}, ], ) ] @@ -55,12 +55,12 @@ def test_instanciation(): text="Bonjour, \nActuellement en conge je prendrai connaissance" + " de votre message ulterieurement.\nCordialement,", tags=[ - ("HELLO", "Bonjour,"), - ( - "BODY", - "Actuellement en conge je prendrai connaissance de votre message ulterieurement.", - ), - ("GREETINGS", "Cordialement, "), + {"base_tag": "HELLO", "base_text": "Bonjour,"}, + { + "base_tag": "BODY", + "base_text": "Actuellement en conge je prendrai connaissance de votre message ulterieurement." + }, + {"base_tag": "GREETINGS", "base_text": "Cordialement, "}, ], ) ] @@ -73,8 +73,6 @@ def test_instanciation(): ) def test_transform(df, good_result): """Unit test of the transform() method.""" - df_copy = df.copy() - message_column = "messages" detector = VacationReplyDetector( @@ -100,12 +98,12 @@ def test_transform(df, good_result): text="Bonjour, \nActuellement en conge je prendrai connaissance" + " de votre message ulterieurement.\nCordialement,", tags=[ - ("HELLO", "Bonjour,"), - ( - "BODY", - "Actuellement en conge je prendrai connaissance de votre message ulterieurement.", - ), - ("GREETINGS", "Cordialement, "), + {"base_tag": "HELLO", "base_text": "Bonjour,"}, + { + "base_tag": "BODY", + "base_text": "Actuellement en conge je prendrai connaissance de votre message ulterieurement." + }, + {"base_tag": "GREETINGS", "base_text": "Cordialement, "}, ], ) ] @@ -114,12 +112,6 @@ def test_transform(df, good_result): ), True, { - "parts": [ - ( - "BODY", - "Actuellement en conge je prendrai connaissance de votre message ulterieurement.", - ) - ], "text": "Actuellement en conge je prendrai connaissance de votre message ulterieurement.", "VacationReplyRegex": { "match_result": True, @@ -138,8 +130,6 @@ def test_transform(df, good_result): ) def test_transform_debug_mode(df, good_detection_result, good_debug_info): """Unit test of the debug mode.""" - df_copy = df.copy() - messages_column = "messages" detector = VacationReplyDetector( diff --git a/tests/functional/test_emails_fixtures.py b/tests/functional/test_emails_fixtures.py index 58f719be..934a7301 100644 --- a/tests/functional/test_emails_fixtures.py +++ b/tests/functional/test_emails_fixtures.py @@ -53,11 +53,22 @@ content_tagger_expected={ "messages.tags": [ [ - ("HELLO", "BonJour wORLD"), - ("BODY", "L'orem"), - ("BODY", "Ip-sum"), - ("BODY", "Lo_rem"), - ("BODY", "ip.sum."), + {"base_text": "BonJour wORLD", "base_tag": "HELLO", "base_tag_list": ["HELLO"]}, + {"base_text": "L'orem", "base_tag": "BODY", "base_tag_list": ["BODY"]}, + {"base_text": "Ip-sum", "base_tag": "BODY", "base_tag_list": ["BODY"]}, + {"base_text": "Lo_rem", "base_tag": "BODY", "base_tag_list": ["BODY"]}, + {"base_text": "ip.sum.", "base_tag": "BODY", "base_tag_list": ["BODY"]}, + ], + ], + }, + refined_tagger_expected={ + "messages.tags": [ + [ + {"base_text": "BonJour wORLD", "base_tag": "HELLO", "base_tag_list": ["HELLO"], "refined_tag": "HELLO"}, + {"base_text": "L'orem", "base_tag": "BODY", "base_tag_list": ["BODY"], "refined_tag": "BODY"}, + {"base_text": "Ip-sum", "base_tag": "BODY", "base_tag_list": ["BODY"], "refined_tag": "BODY"}, + {"base_text": "Lo_rem", "base_tag": "BODY", "base_tag_list": ["BODY"], "refined_tag": "BODY"}, + {"base_text": "ip.sum.", "base_tag": "BODY", "base_tag_list": ["BODY"], "refined_tag": "BODY"}, ], ], }, @@ -116,19 +127,47 @@ content_tagger_expected={ "messages.tags": [ [ - ("HELLO", "Bonjour,"), - ("BODY", "Vous trouverez ci-joint l'attestation"), - ("BODY", "Merci de me confirmer la bonne réception de ce message."), - ("THANKS", "Vous en remerciant par avance."), - ("GREETINGS", "Cordialement,"), - ("SIGNATURE_NAME", "Jean Dupont"), + {"base_text": "Bonjour,", "base_tag": "HELLO", "base_tag_list": ["HELLO"]}, + {"base_text": "Vous trouverez ci-joint l'attestation", "base_tag": "BODY", "base_tag_list": ["BODY"]}, + {"base_text": "Merci de me confirmer la bonne réception de ce message.", "base_tag": "BODY", "base_tag_list": ["BODY"]}, + {"base_text": "Vous en remerciant par avance.", "base_tag": "THANKS", "base_tag_list": ["THANKS"]}, + {"base_text": "Cordialement,", "base_tag": "GREETINGS", "base_tag_list": ["GREETINGS"]}, + {"base_text": "Jean Dupont", "base_tag": "BODY", "base_tag_list": ["BODY"]}, ], [ - ("HELLO", "Bonjour,"), - ("BODY", "Veuillez trouver ci-jointe la lettre"), - ("FOOTER", "La visualisation des fichiers PDF nécessite Adobe Reader."), - ("GREETINGS", "Sentiments mutualistes."), - ("SIGNATURE_NAME", "La MAIF"), + {"base_text": "Bonjour,", "base_tag": "HELLO", "base_tag_list": ["HELLO"]}, + {"base_text": "Veuillez trouver ci-jointe la lettre", "base_tag": "BODY", "base_tag_list": ["BODY"]}, + {"base_text": "La visualisation des fichiers PDF nécessite Adobe Reader.", "base_tag": "FOOTER", "base_tag_list": ["FOOTER"]}, + {"base_text": "Sentiments mutualistes.", "base_tag": "GREETINGS", "base_tag_list": ["GREETINGS"]}, + {"base_text": "La MAIF", "base_tag": "BODY", "base_tag_list": ["BODY"]}, + ], + ], + }, + refined_tagger_expected={ + "messages.tags": [ + [ + {"base_text": "Bonjour,", "base_tag": "HELLO", "base_tag_list": ["HELLO"], "refined_tag": "HELLO"}, + {"base_text": "Vous trouverez ci-joint l'attestation", "base_tag": "BODY", "base_tag_list": ["BODY"], + "refined_tag": "BODY"}, + {"base_text": "Merci de me confirmer la bonne réception de ce message.", "base_tag": "BODY", + "base_tag_list": ["BODY"], "refined_tag": "BODY"}, + {"base_text": "Vous en remerciant par avance.", "base_tag": "THANKS", "base_tag_list": ["THANKS"], + "refined_tag": "THANKS"}, + {"base_text": "Cordialement,", "base_tag": "GREETINGS", "base_tag_list": ["GREETINGS"], + "refined_tag": "GREETINGS"}, + {"base_text": "Jean Dupont", "base_tag": "BODY", "base_tag_list": ["BODY"], + "refined_tag": "SIGNATURE_NAME"}, + ], + [ + {"base_text": "Bonjour,", "base_tag": "HELLO", "base_tag_list": ["HELLO"], "refined_tag": "HELLO"}, + {"base_text": "Veuillez trouver ci-jointe la lettre", "base_tag": "BODY", "base_tag_list": ["BODY"], + "refined_tag": "BODY"}, + {"base_text": "La visualisation des fichiers PDF nécessite Adobe Reader.", "base_tag": "FOOTER", + "base_tag_list": ["FOOTER"], "refined_tag": "FOOTER"}, + {"base_text": "Sentiments mutualistes.", "base_tag": "GREETINGS", "base_tag_list": ["GREETINGS"], + "refined_tag": "GREETINGS"}, + {"base_text": "La MAIF", "base_tag": "BODY", "base_tag_list": ["BODY"], + "refined_tag": "SIGNATURE_NAME"}, ], ], }, @@ -162,16 +201,30 @@ content_tagger_expected={ "messages.tags": [ [ - ("HELLO", "Bonjour"), - ( - "BODY", - "Pouvez-vous me transmettre deux attestations au nom de mes enfants", - ), - ("BODY", "- Jane Dupond"), - ("BODY", "- Joe Dupond"), - ("THANKS", "Merci par avance"), - ("GREETINGS", "Cordialement"), - ("SIGNATURE_NAME", "Mr Jean Dupond"), + {"base_text": "Bonjour", "base_tag": "HELLO", "base_tag_list": ["HELLO"]}, + {"base_text": "Pouvez-vous me transmettre deux attestations au nom de mes enfants", "base_tag": "BODY", "base_tag_list": ["BODY"]}, + {"base_text": "- Jane Dupond", "base_tag": "BODY", "base_tag_list": ["BODY"]}, + {"base_text": "- Joe Dupond", "base_tag": "BODY", "base_tag_list": ["BODY"]}, + {"base_text": "Merci par avance", "base_tag": "THANKS", "base_tag_list": ["THANKS"]}, + {"base_text": "Cordialement", "base_tag": "GREETINGS", "base_tag_list": ["GREETINGS"]}, + {"base_text": "Mr Jean Dupond", "base_tag": "BODY", "base_tag_list": ["BODY"]}, + ] + ], + }, + refined_tagger_expected={ + "messages.tags": [ + [ + {"base_text": "Bonjour", "base_tag": "HELLO", "base_tag_list": ["HELLO"], "refined_tag": "HELLO"}, + {"base_text": "Pouvez-vous me transmettre deux attestations au nom de mes enfants", "base_tag": "BODY", + "base_tag_list": ["BODY"], "refined_tag": "BODY"}, + {"base_text": "- Jane Dupond", "base_tag": "BODY", "base_tag_list": ["BODY"], "refined_tag": "BODY"}, + {"base_text": "- Joe Dupond", "base_tag": "BODY", "base_tag_list": ["BODY"], "refined_tag": "BODY"}, + {"base_text": "Merci par avance", "base_tag": "THANKS", "base_tag_list": ["THANKS"], + "refined_tag": "THANKS"}, + {"base_text": "Cordialement", "base_tag": "GREETINGS", "base_tag_list": ["GREETINGS"], + "refined_tag": "GREETINGS"}, + {"base_text": "Mr Jean Dupond", "base_tag": "BODY", "base_tag_list": ["BODY"], + "refined_tag": "SIGNATURE_NAME"}, ] ], }, @@ -276,8 +329,8 @@ content_tagger_expected={ "messages.tags": [ [ - ("THANKS", "Bonjour et merci"), - ("GREETINGS", "Cordialement"), + {"base_text": "Bonjour et merci", "base_tag": "THANKS", "base_tag_list": ["THANKS", "HELLO"]}, + {"base_text": "Cordialement", "base_tag": "GREETINGS", "base_tag_list": ["GREETINGS"]}, ] ], }, diff --git a/tests/io/__init__.py b/tests/io_mixin/__init__.py similarity index 100% rename from tests/io/__init__.py rename to tests/io_mixin/__init__.py diff --git a/tests/io/test_io_mixin.py b/tests/io_mixin/test_io_mixin.py similarity index 93% rename from tests/io/test_io_mixin.py rename to tests/io_mixin/test_io_mixin.py index e0eb1bfa..070bc0cc 100644 --- a/tests/io/test_io_mixin.py +++ b/tests/io_mixin/test_io_mixin.py @@ -1,8 +1,8 @@ import pytest from melusine import config -from melusine.io import IoMixin -from melusine.io._classes import InitError +from melusine.io_mixin import IoMixin +from melusine.io_mixin._classes import InitError class FakeClass(IoMixin): diff --git a/tests/processors/test_content_refined_tagger.py b/tests/processors/test_content_refined_tagger.py new file mode 100644 index 00000000..00914176 --- /dev/null +++ b/tests/processors/test_content_refined_tagger.py @@ -0,0 +1,819 @@ +import re + +import pytest + +from melusine.message import Message +from melusine.processors import BaseContentTagger, ContentTagger, Tag, RefinedTagger + + +def test_content_tagger(): + # Text segments (= individual messages in an email conversation) + text_segments = [ + "Envoye de mon iphone", + ("Bonjour Mme X,\nSuite a blh blah blah\nBien cordialement\nJane Dupond\n(See attached file: flex.jpg)"), + ( + "Bonjour,\nVeuillez trouver ci-joint blah\n" + "Merci d'avance,\nCordialement,\n" + "Toute modification, edition, utilisation ou diffusion non autorisee est interdite" + ), + ] + + # Expected tags + expected_tags = [ + [ + {"base_text": "Envoye de mon iphone", "base_tag": "FOOTER"}, + ], + [ + {"base_text": "Bonjour Mme X,", "base_tag": "HELLO"}, + {"base_text": "Suite a blh blah blah", "base_tag": "BODY"}, + {"base_text": "Bien cordialement", "base_tag": "GREETINGS"}, + {"base_text": "Jane Dupond", "base_tag": "BODY"}, + {"base_text": "(See attached file: flex.jpg)", "base_tag": "PJ"}, + ], + [ + {"base_text": "Bonjour,", "base_tag": "HELLO"}, + {"base_text": "Veuillez trouver ci-joint blah", "base_tag": "BODY"}, + {"base_text": "Merci d'avance,", "base_tag": "THANKS"}, + {"base_text": "Cordialement,", "base_tag": "GREETINGS"}, + {"base_text": "Toute modification, edition, utilisation ou diffusion non autorisee est interdite", + "base_tag": "FOOTER"}, + ], + ] + + # Mock the output of a Segmenter (List of Message object) + messages = [Message(text=segment) for segment in text_segments] + + # Instantiate and apply the Tagger + tagger = ContentTagger() + output_messages = tagger.tag_email(messages) + + # Test output tags + for tag_data_list in expected_tags: + for tag_data in tag_data_list: + if "base_tag_list" not in tag_data_list: + tag_data["base_tag_list"] = [tag_data["base_tag"]] + + for i, message in enumerate(output_messages): + for j, tag_data in enumerate(message.tags): + assert tag_data == expected_tags[i][j] + + +def test_tag_null_message(): + messages = None + + # Instantiate and apply the Tagger + tagger = ContentTagger() + output_messages = tagger.tag_email(messages) + + assert output_messages is None + + +@pytest.mark.parametrize( + "text, expected_parts", + [ + ( + "Bonjour, merci pour votre message!\nComment-allez vous?! Je suis satisfait!!!\n" + "Bien cordialement\n\n\n\nJane Dupond\n", + [ + "Bonjour,", + "merci pour votre message!", + "Comment-allez vous?!", + "Je suis satisfait!!!", + "Bien cordialement", + "Jane Dupond", + ], + ), + ], +) +def test_content_tagger_split_text(text, expected_parts): + # Instantiate and apply the Tagger + tagger = ContentTagger() + output_parts = tagger.split_text(text) + + assert output_parts == expected_parts + + +@pytest.mark.parametrize( + "text, expected_tags", + [ + ( + "Bonjour Mme X,\nSuite a blh blah blah.\n" + "Bien cordialement\nJane Dupond\n" + "(See attached file: flex.jpg)", + [ + {"base_text": "Bonjour Mme X,", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "Suite a blh blah blah.", "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Bien cordialement", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + {"base_text": "Jane Dupond", "base_tag": "BODY", "refined_tag": "SIGNATURE_NAME"}, + {"base_text": "(See attached file: flex.jpg)", "base_tag": "PJ", "refined_tag": "PJ"}, + ], + ), + ( + "Bonjour, je confirme le rdv. Cordialement, John Smith", + [ + {"base_text": "Bonjour,", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "je confirme le rdv.", "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Cordialement, John Smith", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + ], + ), + ( + ( + "Bonjour,\nSuite a notre intervention du 16.02.22 , un taux d'humidité de 50% a été relevé.\n" + "Cordialement.\n177, rue de la fée - 75000 Paris.\n" + "Horaires : du lundi au jeudi de 08h00 à 16h30 et le vendredi de 08h00 à 16h00.\n" + "Tel : 01.45.53.11.33" + ), + [ + {"base_text": "Bonjour,", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "Suite a notre intervention du 16.02.22 , un taux d'humidité de 50% a été relevé.", + "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Cordialement.", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + {"base_text": "177, rue de la fée - 75000 Paris.", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + {"base_text": "Horaires : du lundi au jeudi de 08h00 à 16h30 et le vendredi de 08h00 à 16h00.", + "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Tel : 01.45.53.11.33", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + ], + ), + ( + ( + "bonjour\n" + "15 jours après les premières réparations, un défaut a été détecté. " + "Bien à vous\n" + "Britney Spears" + ), + [ + {"base_text": "bonjour", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "15 jours après les premières réparations, un défaut a été détecté.", "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Bien à vous", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + {"base_text": "Britney Spears", "base_tag": "BODY", "refined_tag": "SIGNATURE_NAME"}, + ], + ), + ( + ( + "Bonjour monsieur Smith\n" + "merci. Bien à vous\n" + "Britney Spears\n" + "22 hollywood boulevard\n" + "79000 Niort\n" + ), + [ + {"base_text": "Bonjour monsieur Smith", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "merci.", "base_tag": "THANKS", "refined_tag": "THANKS"}, + {"base_text": "Bien à vous", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + {"base_text": "Britney Spears", "base_tag": "BODY", "refined_tag": "SIGNATURE_NAME"}, + {"base_text": "22 hollywood boulevard", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + {"base_text": "79000 Niort", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + ], + ), + ( + ( + "Merci de me faire suivre les docs à ma nouvelle adresse qui est 0 rue du parc, 75000 Paris. " + "Merci d'avance. \nAcceptez notre salutation," + ), + [ + { + "base_text": "Merci de me faire suivre les docs à ma nouvelle adresse qui est 0 rue du parc, 75000 Paris.", + "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Merci d'avance.", "base_tag": "THANKS", "refined_tag": "THANKS"}, + {"base_text": "Acceptez notre salutation,", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + ], + ), + ( + ( + "Bonjour\n" + "Je vous relance concernant ma télévision avec le devis en PJ.\n" + "Désolé pour la qualité.\n" + "Je l'ai envoyé à partir de mon ordi.\n" + "Excellente journée à vous,\n" + "Bon we\n" + "Votre bien dévoué\n" + "amicalement votre\n" + "Cordiales salutations.\n" + "Françoise-Bénédicte Dupond\n" + "Envoyé à partir de \nCourrier \npour Windows" + ), + [ + {"base_text": "Bonjour", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "Je vous relance concernant ma télévision avec le devis en PJ.", "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Désolé pour la qualité.", "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Je l'ai envoyé à partir de mon ordi.", "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Excellente journée à vous,", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "Bon we", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "Votre bien dévoué", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + {"base_text": "amicalement votre", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + {"base_text": "Cordiales salutations.", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + {"base_text": "Françoise-Bénédicte Dupond", "base_tag": "BODY", "refined_tag": "SIGNATURE_NAME"}, + {"base_text": "Envoyé à partir de", "base_tag": "FOOTER", "refined_tag": "FOOTER"}, + {"base_text": "Courrier", "base_tag": "FOOTER", "refined_tag": "FOOTER"}, + {"base_text": "pour Windows", "base_tag": "FOOTER", "refined_tag": "FOOTER"}, + ], + ), + ( + "C'est bien note, merci beaucoup.\nSentiments dévoués.\nTélécharger \nOutlook pour Android", + [ + {"base_text": "C'est bien note, merci beaucoup.", "base_tag": "THANKS", "refined_tag": "THANKS"}, + {"base_text": "Sentiments dévoués.", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + {"base_text": "Télécharger", "base_tag": "FOOTER", "refined_tag": "FOOTER"}, + {"base_text": "Outlook pour Android", "base_tag": "FOOTER", "refined_tag": "FOOTER"}, + ], + ), + ( + "Impeccable, je vous remercie beaucoup pour votre rapidité.\nObtenir\nOutlook pour Android", + [ + {"base_text": "Impeccable, je vous remercie beaucoup pour votre rapidité.", "base_tag": "THANKS", "refined_tag": "THANKS"}, + {"base_text": "Obtenir", "base_tag": "FOOTER", "refined_tag": "FOOTER"}, + {"base_text": "Outlook pour Android", "base_tag": "FOOTER", "refined_tag": "FOOTER"}, + ], + ), + ( + ( + "Cher Monsieur,\nJe vous confirme la bonne réception de votre précédent email.\n" + "Je vous en remercie.\nBien cordialement,\nJohn Smith" + ), + [ + {"base_text": "Cher Monsieur,", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "Je vous confirme la bonne réception de votre précédent email.", "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Je vous en remercie.", "base_tag": "THANKS", "refined_tag": "THANKS"}, + {"base_text": "Bien cordialement,", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + {"base_text": "John Smith", "base_tag": "BODY", "refined_tag": "SIGNATURE_NAME"}, + ], + ), + ( + ( + "chère madame,\n" + "URGENT URGENT\n" + "Merci de me faire suivre les docs à ma nouvelle adresse qui est 0 rue du parc, 75000 Paris. " + "Merci d'avance. \nRecevez nos salutations,\nVous en souhaitant bonne réception" + ), + [ + {"base_text": "chère madame,", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "URGENT URGENT", "base_tag": "BODY", "refined_tag": "BODY"}, + { + "base_text": "Merci de me faire suivre les docs à ma nouvelle adresse qui est 0 rue du parc, 75000 Paris.", + "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Merci d'avance.", "base_tag": "THANKS", "refined_tag": "THANKS"}, + {"base_text": "Recevez nos salutations,", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + {"base_text": "Vous en souhaitant bonne réception", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + ], + ), + pytest.param( + "Un témoignage sous X\nEnvoyé depuis mon téléphone Orange", + [ + {"base_text": "Un témoignage sous X", "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Envoyé depuis mon téléphone Orange", "base_tag": "FOOTER", "refined_tag": "FOOTER"}, + ], + id="Edge case where a line ends with an isolated character", + ), + pytest.param( + " ??\n !??!", + [ + {"base_text": "??!??!", "base_tag": "BODY", "refined_tag": "BODY"}, + ], + id="Edge case where the two first lines are missing word characters", + ), + ( + "Bonjour Mme X,\nSuite a blh blah blah.\n" + "Bien cordialement\nJane Dupond\n" + "(See attached file: flex.jpg)", + [ + {"base_text": "Bonjour Mme X,", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "Suite a blh blah blah.", "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Bien cordialement", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + {"base_text": "Jane Dupond", "base_tag": "BODY", "refined_tag": "SIGNATURE_NAME"}, + {"base_text": "(See attached file: flex.jpg)", "base_tag": "PJ", "refined_tag": "PJ"}, + ], + ), + ( + "\nChère Madame\n\nC'est bien noté, merci\nBien reçu\nJ.Smith\n\n", + [ + {"base_text": "Chère Madame", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "C'est bien noté, merci", "base_tag": "THANKS", "refined_tag": "THANKS"}, + {"base_text": "Bien reçu", "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "J.Smith", "base_tag": "BODY", "refined_tag": "SIGNATURE_NAME"}, + ], + ), + ( + "\nBonjour Monsieur, ceci n'est pas un hello\nBonne fin de journee\nsalutations", + [ + {"base_text": "Bonjour Monsieur, ceci n'est pas un hello", "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Bonne fin de journee", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "salutations", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + ], + ), + ( + "\nBonjour Monsieur Stanislas von den hoeggenboord\n\nbien à toi\nJ. Smith\nChargé de clientèle", + [ + {"base_text": "Bonjour Monsieur Stanislas von den hoeggenboord", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "bien à toi", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + {"base_text": "J. Smith", "base_tag": "BODY", "refined_tag": "SIGNATURE_NAME"}, + {"base_text": "Chargé de clientèle", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + ], + ), + ( + ( + "\n1 rdv à 18h\n\n2 ème message laissé à la locataire\n3je m'en vais au bois\n" + "4 allée des iris\n 5bis rue Patrick Sebastien\n6-8 cours mirabeau\n 7 ter place du dahu\n" + "8 de la rue très longue qui ne doit pas être taggée signature" + ), + [ + {"base_text": "1 rdv à 18h", "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "2 ème message laissé à la locataire", "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "3je m'en vais au bois", "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "4 allée des iris", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + {"base_text": "5bis rue Patrick Sebastien", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + {"base_text": "6-8 cours mirabeau", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + {"base_text": "7 ter place du dahu", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + {"base_text": "8 de la rue très longue qui ne doit pas être taggée signature", "base_tag": "BODY", "refined_tag": "BODY"}, + ], + ), + ( + ( + "à L'attention de M Bob,\n" + "Bonjour,\n" + "Je vous informe que je vais accepter la proposition de L , à savoir le paiement d'une indemnité forfaitaire de résiliation du CCMI de 4000 € TTC pour clore cette affaire.\n" + "Cordialement.\n" + "Bob Smith" + ), + [ + {"base_text": "à L'attention de M Bob,", "base_tag": "FOOTER", "refined_tag": "FOOTER"}, + {"base_text": "Bonjour,", "base_tag": "HELLO", "refined_tag": "HELLO"}, + { + "base_text": "Je vous informe que je vais accepter la proposition de L , à savoir le paiement d'une indemnité forfaitaire de résiliation du CCMI de 4000 € TTC pour clore cette affaire.", + "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Cordialement.", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + {"base_text": "Bob Smith", "base_tag": "BODY", "refined_tag": "SIGNATURE_NAME"}, + ], + ), + ( + ( + "Monsieur Bob Smith\n" + "Adresse mail : BobSmith90@gmail.com\n" + "Lucy Ange\n\n" + "Bonjour Monsieur,\n" + "Suite à notre entretien téléphonique de ce matin, et au message que vous m'avez envoyé sur ma messagerie, je voudrais effectuer la réparation du véhicule Renault Twingo dans un garage partenaire de la Maif situé, si c'est possible.\n" + "Dans l'attente de votre réponse et en vous remerciant par avance,\n\n\n" + "Monsieur Bob Smith\n\n\n" + "Envoyé à partir de\n" + "Courrier\npour Windows\n\n\n\n" + "Sans virus.\nwww.avast.com" + ), + [ + {"base_text": "Monsieur Bob Smith", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "Adresse mail : BobSmith90@gmail.com", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + {"base_text": "Lucy Ange", "base_tag": "BODY", "refined_tag": "SIGNATURE_NAME"}, + {"base_text": "Bonjour Monsieur,", "base_tag": "HELLO", "refined_tag": "HELLO"}, + { + "base_text": "Suite à notre entretien téléphonique de ce matin, et au message que vous m'avez envoyé sur ma messagerie, je voudrais effectuer la réparation du véhicule Renault Twingo dans un garage partenaire de la Maif situé, si c'est possible.", + "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Dans l'attente de votre réponse et en vous remerciant par avance,", "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Monsieur Bob Smith", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "Envoyé à partir de", "base_tag": "FOOTER", "refined_tag": "FOOTER"}, + {"base_text": "Courrier", "base_tag": "FOOTER", "refined_tag": "FOOTER"}, + {"base_text": "pour Windows", "base_tag": "FOOTER", "refined_tag": "FOOTER"}, + {"base_text": "Sans virus.", "base_tag": "FOOTER", "refined_tag": "FOOTER"}, + {"base_text": "www.avast.com", "base_tag": "FOOTER", "refined_tag": "FOOTER"}, + ], + ), + ( + ( + "Bob Smith\n\n\n" + "A l’attention de Madame Lucy Ange,\n\n\n\n\n\n" + "Bonjour Madame Ange,\n\n\n\n\n\n\n\n\n" + "J’espère que vous allez bien.\n\n\n\n\n\n" + "Pour faire suite à mon mail du 21 février 2023, je me permets de revenir vers vous pour avoir votre avis sur le devis que j’ai demandé auprès d’un enquêteur.\n\n\n\n" + "Voici son retour :\n\n\n\n\n\n" + "Qu’en pensez-vous svp ?\n\n\n\n\n\n" + "Je reste à votre disposition pour tout complément d’information et vous remercie de l’intérêt que vous porterez à ma demande,\n\n\n\n\n\n" + "Bien Cordialement,\n\n\n\n\n\n" + "Bob Smith\n\n\n" + "Tél. 06.83.22.95.94" + ), + [ + {"base_text": "Bob Smith", "base_tag": "BODY", "refined_tag": "SIGNATURE_NAME"}, + {"base_text": "A l’attention de Madame Lucy Ange,", "base_tag": "FOOTER", "refined_tag": "FOOTER"}, + {"base_text": "Bonjour Madame Ange,", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "J’espère que vous allez bien.", "base_tag": "BODY", "refined_tag": "BODY"}, + { + "base_text": "Pour faire suite à mon mail du 21 février 2023, je me permets de revenir vers vous pour avoir votre avis sur le devis que j’ai demandé auprès d’un enquêteur.", + "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Voici son retour :", "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Qu’en pensez-vous svp ?", "base_tag": "BODY", "refined_tag": "BODY"}, + { + "base_text": "Je reste à votre disposition pour tout complément d’information et vous remercie de l’intérêt que vous porterez à ma demande,", + "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Bien Cordialement,", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + {"base_text": "Bob Smith", "base_tag": "BODY", "refined_tag": "SIGNATURE_NAME"}, + {"base_text": "Tél.", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + {"base_text": "06.83.22.95.94", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + ], + ), + pytest.param( + ( + "cordialement\nContact e-mail\n\n\nContact téléphone\n\n01 23 45 67 89 / abcabc@hotmail.fr\n" + "Torroella de Montgri, le 5 avril 2023\nLes formats de fichiers acceptés sont : PDF, DOC, DOCX, JPEG, " + "JPG, TIFF, TXT, ODT, XLS, XLSX\nTout autre format de fichiers ne sera pas transmis au dossier" + ), + [ + {"base_text": "cordialement", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + {"base_text": "Contact e-mail", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + {"base_text": "Contact téléphone", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + {"base_text": "01 23 45 67 89 / abcabc@hotmail.fr", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + {"base_text": "Torroella de Montgri, le 5 avril 2023", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + { + "base_text": "Les formats de fichiers acceptés sont : PDF, DOC, DOCX, JPEG, JPG, TIFF, TXT, ODT, XLS, XLSX", + "base_tag": "FOOTER", "refined_tag": "FOOTER"}, + {"base_text": "Tout autre format de fichiers ne sera pas transmis au dossier", "base_tag": "FOOTER", "refined_tag": "FOOTER"}, + ], + id="diverse_signature_patterns", + ), + pytest.param( + ( + "bonjour\nmon body\nJ. Smith\n\n01 23 45 67 89\nSecrétaire en charge des avions\n" + "Business Analyst – Tribu Sinistres – Squad Flux Entrants\n" + "Société nationale des chemins de fer\nConseiller MAIF\nGestionnaire sinistre - C99G\n" + "Service des lettres anonymes\nTechnicienne de gestion - EQUIPE ABC\n" + ), + [ + {"base_text": "bonjour", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "mon body", "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "J. Smith", "base_tag": "BODY", "refined_tag": "SIGNATURE_NAME"}, + {"base_text": "01 23 45 67 89", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + {"base_text": "Secrétaire en charge des avions", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + {"base_text": "Business Analyst – Tribu Sinistres – Squad Flux Entrants", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + {"base_text": "Société nationale des chemins de fer", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + {"base_text": "Conseiller MAIF", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + {"base_text": "Gestionnaire sinistre - C99G", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + {"base_text": "Service des lettres anonymes", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + {"base_text": "Technicienne de gestion - EQUIPE ABC", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + ], + id="signature_jobs", + ), + pytest.param( + ( + "bonjour\nmon body\nCordialement\n\n" + "analyste -------------------------------------- test test test test test test test\n" + ), + [ + {"base_text": "bonjour", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "mon body", "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Cordialement", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + {"base_text": "analyste -------------------------------------- test test test test test test test", + "base_tag": "BODY", "refined_tag": "BODY"}, + ], + id="check_catastrophic_backtracking", + ), + ], +) +def test_tag_text_generic(text, expected_tags): + # Arrange + tagger = ContentTagger() + refined_tagger = RefinedTagger() + + # Act + base_tags = tagger.tag_text(text) + refined_tags = refined_tagger.post_process_tags(base_tags) + + # Assert + for tag_data in expected_tags: + if "base_tag_list" not in tag_data: + tag_data["base_tag_list"] = [tag_data["base_tag"]] + assert refined_tags == expected_tags + + +@pytest.mark.parametrize( + "text, expected_tags", + [ + pytest.param( + ( + "Merci\n" + "Je vous remercie\n" + "Merci d'avance\n" + "Je vous remercie par avance\n" + "Vous en remerciant par avance.\n" + ), + [ + {"base_text": "Merci", "base_tag": "THANKS"}, + {"base_text": "Je vous remercie", "base_tag": "THANKS"}, + {"base_text": "Merci d'avance", "base_tag": "THANKS"}, + {"base_text": "Je vous remercie par avance", "base_tag": "THANKS"}, + {"base_text": "Vous en remerciant par avance.", "base_tag": "THANKS"}, + ], + id="french thanks patterns", + ), + ], +) +def test_tag_text_french(text, expected_tags): + # Arrange + tagger = ContentTagger() + + # Act + output_tags = tagger.tag_text(text) + + # Assert + for tag_data in expected_tags: + if "base_tag_list" not in tag_data: + tag_data["base_tag_list"] = [tag_data["base_tag"]] + assert output_tags == expected_tags + + +@pytest.mark.parametrize( + "text, expected_tags", + [ + pytest.param( + ( + "Thank you so much\n" + "thanks\n" + "thx Joanna\n" + "thanks but you forgot bla\n" + "Thx however I still need the document\n" + ), + [ + {"base_text": "Thank you so much", "base_tag": "THANKS"}, + {"base_text": "thanks", "base_tag": "THANKS"}, + {"base_text": "thx Joanna", "base_tag": "THANKS"}, + {"base_text": "thanks but you forgot bla", "base_tag": "BODY"}, + {"base_text": "Thx however I still need the document", "base_tag": "BODY"}, + ], + id="english thanks patterns", + ), + pytest.param( + ( + "Best\n" + "warm Wishes\n" + "regards\n" + "best regards\n" + "cheers\n" + "yours\n" + "yours truly\n" + "Sincerely\n" + "see you soon\n" + "Speak to you soon\n" + "talk soon\n" + "Take care\n" + "Catch you later\n" + "Have a fantastic day\n" + "Looking forward to your reply\n" + "I am looking forward to hearing from you\n" + "Hoping to hear from you\n" + ), + [ + {"base_text": "Best", "base_tag": "GREETINGS"}, + {"base_text": "warm Wishes", "base_tag": "GREETINGS"}, + {"base_text": "regards", "base_tag": "GREETINGS"}, + {"base_text": "best regards", "base_tag": "GREETINGS"}, + {"base_text": "cheers", "base_tag": "GREETINGS"}, + {"base_text": "yours", "base_tag": "GREETINGS"}, + {"base_text": "yours truly", "base_tag": "GREETINGS"}, + {"base_text": "Sincerely", "base_tag": "GREETINGS"}, + {"base_text": "see you soon", "base_tag": "GREETINGS"}, + {"base_text": "Speak to you soon", "base_tag": "GREETINGS"}, + {"base_text": "talk soon", "base_tag": "GREETINGS"}, + {"base_text": "Take care", "base_tag": "GREETINGS"}, + {"base_text": "Catch you later", "base_tag": "GREETINGS"}, + {"base_text": "Have a fantastic day", "base_tag": "GREETINGS"}, + {"base_text": "Looking forward to your reply", "base_tag": "GREETINGS"}, + {"base_text": "I am looking forward to hearing from you", "base_tag": "GREETINGS"}, + {"base_text": "Hoping to hear from you", "base_tag": "GREETINGS"}, + ], + id="english greetings", + ), + pytest.param( + ( + "Hello John\n" + "hi\n" + "Hi there\n" + "good to hear from you\n" + "it is good to hear from you\n" + "I hope you are having a great week\n" + "how are you doing\n" + "how are you positioned about the matter\n" + "i hope you are doing well\n" + "Good Morning Joanna\n" + "good Afternoon\n" + "Dear Jacky\n" + "Sir\n" + "Dear Madam\n" + "Dear Mr\n" + "Dear Ms.\n" + "Dear miss\n" + "Dear mrs.\n" + "Dear sir or madam\n" + "To whom it may concern\n" + ), + [ + {"base_text": "Hello John", "base_tag": "HELLO"}, + {"base_text": "hi", "base_tag": "HELLO"}, + {"base_text": "Hi there", "base_tag": "HELLO"}, + {"base_text": "good to hear from you", "base_tag": "HELLO"}, + {"base_text": "it is good to hear from you", "base_tag": "HELLO"}, + {"base_text": "I hope you are having a great week", "base_tag": "HELLO"}, + {"base_text": "how are you doing", "base_tag": "HELLO"}, + {"base_text": "how are you positioned about the matter", "base_tag": "BODY"}, + {"base_text": "i hope you are doing well", "base_tag": "HELLO"}, + {"base_text": "Good Morning Joanna", "base_tag": "HELLO"}, + {"base_text": "good Afternoon", "base_tag": "HELLO"}, + {"base_text": "Dear Jacky", "base_tag": "HELLO"}, + {"base_text": "Sir", "base_tag": "HELLO"}, + {"base_text": "Dear Madam", "base_tag": "HELLO"}, + {"base_text": "Dear Mr", "base_tag": "HELLO"}, + {"base_text": "Dear Ms.", "base_tag": "HELLO"}, + {"base_text": "Dear miss", "base_tag": "HELLO"}, + {"base_text": "Dear mrs.", "base_tag": "HELLO"}, + {"base_text": "Dear sir or madam", "base_tag": "HELLO"}, + {"base_text": "To whom it may concern", "base_tag": "HELLO"}, + ], + id="english hello", + ), + pytest.param( + ( + "VP of Data Science\n" + "Chief of staff\n" + "CTO at TestMelusine\n" + "CEOABC test\n" + "Lead business developer\n" + ), + [ + {"base_text": "VP of Data Science", "base_tag": "SIGNATURE"}, + {"base_text": "Chief of staff", "base_tag": "SIGNATURE"}, + {"base_text": "CTO at TestMelusine", "base_tag": "SIGNATURE"}, + {"base_text": "CEOABC test", "base_tag": "BODY"}, + {"base_text": "Lead business developer", "base_tag": "SIGNATURE"}, + ], + id="english job signature patterns", + ), + pytest.param( + ( + "9 downing street\n" + "4-6 Beverly Hill\n" + "4 Abbey road W24RA\n" + "3 Ocean Rd.\n" + "5th avenue\n" + "221b Baker St.\n" + "6bis River ln.\n" + "7 Winter lane\n" + ), + [ + {"base_text": "9 downing street", "base_tag": "SIGNATURE"}, + {"base_text": "4-6 Beverly Hill", "base_tag": "SIGNATURE"}, + {"base_text": "4 Abbey road W24RA", "base_tag": "SIGNATURE"}, + {"base_text": "3 Ocean Rd.", "base_tag": "SIGNATURE"}, + {"base_text": "5th avenue", "base_tag": "SIGNATURE"}, + {"base_text": "221b Baker St.", "base_tag": "SIGNATURE"}, + {"base_text": "6bis River ln.", "base_tag": "SIGNATURE"}, + {"base_text": "7 Winter lane", "base_tag": "SIGNATURE"}, + ], + id="english adsress signature patterns", + ), + ], +) +def test_tag_text_english(text, expected_tags): + # Arrange + tagger = ContentTagger() + + # Act + output_tags = tagger.tag_text(text) + + # Assert + for tag_data in expected_tags: + if "base_tag_list" not in tag_data: + tag_data["base_tag_list"] = [tag_data["base_tag"]] + assert output_tags == expected_tags + + +def test_tag_list(): + # Arrange + # Limit tags to "HELLO" and the default tag ("BODY") + tag_list = ["HELLO"] + + # Text segment (= individual message in an email conversation) + text = "bonjour\nblah blah blah\nmerci\ncordialement" + + # Expected tags + expected_tags = [ + {"base_text": "bonjour", "base_tag": "HELLO"}, + {"base_text": "blah blah blah", "base_tag": "BODY"}, + {"base_text": "merci", "base_tag": "BODY"}, + {"base_text": "cordialement", "base_tag": "BODY"}, + ] + + # Instantiate and apply the Tagger + tagger = ContentTagger(tag_list=tag_list) + + # Act + output_tags = tagger.tag_text(text) + + # Assert + for tag_data in expected_tags: + if "base_tag_list" not in tag_data: + tag_data["base_tag_list"] = [tag_data["base_tag"]] + assert output_tags == expected_tags + + +def test_undefined_tag(): + unknown_tag = "UNKNOWN_TAG" + + # Setup an unknown tag + tag_list = [unknown_tag] + + # Instantiate Tagger + with pytest.raises(ValueError, match=rf".*{unknown_tag}.*"): + _ = ContentTagger(tag_list=tag_list) + + +def test_unsupported_type(): + class MyClass(ContentTagger): + """Test class""" + + @Tag + def TEST_TAG(self): + """Test method""" + return 3.3 + + with pytest.raises(ValueError, match="supported types"): + _ = MyClass() + + +def test_compiled_pattern(): + class MyClass(ContentTagger): + """Test class""" + + @Tag + def TEST_TAG(self): + """Test method""" + return re.compile(r"cool_pattern") + + tagger = MyClass() + subtext, tag, match = tagger("cool_pattern is what I am looking for")[0] + + # Check tag result + assert tag == "TEST_TAG" + + +def test_str_pattern(): + class MyClass(ContentTagger): + """Test class""" + + @Tag + def TEST_TAG(self): + """Test method""" + return r"cool_pattern" + + tagger = MyClass() + subtext, tag, match = tagger("cool_pattern is what I am looking for")[0] + + # Check tag result + assert tag == "TEST_TAG" + + +def test_malformed_regex(): + from melusine.processors import Tag + + malformed_regex = r"[*." + + # Create a tagger containing an ill defined Tag (malformed regex) + class CustomTagger(ContentTagger): + """Test class""" + + @Tag + def HELLO(self): + """Test method""" + return malformed_regex + + # Instantiate Tagger + with pytest.raises(ValueError, match=rf"Invalid regex"): + _ = CustomTagger() + + +def test_direct_tagging(): + tagger = ContentTagger() + match = tagger["HELLO"].match("Bonjour") + + assert bool(match) + + +def test_call_method(): + tagger = ContentTagger() + + match_list = tagger("Bonjour a tous") + subtext, tag, regex = match_list[0] + + assert tag == "HELLO" + + +@pytest.mark.parametrize( + "text, n_words, word_character_only, expected_match", + [ + pytest.param("Hello you", 4, False, True, id="4 words match"), + pytest.param("Hello how are you today", 4, False, False, id="4 words no match"), + pytest.param("Hello! you?", 4, False, True, id="4 words match with special characters"), + pytest.param( + "Hello! you?", 4, True, False, id="4 words match with special characters (word character only True)" + ), + ], +) +def test_word_blocks(text, n_words, word_character_only, expected_match): + regex = BaseContentTagger.word_block(n_words, word_character_only=word_character_only) + + search_regex = r"^" + regex + r"$" + match = bool(re.search(search_regex, text)) + assert match == expected_match diff --git a/tests/processors/test_content_tagger.py b/tests/processors/test_content_tagger.py deleted file mode 100644 index 94dc5353..00000000 --- a/tests/processors/test_content_tagger.py +++ /dev/null @@ -1,787 +0,0 @@ -import re - -import pytest - -from melusine.message import Message -from melusine.processors import BaseContentTagger, ContentTagger, Tag - - -def test_content_tagger(): - # Text segments (= individual messages in an email conversation) - text_segments = [ - "Envoye de mon iphone", - ("Bonjour Mme X,\nSuite a blh blah blah\n" "Bien cordialement\nJane Dupond\n" "(See attached file: flex.jpg)"), - ( - "Bonjour,\nVeuillez trouver ci-joint blah\n" - "Merci d'avance,\nCordialement,\n" - "Toute modification, edition, utilisation ou diffusion non autorisee est interdite" - ), - ] - - # Expected tags - expected_tags = [ - [ - ("FOOTER", "Envoye de mon iphone"), - ], - [ - ("HELLO", "Bonjour Mme X,"), - ("BODY", "Suite a blh blah blah"), - ("GREETINGS", "Bien cordialement"), - ("SIGNATURE_NAME", "Jane Dupond"), - ("PJ", "(See attached file: flex.jpg)"), - ], - [ - ("HELLO", "Bonjour,"), - ("BODY", "Veuillez trouver ci-joint blah"), - ("THANKS", "Merci d'avance,"), - ("GREETINGS", "Cordialement,"), - ( - "FOOTER", - "Toute modification, edition, utilisation ou diffusion non autorisee est interdite", - ), - ], - ] - - # Mock the output of a Segmenter (List of Message object) - messages = [Message(text=segment) for segment in text_segments] - - # Instantiate and apply the Tagger - tagger = ContentTagger() - output_messages = tagger.tag_email(messages) - - # Test output tags - output_tags = [x.tags for x in output_messages] - assert output_tags == expected_tags - - -def test_tag_null_message(): - messages = None - - # Instantiate and apply the Tagger - tagger = ContentTagger() - output_messages = tagger.tag_email(messages) - - assert output_messages is None - - -@pytest.mark.parametrize( - "text, expected_parts", - [ - ( - "Bonjour, merci pour votre message!\nComment-allez vous?! Je suis satisfait!!!\n" - "Bien cordialement\n\n\n\nJane Dupond\n", - [ - "Bonjour,", - "merci pour votre message!", - "Comment-allez vous?!", - "Je suis satisfait!!!", - "Bien cordialement", - "Jane Dupond", - ], - ), - ], -) -def test_content_tagger_split_text(text, expected_parts): - # Instantiate and apply the Tagger - tagger = ContentTagger() - output_parts = tagger.split_text(text) - - assert output_parts == expected_parts - - -@pytest.mark.parametrize( - "text, expected_tags", - [ - ( - "Bonjour Mme X,\nSuite a blh blah blah.\n" - "Bien cordialement\nJane Dupond\n" - "(See attached file: flex.jpg)", - [ - ("HELLO", "Bonjour Mme X,"), - ("BODY", "Suite a blh blah blah."), - ("GREETINGS", "Bien cordialement"), - ("SIGNATURE_NAME", "Jane Dupond"), - ("PJ", "(See attached file: flex.jpg)"), - ], - ), - ( - "Bonjour, je confirme le rdv. Cordialement, John Smith", - [ - ("HELLO", "Bonjour,"), - ("BODY", "je confirme le rdv."), - ("GREETINGS", "Cordialement, John Smith"), - ], - ), - ( - ( - "Bonjour,\nSuite a notre intervention du 16.02.22 , un taux d'humidité de 50% a été relevé.\n" - "Cordialement.\n177, rue de la fée - 75000 Paris.\n" - "Horaires : du lundi au jeudi de 08h00 à 16h30 et le vendredi de 08h00 à 16h00.\n" - "Tel : 01.45.53.11.33" - ), - [ - ("HELLO", "Bonjour,"), - ("BODY", "Suite a notre intervention du 16.02.22 , un taux d'humidité de 50% a été relevé."), - ("GREETINGS", "Cordialement."), - ("SIGNATURE", "177, rue de la fée - 75000 Paris."), - ("BODY", "Horaires : du lundi au jeudi de 08h00 à 16h30 et le vendredi de 08h00 à 16h00."), - ("SIGNATURE", "Tel : 01.45.53.11.33"), - ], - ), - ( - ( - "bonjour\n" - "15 jours après les premières réparations, un défaut a été détecté. " - "Bien à vous\n" - "Britney Spears" - ), - [ - ("HELLO", "bonjour"), - ("BODY", "15 jours après les premières réparations, un défaut a été détecté."), - ("GREETINGS", "Bien à vous"), - ("SIGNATURE_NAME", "Britney Spears"), - ], - ), - ( - ( - "Bonjour monsieur Smith\n" - "merci. Bien à vous\n" - "Britney Spears\n" - "22 hollywood boulevard\n" - "79000 Niort\n" - ), - [ - ("HELLO", "Bonjour monsieur Smith"), - ("THANKS", "merci."), - ("GREETINGS", "Bien à vous"), - ("SIGNATURE_NAME", "Britney Spears"), - ("SIGNATURE", "22 hollywood boulevard"), - ("SIGNATURE", "79000 Niort"), - ], - ), - ( - ( - "Merci de me faire suivre les docs à ma nouvelle adresse qui est 0 rue du parc, 75000 Paris. " - "Merci d'avance. \nAcceptez notre salutation," - ), - [ - ("BODY", "Merci de me faire suivre les docs à ma nouvelle adresse qui est 0 rue du parc, 75000 Paris."), - ("THANKS", "Merci d'avance."), - ("GREETINGS", "Acceptez notre salutation,"), - ], - ), - ( - ( - "Bonjour\n" - "Je vous relance concernant ma télévision avec le devis en PJ.\n" - "Désolé pour la qualité.\n" - "Je l'ai envoyé à partir de mon ordi.\n" - "Excellente journée à vous,\n" - "Bon we\n" - "Votre bien dévoué\n" - "amicalement votre\n" - "Cordiales salutations.\n" - "Françoise-Bénédicte Dupond\n" - "Envoyé à partir de \nCourrier \npour Windows" - ), - [ - ("HELLO", "Bonjour"), - ("BODY", "Je vous relance concernant ma télévision avec le devis en PJ."), - ("BODY", "Désolé pour la qualité."), - ("BODY", "Je l'ai envoyé à partir de mon ordi."), - ("HELLO", "Excellente journée à vous,"), - ("HELLO", "Bon we"), - ("GREETINGS", "Votre bien dévoué"), - ("GREETINGS", "amicalement votre"), - ("GREETINGS", "Cordiales salutations."), - ("SIGNATURE_NAME", "Françoise-Bénédicte Dupond"), - ("FOOTER", "Envoyé à partir de"), - ("FOOTER", "Courrier"), - ("FOOTER", "pour Windows"), - ], - ), - ( - "C'est bien note, merci beaucoup.\nSentiments dévoués.\nTélécharger \nOutlook pour Android", - [ - ("THANKS", "C'est bien note, merci beaucoup."), - ("GREETINGS", "Sentiments dévoués."), - ("FOOTER", "Télécharger"), - ("FOOTER", "Outlook pour Android"), - ], - ), - ( - "Impeccable, je vous remercie beaucoup pour votre rapidité.\nObtenir\nOutlook pour Android", - [ - ("THANKS", "Impeccable, je vous remercie beaucoup pour votre rapidité."), - ("FOOTER", "Obtenir"), - ("FOOTER", "Outlook pour Android"), - ], - ), - ( - ( - "Cher Monsieur,\nJe vous confirme la bonne réception de votre précédent email.\n" - "Je vous en remercie.\nBien cordialement,\nJohn Smith" - ), - [ - ("HELLO", "Cher Monsieur,"), - ("BODY", "Je vous confirme la bonne réception de votre précédent email."), - ("THANKS", "Je vous en remercie."), - ("GREETINGS", "Bien cordialement,"), - ("SIGNATURE_NAME", "John Smith"), - ], - ), - ( - ( - "chère madame,\n" - "URGENT URGENT\n" - "Merci de me faire suivre les docs à ma nouvelle adresse qui est 0 rue du parc, 75000 Paris. " - "Merci d'avance. \nRecevez nos salutations,\nVous en souhaitant bonne réception" - ), - [ - ("HELLO", "chère madame,"), - ("BODY", "URGENT URGENT"), - ("BODY", "Merci de me faire suivre les docs à ma nouvelle adresse qui est 0 rue du parc, 75000 Paris."), - ("THANKS", "Merci d'avance."), - ("GREETINGS", "Recevez nos salutations,"), - ("GREETINGS", "Vous en souhaitant bonne réception"), - ], - ), - pytest.param( - "Un témoignage sous X\nEnvoyé depuis mon téléphone Orange", - [ - ("BODY", "Un témoignage sous X"), - ("FOOTER", "Envoyé depuis mon téléphone Orange"), - ], - id="Edge case where a line ends with an isolated character", - ), - pytest.param( - " ??\n !??!", - [ - ("BODY", "??!??!"), - ], - id="Edge case where the two first lines are missing word characters", - ), - ( - "Bonjour Mme X,\nSuite a blh blah blah.\n" - "Bien cordialement\nJane Dupond\n" - "(See attached file: flex.jpg)", - [ - ("HELLO", "Bonjour Mme X,"), - ("BODY", "Suite a blh blah blah."), - ("GREETINGS", "Bien cordialement"), - ("SIGNATURE_NAME", "Jane Dupond"), - ("PJ", "(See attached file: flex.jpg)"), - ], - ), - ( - "\nChère Madame\n\nC'est bien noté, merci\nBien reçu\nJ.Smith\n\n", - [ - ("HELLO", "Chère Madame"), - ("THANKS", "C'est bien noté, merci"), - ("BODY", "Bien reçu"), - ("SIGNATURE_NAME", "J.Smith"), - ], - ), - ( - "\nBonjour Monsieur, ceci n'est pas un hello\nBonne fin de journee\nsalutations", - [ - ("BODY", "Bonjour Monsieur, ceci n'est pas un hello"), - ("HELLO", "Bonne fin de journee"), - ("GREETINGS", "salutations"), - ], - ), - ( - "\nBonjour Monsieur Stanislas von den hoeggenboord\n\nbien à toi\nJ. Smith\nChargé de clientèle", - [ - ("HELLO", "Bonjour Monsieur Stanislas von den hoeggenboord"), - ("GREETINGS", "bien à toi"), - ("SIGNATURE_NAME", "J. Smith"), - ("SIGNATURE", "Chargé de clientèle"), - ], - ), - ( - ( - "\n1 rdv à 18h\n\n2 ème message laissé à la locataire\n3je m'en vais au bois\n" - "4 allée des iris\n 5bis rue Patrick Sebastien\n6-8 cours mirabeau\n 7 ter place du dahu\n" - "8 de la rue très longue qui ne doit pas être taggée signature" - ), - [ - ("BODY", "1 rdv à 18h"), - ("BODY", "2 ème message laissé à la locataire"), - ("BODY", "3je m'en vais au bois"), - ("SIGNATURE", "4 allée des iris"), - ("SIGNATURE", "5bis rue Patrick Sebastien"), - ("SIGNATURE", "6-8 cours mirabeau"), - ("SIGNATURE", "7 ter place du dahu"), - ("BODY", "8 de la rue très longue qui ne doit pas être taggée signature"), - ], - ), - ( - ( - "à L'attention de M Bob,\n" - "Bonjour,\n" - "Je vous informe que je vais accepter la proposition de L , à savoir le paiement d'une indemnité forfaitaire de résiliation du CCMI de 4000 € TTC pour clore cette affaire.\n" - "Cordialement.\n" - "Bob Smith" - ), - [ - ("FOOTER", "à L'attention de M Bob,"), - ("HELLO", "Bonjour,"), - ( - "BODY", - "Je vous informe que je vais accepter la proposition de L , à savoir le paiement d'une indemnité forfaitaire de résiliation du CCMI de 4000 € TTC pour clore cette affaire.", - ), - ("GREETINGS", "Cordialement."), - ("SIGNATURE_NAME", "Bob Smith"), - ], - ), - ( - ( - "Monsieur Bob Smith\n" - "Adresse mail : BobSmith90@gmail.com\n" - "Lucy Ange\n\n" - "Bonjour Monsieur,\n" - "Suite à notre entretien téléphonique de ce matin, et au message que vous m'avez envoyé sur ma messagerie, je voudrais effectuer la réparation du véhicule Renault Twingo dans un garage partenaire de la Maif situé, si c'est possible.\n" - "Dans l'attente de votre réponse et en vous remerciant par avance,\n\n\n" - "Monsieur Bob Smith\n\n\n" - "Envoyé à partir de\n" - "Courrier\npour Windows\n\n\n\n" - "Sans virus.\nwww.avast.com" - ), - [ - ("HELLO", "Monsieur Bob Smith"), - ("SIGNATURE", "Adresse mail : BobSmith90@gmail.com"), - ("SIGNATURE_NAME", "Lucy Ange"), - ("HELLO", "Bonjour Monsieur,"), - ( - "BODY", - "Suite à notre entretien téléphonique de ce matin, et au message que vous m'avez envoyé sur ma messagerie, je voudrais effectuer la réparation du véhicule Renault Twingo dans un garage partenaire de la Maif situé, si c'est possible.", - ), - ("BODY", "Dans l'attente de votre réponse et en vous remerciant par avance,"), - ("HELLO", "Monsieur Bob Smith"), - ("FOOTER", "Envoyé à partir de"), - ("FOOTER", "Courrier"), - ("FOOTER", "pour Windows"), - ("FOOTER", "Sans virus."), - ("FOOTER", "www.avast.com"), - ], - ), - ( - ( - "Bob Smith\n\n\n" - "A l’attention de Madame Lucy Ange,\n\n\n\n\n\n" - "Bonjour Madame Ange,\n\n\n\n\n\n\n\n\n" - "J’espère que vous allez bien.\n\n\n\n\n\n" - "Pour faire suite à mon mail du 21 février 2023, je me permets de revenir vers vous pour avoir votre avis sur le devis que j’ai demandé auprès d’un enquêteur.\n\n\n\n" - "Voici son retour :\n\n\n\n\n\n" - "Qu’en pensez-vous svp ?\n\n\n\n\n\n" - "Je reste à votre disposition pour tout complément d’information et vous remercie de l’intérêt que vous porterez à ma demande,\n\n\n\n\n\n" - "Bien Cordialement,\n\n\n\n\n\n" - "Bob Smith\n\n\n" - "Tél. 06.83.22.95.94" - ), - [ - ("SIGNATURE_NAME", "Bob Smith"), - ("FOOTER", "A l’attention de Madame Lucy Ange,"), - ("HELLO", "Bonjour Madame Ange,"), - ("BODY", "J’espère que vous allez bien."), - ( - "BODY", - "Pour faire suite à mon mail du 21 février 2023, je me permets de revenir vers vous pour avoir votre avis sur le devis que j’ai demandé auprès d’un enquêteur.", - ), - ("BODY", "Voici son retour :"), - ("BODY", "Qu’en pensez-vous svp ?"), - ( - "BODY", - "Je reste à votre disposition pour tout complément d’information et vous remercie de l’intérêt que vous porterez à ma demande,", - ), - ("GREETINGS", "Bien Cordialement,"), - ("SIGNATURE_NAME", "Bob Smith"), - ("SIGNATURE", "Tél."), - ("SIGNATURE", "06.83.22.95.94"), - ], - ), - pytest.param( - ( - "cordialement\nContact e-mail\n\n\nContact téléphone\n\n01 23 45 67 89 / abcabc@hotmail.fr\n" - "Torroella de Montgri, le 5 avril 2023\nLes formats de fichiers acceptés sont : PDF, DOC, DOCX, JPEG, " - "JPG, TIFF, TXT, ODT, XLS, XLSX\nTout autre format de fichiers ne sera pas transmis au dossier" - ), - [ - ("GREETINGS", "cordialement"), - ("SIGNATURE", "Contact e-mail"), - ("SIGNATURE", "Contact téléphone"), - ("SIGNATURE", "01 23 45 67 89 / abcabc@hotmail.fr"), - ("SIGNATURE", "Torroella de Montgri, le 5 avril 2023"), - ( - "FOOTER", - "Les formats de fichiers acceptés sont : PDF, DOC, DOCX, JPEG, JPG, TIFF, TXT, ODT, XLS, XLSX", - ), - ("FOOTER", "Tout autre format de fichiers ne sera pas transmis au dossier"), - ], - id="diverse_signature_patterns", - ), - pytest.param( - ( - "bonjour\nmon body\nJ. Smith\n\n01 23 45 67 89\nSecrétaire en charge des avions\n" - "Business Analyst – Tribu Sinistres – Squad Flux Entrants\n" - "Société nationale des chemins de fer\nConseiller MAIF\nGestionnaire sinistre - C99G\n" - "Service des lettres anonymes\nTechnicienne de gestion - EQUIPE ABC\n" - ), - [ - ("HELLO", "bonjour"), - ("BODY", "mon body"), - ("SIGNATURE_NAME", "J. Smith"), - ("SIGNATURE", "01 23 45 67 89"), - ("SIGNATURE", "Secrétaire en charge des avions"), - ("SIGNATURE", "Business Analyst – Tribu Sinistres – Squad Flux Entrants"), - ("SIGNATURE", "Société nationale des chemins de fer"), - ("SIGNATURE", "Conseiller MAIF"), - ("SIGNATURE", "Gestionnaire sinistre - C99G"), - ("SIGNATURE", "Service des lettres anonymes"), - ("SIGNATURE", "Technicienne de gestion - EQUIPE ABC"), - ], - id="signature_jobs", - ), - pytest.param( - ( - "bonjour\nmon body\nCordialement\n\n" - "analyste -------------------------------------- test test test test test test test\n" - ), - [ - ("HELLO", "bonjour"), - ("BODY", "mon body"), - ("GREETINGS", "Cordialement"), - ("BODY", "analyste -------------------------------------- test test test test test test test"), - ], - id="check_catastrophic_backtracking", - ), - ], -) -def test_tag_text_generic(text, expected_tags): - # Instantiate and apply the Tagger - tagger = ContentTagger() - output_tags = tagger.tag_text(text) - # Test output tags - assert output_tags == expected_tags - - -@pytest.mark.parametrize( - "text, expected_tags", - [ - pytest.param( - ( - "Merci\n" - "Je vous remercie\n" - "Merci d'avance\n" - "Je vous remercie par avance\n" - "Vous en remerciant par avance.\n" - ), - [ - ("THANKS", "Merci"), - ("THANKS", "Je vous remercie"), - ("THANKS", "Merci d'avance"), - ("THANKS", "Je vous remercie par avance"), - ("THANKS", "Vous en remerciant par avance."), - ], - id="french thanks patterns", - ), - ], -) -def test_tag_text_french(text, expected_tags): - # Instantiate and apply the Tagger - tagger = ContentTagger() - output_tags = tagger.tag_text(text) - # Test output tags - assert output_tags == expected_tags - - -@pytest.mark.parametrize( - "text, expected_tags", - [ - pytest.param( - ( - "Thank you so much\n" - "thanks\n" - "thx Joanna\n" - "thanks but you forgot bla\n" - "Thx however I still need the document\n" - ), - [ - ("THANKS", "Thank you so much"), - ("THANKS", "thanks"), - ("THANKS", "thx Joanna"), - ("BODY", "thanks but you forgot bla"), - ("BODY", "Thx however I still need the document"), - ], - id="english thanks patterns", - ), - pytest.param( - ( - "Best\n" - "warm Wishes\n" - "regards\n" - "best regards\n" - "cheers\n" - "yours\n" - "yours truly\n" - "Sincerely\n" - "see you soon\n" - "Speak to you soon\n" - "talk soon\n" - "Take care\n" - "Catch you later\n" - "Have a fantastic day\n" - "Looking forward to your reply\n" - "I am looking forward to hearing from you\n" - "Hoping to hear from you\n" - ), - [ - ("GREETINGS", "Best"), - ("GREETINGS", "warm Wishes"), - ("GREETINGS", "regards"), - ("GREETINGS", "best regards"), - ("GREETINGS", "cheers"), - ("GREETINGS", "yours"), - ("GREETINGS", "yours truly"), - ("GREETINGS", "Sincerely"), - ("GREETINGS", "see you soon"), - ("GREETINGS", "Speak to you soon"), - ("GREETINGS", "talk soon"), - ("GREETINGS", "Take care"), - ("GREETINGS", "Catch you later"), - ("GREETINGS", "Have a fantastic day"), - ("GREETINGS", "Looking forward to your reply"), - ("GREETINGS", "I am looking forward to hearing from you"), - ("GREETINGS", "Hoping to hear from you"), - ], - id="english greetings", - ), - pytest.param( - ( - "Hello John\n" - "hi\n" - "Hi there\n" - "good to hear from you\n" - "it is good to hear from you\n" - "I hope you are having a great week\n" - "how are you doing\n" - "how are you positioned about the matter\n" - "i hope you are doing well\n" - "Good Morning Joanna\n" - "good Afternoon\n" - "Dear Jacky\n" - "Sir\n" - "Dear Madam\n" - "Dear Mr\n" - "Dear Ms.\n" - "Dear miss\n" - "Dear mrs.\n" - "Dear sir or madam\n" - "To whom it may concern\n" - ), - [ - ("HELLO", "Hello John"), - ("HELLO", "hi"), - ("HELLO", "Hi there"), - ("HELLO", "good to hear from you"), - ("HELLO", "it is good to hear from you"), - ("HELLO", "I hope you are having a great week"), - ("HELLO", "how are you doing"), - ("BODY", "how are you positioned about the matter"), - ("HELLO", "i hope you are doing well"), - ("HELLO", "Good Morning Joanna"), - ("HELLO", "good Afternoon"), - ("HELLO", "Dear Jacky"), - ("HELLO", "Sir"), - ("HELLO", "Dear Madam"), - ("HELLO", "Dear Mr"), - ("HELLO", "Dear Ms."), - ("HELLO", "Dear miss"), - ("HELLO", "Dear mrs."), - ("HELLO", "Dear sir or madam"), - ("HELLO", "To whom it may concern"), - ], - id="english hello", - ), - pytest.param( - ( - "VP of Data Science\n" - "Chief of staff\n" - "CTO at TestMelusine\n" - "CEOABC test\n" - "Lead business developer\n" - ), - [ - ("SIGNATURE", "VP of Data Science"), - ("SIGNATURE", "Chief of staff"), - ("SIGNATURE", "CTO at TestMelusine"), - ("BODY", "CEOABC test"), - ("SIGNATURE", "Lead business developer"), - ], - id="english job signature patterns", - ), - pytest.param( - ( - "9 downing street\n" - "4-6 Beverly Hill\n" - "4 Abbey road W24RA\n" - "3 Ocean Rd.\n" - "5th avenue\n" - "221b Baker St.\n" - "6bis River ln.\n" - "7 Winter lane\n" - ), - [ - ("SIGNATURE", "9 downing street"), - ("SIGNATURE", "4-6 Beverly Hill"), - ("SIGNATURE", "4 Abbey road W24RA"), - ("SIGNATURE", "3 Ocean Rd."), - ("SIGNATURE", "5th avenue"), - ("SIGNATURE", "221b Baker St."), - ("SIGNATURE", "6bis River ln."), - ("SIGNATURE", "7 Winter lane"), - ], - id="english adsress signature patterns", - ), - ], -) -def test_tag_text_english(text, expected_tags): - # Instantiate and apply the Tagger - tagger = ContentTagger() - output_tags = tagger.tag_text(text) - # Test output tags - assert output_tags == expected_tags - - -def test_tag_list(): - # Limit tags to "HELLO" and the default tag ("BODY") - tag_list = ["HELLO"] - - # Text segment (= individual message in an email conversation) - text = "bonjour\nblah blah blah\nmerci\ncordialement" - - # Expected tags - expected_tags = [ - ("HELLO", "bonjour"), - ("BODY", "blah blah blah"), - ("BODY", "merci"), - ("BODY", "cordialement"), - ] - - # Instantiate and apply the Tagger - tagger = ContentTagger(tag_list=tag_list) - output_tags = tagger.tag_text(text) - - # Test output tags - assert expected_tags == output_tags - - -def test_undefined_tag(): - unknown_tag = "UNKNOWN_TAG" - - # Setup an unknown tag - tag_list = [unknown_tag] - - # Instantiate Tagger - with pytest.raises(ValueError, match=rf".*{unknown_tag}.*"): - _ = ContentTagger(tag_list=tag_list) - - -def test_unsupported_type(): - class MyClass(ContentTagger): - """Test class""" - - @Tag - def TEST_TAG(self): - """Test method""" - return 3.3 - - with pytest.raises(ValueError, match="supported types"): - _ = MyClass() - - -def test_compiled_pattern(): - class MyClass(ContentTagger): - """Test class""" - - @Tag - def TEST_TAG(self): - """Test method""" - return re.compile(r"cool_pattern") - - tagger = MyClass() - subtext, tag, match = tagger("cool_pattern is what I am looking for")[0] - - # Check tag result - assert tag == "TEST_TAG" - - -def test_str_pattern(): - class MyClass(ContentTagger): - """Test class""" - - @Tag - def TEST_TAG(self): - """Test method""" - return r"cool_pattern" - - tagger = MyClass() - subtext, tag, match = tagger("cool_pattern is what I am looking for")[0] - - # Check tag result - assert tag == "TEST_TAG" - - -def test_malformed_regex(): - from melusine.processors import Tag - - malformed_regex = r"[*." - - # Create a tagger containing an ill defined Tag (malformed regex) - class CustomTagger(ContentTagger): - """Test class""" - - @Tag - def HELLO(self): - """Test method""" - return malformed_regex - - # Instantiate Tagger - with pytest.raises(ValueError, match=rf"Invalid regex"): - _ = CustomTagger() - - -def test_direct_tagging(): - tagger = ContentTagger() - match = tagger["HELLO"].match("Bonjour") - - assert bool(match) - - -def test_call_method(): - tagger = ContentTagger() - - match_list = tagger("Bonjour a tous") - subtext, tag, regex = match_list[0] - - assert tag == "HELLO" - - -@pytest.mark.parametrize( - "text, n_words, word_character_only, expected_match", - [ - pytest.param("Hello you", 4, False, True, id="4 words match"), - pytest.param("Hello how are you today", 4, False, False, id="4 words no match"), - pytest.param("Hello! you?", 4, False, True, id="4 words match with special characters"), - pytest.param( - "Hello! you?", 4, True, False, id="4 words match with special characters (word character only True)" - ), - ], -) -def test_word_blocks(text, n_words, word_character_only, expected_match): - regex = BaseContentTagger.word_block(n_words, word_character_only=word_character_only) - - search_regex = r"^" + regex + r"$" - match = bool(re.search(search_regex, text)) - assert match == expected_match diff --git a/tests/processors/test_processors.py b/tests/processors/test_processors.py index c6aaae5f..03cac2ab 100644 --- a/tests/processors/test_processors.py +++ b/tests/processors/test_processors.py @@ -161,7 +161,7 @@ def test_segmenter(input_text, expected_messages): ), ( [ - Message(meta="", text="Merci", tags=[("THANKS", "Merci")]), + Message(meta="", text="Merci", tags=[{"base_tag": "THANKS", "base_text": "Merci"}]), ], "Merci", ), @@ -184,9 +184,21 @@ def test_text_extractor_error(): def test_text_extractor_multiple_messages(): """Unit test""" message_list = [ - Message(meta="", text="", tags=[("BODY", "A"), ("GREETINGS", "G"), ("BODY", "A")]), - Message(meta="", text="", tags=[("BODY", "B"), ("BODY", "B"), ("BODY", "B")]), - Message(meta="", text="", tags=[("GREETINGS", "G"), ("BODY", "C"), ("BODY", "C")]), + Message(meta="", text="", tags=[ + {"base_text": "A", "base_tag": "BODY"}, + {"base_text": "G", "base_tag": "GREETINGS"}, + {"base_text": "A", "base_tag": "BODY"}, + ]), + Message(meta="", text="", tags=[ + {"base_text": "B", "base_tag": "BODY"}, + {"base_text": "B", "base_tag": "BODY"}, + {"base_text": "B", "base_tag": "BODY"}, + ]), + Message(meta="", text="", tags=[ + {"base_text": "G", "base_tag": "GREETINGS"}, + {"base_text": "C", "base_tag": "BODY"}, + {"base_text": "C", "base_tag": "BODY"}, + ]), ] expected_output = "A\nB\nB\nB" @@ -206,8 +218,15 @@ def test_text_extractor_with_tags(): Message(meta="", text="Bonjour\nblahblah\nMerci"), Message(meta="", text="Bonjour2\nMerci2"), ] - input_message_list[0].tags = [("HELLO", "Bonjour"), ("CUSTOM_TAG", "blahblah"), ("THANKS", "Merci")] - input_message_list[1].tags = [("HELLO", "Bonjour2"), ("THANKS", "Merci2")] + input_message_list[0].tags = [ + {"base_text": "Bonjour", "base_tag": "HELLO"}, + {"base_text": "blahblah", "base_tag": "CUSTOM_TAG"}, + {"base_text": "Merci", "base_tag": "THANKS"}, + ] + input_message_list[1].tags = [ + {"base_text": "Bonjour2", "base_tag": "HELLO"}, + {"base_text": "Merci2", "base_tag": "THANKS"}, + ] extractor = TextExtractor( output_columns="text", @@ -331,7 +350,9 @@ def test_date_processor(date_str: str, expected_iso_format: str) -> None: "A: avocats@test.fr; BOB Morane \n" "Objet: dossier Test ,\n", text="bla bla bla", - tags=[("BODY", "bla bla bla")], + tags=[ + {"base_text": "bla bla bla", "base_tag": "BODY"}, + ], ) ], "('FOOTER', 'SIGNATURE')", @@ -345,7 +366,9 @@ def test_date_processor(date_str: str, expected_iso_format: str) -> None: Message( meta="", text="Envoyé depuis mon Iphone", - tags=[("FOOTER", "Envoyé depuis mon Iphone")], + tags=[ + {"base_text": "Envoyé depuis mon Iphone", "base_tag": "FOOTER"}, + ] ), Message( meta="De: test.test@test.fr \n" @@ -353,7 +376,9 @@ def test_date_processor(date_str: str, expected_iso_format: str) -> None: "A: avocats@test.fr; BOB Morane \n" "Objet: dossier Test ,\n", text="bla bla bla", - tags=[("BODY", "bla bla bla")], + tags=[ + {"base_text": "bla bla bla", "base_tag": "BODY"}, + ], ), ], "('FOOTER', 'SIGNATURE')", @@ -368,14 +393,16 @@ def test_date_processor(date_str: str, expected_iso_format: str) -> None: meta="", text="Jane Doe\n4 rue des oliviers 75001 Ville", tags=[ - ("SIGNATURE", "4 rue des oliviers 75001 Ville"), + {"base_text": "4 rue des oliviers 75001 Ville", "base_tag": "SIGNATURE"}, ], ), Message( meta="De :\ntest.test42@test.fr\nEnvoyé :\nvendredi 03 mars 2023 14:28\nÀ :" "\nana@test.fr\nObjet :\nTEST", text="bla bla bla", - tags=[("BODY", "bla bla bla")], + tags=[ + {"base_text": "bla bla bla", "base_tag": "BODY"}, + ], ), ], "('FOOTER', 'SIGNATURE')", @@ -390,15 +417,17 @@ def test_date_processor(date_str: str, expected_iso_format: str) -> None: meta="", text="Jane Doe\n4 rue des oliviers 75001 Ville", tags=[ - ("SIGNATURE_NAME", "Jane Doe"), - ("SIGNATURE", "4 rue des oliviers 75001 Ville"), + {"base_text": "Jane Doe", "base_tag": "SIGNATURE_NAME"}, + {"base_text": "4 rue des oliviers 75001 Ville", "base_tag": "SIGNATURE"}, ], ), Message( meta="De :\ntest.test42@test.fr\nEnvoyé :\nvendredi 03 mars 2023 14:28\nÀ :" "\nana@test.fr\nObjet :\nTEST", text="bla bla bla", - tags=[("BODY", "bla bla bla")], + tags=[ + {"base_text": "bla bla bla", "base_tag": "BODY"}, + ], ), ], "('FOOTER', 'SIGNATURE')", @@ -415,7 +444,9 @@ def test_date_processor(date_str: str, expected_iso_format: str) -> None: "À:\nbob@test.fr\nObjet:\nTR : 1223456" ), text="bla bla bla", - tags=[("BODY", "bla bla bla")], + tags=[ + {"base_text": "bla bla bla", "base_tag": "BODY"}, + ], ), ], "('FOOTER', 'SIGNATURE')", @@ -432,7 +463,9 @@ def test_date_processor(date_str: str, expected_iso_format: str) -> None: "À:\nbob@test.fr\nObjet:\nTR : 1223456" ), text="bla bla bla", - tags=[("BODY", "bla bla bla")], + tags=[ + {"base_text": "bla bla bla", "base_tag": "BODY"}, + ], ), ], "('FOOTER', 'SIGNATURE')", @@ -446,7 +479,9 @@ def test_date_processor(date_str: str, expected_iso_format: str) -> None: Message( meta=("Le 2 mars 2023 à 18:18, Bob a écrit :"), text="bla bla bla", - tags=[("BODY", "bla bla bla")], + tags=[ + {"base_text": "bla bla bla", "base_tag": "BODY"}, + ], ), ], "('FOOTER', 'SIGNATURE')", @@ -460,7 +495,9 @@ def test_date_processor(date_str: str, expected_iso_format: str) -> None: Message( meta=("Le 01/01/2001 11:14, test.test.test@test.fr a écrit :"), text="bla bla bla", - tags=[("BODY", "bla bla bla")], + tags=[ + {"base_text": "bla bla bla", "base_tag": "BODY"}, + ], ), ], "('FOOTER', 'SIGNATURE')", @@ -474,7 +511,9 @@ def test_date_processor(date_str: str, expected_iso_format: str) -> None: Message( meta=("Le 01/01/2001 11:14, test.test.test@test.fr a écrit :"), text="bla bla bla", - tags=[("BODY", "bla bla bla")], + tags=[ + {"base_text": "bla bla bla", "base_tag": "BODY"}, + ], ), ], "('FOOTER', 'SIGNATURE')", @@ -488,7 +527,9 @@ def test_date_processor(date_str: str, expected_iso_format: str) -> None: Message( meta="", text="Jane Doe\n4 rue des oliviers 75001 Ville", - tags=[("SIGNATURE", "Jane Doe\n4 rue des oliviers 75001 Ville")], + tags=[ + {"base_text": "Jane Doe\n4 rue des oliviers 75001 Ville", "base_tag": "SIGNATURE"}, + ], ), Message( meta="De: test.test@test.fr \n" @@ -496,7 +537,9 @@ def test_date_processor(date_str: str, expected_iso_format: str) -> None: "A: avocats@test.fr; BOB Morane \n" "Objet: dossier Test ,\n", text="bla bla bla", - tags=[("BODY", "bla bla bla")], + tags=[ + {"base_text": "bla bla bla", "base_tag": "BODY"}, + ], ), ], "('FOOTER',)", @@ -510,7 +553,9 @@ def test_date_processor(date_str: str, expected_iso_format: str) -> None: Message( meta="", text="J'entends le loup, le renard et la belette", - tags=[("BODY", "J'entends le loup, le renard et la belette")], + tags=[ + {"base_text": "J'entends le loup, le renard et la belette", "base_tag": "BODY"}, + ], ), Message( meta="De: test.test@test.fr \n" @@ -518,7 +563,9 @@ def test_date_processor(date_str: str, expected_iso_format: str) -> None: "A: avocats@test.fr; BOB Morane \n" "Objet: dossier Test ,\n", text="bla bla bla", - tags=[("BODY", "bla bla bla")], + tags=[ + {"base_text": "bla bla bla", "base_tag": "BODY"}, + ], ), ], "('FOOTER', 'SIGNATURE')", @@ -532,7 +579,9 @@ def test_date_processor(date_str: str, expected_iso_format: str) -> None: Message( meta="", text="", - tags=[("BODY", "")], + tags=[ + {"base_text": "", "base_tag": "BODY"}, + ], ), ], "('FOOTER', 'SIGNATURE')", @@ -546,12 +595,16 @@ def test_date_processor(date_str: str, expected_iso_format: str) -> None: Message( meta="", text="Envoyé de mon iPhone", - tags=[("FOOTER", "bla 1")], + tags=[ + {"base_text": "bla 1", "base_tag": "FOOTER"}, + ], ), Message( meta="Nothing useful", text="bla 2", - tags=[("BODY", "bla 2")], + tags=[ + {"base_text": "bla 2", "base_tag": "BODY"}, + ], ), ], "('FOOTER', 'SIGNATURE')",