Skip to content

Commit

Permalink
Operate on unicode data exclusively
Browse files Browse the repository at this point in the history
  • Loading branch information
horkhe committed Feb 4, 2022
1 parent a8c7e6a commit 14f106e
Show file tree
Hide file tree
Showing 6 changed files with 72 additions and 266 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def finalize_options(self):


setup(name='talon',
version='1.5.0',
version='1.6.0',
description=("Mailgun library "
"to extract message quotations and signatures."),
long_description=open("README.rst").read(),
Expand Down
50 changes: 20 additions & 30 deletions talon/quotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,17 @@
"""

from __future__ import absolute_import
import regex as re

import logging
from copy import deepcopy

from lxml import html, etree

from talon.utils import (get_delimiter, html_tree_to_text,
html_document_fromstring)
from talon import html_quotations
import regex as re
from lxml import etree, html
from six.moves import range
import six

from talon import html_quotations
from talon.utils import (get_delimiter, html_document_fromstring,
html_tree_to_text)

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -94,7 +93,7 @@
)

RE_QUOTATION = re.compile(
r'''
r"""
(
# quotation border: splitter line or a number of quotation marker lines
(?:
Expand All @@ -112,10 +111,10 @@
# after quotations should be text only or nothing at all
[te]*$
''', re.VERBOSE)
""", re.VERBOSE)

RE_EMPTY_QUOTATION = re.compile(
r'''
r"""
(
# quotation border: splitter line or a number of quotation marker lines
(?:
Expand All @@ -125,7 +124,7 @@
)
)
e*
''', re.VERBOSE)
""", re.VERBOSE)

# ------Original Message------ or ---- Reply Message ----
# With variations in other languages.
Expand Down Expand Up @@ -343,9 +342,6 @@ def _replace_link_brackets(msg_body):
Converts msg_body into a unicode
"""
if isinstance(msg_body, bytes):
msg_body = msg_body.decode('utf8')

def link_wrapper(link):
newline_index = msg_body[:link.start()].rfind("\n")
if msg_body[newline_index + 1] == ">":
Expand Down Expand Up @@ -385,8 +381,6 @@ def postprocess(msg_body):

def extract_from_plain(msg_body):
"""Extracts a non quoted message from provided plain text."""
stripped_text = msg_body

delimiter = get_delimiter(msg_body)
msg_body = preprocess(msg_body, delimiter)
# don't process too long messages
Expand Down Expand Up @@ -418,17 +412,13 @@ def extract_from_html(msg_body):
Returns a unicode string.
"""
msg_body_bytes = msg_body
if isinstance(msg_body, six.text_type):
msg_body_bytes = msg_body.encode('utf8')

if msg_body_bytes.strip() == b'':
if msg_body.strip() == "":
return msg_body

msg_body_bytes = msg_body_bytes.replace(b'\r\n', b'\n')
msg_body = msg_body.replace("\r\n", "\n")
# Cut out xml and doctype tags to avoid conflict with unicode decoding.
msg_body_bytes = re.sub(br"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", b"", msg_body_bytes)
html_tree = html_document_fromstring(msg_body_bytes)
msg_body = re.sub(r"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body)
html_tree = html_document_fromstring(msg_body)
if html_tree is None:
return msg_body

Expand Down Expand Up @@ -531,11 +521,11 @@ def extract_from_html_tree(html_tree):
# of replacing data outside the <tag> which might be essential to
# the customer.
remove_namespaces(html_tree_copy)
s = html.tostring(html_tree_copy)
s = html.tostring(html_tree_copy, encoding="ascii")
if not s:
return None

return s.decode('utf-8')
return s.decode("ascii")


def remove_namespaces(root):
Expand Down Expand Up @@ -654,23 +644,23 @@ def _readable_text_empty(html_tree):


def is_splitter(line):
'''
"""
Returns Matcher object if provided string is a splitter and
None otherwise.
'''
"""
for pattern in SPLITTER_PATTERNS:
matcher = re.match(pattern, line)
if matcher:
return matcher


def text_content(context):
'''XPath Extension function to return a node text content.'''
"""XPath Extension function to return a node text content."""
return context.context_node.xpath("string()").strip()


def tail(context):
'''XPath Extension function to return a node tail text.'''
"""XPath Extension function to return a node tail text."""
return context.context_node.tail or ''


Expand Down
29 changes: 11 additions & 18 deletions talon/signature/learning/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,17 @@
* regexp's constants used when evaluating signature's features
"""

from __future__ import absolute_import
import unicodedata
import regex as re

from talon.utils import to_unicode
import regex as re

from talon.signature.constants import SIGNATURE_MAX_LINES


rc = re.compile

RE_EMAIL = rc('\S@\S')
RE_RELAX_PHONE = rc('(\(? ?[\d]{2,3} ?\)?.{,3}?){2,}')
RE_URL = rc(r'''https?://|www\.[\S]+\.[\S]''')
RE_URL = rc(r"""https?://|www\.[\S]+\.[\S]""")

# Taken from:
# http://www.cs.cmu.edu/~vitor/papers/sigFilePaper_finalversion.pdf
Expand Down Expand Up @@ -55,7 +51,7 @@


def binary_regex_search(prog):
'''Returns a function that returns 1 or 0 depending on regex search result.
"""Returns a function that returns 1 or 0 depending on regex search result.
If regular expression compiled into prog is present in a string
the result of calling the returned function with the string will be 1
Expand All @@ -66,12 +62,12 @@ def binary_regex_search(prog):
1
>>> binary_regex_search(re.compile("12"))("34")
0
'''
"""
return lambda s: 1 if prog.search(s) else 0


def binary_regex_match(prog):
'''Returns a function that returns 1 or 0 depending on regex match result.
"""Returns a function that returns 1 or 0 depending on regex match result.
If a string matches regular expression compiled into prog
the result of calling the returned function with the string will be 1
Expand All @@ -82,7 +78,7 @@ def binary_regex_match(prog):
1
>>> binary_regex_match(re.compile("12"))("3 12")
0
'''
"""
return lambda s: 1 if prog.match(s) else 0


Expand Down Expand Up @@ -135,7 +131,6 @@ def extract_names(sender):
>>> extract_names('')
[]
"""
sender = to_unicode(sender, precise=True)
# Remove non-alphabetical characters
sender = "".join([char if char.isalpha() else ' ' for char in sender])
# Remove too short words and words from "black" list i.e.
Expand All @@ -154,7 +149,7 @@ def extract_names(sender):


def categories_percent(s, categories):
'''Returns category characters percent.
"""Returns category characters percent.
>>> categories_percent("qqq ggg hhh", ["Po"])
0.0
Expand All @@ -166,29 +161,27 @@ def categories_percent(s, categories):
50.0
>>> categories_percent("s.s,5s", ["Po", "Nd"])
50.0
'''
"""
count = 0
s = to_unicode(s, precise=True)
for c in s:
if unicodedata.category(c) in categories:
count += 1
return 100 * float(count) / len(s) if len(s) else 0


def punctuation_percent(s):
'''Returns punctuation percent.
"""Returns punctuation percent.
>>> punctuation_percent("qqq ggg hhh")
0.0
>>> punctuation_percent("q,w.")
50.0
'''
"""
return categories_percent(s, ['Po'])


def capitalized_words_percent(s):
'''Returns capitalized words percent.'''
s = to_unicode(s, precise=True)
"""Returns capitalized words percent."""
words = re.split('\s', s)
words = [w for w in words if w.strip()]
words = [w for w in words if len(w) > 2]
Expand Down
Loading

0 comments on commit 14f106e

Please sign in to comment.