From aae1c82806fa6f5067d1dea4578648a0b5fd7567 Mon Sep 17 00:00:00 2001 From: Leo Hemsted Date: Fri, 15 Nov 2019 16:28:06 +0000 Subject: [PATCH] don't encode sms when calculating content length MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit we were previously encoding the utf-8 content, and measuring the number of bytes. We've been doing this since the beginning of time, and for most of time, it wasn't an issue. However, when we started accepting unicode characters (notably some welsh-only accents eg Ŵ or ŷ), they are encoded as two bytes rather than 1. So a message of 70 ŷ characters is 140 bytes long, but a message of 69 a characters and one ŷ is 71 characters long. However, in reality, when we send a message, if _any_ character in it is non-gsm (similar but not the same as ascii), then the entire thing is encoded in UTF-16, where every single character (including basic latin text) is two bytes long. So both ŷŷŷŷ... and aaaaa....aŷ are actually 70 utf-16 characters - we already take account of the double code point width when determining billable fragments. This means in a small amount of cases (where someone has a message that is almost at a message boundary in size, which contains a lot of welsh characters), we've over-billed that service. But this is probably an incredibly low amount of messages, and impossible to account for after 1 wk, so probably not worth worrying about. --- notifications_utils/template.py | 11 +++++++++-- notifications_utils/version.py | 2 +- tests/test_base_template.py | 4 +++- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/notifications_utils/template.py b/notifications_utils/template.py index b9fe03f3..9f65fc6b 100644 --- a/notifications_utils/template.py +++ b/notifications_utils/template.py @@ -166,12 +166,19 @@ def prefix(self, value): @property def content_count(self): - return len(( + """ + Return the number of characters in the message. Note that we don't distinguish between GSM and non-GSM + characters at this point, as `get_sms_fragment_count` handles that separately. + + Also note that if values aren't provided, will calculate the raw length of the unsubstituted placeholders, + as in the message `foo ((placeholder))` has a length of 19. + """ + return len( # we always want to call SMSMessageTemplate.__str__ regardless of subclass, to avoid any html formatting SMSMessageTemplate.__str__(self) if self._values else sms_encode(add_prefix(self.content.strip(), self.prefix)) - ).encode(self.encoding)) + ) @property def fragment_count(self): diff --git a/notifications_utils/version.py b/notifications_utils/version.py index 2b082b63..4e315cf0 100644 --- a/notifications_utils/version.py +++ b/notifications_utils/version.py @@ -1 +1 @@ -__version__ = '36.1.1' +__version__ = '36.1.2' diff --git a/tests/test_base_template.py b/tests/test_base_template.py index 29bb988c..9509d352 100644 --- a/tests/test_base_template.py +++ b/tests/test_base_template.py @@ -119,8 +119,10 @@ def test_extracting_placeholders(template_content, template_subject, expected): "content,prefix, expected_length, expected_replaced_length", [ ("The quick brown fox jumped over the lazy dog", None, 44, 44), - # should be replaced with a ? + # is an unsupported unicode character so should be replaced with a ? ("深", None, 1, 1), + # is a supported unicode character so should be kept as is + ("Ŵ", None, 1, 1), ("'First line.\n", None, 12, 12), ("\t\n\r", None, 0, 0), ("((placeholder))", None, 15, 3),