From aae1c82806fa6f5067d1dea4578648a0b5fd7567 Mon Sep 17 00:00:00 2001
From: Leo Hemsted <leo.hemsted@digital.cabinet-office.gov.uk>
Date: Fri, 15 Nov 2019 16:28:06 +0000
Subject: [PATCH] don't encode sms when calculating content length
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

we were previously encoding the utf-8 content, and measuring the number
of bytes. We've been doing this since the beginning of time, and for
most of time, it wasn't an issue. However, when we started accepting
unicode characters (notably some welsh-only accents eg Ŵ or ŷ), they are
encoded as two bytes rather than 1. So a message of 70 ŷ characters is
140 bytes long, but a message of 69 a characters and one ŷ is 71
characters long.

However, in reality, when we send a message, if _any_ character in it is
non-gsm (similar but not the same as ascii), then the entire thing is
encoded in UTF-16, where every single character (including basic latin
text) is two bytes long. So both ŷŷŷŷ... and aaaaa....aŷ are actually 70
utf-16 characters - we already take account of the double code point
width when determining billable fragments.

This means in a small amount of cases (where someone has a message that
is almost at a message boundary in size, which contains a lot of welsh
characters), we've over-billed that service. But this is probably an
incredibly low amount of messages, and impossible to account for after 1
wk, so probably not worth worrying about.
---
 notifications_utils/template.py | 11 +++++++++--
 notifications_utils/version.py  |  2 +-
 tests/test_base_template.py     |  4 +++-
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/notifications_utils/template.py b/notifications_utils/template.py
index b9fe03f3..9f65fc6b 100644
--- a/notifications_utils/template.py
+++ b/notifications_utils/template.py
@@ -166,12 +166,19 @@ def prefix(self, value):
 
     @property
     def content_count(self):
-        return len((
+        """
+        Return the number of characters in the message. Note that we don't distinguish between GSM and non-GSM
+        characters at this point, as `get_sms_fragment_count` handles that separately.
+
+        Also note that if values aren't provided, will calculate the raw length of the unsubstituted placeholders,
+        as in the message `foo ((placeholder))` has a length of 19.
+        """
+        return len(
             # we always want to call SMSMessageTemplate.__str__ regardless of subclass, to avoid any html formatting
             SMSMessageTemplate.__str__(self)
             if self._values
             else sms_encode(add_prefix(self.content.strip(), self.prefix))
-        ).encode(self.encoding))
+        )
 
     @property
     def fragment_count(self):
diff --git a/notifications_utils/version.py b/notifications_utils/version.py
index 2b082b63..4e315cf0 100644
--- a/notifications_utils/version.py
+++ b/notifications_utils/version.py
@@ -1 +1 @@
-__version__ = '36.1.1'
+__version__ = '36.1.2'
diff --git a/tests/test_base_template.py b/tests/test_base_template.py
index 29bb988c..9509d352 100644
--- a/tests/test_base_template.py
+++ b/tests/test_base_template.py
@@ -119,8 +119,10 @@ def test_extracting_placeholders(template_content, template_subject, expected):
     "content,prefix, expected_length, expected_replaced_length",
     [
         ("The quick brown fox jumped over the lazy dog", None, 44, 44),
-        # should be replaced with a ?
+        # is an unsupported unicode character so should be replaced with a ?
         ("深", None, 1, 1),
+        # is a supported unicode character so should be kept as is
+        ("Ŵ", None, 1, 1),
         ("'First line.\n", None, 12, 12),
         ("\t\n\r", None, 0, 0),
         ("((placeholder))", None, 15, 3),