Fixed dual python2/python3 compatibility

firecat53 · Jun 19, 2014 · df45287 · df45287
1 parent 67e5038
commit df45287
Show file tree

Hide file tree

Showing 3 changed files with 44 additions and 18 deletions.
diff --git a/bin/urlscan b/bin/urlscan
@@ -20,12 +20,12 @@
 #   the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 #   Boston, MA 02111-1307, USA.
 
-
+from __future__ import unicode_literals
 import os
 import sys
 import re
 import optparse
-
+from urlscan import urlchoose, urlscan
 try:
     from email.Parser import Parser as parser
 except ImportError:
@@ -41,33 +41,50 @@ optparser.add_option('-b', '--background', action='store_true',
 
 options, args = optparser.parse_args()
 
-#homedir = os.path.dirname(sys.argv[0])
-#moduledir = os.path.join(homedir, 'modules')
-#if os.path.isdir(moduledir):
-#    sys.path = [moduledir] + sys.path
-
-from urlscan import urlchoose
-from urlscan import urlscan
 
 # Written as a generator so I can easily choose only
 # one subpart in the future (e.g., for
 # multipart/alternative).  Actually, I might even add
 # a browser for the message structure?
-def msgurls(msg, urlidx = 1):
+def msgurls(msg, urlidx=1):
+    enc = urlscan.get_charset(msg)
     if msg.is_multipart():
         for part in msg.get_payload():
             for chunk in msgurls(part, urlidx):
                 urlidx += 1
                 yield chunk
     elif msg.get_content_type() == 'text/plain':
-        for chunk in urlscan.extracturls(msg.get_payload(decode = True)):
+        msg = decode_bytes(msg.get_payload(decode=True), enc)
+        for chunk in urlscan.extracturls(msg):
             urlidx += 1
             yield chunk
     elif msg.get_content_type() == 'text/html':
-        for chunk in urlscan.extracthtmlurls(msg.get_payload(decode = True)):
+        msg = decode_bytes(msg.get_payload(decode=True), enc)
+        for chunk in urlscan.extracthtmlurls(msg):
             urlidx += 1
             yield chunk
 
+def decode_bytes(b, enc='utf-8'):
+    """Given a string or bytes input, return a string.
+
+    If the default encoding or detected encoding don't work, try 'latin-1'
+
+        Args: b - bytes or string
+              enc - encoding to use for decoding the byte string.
+
+    """
+    try:
+        s = b.decode(enc)
+    except UnicodeDecodeError:
+        try:
+            s = b.decode('latin-1')
+        except UnicodeDecodeError as e:
+            s = "Unable to decode message:\n{}\n{}".format(str(b), e)
+    except AttributeError:
+        # If b is already a string, just return it
+        return b
+    return s
+
 def main(msg):
     global options
 

diff --git a/urlscan/urlchoose.py b/urlscan/urlchoose.py
@@ -39,10 +39,7 @@ def __init__(self, extractedurls, compact_mode=False, background=True):
             if first:
                 first = False
             elif not self.compact_mode:
-                self.items.append(urwid.Divider(div_char='-',
-                                                top=1,
-                                                bottom=1))
-
+                self.items.append(urwid.Divider(div_char='-', top=1, bottom=1))
             groupurls = []
             markup = []
             if self.compact_mode:
@@ -100,6 +97,9 @@ def __init__(self, extractedurls, compact_mode=False, background=True):
                                                mkbrowseto(url, background),
                                                user_data=url))
 
+        if not self.items:
+            self.items.append(urwid.Text("No URLs found"))
+            firstbutton = 1
         self.listbox = urwid.ListBox(self.items)
         self.listbox.set_focus(firstbutton)
         if len(self.urls) == 1:

diff --git a/urlscan/urlscan.py b/urlscan/urlscan.py
@@ -17,13 +17,23 @@
 
 """Contains the backend logic that scans messages for URLs and context."""
 
+from __future__ import unicode_literals
 import re
 try:
     from HTMLParser import HTMLParser
 except ImportError:
     from html.parser import HTMLParser
 
 
+def get_charset(message, default="ascii"):
+    """Get the message charset"""
+    if message.get_content_charset():
+        return message.get_content_charset()
+    if message.get_charset():
+        return message.get_charset()
+    return default
+
+
 class Chunk:
     '''Represents a chunk of (marked-up) text that
     may or may not be linked to a URL.
@@ -338,8 +348,7 @@ def extract_with_context(lst, pred, before_context, after_context):
 def extracturls(s):
     """Given a text message, extract all the URLs found in the message, along
     with their surrounding context.  The output is a list of sequences of Chunk
-    objects, corresponding to the contextual regions extracted from the
-    string.
+    objects, corresponding to the contextual regions extracted from the string.
 
     """
     lines = nlre.split(s)