diff --git a/bin/urlscan b/bin/urlscan index 5edb71a..b8e1008 100755 --- a/bin/urlscan +++ b/bin/urlscan @@ -20,12 +20,12 @@ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, # Boston, MA 02111-1307, USA. - +from __future__ import unicode_literals import os import sys import re import optparse - +from urlscan import urlchoose, urlscan try: from email.Parser import Parser as parser except ImportError: @@ -41,33 +41,50 @@ optparser.add_option('-b', '--background', action='store_true', options, args = optparser.parse_args() -#homedir = os.path.dirname(sys.argv[0]) -#moduledir = os.path.join(homedir, 'modules') -#if os.path.isdir(moduledir): -# sys.path = [moduledir] + sys.path - -from urlscan import urlchoose -from urlscan import urlscan # Written as a generator so I can easily choose only # one subpart in the future (e.g., for # multipart/alternative). Actually, I might even add # a browser for the message structure? -def msgurls(msg, urlidx = 1): +def msgurls(msg, urlidx=1): + enc = urlscan.get_charset(msg) if msg.is_multipart(): for part in msg.get_payload(): for chunk in msgurls(part, urlidx): urlidx += 1 yield chunk elif msg.get_content_type() == 'text/plain': - for chunk in urlscan.extracturls(msg.get_payload(decode = True)): + msg = decode_bytes(msg.get_payload(decode=True), enc) + for chunk in urlscan.extracturls(msg): urlidx += 1 yield chunk elif msg.get_content_type() == 'text/html': - for chunk in urlscan.extracthtmlurls(msg.get_payload(decode = True)): + msg = decode_bytes(msg.get_payload(decode=True), enc) + for chunk in urlscan.extracthtmlurls(msg): urlidx += 1 yield chunk +def decode_bytes(b, enc='utf-8'): + """Given a string or bytes input, return a string. + + If the default encoding or detected encoding don't work, try 'latin-1' + + Args: b - bytes or string + enc - encoding to use for decoding the byte string. + + """ + try: + s = b.decode(enc) + except UnicodeDecodeError: + try: + s = b.decode('latin-1') + except UnicodeDecodeError as e: + s = "Unable to decode message:\n{}\n{}".format(str(b), e) + except AttributeError: + # If b is already a string, just return it + return b + return s + def main(msg): global options diff --git a/urlscan/urlchoose.py b/urlscan/urlchoose.py index 5b70d19..5df0c1a 100644 --- a/urlscan/urlchoose.py +++ b/urlscan/urlchoose.py @@ -39,10 +39,7 @@ def __init__(self, extractedurls, compact_mode=False, background=True): if first: first = False elif not self.compact_mode: - self.items.append(urwid.Divider(div_char='-', - top=1, - bottom=1)) - + self.items.append(urwid.Divider(div_char='-', top=1, bottom=1)) groupurls = [] markup = [] if self.compact_mode: @@ -100,6 +97,9 @@ def __init__(self, extractedurls, compact_mode=False, background=True): mkbrowseto(url, background), user_data=url)) + if not self.items: + self.items.append(urwid.Text("No URLs found")) + firstbutton = 1 self.listbox = urwid.ListBox(self.items) self.listbox.set_focus(firstbutton) if len(self.urls) == 1: diff --git a/urlscan/urlscan.py b/urlscan/urlscan.py index bc5ce03..66e7d28 100644 --- a/urlscan/urlscan.py +++ b/urlscan/urlscan.py @@ -17,6 +17,7 @@ """Contains the backend logic that scans messages for URLs and context.""" +from __future__ import unicode_literals import re try: from HTMLParser import HTMLParser @@ -24,6 +25,15 @@ from html.parser import HTMLParser +def get_charset(message, default="ascii"): + """Get the message charset""" + if message.get_content_charset(): + return message.get_content_charset() + if message.get_charset(): + return message.get_charset() + return default + + class Chunk: '''Represents a chunk of (marked-up) text that may or may not be linked to a URL. @@ -338,8 +348,7 @@ def extract_with_context(lst, pred, before_context, after_context): def extracturls(s): """Given a text message, extract all the URLs found in the message, along with their surrounding context. The output is a list of sequences of Chunk - objects, corresponding to the contextual regions extracted from the - string. + objects, corresponding to the contextual regions extracted from the string. """ lines = nlre.split(s)