Skip to content

Commit

Permalink
Fixed dual python2/python3 compatibility
Browse files Browse the repository at this point in the history
  • Loading branch information
firecat53 committed Jun 19, 2014
1 parent 67e5038 commit df45287
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 18 deletions.
41 changes: 29 additions & 12 deletions bin/urlscan
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@
# the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.


from __future__ import unicode_literals
import os
import sys
import re
import optparse

from urlscan import urlchoose, urlscan
try:
from email.Parser import Parser as parser
except ImportError:
Expand All @@ -41,33 +41,50 @@ optparser.add_option('-b', '--background', action='store_true',

options, args = optparser.parse_args()

#homedir = os.path.dirname(sys.argv[0])
#moduledir = os.path.join(homedir, 'modules')
#if os.path.isdir(moduledir):
# sys.path = [moduledir] + sys.path

from urlscan import urlchoose
from urlscan import urlscan

# Written as a generator so I can easily choose only
# one subpart in the future (e.g., for
# multipart/alternative). Actually, I might even add
# a browser for the message structure?
def msgurls(msg, urlidx = 1):
def msgurls(msg, urlidx=1):
enc = urlscan.get_charset(msg)
if msg.is_multipart():
for part in msg.get_payload():
for chunk in msgurls(part, urlidx):
urlidx += 1
yield chunk
elif msg.get_content_type() == 'text/plain':
for chunk in urlscan.extracturls(msg.get_payload(decode = True)):
msg = decode_bytes(msg.get_payload(decode=True), enc)
for chunk in urlscan.extracturls(msg):
urlidx += 1
yield chunk
elif msg.get_content_type() == 'text/html':
for chunk in urlscan.extracthtmlurls(msg.get_payload(decode = True)):
msg = decode_bytes(msg.get_payload(decode=True), enc)
for chunk in urlscan.extracthtmlurls(msg):
urlidx += 1
yield chunk

def decode_bytes(b, enc='utf-8'):
"""Given a string or bytes input, return a string.
If the default encoding or detected encoding don't work, try 'latin-1'
Args: b - bytes or string
enc - encoding to use for decoding the byte string.
"""
try:
s = b.decode(enc)
except UnicodeDecodeError:
try:
s = b.decode('latin-1')
except UnicodeDecodeError as e:
s = "Unable to decode message:\n{}\n{}".format(str(b), e)
except AttributeError:
# If b is already a string, just return it
return b
return s

def main(msg):
global options

Expand Down
8 changes: 4 additions & 4 deletions urlscan/urlchoose.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,7 @@ def __init__(self, extractedurls, compact_mode=False, background=True):
if first:
first = False
elif not self.compact_mode:
self.items.append(urwid.Divider(div_char='-',
top=1,
bottom=1))

self.items.append(urwid.Divider(div_char='-', top=1, bottom=1))
groupurls = []
markup = []
if self.compact_mode:
Expand Down Expand Up @@ -100,6 +97,9 @@ def __init__(self, extractedurls, compact_mode=False, background=True):
mkbrowseto(url, background),
user_data=url))

if not self.items:
self.items.append(urwid.Text("No URLs found"))
firstbutton = 1
self.listbox = urwid.ListBox(self.items)
self.listbox.set_focus(firstbutton)
if len(self.urls) == 1:
Expand Down
13 changes: 11 additions & 2 deletions urlscan/urlscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,23 @@

"""Contains the backend logic that scans messages for URLs and context."""

from __future__ import unicode_literals
import re
try:
from HTMLParser import HTMLParser
except ImportError:
from html.parser import HTMLParser


def get_charset(message, default="ascii"):
"""Get the message charset"""
if message.get_content_charset():
return message.get_content_charset()
if message.get_charset():
return message.get_charset()
return default


class Chunk:
'''Represents a chunk of (marked-up) text that
may or may not be linked to a URL.
Expand Down Expand Up @@ -338,8 +348,7 @@ def extract_with_context(lst, pred, before_context, after_context):
def extracturls(s):
"""Given a text message, extract all the URLs found in the message, along
with their surrounding context. The output is a list of sequences of Chunk
objects, corresponding to the contextual regions extracted from the
string.
objects, corresponding to the contextual regions extracted from the string.
"""
lines = nlre.split(s)
Expand Down

0 comments on commit df45287

Please sign in to comment.