From c92bdb2682a9e1452b2abb22278c42801d5a2efc Mon Sep 17 00:00:00 2001 From: Fran Boon Date: Sat, 21 Dec 2019 12:21:50 +0000 Subject: [PATCH] S3Msg: Py 3.x fix (inc upgrade of Feedparser to 6.0.0b1 for 3.x) --- VERSION | 2 +- modules/feedparser/__init__.py | 50 + modules/feedparser/api.py | 296 ++++++ modules/feedparser/datetimes/__init__.py | 72 ++ modules/feedparser/datetimes/asctime.py | 74 ++ modules/feedparser/datetimes/greek.py | 89 ++ modules/feedparser/datetimes/hungarian.py | 75 ++ modules/feedparser/datetimes/iso8601.py | 161 ++++ modules/feedparser/datetimes/korean.py | 86 ++ modules/feedparser/datetimes/perforce.py | 53 ++ modules/feedparser/datetimes/rfc822.py | 149 +++ modules/feedparser/datetimes/w3dtf.py | 117 +++ modules/feedparser/encodings.py | 292 ++++++ modules/feedparser/exceptions.py | 58 ++ modules/feedparser/html.py | 364 ++++++++ modules/feedparser/http.py | 256 ++++++ modules/feedparser/mixin.py | 813 +++++++++++++++++ modules/feedparser/namespaces/__init__.py | 0 modules/feedparser/namespaces/_base.py | 506 +++++++++++ modules/feedparser/namespaces/admin.py | 56 ++ modules/feedparser/namespaces/cc.py | 72 ++ modules/feedparser/namespaces/dc.py | 137 +++ modules/feedparser/namespaces/georss.py | 276 ++++++ modules/feedparser/namespaces/itunes.py | 112 +++ modules/feedparser/namespaces/mediarss.py | 144 +++ modules/feedparser/namespaces/psc.py | 77 ++ modules/feedparser/parsers/__init__.py | 0 modules/feedparser/parsers/loose.py | 81 ++ modules/feedparser/parsers/strict.py | 137 +++ modules/feedparser/sanitizer.py | 955 ++++++++++++++++++++ modules/feedparser/sgml.py | 136 +++ modules/feedparser/urls.py | 162 ++++ modules/feedparser/util.py | 166 ++++ modules/{feedparser.py => feedparser521.py} | 0 modules/s3/s3msg.py | 29 +- modules/templates/SAMBRO/Demo/tasks.cfg | 2 +- optional_requirements.txt | 2 + 37 files changed, 6044 insertions(+), 13 deletions(-) create mode 100644 modules/feedparser/__init__.py create mode 100644 modules/feedparser/api.py create mode 100644 modules/feedparser/datetimes/__init__.py create mode 100644 modules/feedparser/datetimes/asctime.py create mode 100644 modules/feedparser/datetimes/greek.py create mode 100644 modules/feedparser/datetimes/hungarian.py create mode 100644 modules/feedparser/datetimes/iso8601.py create mode 100644 modules/feedparser/datetimes/korean.py create mode 100644 modules/feedparser/datetimes/perforce.py create mode 100644 modules/feedparser/datetimes/rfc822.py create mode 100644 modules/feedparser/datetimes/w3dtf.py create mode 100644 modules/feedparser/encodings.py create mode 100644 modules/feedparser/exceptions.py create mode 100644 modules/feedparser/html.py create mode 100644 modules/feedparser/http.py create mode 100644 modules/feedparser/mixin.py create mode 100644 modules/feedparser/namespaces/__init__.py create mode 100644 modules/feedparser/namespaces/_base.py create mode 100644 modules/feedparser/namespaces/admin.py create mode 100644 modules/feedparser/namespaces/cc.py create mode 100644 modules/feedparser/namespaces/dc.py create mode 100644 modules/feedparser/namespaces/georss.py create mode 100644 modules/feedparser/namespaces/itunes.py create mode 100644 modules/feedparser/namespaces/mediarss.py create mode 100644 modules/feedparser/namespaces/psc.py create mode 100644 modules/feedparser/parsers/__init__.py create mode 100644 modules/feedparser/parsers/loose.py create mode 100644 modules/feedparser/parsers/strict.py create mode 100644 modules/feedparser/sanitizer.py create mode 100644 modules/feedparser/sgml.py create mode 100644 modules/feedparser/urls.py create mode 100644 modules/feedparser/util.py rename modules/{feedparser.py => feedparser521.py} (100%) diff --git a/VERSION b/VERSION index 319be64e4..c5659c29c 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -nursix-dev-3404-gd982b86 (2019-12-19 15:45:35) +b'8d7bba62b' (2019-12-21 12:21:50) \ No newline at end of file diff --git a/modules/feedparser/__init__.py b/modules/feedparser/__init__.py new file mode 100644 index 000000000..25e97fd04 --- /dev/null +++ b/modules/feedparser/__init__.py @@ -0,0 +1,50 @@ +# Copyright 2010-2019 Kurt McKee +# Copyright 2002-2008 Mark Pilgrim +# All rights reserved. +# +# This file is part of feedparser. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE.""" + +from __future__ import absolute_import +from __future__ import unicode_literals + +from .api import parse +from .datetimes import registerDateHandler +from .exceptions import * + +__author__ = 'Kurt McKee ' +__license__ = 'BSD 2-clause' +__version__ = '5.2.1' + +# HTTP "User-Agent" header to send to servers when downloading feeds. +# If you are embedding feedparser in a larger application, you should +# change this to your application name and URL. +USER_AGENT = "feedparser/%s +https://github.com/kurtmckee/feedparser/" % __version__ + +# If you want feedparser to automatically resolve all relative URIs, set this +# to 1. +RESOLVE_RELATIVE_URIS = 1 + +# If you want feedparser to automatically sanitize all potentially unsafe +# HTML content, set this to 1. +SANITIZE_HTML = 1 diff --git a/modules/feedparser/api.py b/modules/feedparser/api.py new file mode 100644 index 000000000..7e552f279 --- /dev/null +++ b/modules/feedparser/api.py @@ -0,0 +1,296 @@ +# The public API for feedparser +# Copyright 2010-2019 Kurt McKee +# Copyright 2002-2008 Mark Pilgrim +# All rights reserved. +# +# This file is a part of feedparser. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from __future__ import absolute_import +from __future__ import unicode_literals + +import xml.sax + +try: + from io import BytesIO as _StringIO +except ImportError: + # Python 2.7 + try: + from cStringIO import StringIO as _StringIO + except ImportError: + from StringIO import StringIO as _StringIO + +try: + import urllib.parse +except ImportError: + from urlparse import urlparse + + class urllib(object): + class parse(object): + urlparse = staticmethod(urlparse) + +from .datetimes import registerDateHandler, _parse_date +from .encodings import convert_to_utf8 +from .exceptions import * +from .html import _BaseHTMLProcessor +from . import http +from . import mixin +from .mixin import _FeedParserMixin +from .parsers.loose import _LooseFeedParser +from .parsers.strict import _StrictFeedParser +from .sanitizer import replace_doctype +from .sgml import * +from .urls import convert_to_idn, make_safe_absolute_uri +from .util import FeedParserDict + +bytes_ = type(b'') +unicode_ = type('') +try: + unichr + basestring +except NameError: + unichr = chr + basestring = str + +# List of preferred XML parsers, by SAX driver name. These will be tried first, +# but if they're not installed, Python will keep searching through its own list +# of pre-installed parsers until it finds one that supports everything we need. +PREFERRED_XML_PARSERS = ["drv_libxml2"] + +_XML_AVAILABLE = True + +SUPPORTED_VERSIONS = { + '': 'unknown', + 'rss090': 'RSS 0.90', + 'rss091n': 'RSS 0.91 (Netscape)', + 'rss091u': 'RSS 0.91 (Userland)', + 'rss092': 'RSS 0.92', + 'rss093': 'RSS 0.93', + 'rss094': 'RSS 0.94', + 'rss20': 'RSS 2.0', + 'rss10': 'RSS 1.0', + 'rss': 'RSS (unknown version)', + 'atom01': 'Atom 0.1', + 'atom02': 'Atom 0.2', + 'atom03': 'Atom 0.3', + 'atom10': 'Atom 1.0', + 'atom': 'Atom (unknown version)', + 'cdf': 'CDF', +} + + +def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result): + """URL, filename, or string --> stream + + This function lets you define parsers that take any input source + (URL, pathname to local or network file, or actual data as a string) + and deal with it in a uniform manner. Returned object is guaranteed + to have all the basic stdio read methods (read, readline, readlines). + Just .close() the object when you're done with it. + + If the etag argument is supplied, it will be used as the value of an + If-None-Match request header. + + If the modified argument is supplied, it can be a tuple of 9 integers + (as returned by gmtime() in the standard Python time module) or a date + string in any format supported by feedparser. Regardless, it MUST + be in GMT (Greenwich Mean Time). It will be reformatted into an + RFC 1123-compliant date and used as the value of an If-Modified-Since + request header. + + If the agent argument is supplied, it will be used as the value of a + User-Agent request header. + + If the referrer argument is supplied, it will be used as the value of a + Referer[sic] request header. + + If handlers is supplied, it is a list of handlers used to build a + urllib2 opener. + + if request_headers is supplied it is a dictionary of HTTP request headers + that will override the values generated by FeedParser. + + :return: A :class:`StringIO.StringIO` or :class:`io.BytesIO`. + """ + + if hasattr(url_file_stream_or_string, 'read'): + return url_file_stream_or_string.read() + + if isinstance(url_file_stream_or_string, basestring) \ + and urllib.parse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'): + return http.get(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result) + + # try to open with native open function (if url_file_stream_or_string is a filename) + try: + with open(url_file_stream_or_string, 'rb') as f: + data = f.read() + except (IOError, UnicodeEncodeError, TypeError, ValueError): + # if url_file_stream_or_string is a unicode object that + # cannot be converted to the encoding returned by + # sys.getfilesystemencoding(), a UnicodeEncodeError + # will be thrown + # If url_file_stream_or_string is a string that contains NULL + # (such as an XML document encoded in UTF-32), TypeError will + # be thrown. + pass + else: + return data + + # treat url_file_stream_or_string as string + if not isinstance(url_file_stream_or_string, bytes_): + return url_file_stream_or_string.encode('utf-8') + return url_file_stream_or_string + + +LooseFeedParser = type( + str('LooseFeedParser'), # `str()` call required for Python 2.7 + (_LooseFeedParser, _FeedParserMixin, _BaseHTMLProcessor, object), + {}, +) + +StrictFeedParser = type( + str('StrictFeedParser'), # `str()` call required for Python 2.7 + (_StrictFeedParser, _FeedParserMixin, xml.sax.handler.ContentHandler, object), + {}, +) + + +def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None, resolve_relative_uris=None, sanitize_html=None): + """Parse a feed from a URL, file, stream, or string. + + :param url_file_stream_or_string: + File-like object, URL, file path, or string. Both byte and text strings + are accepted. If necessary, encoding will be derived from the response + headers or automatically detected. + + Note that strings may trigger network I/O or filesystem access + depending on the value. Wrap an untrusted string in + a :class:`io.StringIO` or :class:`io.BytesIO` to avoid this. Do not + pass untrusted strings to this function. + + When a URL is not passed the feed location to use in relative URL + resolution should be passed in the ``Content-Location`` response header + (see ``response_headers`` below). + + :param str etag: HTTP ``ETag`` request header. + :param modified: HTTP ``Last-Modified`` request header. + :type modified: :class:`str`, :class:`time.struct_time` 9-tuple, or + :class:`datetime.datetime` + :param str agent: HTTP ``User-Agent`` request header, which defaults to + the value of :data:`feedparser.USER_AGENT`. + :param referrer: HTTP ``Referer`` [sic] request header. + :param request_headers: + A mapping of HTTP header name to HTTP header value to add to the + request, overriding internally generated values. + :type request_headers: :class:`dict` mapping :class:`str` to :class:`str` + :param response_headers: + A mapping of HTTP header name to HTTP header value. Multiple values may + be joined with a comma. If a HTTP request was made, these headers + override any matching headers in the response. Otherwise this specifies + the entirety of the response headers. + :type response_headers: :class:`dict` mapping :class:`str` to :class:`str` + + :param bool resolve_relative_uris: + Should feedparser attempt to resolve relative URIs absolute ones within + HTML content? Defaults to the value of + :data:`feedparser.RESOLVE_RELATIVE_URIS`, which is ``True``. + :param bool sanitize_html: + Should feedparser skip HTML sanitization? Only disable this if you know + what you are doing! Defaults to the value of + :data:`feedparser.SANITIZE_HTML`, which is ``True``. + + :return: A :class:`FeedParserDict`. + """ + + if not agent or sanitize_html is None or resolve_relative_uris is None: + import feedparser + if not agent: + agent = feedparser.USER_AGENT + if sanitize_html is None: + sanitize_html = feedparser.SANITIZE_HTML + if resolve_relative_uris is None: + resolve_relative_uris = feedparser.RESOLVE_RELATIVE_URIS + + result = FeedParserDict( + bozo=False, + entries=[], + feed=FeedParserDict(), + headers={}, + ) + + data = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result) + + if not data: + return result + + # overwrite existing headers using response_headers + result['headers'].update(response_headers or {}) + + data = convert_to_utf8(result['headers'], data, result) + use_strict_parser = result['encoding'] and True or False + + result['version'], data, entities = replace_doctype(data) + + # Ensure that baseuri is an absolute URI using an acceptable URI scheme. + contentloc = result['headers'].get('content-location', '') + href = result.get('href', '') + baseuri = make_safe_absolute_uri(href, contentloc) or make_safe_absolute_uri(contentloc) or href + + baselang = result['headers'].get('content-language', None) + if isinstance(baselang, bytes_) and baselang is not None: + baselang = baselang.decode('utf-8', 'ignore') + + if not _XML_AVAILABLE: + use_strict_parser = 0 + if use_strict_parser: + # initialize the SAX parser + feedparser = StrictFeedParser(baseuri, baselang, 'utf-8') + feedparser.resolve_relative_uris = resolve_relative_uris + feedparser.sanitize_html = sanitize_html + saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS) + saxparser.setFeature(xml.sax.handler.feature_namespaces, 1) + try: + # disable downloading external doctype references, if possible + saxparser.setFeature(xml.sax.handler.feature_external_ges, 0) + except xml.sax.SAXNotSupportedException: + pass + saxparser.setContentHandler(feedparser) + saxparser.setErrorHandler(feedparser) + source = xml.sax.xmlreader.InputSource() + source.setByteStream(_StringIO(data)) + try: + saxparser.parse(source) + except xml.sax.SAXException as e: + result['bozo'] = 1 + result['bozo_exception'] = feedparser.exc or e + use_strict_parser = 0 + if not use_strict_parser and _SGML_AVAILABLE: + feedparser = LooseFeedParser(baseuri, baselang, 'utf-8', entities) + feedparser.resolve_relative_uris = resolve_relative_uris + feedparser.sanitize_html = sanitize_html + feedparser.feed(data.decode('utf-8', 'replace')) + result['feed'] = feedparser.feeddata + result['entries'] = feedparser.entries + result['version'] = result['version'] or feedparser.version + result['namespaces'] = feedparser.namespaces_in_use + return result diff --git a/modules/feedparser/datetimes/__init__.py b/modules/feedparser/datetimes/__init__.py new file mode 100644 index 000000000..fc1b27d89 --- /dev/null +++ b/modules/feedparser/datetimes/__init__.py @@ -0,0 +1,72 @@ +# Copyright 2010-2019 Kurt McKee +# Copyright 2002-2008 Mark Pilgrim +# All rights reserved. +# +# This file is a part of feedparser. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from __future__ import absolute_import + +from .asctime import _parse_date_asctime +from .greek import _parse_date_greek +from .hungarian import _parse_date_hungarian +from .iso8601 import _parse_date_iso8601 +from .korean import _parse_date_onblog, _parse_date_nate +from .perforce import _parse_date_perforce +from .rfc822 import _parse_date_rfc822 +from .w3dtf import _parse_date_w3dtf + +_date_handlers = [] + + +def registerDateHandler(func): + """Register a date handler function (takes string, returns 9-tuple date in GMT)""" + _date_handlers.insert(0, func) + + +def _parse_date(date_string): + """Parses a variety of date formats into a 9-tuple in GMT""" + if not date_string: + return None + for handler in _date_handlers: + try: + date9tuple = handler(date_string) + except (KeyError, OverflowError, ValueError, AttributeError): + continue + if not date9tuple: + continue + if len(date9tuple) != 9: + continue + return date9tuple + return None + + +registerDateHandler(_parse_date_onblog) +registerDateHandler(_parse_date_nate) +registerDateHandler(_parse_date_greek) +registerDateHandler(_parse_date_hungarian) +registerDateHandler(_parse_date_perforce) +registerDateHandler(_parse_date_asctime) +registerDateHandler(_parse_date_iso8601) +registerDateHandler(_parse_date_rfc822) +registerDateHandler(_parse_date_w3dtf) diff --git a/modules/feedparser/datetimes/asctime.py b/modules/feedparser/datetimes/asctime.py new file mode 100644 index 000000000..b8ac3ab17 --- /dev/null +++ b/modules/feedparser/datetimes/asctime.py @@ -0,0 +1,74 @@ +# Copyright 2010-2019 Kurt McKee +# Copyright 2002-2008 Mark Pilgrim +# All rights reserved. +# +# This file is a part of feedparser. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from __future__ import absolute_import +from __future__ import unicode_literals + +from .rfc822 import _parse_date_rfc822 + +_months = [ + 'jan', + 'feb', + 'mar', + 'apr', + 'may', + 'jun', + 'jul', + 'aug', + 'sep', + 'oct', + 'nov', + 'dec', +] + + +def _parse_date_asctime(dt): + """Parse asctime-style dates. + + Converts asctime to RFC822-compatible dates and uses the RFC822 parser + to do the actual parsing. + + Supported formats (format is standardized to the first one listed): + + * {weekday name} {month name} dd hh:mm:ss {+-tz} yyyy + * {weekday name} {month name} dd hh:mm:ss yyyy + """ + + parts = dt.split() + + # Insert a GMT timezone, if needed. + if len(parts) == 5: + parts.insert(4, '+0000') + + # Exit if there are not six parts. + if len(parts) != 6: + return None + + # Reassemble the parts in an RFC822-compatible order and parse them. + return _parse_date_rfc822(' '.join([ + parts[0], parts[2], parts[1], parts[5], parts[3], parts[4], + ])) diff --git a/modules/feedparser/datetimes/greek.py b/modules/feedparser/datetimes/greek.py new file mode 100644 index 000000000..77894c9ae --- /dev/null +++ b/modules/feedparser/datetimes/greek.py @@ -0,0 +1,89 @@ +# Copyright 2010-2019 Kurt McKee +# Copyright 2002-2008 Mark Pilgrim +# All rights reserved. +# +# This file is a part of feedparser. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from __future__ import absolute_import +from __future__ import unicode_literals + +import re + +from .rfc822 import _parse_date_rfc822 + +# Unicode strings for Greek date strings +_greek_months = { + '\u0399\u03b1\u03bd': 'Jan', # c9e1ed in iso-8859-7 + '\u03a6\u03b5\u03b2': 'Feb', # d6e5e2 in iso-8859-7 + '\u039c\u03ac\u03ce': 'Mar', # ccdcfe in iso-8859-7 + '\u039c\u03b1\u03ce': 'Mar', # cce1fe in iso-8859-7 + '\u0391\u03c0\u03c1': 'Apr', # c1f0f1 in iso-8859-7 + '\u039c\u03ac\u03b9': 'May', # ccdce9 in iso-8859-7 + '\u039c\u03b1\u03ca': 'May', # cce1fa in iso-8859-7 + '\u039c\u03b1\u03b9': 'May', # cce1e9 in iso-8859-7 + '\u0399\u03bf\u03cd\u03bd': 'Jun', # c9effded in iso-8859-7 + '\u0399\u03bf\u03bd': 'Jun', # c9efed in iso-8859-7 + '\u0399\u03bf\u03cd\u03bb': 'Jul', # c9effdeb in iso-8859-7 + '\u0399\u03bf\u03bb': 'Jul', # c9f9eb in iso-8859-7 + '\u0391\u03cd\u03b3': 'Aug', # c1fde3 in iso-8859-7 + '\u0391\u03c5\u03b3': 'Aug', # c1f5e3 in iso-8859-7 + '\u03a3\u03b5\u03c0': 'Sep', # d3e5f0 in iso-8859-7 + '\u039f\u03ba\u03c4': 'Oct', # cfeaf4 in iso-8859-7 + '\u039d\u03bf\u03ad': 'Nov', # cdefdd in iso-8859-7 + '\u039d\u03bf\u03b5': 'Nov', # cdefe5 in iso-8859-7 + '\u0394\u03b5\u03ba': 'Dec', # c4e5ea in iso-8859-7 +} + +_greek_wdays = { + '\u039a\u03c5\u03c1': 'Sun', # caf5f1 in iso-8859-7 + '\u0394\u03b5\u03c5': 'Mon', # c4e5f5 in iso-8859-7 + '\u03a4\u03c1\u03b9': 'Tue', # d4f1e9 in iso-8859-7 + '\u03a4\u03b5\u03c4': 'Wed', # d4e5f4 in iso-8859-7 + '\u03a0\u03b5\u03bc': 'Thu', # d0e5ec in iso-8859-7 + '\u03a0\u03b1\u03c1': 'Fri', # d0e1f1 in iso-8859-7 + '\u03a3\u03b1\u03b2': 'Sat', # d3e1e2 in iso-8859-7 +} + +_greek_date_format_re = re.compile(r'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)') + + +def _parse_date_greek(date_string): + """Parse a string according to a Greek 8-bit date format.""" + m = _greek_date_format_re.match(date_string) + if not m: + return + wday = _greek_wdays[m.group(1)] + month = _greek_months[m.group(3)] + rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \ + { + 'wday': wday, + 'day': m.group(2), + 'month': month, + 'year': m.group(4), + 'hour': m.group(5), + 'minute': m.group(6), + 'second': m.group(7), + 'zonediff': m.group(8), + } + return _parse_date_rfc822(rfc822date) diff --git a/modules/feedparser/datetimes/hungarian.py b/modules/feedparser/datetimes/hungarian.py new file mode 100644 index 000000000..497cb0bc0 --- /dev/null +++ b/modules/feedparser/datetimes/hungarian.py @@ -0,0 +1,75 @@ +# Copyright 2010-2019 Kurt McKee +# Copyright 2002-2008 Mark Pilgrim +# All rights reserved. +# +# This file is a part of feedparser. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from __future__ import absolute_import +from __future__ import unicode_literals + +import re + +from .w3dtf import _parse_date_w3dtf + +# Unicode strings for Hungarian date strings +_hungarian_months = { + 'janu\u00e1r': '01', # e1 in iso-8859-2 + 'febru\u00e1ri': '02', # e1 in iso-8859-2 + 'm\u00e1rcius': '03', # e1 in iso-8859-2 + '\u00e1prilis': '04', # e1 in iso-8859-2 + 'm\u00e1ujus': '05', # e1 in iso-8859-2 + 'j\u00fanius': '06', # fa in iso-8859-2 + 'j\u00falius': '07', # fa in iso-8859-2 + 'augusztus': '08', + 'szeptember': '09', + 'okt\u00f3ber': '10', # f3 in iso-8859-2 + 'november': '11', + 'december': '12', +} + +_hungarian_date_format_re = re.compile(r'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))') + + +def _parse_date_hungarian(date_string): + """Parse a string according to a Hungarian 8-bit date format.""" + m = _hungarian_date_format_re.match(date_string) + if not m or m.group(2) not in _hungarian_months: + return None + month = _hungarian_months[m.group(2)] + day = m.group(3) + if len(day) == 1: + day = '0' + day + hour = m.group(4) + if len(hour) == 1: + hour = '0' + hour + w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \ + { + 'year': m.group(1), + 'month': month, + 'day': day, + 'hour': hour, + 'minute': m.group(5), + 'zonediff': m.group(6), + } + return _parse_date_w3dtf(w3dtfdate) diff --git a/modules/feedparser/datetimes/iso8601.py b/modules/feedparser/datetimes/iso8601.py new file mode 100644 index 000000000..bdf7652bd --- /dev/null +++ b/modules/feedparser/datetimes/iso8601.py @@ -0,0 +1,161 @@ +# Copyright 2010-2019 Kurt McKee +# Copyright 2002-2008 Mark Pilgrim +# All rights reserved. +# +# This file is a part of feedparser. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from __future__ import absolute_import +from __future__ import unicode_literals + +import re +import time + +# ISO-8601 date parsing routines written by Fazal Majid. +# The ISO 8601 standard is very convoluted and irregular - a full ISO 8601 +# parser is beyond the scope of feedparser and would be a worthwhile addition +# to the Python library. +# A single regular expression cannot parse ISO 8601 date formats into groups +# as the standard is highly irregular (for instance is 030104 2003-01-04 or +# 0301-04-01), so we use templates instead. +# Please note the order in templates is significant because we need a +# greedy match. +_iso8601_tmpl = [ + 'YYYY-?MM-?DD', + 'YYYY-0MM?-?DD', + 'YYYY-MM', + 'YYYY-?OOO', + 'YY-?MM-?DD', + 'YY-?OOO', + 'YYYY', + '-YY-?MM', + '-OOO', + '-YY', + '--MM-?DD', + '--MM', + '---DD', + 'CC', + '', +] + +_iso8601_re = [ + tmpl.replace( + 'YYYY', r'(?P\d{4})').replace( + 'YY', r'(?P\d\d)').replace( + 'MM', r'(?P[01]\d)').replace( + 'DD', r'(?P[0123]\d)').replace( + 'OOO', r'(?P[0123]\d\d)').replace( + 'CC', r'(?P\d\d$)') + + r'(T?(?P\d{2}):(?P\d{2})' + + r'(:(?P\d{2}))?' + + r'(\.(?P\d+))?' + + r'(?P[+-](?P\d{2})(:(?P\d{2}))?|Z)?)?' + for tmpl in _iso8601_tmpl] +try: + del tmpl +except NameError: + pass +_iso8601_matches = [re.compile(regex).match for regex in _iso8601_re] +try: + del regex +except NameError: + pass + + +def _parse_date_iso8601(date_string): + """Parse a variety of ISO-8601-compatible formats like 20040105""" + m = None + for _iso8601_match in _iso8601_matches: + m = _iso8601_match(date_string) + if m: + break + if not m: + return + if m.span() == (0, 0): + return + params = m.groupdict() + ordinal = params.get('ordinal', 0) + if ordinal: + ordinal = int(ordinal) + else: + ordinal = 0 + year = params.get('year', '--') + if not year or year == '--': + year = time.gmtime()[0] + elif len(year) == 2: + # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993 + year = 100 * int(time.gmtime()[0] / 100) + int(year) + else: + year = int(year) + month = params.get('month', '-') + if not month or month == '-': + # ordinals are NOT normalized by mktime, we simulate them + # by setting month=1, day=ordinal + if ordinal: + month = 1 + else: + month = time.gmtime()[1] + month = int(month) + day = params.get('day', 0) + if not day: + # see above + if ordinal: + day = ordinal + elif params.get('century', 0) or \ + params.get('year', 0) or params.get('month', 0): + day = 1 + else: + day = time.gmtime()[2] + else: + day = int(day) + # special case of the century - is the first year of the 21st century + # 2000 or 2001 ? The debate goes on... + if 'century' in params: + year = (int(params['century']) - 1) * 100 + 1 + # in ISO 8601 most fields are optional + for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']: + if not params.get(field, None): + params[field] = 0 + hour = int(params.get('hour', 0)) + minute = int(params.get('minute', 0)) + second = int(float(params.get('second', 0))) + # weekday is normalized by mktime(), we can ignore it + weekday = 0 + daylight_savings_flag = -1 + tm = [year, month, day, hour, minute, second, weekday, + ordinal, daylight_savings_flag] + # ISO 8601 time zone adjustments + tz = params.get('tz') + if tz and tz != 'Z': + if tz[0] == '-': + tm[3] += int(params.get('tzhour', 0)) + tm[4] += int(params.get('tzmin', 0)) + elif tz[0] == '+': + tm[3] -= int(params.get('tzhour', 0)) + tm[4] -= int(params.get('tzmin', 0)) + else: + return None + # Python's time.mktime() is a wrapper around the ANSI C mktime(3c) + # which is guaranteed to normalize d/m/y/h/m/s. + # Many implementations have bugs, but we'll pretend they don't. + return time.localtime(time.mktime(tuple(tm))) diff --git a/modules/feedparser/datetimes/korean.py b/modules/feedparser/datetimes/korean.py new file mode 100644 index 000000000..c0bcb56ff --- /dev/null +++ b/modules/feedparser/datetimes/korean.py @@ -0,0 +1,86 @@ +# Copyright 2010-2019 Kurt McKee +# Copyright 2002-2008 Mark Pilgrim +# All rights reserved. +# +# This file is a part of feedparser. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from __future__ import absolute_import +from __future__ import unicode_literals + +import re + +from .w3dtf import _parse_date_w3dtf + +# 8-bit date handling routines written by ytrewq1. +_korean_year = '\ub144' # b3e2 in euc-kr +_korean_month = '\uc6d4' # bff9 in euc-kr +_korean_day = '\uc77c' # c0cf in euc-kr +_korean_am = '\uc624\uc804' # bfc0 c0fc in euc-kr +_korean_pm = '\uc624\ud6c4' # bfc0 c8c4 in euc-kr + +_korean_onblog_date_re = re.compile( + r'(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' + % (_korean_year, _korean_month, _korean_day) +) + +_korean_nate_date_re = re.compile( + r'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' + % (_korean_am, _korean_pm)) + + +def _parse_date_onblog(dateString): + """Parse a string according to the OnBlog 8-bit date format""" + m = _korean_onblog_date_re.match(dateString) + if not m: + return + w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ + {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ + 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\ + 'zonediff': '+09:00'} + return _parse_date_w3dtf(w3dtfdate) + + +def _parse_date_nate(dateString): + """Parse a string according to the Nate 8-bit date format""" + m = _korean_nate_date_re.match(dateString) + if not m: + return + hour = int(m.group(5)) + ampm = m.group(4) + if ampm == _korean_pm: + hour += 12 + hour = str(hour) + if len(hour) == 1: + hour = '0' + hour + w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ + { + 'year': m.group(1), + 'month': m.group(2), + 'day': m.group(3), + 'hour': hour, + 'minute': m.group(6), + 'second': m.group(7), + 'zonediff': '+09:00', + } + return _parse_date_w3dtf(w3dtfdate) diff --git a/modules/feedparser/datetimes/perforce.py b/modules/feedparser/datetimes/perforce.py new file mode 100644 index 000000000..6e9b0361f --- /dev/null +++ b/modules/feedparser/datetimes/perforce.py @@ -0,0 +1,53 @@ +# Copyright 2010-2019 Kurt McKee +# Copyright 2002-2008 Mark Pilgrim +# All rights reserved. +# +# This file is a part of feedparser. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from __future__ import absolute_import +from __future__ import unicode_literals + +try: + import rfc822 +except ImportError: + from email import _parseaddr as rfc822 + +import re +import time + + +def _parse_date_perforce(date_string): + """parse a date in yyyy/mm/dd hh:mm:ss TTT format""" + # Fri, 2006/09/15 08:19:53 EDT + _my_date_pattern = re.compile(r'(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})') + + m = _my_date_pattern.search(date_string) + if m is None: + return None + dow, year, month, day, hour, minute, second, tz = m.groups() + months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] + new_date_string = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz) + tm = rfc822.parsedate_tz(new_date_string) + if tm: + return time.gmtime(rfc822.mktime_tz(tm)) diff --git a/modules/feedparser/datetimes/rfc822.py b/modules/feedparser/datetimes/rfc822.py new file mode 100644 index 000000000..5cbd8f839 --- /dev/null +++ b/modules/feedparser/datetimes/rfc822.py @@ -0,0 +1,149 @@ +# Copyright 2010-2019 Kurt McKee +# Copyright 2002-2008 Mark Pilgrim +# All rights reserved. +# +# This file is a part of feedparser. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from __future__ import absolute_import +from __future__ import unicode_literals + +import datetime + +timezone_names = { + 'ut': 0, 'gmt': 0, 'z': 0, + 'adt': -3, 'ast': -4, 'at': -4, + 'edt': -4, 'est': -5, 'et': -5, + 'cdt': -5, 'cst': -6, 'ct': -6, + 'mdt': -6, 'mst': -7, 'mt': -7, + 'pdt': -7, 'pst': -8, 'pt': -8, + 'a': -1, 'n': 1, + 'm': -12, 'y': 12, + 'met': 1, 'mest': 2, +} +day_names = {'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'} +months = { + 'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, + 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12, +} + + +def _parse_date_rfc822(date): + """Parse RFC 822 dates and times + http://tools.ietf.org/html/rfc822#section-5 + + There are some formatting differences that are accounted for: + 1. Years may be two or four digits. + 2. The month and day can be swapped. + 3. Additional timezone names are supported. + 4. A default time and timezone are assumed if only a date is present. + + :param str date: a date/time string that will be converted to a time tuple + :returns: a UTC time tuple, or None + :rtype: time.struct_time | None + """ + + parts = date.lower().split() + if len(parts) < 5: + # Assume that the time and timezone are missing + parts.extend(('00:00:00', '0000')) + # Remove the day name + if parts[0][:3] in day_names: + parts = parts[1:] + if len(parts) < 5: + # If there are still fewer than five parts, there's not enough + # information to interpret this. + return None + + # Handle the day and month name. + month = months.get(parts[1][:3]) + try: + day = int(parts[0]) + except ValueError: + # Check if the day and month are swapped. + if months.get(parts[0][:3]): + try: + day = int(parts[1]) + except ValueError: + return None + month = months.get(parts[0][:3]) + else: + return None + if not month: + return None + + # Handle the year. + try: + year = int(parts[2]) + except ValueError: + return None + # Normalize two-digit years: + # Anything in the 90's is interpreted as 1990 and on. + # Anything 89 or less is interpreted as 2089 or before. + if len(parts[2]) <= 2: + year += (1900, 2000)[year < 90] + + # Handle the time (default to 00:00:00). + time_parts = parts[3].split(':') + time_parts.extend(('0',) * (3 - len(time_parts))) + try: + (hour, minute, second) = [int(i) for i in time_parts] + except ValueError: + return None + + # Handle the timezone information, if any (default to +0000). + # Strip 'Etc/' from the timezone. + if parts[4].startswith('etc/'): + parts[4] = parts[4][4:] + # Normalize timezones that start with 'gmt': + # GMT-05:00 => -0500 + # GMT => GMT + if parts[4].startswith('gmt'): + parts[4] = ''.join(parts[4][3:].split(':')) or 'gmt' + # Handle timezones like '-0500', '+0500', and 'EST' + if parts[4] and parts[4][0] in ('-', '+'): + try: + timezone_hours = int(parts[4][1:3]) + timezone_minutes = int(parts[4][3:]) + except ValueError: + return None + if parts[4].startswith('-'): + timezone_hours *= -1 + timezone_minutes *= -1 + else: + timezone_hours = timezone_names.get(parts[4], 0) + timezone_minutes = 0 + + # Create the datetime object and timezone delta objects + try: + stamp = datetime.datetime(year, month, day, hour, minute, second) + except ValueError: + return None + delta = datetime.timedelta(0, 0, 0, 0, timezone_minutes, timezone_hours) + + # Return the date and timestamp in a UTC 9-tuple + try: + return (stamp - delta).utctimetuple() + except (OverflowError, ValueError): + # IronPython throws ValueErrors instead of OverflowErrors + return None diff --git a/modules/feedparser/datetimes/w3dtf.py b/modules/feedparser/datetimes/w3dtf.py new file mode 100644 index 000000000..e4ad4517c --- /dev/null +++ b/modules/feedparser/datetimes/w3dtf.py @@ -0,0 +1,117 @@ +# Copyright 2010-2019 Kurt McKee +# Copyright 2002-2008 Mark Pilgrim +# All rights reserved. +# +# This file is a part of feedparser. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from __future__ import absolute_import +from __future__ import unicode_literals + +import datetime + +timezonenames = { + 'ut': 0, 'gmt': 0, 'z': 0, + 'adt': -3, 'ast': -4, 'at': -4, + 'edt': -4, 'est': -5, 'et': -5, + 'cdt': -5, 'cst': -6, 'ct': -6, + 'mdt': -6, 'mst': -7, 'mt': -7, + 'pdt': -7, 'pst': -8, 'pt': -8, + 'a': -1, 'n': 1, + 'm': -12, 'y': 12, +} +# W3 date and time format parser +# http://www.w3.org/TR/NOTE-datetime +# Also supports MSSQL-style datetimes as defined at: +# http://msdn.microsoft.com/en-us/library/ms186724.aspx +# (basically, allow a space as a date/time/timezone separator) + + +def _parse_date_w3dtf(datestr): + if not datestr.strip(): + return None + parts = datestr.lower().split('t') + if len(parts) == 1: + # This may be a date only, or may be an MSSQL-style date + parts = parts[0].split() + if len(parts) == 1: + # Treat this as a date only + parts.append('00:00:00z') + elif len(parts) > 2: + return None + date = parts[0].split('-', 2) + if not date or len(date[0]) != 4: + return None + # Ensure that `date` has 3 elements. Using '1' sets the default + # month to January and the default day to the 1st of the month. + date.extend(['1'] * (3 - len(date))) + try: + year, month, day = [int(i) for i in date] + except ValueError: + # `date` may have more than 3 elements or may contain + # non-integer strings. + return None + if parts[1].endswith('z'): + parts[1] = parts[1][:-1] + parts.append('z') + # Append the numeric timezone offset, if any, to parts. + # If this is an MSSQL-style date then parts[2] already contains + # the timezone information, so `append()` will not affect it. + # Add 1 to each value so that if `find()` returns -1 it will be + # treated as False. + loc = parts[1].find('-') + 1 or parts[1].find('+') + 1 or len(parts[1]) + 1 + loc = loc - 1 + parts.append(parts[1][loc:]) + parts[1] = parts[1][:loc] + time = parts[1].split(':', 2) + # Ensure that time has 3 elements. Using '0' means that the + # minutes and seconds, if missing, will default to 0. + time.extend(['0'] * (3 - len(time))) + if parts[2][:1] in ('-', '+'): + try: + tzhour = int(parts[2][1:3]) + tzmin = int(parts[2][4:]) + except ValueError: + return None + if parts[2].startswith('-'): + tzhour = tzhour * -1 + tzmin = tzmin * -1 + else: + tzhour = timezonenames.get(parts[2], 0) + tzmin = 0 + try: + hour, minute, second = [int(float(i)) for i in time] + except ValueError: + return None + # Create the datetime object and timezone delta objects + try: + stamp = datetime.datetime(year, month, day, hour, minute, second) + except ValueError: + return None + delta = datetime.timedelta(0, 0, 0, 0, tzmin, tzhour) + # Return the date and timestamp in a UTC 9-tuple + try: + return (stamp - delta).utctimetuple() + except (OverflowError, ValueError): + # IronPython throws ValueErrors instead of OverflowErrors + return None diff --git a/modules/feedparser/encodings.py b/modules/feedparser/encodings.py new file mode 100644 index 000000000..9889f5ec0 --- /dev/null +++ b/modules/feedparser/encodings.py @@ -0,0 +1,292 @@ +# Character encoding routines +# Copyright 2010-2019 Kurt McKee +# Copyright 2002-2008 Mark Pilgrim +# All rights reserved. +# +# This file is a part of feedparser. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from __future__ import absolute_import +from __future__ import unicode_literals + +import cgi +import codecs +import re + +try: + try: + import cchardet as chardet + except ImportError: + import chardet +except ImportError: + chardet = None + lazy_chardet_encoding = None +else: + def lazy_chardet_encoding(data): + chardet_encoding = chardet.detect(data)['encoding'] + if not chardet_encoding: + chardet_encoding = '' + if isinstance(chardet_encoding, bytes_): + chardet_encoding = chardet_encoding.encode('ascii', 'ignore') + return chardet_encoding + +from .exceptions import ( + CharacterEncodingOverride, + CharacterEncodingUnknown, + NonXMLContentType, +) + +bytes_ = type(b'') +unicode_ = type('') + +# Each marker represents some of the characters of the opening XML +# processing instruction (' +RE_XML_DECLARATION = re.compile(r'^<\?xml[^>]*?>') + +# Capture the value of the XML processing instruction's encoding attribute. +# Example: +RE_XML_PI_ENCODING = re.compile(br'^<\?.*encoding=[\'"](.*?)[\'"].*\?>') + + +def convert_to_utf8(http_headers, data, result): + """Detect and convert the character encoding to UTF-8. + + http_headers is a dictionary + data is a raw string (not Unicode)""" + + # This is so much trickier than it sounds, it's not even funny. + # According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type + # is application/xml, application/*+xml, + # application/xml-external-parsed-entity, or application/xml-dtd, + # the encoding given in the charset parameter of the HTTP Content-Type + # takes precedence over the encoding given in the XML prefix within the + # document, and defaults to 'utf-8' if neither are specified. But, if + # the HTTP Content-Type is text/xml, text/*+xml, or + # text/xml-external-parsed-entity, the encoding given in the XML prefix + # within the document is ALWAYS IGNORED and only the encoding given in + # the charset parameter of the HTTP Content-Type header should be + # respected, and it defaults to 'us-ascii' if not specified. + + # Furthermore, discussion on the atom-syntax mailing list with the + # author of RFC 3023 leads me to the conclusion that any document + # served with a Content-Type of text/* and no charset parameter + # must be treated as us-ascii. (We now do this.) And also that it + # must always be flagged as non-well-formed. (We now do this too.) + + # If Content-Type is unspecified (input was local file or non-HTTP source) + # or unrecognized (server just got it totally wrong), then go by the + # encoding given in the XML prefix of the document and default to + # 'iso-8859-1' as per the HTTP specification (RFC 2616). + + # Then, assuming we didn't find a character encoding in the HTTP headers + # (and the HTTP Content-type allowed us to look in the body), we need + # to sniff the first few bytes of the XML data and try to determine + # whether the encoding is ASCII-compatible. Section F of the XML + # specification shows the way here: + # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info + + # If the sniffed encoding is not ASCII-compatible, we need to make it + # ASCII compatible so that we can sniff further into the XML declaration + # to find the encoding attribute, which will tell us the true encoding. + + # Of course, none of this guarantees that we will be able to parse the + # feed in the declared character encoding (assuming it was declared + # correctly, which many are not). iconv_codec can help a lot; + # you should definitely install it if you can. + # http://cjkpython.i18n.org/ + + bom_encoding = '' + xml_encoding = '' + + # Look at the first few bytes of the document to guess what + # its encoding may be. We only need to decode enough of the + # document that we can use an ASCII-compatible regular + # expression to search for an XML encoding declaration. + # The heuristic follows the XML specification, section F: + # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info + # Check for BOMs first. + if data[:4] == codecs.BOM_UTF32_BE: + bom_encoding = 'utf-32be' + data = data[4:] + elif data[:4] == codecs.BOM_UTF32_LE: + bom_encoding = 'utf-32le' + data = data[4:] + elif data[:2] == codecs.BOM_UTF16_BE and data[2:4] != ZERO_BYTES: + bom_encoding = 'utf-16be' + data = data[2:] + elif data[:2] == codecs.BOM_UTF16_LE and data[2:4] != ZERO_BYTES: + bom_encoding = 'utf-16le' + data = data[2:] + elif data[:3] == codecs.BOM_UTF8: + bom_encoding = 'utf-8' + data = data[3:] + # Check for the characters '''' + if RE_XML_DECLARATION.search(data): + data = RE_XML_DECLARATION.sub(new_declaration, data) + else: + data = new_declaration + '\n' + data + data = data.encode('utf-8') + break + # if still no luck, give up + if not known_encoding: + error = CharacterEncodingUnknown( + 'document encoding unknown, I tried ' + + '%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' % + (rfc3023_encoding, xml_encoding)) + rfc3023_encoding = '' + elif proposed_encoding != rfc3023_encoding: + error = CharacterEncodingOverride( + 'document declared as %s, but parsed as %s' % + (rfc3023_encoding, proposed_encoding)) + rfc3023_encoding = proposed_encoding + + result['encoding'] = rfc3023_encoding + if error: + result['bozo'] = True + result['bozo_exception'] = error + return data diff --git a/modules/feedparser/exceptions.py b/modules/feedparser/exceptions.py new file mode 100644 index 000000000..15f0e1c3d --- /dev/null +++ b/modules/feedparser/exceptions.py @@ -0,0 +1,58 @@ +# Exceptions used throughout feedparser +# Copyright 2010-2019 Kurt McKee +# Copyright 2002-2008 Mark Pilgrim +# All rights reserved. +# +# This file is a part of feedparser. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from __future__ import absolute_import +from __future__ import unicode_literals + +__all__ = [ + 'ThingsNobodyCaresAboutButMe', + 'CharacterEncodingOverride', + 'CharacterEncodingUnknown', + 'NonXMLContentType', + 'UndeclaredNamespace', +] + + +class ThingsNobodyCaresAboutButMe(Exception): + pass + + +class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): + pass + + +class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): + pass + + +class NonXMLContentType(ThingsNobodyCaresAboutButMe): + pass + + +class UndeclaredNamespace(Exception): + pass diff --git a/modules/feedparser/html.py b/modules/feedparser/html.py new file mode 100644 index 000000000..d2da8b593 --- /dev/null +++ b/modules/feedparser/html.py @@ -0,0 +1,364 @@ +# Copyright 2010-2019 Kurt McKee +# Copyright 2002-2008 Mark Pilgrim +# All rights reserved. +# +# This file is a part of feedparser. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from __future__ import absolute_import +from __future__ import unicode_literals + +import re + +try: + from html.entities import name2codepoint +except ImportError: + # Python 2 + # noinspection PyUnresolvedReferences + from htmlentitydefs import name2codepoint + +from .sgml import * + +_cp1252 = { + 128: '\u20ac', # euro sign + 130: '\u201a', # single low-9 quotation mark + 131: '\u0192', # latin small letter f with hook + 132: '\u201e', # double low-9 quotation mark + 133: '\u2026', # horizontal ellipsis + 134: '\u2020', # dagger + 135: '\u2021', # double dagger + 136: '\u02c6', # modifier letter circumflex accent + 137: '\u2030', # per mille sign + 138: '\u0160', # latin capital letter s with caron + 139: '\u2039', # single left-pointing angle quotation mark + 140: '\u0152', # latin capital ligature oe + 142: '\u017d', # latin capital letter z with caron + 145: '\u2018', # left single quotation mark + 146: '\u2019', # right single quotation mark + 147: '\u201c', # left double quotation mark + 148: '\u201d', # right double quotation mark + 149: '\u2022', # bullet + 150: '\u2013', # en dash + 151: '\u2014', # em dash + 152: '\u02dc', # small tilde + 153: '\u2122', # trade mark sign + 154: '\u0161', # latin small letter s with caron + 155: '\u203a', # single right-pointing angle quotation mark + 156: '\u0153', # latin small ligature oe + 158: '\u017e', # latin small letter z with caron + 159: '\u0178', # latin capital letter y with diaeresis +} + + +class _BaseHTMLProcessor(sgmllib.SGMLParser, object): + special = re.compile("""[<>'"]""") + bare_ampersand = re.compile(r"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)") + elements_no_end_tag = { + 'area', + 'base', + 'basefont', + 'br', + 'col', + 'command', + 'embed', + 'frame', + 'hr', + 'img', + 'input', + 'isindex', + 'keygen', + 'link', + 'meta', + 'param', + 'source', + 'track', + 'wbr', + } + + def __init__(self, encoding=None, _type='application/xhtml+xml'): + if encoding: + self.encoding = encoding + self._type = _type + self.pieces = [] + super(_BaseHTMLProcessor, self).__init__() + + def reset(self): + self.pieces = [] + super(_BaseHTMLProcessor, self).reset() + + def _shorttag_replace(self, match): + """ + :type match: Match[str] + :rtype: str + """ + + tag = match.group(1) + if tag in self.elements_no_end_tag: + return '<' + tag + ' />' + else: + return '<' + tag + '>' + + # By declaring these methods and overriding their compiled code + # with the code from sgmllib, the original code will execute in + # feedparser's scope instead of sgmllib's. This means that the + # `tagfind` and `charref` regular expressions will be found as + # they're declared above, not as they're declared in sgmllib. + def goahead(self, i): + raise NotImplementedError + + # Replace goahead with SGMLParser's goahead() code object. + try: + goahead.__code__ = sgmllib.SGMLParser.goahead.__code__ + except AttributeError: + # Python 2 + # noinspection PyUnresolvedReferences + goahead.func_code = sgmllib.SGMLParser.goahead.func_code + + def __parse_starttag(self, i): + raise NotImplementedError + + # Replace __parse_starttag with SGMLParser's parse_starttag() code object. + try: + __parse_starttag.__code__ = sgmllib.SGMLParser.parse_starttag.__code__ + except AttributeError: + # Python 2 + # noinspection PyUnresolvedReferences + __parse_starttag.func_code = sgmllib.SGMLParser.parse_starttag.func_code + + def parse_starttag(self, i): + j = self.__parse_starttag(i) + if self._type == 'application/xhtml+xml': + if j > 2 and self.rawdata[j-2:j] == '/>': + self.unknown_endtag(self.lasttag) + return j + + def feed(self, data): + """ + :type data: str + :rtype: None + """ + + data = re.sub(r'\s]+?)\s*/>', self._shorttag_replace, data) + data = data.replace(''', "'") + data = data.replace('"', '"') + super(_BaseHTMLProcessor, self).feed(data) + super(_BaseHTMLProcessor, self).close() + + @staticmethod + def normalize_attrs(attrs): + """ + :type attrs: List[Tuple[str, str]] + :rtype: List[Tuple[str, str]] + """ + + if not attrs: + return attrs + # utility method to be called by descendants + # Collapse any duplicate attribute names and values by converting + # *attrs* into a dictionary, then convert it back to a list. + attrs_d = {k.lower(): v for k, v in attrs} + attrs = [ + (k, k in ('rel', 'type') and v.lower() or v) + for k, v in attrs_d.items() + ] + attrs.sort() + return attrs + + def unknown_starttag(self, tag, attrs): + """ + :type tag: str + :type attrs: List[Tuple[str, str]] + :rtype: None + """ + + # Called for each start tag + # attrs is a list of (attr, value) tuples + # e.g. for
, tag='pre', attrs=[('class', 'screen')]
+        uattrs = []
+        strattrs = ''
+        if attrs:
+            for key, value in attrs:
+                value = value.replace('>', '>')
+                value = value.replace('<', '<')
+                value = value.replace('"', '"')
+                value = self.bare_ampersand.sub("&", value)
+                uattrs.append((key, value))
+            strattrs = ''.join(
+                ' %s="%s"' % (key, value)
+                for key, value in uattrs
+            )
+        if tag in self.elements_no_end_tag:
+            self.pieces.append('<%s%s />' % (tag, strattrs))
+        else:
+            self.pieces.append('<%s%s>' % (tag, strattrs))
+
+    def unknown_endtag(self, tag):
+        """
+        :type tag: str
+        :rtype: None
+        """
+
+        # Called for each end tag, e.g. for 
, tag will be 'pre' + # Reconstruct the original end tag. + if tag not in self.elements_no_end_tag: + self.pieces.append("" % tag) + + def handle_charref(self, ref): + """ + :type ref: str + :rtype: None + """ + + # Called for each character reference, e.g. ' ' will extract '160' + # Reconstruct the original character reference. + ref = ref.lower() + if ref.startswith('x'): + value = int(ref[1:], 16) + else: + value = int(ref) + + if value in _cp1252: + self.pieces.append('&#%s;' % hex(ord(_cp1252[value]))[1:]) + else: + self.pieces.append('&#%s;' % ref) + + def handle_entityref(self, ref): + """ + :type ref: str + :rtype: None + """ + + # Called for each entity reference, e.g. '©' will extract 'copy' + # Reconstruct the original entity reference. + if ref in name2codepoint or ref == 'apos': + self.pieces.append('&%s;' % ref) + else: + self.pieces.append('&%s' % ref) + + def handle_data(self, text): + """ + :type text: str + :rtype: None + """ + + # called for each block of plain text, i.e. outside of any tag and + # not containing any character or entity references + # Store the original text verbatim. + self.pieces.append(text) + + def handle_comment(self, text): + """ + :type text: str + :rtype: None + """ + + # Called for HTML comments, e.g. + # Reconstruct the original comment. + self.pieces.append('' % text) + + def handle_pi(self, text): + """ + :type text: str + :rtype: None + """ + + # Called for each processing instruction, e.g. + # Reconstruct original processing instruction. + self.pieces.append('' % text) + + def handle_decl(self, text): + """ + :type text: str + :rtype: None + """ + + # called for the DOCTYPE, if present, e.g. + # + # Reconstruct original DOCTYPE + self.pieces.append('' % text) + + _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match + + def _scan_name(self, i, declstartpos): + """ + :type i: int + :type declstartpos: int + :rtype: Tuple[Optional[str], int] + """ + + rawdata = self.rawdata + n = len(rawdata) + if i == n: + return None, -1 + m = self._new_declname_match(rawdata, i) + if m: + s = m.group() + name = s.strip() + if (i + len(s)) == n: + return None, -1 # end of buffer + return name.lower(), m.end() + else: + self.handle_data(rawdata) + # self.updatepos(declstartpos, i) + return None, -1 + + @staticmethod + def convert_charref(name): + """ + :type name: str + :rtype: str + """ + + return '&#%s;' % name + + @staticmethod + def convert_entityref(name): + """ + :type name: str + :rtype: str + """ + + return '&%s;' % name + + def output(self): + """Return processed HTML as a single string. + + :rtype: str + """ + + return ''.join(self.pieces) + + def parse_declaration(self, i): + """ + :type i: int + :rtype: int + """ + + try: + return sgmllib.SGMLParser.parse_declaration(self, i) + except sgmllib.SGMLParseError: + # Escape the doctype declaration and continue parsing. + self.handle_data('<') + return i+1 diff --git a/modules/feedparser/http.py b/modules/feedparser/http.py new file mode 100644 index 000000000..272faad60 --- /dev/null +++ b/modules/feedparser/http.py @@ -0,0 +1,256 @@ +# Copyright 2010-2019 Kurt McKee +# Copyright 2002-2008 Mark Pilgrim +# All rights reserved. +# +# This file is a part of feedparser. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from __future__ import absolute_import +from __future__ import unicode_literals + +import datetime +import gzip +import re +import struct +import zlib + +try: + import urllib.parse + import urllib.request +except ImportError: + # Mock urllib sufficiently to work on Python 2.7 + from urllib import splithost, splittype, splituser + from urllib2 import build_opener, HTTPDigestAuthHandler, HTTPRedirectHandler, HTTPDefaultErrorHandler, Request + from urlparse import urlparse + + class urllib(object): + class parse(object): + splithost = staticmethod(splithost) + splittype = staticmethod(splittype) + splituser = staticmethod(splituser) + urlparse = staticmethod(urlparse) + + class request(object): + build_opener = staticmethod(build_opener) + HTTPDigestAuthHandler = HTTPDigestAuthHandler + HTTPRedirectHandler = HTTPRedirectHandler + HTTPDefaultErrorHandler = HTTPDefaultErrorHandler + Request = Request + +try: + from io import BytesIO as _StringIO +except ImportError: + # Python 2.7 + try: + from cStringIO import StringIO as _StringIO + except ImportError: + from StringIO import StringIO as _StringIO + +import base64 + +from .datetimes import _parse_date +from .urls import convert_to_idn + +# Python 3.1 deprecated decodestring in favor of decodebytes. +# This can be removed after Python 2.7 support is dropped. +_base64decode = getattr(base64, 'decodebytes', base64.decodestring) + +try: + basestring +except NameError: + basestring = str + +bytes_ = type(b'') + +# HTTP "Accept" header to send to servers when downloading feeds. If you don't +# want to send an Accept header, set this to None. +ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1" + + +class _FeedURLHandler(urllib.request.HTTPDigestAuthHandler, urllib.request.HTTPRedirectHandler, urllib.request.HTTPDefaultErrorHandler): + def http_error_default(self, req, fp, code, msg, headers): + # The default implementation just raises HTTPError. + # Forget that. + fp.status = code + return fp + + def http_error_301(self, req, fp, code, msg, hdrs): + result = urllib.request.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, hdrs) + result.status = code + result.newurl = result.geturl() + return result + + # The default implementations in urllib.request.HTTPRedirectHandler + # are identical, so hardcoding a http_error_301 call above + # won't affect anything + http_error_300 = http_error_301 + http_error_302 = http_error_301 + http_error_303 = http_error_301 + http_error_307 = http_error_301 + + def http_error_401(self, req, fp, code, msg, headers): + # Check if + # - server requires digest auth, AND + # - we tried (unsuccessfully) with basic auth, AND + # If all conditions hold, parse authentication information + # out of the Authorization header we sent the first time + # (for the username and password) and the WWW-Authenticate + # header the server sent back (for the realm) and retry + # the request with the appropriate digest auth headers instead. + # This evil genius hack has been brought to you by Aaron Swartz. + host = urllib.parse.urlparse(req.get_full_url())[1] + if 'Authorization' not in req.headers or 'WWW-Authenticate' not in headers: + return self.http_error_default(req, fp, code, msg, headers) + auth = _base64decode(req.headers['Authorization'].split(' ')[1]) + user, passw = auth.split(':') + realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0] + self.add_password(realm, host, user, passw) + retry = self.http_error_auth_reqed('www-authenticate', host, req, headers) + self.reset_retry_count() + return retry + + +def _build_urllib2_request(url, agent, accept_header, etag, modified, referrer, auth, request_headers): + request = urllib.request.Request(url) + request.add_header('User-Agent', agent) + if etag: + request.add_header('If-None-Match', etag) + if isinstance(modified, basestring): + modified = _parse_date(modified) + elif isinstance(modified, datetime.datetime): + modified = modified.utctimetuple() + if modified: + # format into an RFC 1123-compliant timestamp. We can't use + # time.strftime() since the %a and %b directives can be affected + # by the current locale, but RFC 2616 states that dates must be + # in English. + short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] + months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] + request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5])) + if referrer: + request.add_header('Referer', referrer) + request.add_header('Accept-encoding', 'gzip, deflate') + if auth: + request.add_header('Authorization', 'Basic %s' % auth) + if accept_header: + request.add_header('Accept', accept_header) + # use this for whatever -- cookies, special headers, etc + # [('Cookie','Something'),('x-special-header','Another Value')] + for header_name, header_value in request_headers.items(): + request.add_header(header_name, header_value) + request.add_header('A-IM', 'feed') # RFC 3229 support + return request + + +def get(url, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, result=None): + if handlers is None: + handlers = [] + elif not isinstance(handlers, list): + handlers = [handlers] + if request_headers is None: + request_headers = {} + + # Deal with the feed URI scheme + if url.startswith('feed:http'): + url = url[5:] + elif url.startswith('feed:'): + url = 'http:' + url[5:] + if not agent: + from . import USER_AGENT + agent = USER_AGENT + # Test for inline user:password credentials for HTTP basic auth + auth = None + if not url.startswith('ftp:'): + urltype, rest = urllib.parse.splittype(url) + realhost, rest = urllib.parse.splithost(rest) + if realhost: + user_passwd, realhost = urllib.parse.splituser(realhost) + if user_passwd: + url = '%s://%s%s' % (urltype, realhost, rest) + auth = base64.standard_b64encode(user_passwd).strip() + + # iri support + if not isinstance(url, bytes_): + url = convert_to_idn(url) + + # try to open with urllib2 (to use optional headers) + request = _build_urllib2_request(url, agent, ACCEPT_HEADER, etag, modified, referrer, auth, request_headers) + opener = urllib.request.build_opener(*tuple(handlers + [_FeedURLHandler()])) + opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent + f = opener.open(request) + data = f.read() + f.close() + + # lowercase all of the HTTP headers for comparisons per RFC 2616 + result['headers'] = {k.lower(): v for k, v in f.headers.items()} + + # if feed is gzip-compressed, decompress it + if data and 'gzip' in result['headers'].get('content-encoding', ''): + try: + data = gzip.GzipFile(fileobj=_StringIO(data)).read() + except (EOFError, IOError, struct.error) as e: + # IOError can occur if the gzip header is bad. + # struct.error can occur if the data is damaged. + result['bozo'] = True + result['bozo_exception'] = e + if isinstance(e, struct.error): + # A gzip header was found but the data is corrupt. + # Ideally, we should re-request the feed without the + # 'Accept-encoding: gzip' header, but we don't. + data = None + elif data and 'deflate' in result['headers'].get('content-encoding', ''): + try: + data = zlib.decompress(data) + except zlib.error: + try: + # The data may have no headers and no checksum. + data = zlib.decompress(data, -15) + except zlib.error as e: + result['bozo'] = True + result['bozo_exception'] = e + + # save HTTP headers + if 'etag' in result['headers']: + etag = result['headers'].get('etag', '') + if isinstance(etag, bytes_): + etag = etag.decode('utf-8', 'ignore') + if etag: + result['etag'] = etag + if 'last-modified' in result['headers']: + modified = result['headers'].get('last-modified', '') + if modified: + result['modified'] = modified + result['modified_parsed'] = _parse_date(modified) + if isinstance(f.url, bytes_): + result['href'] = f.url.decode('utf-8', 'ignore') + else: + result['href'] = f.url + result['status'] = getattr(f, 'status', 200) + + # Stop processing if the server sent HTTP 304 Not Modified. + if getattr(f, 'code', 0) == 304: + result['version'] = '' + result['debug_message'] = 'The feed has not changed since you last checked, ' + \ + 'so the server sent no data. This is a feature, not a bug!' + + return data diff --git a/modules/feedparser/mixin.py b/modules/feedparser/mixin.py new file mode 100644 index 000000000..1b0dc1aee --- /dev/null +++ b/modules/feedparser/mixin.py @@ -0,0 +1,813 @@ +# Copyright 2010-2019 Kurt McKee +# Copyright 2002-2008 Mark Pilgrim +# All rights reserved. +# +# This file is a part of feedparser. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from __future__ import absolute_import +from __future__ import unicode_literals + +import base64 +import binascii +import copy +import re +from xml.sax.saxutils import escape as _xmlescape + +try: + from html.entities import name2codepoint, entitydefs +except ImportError: + # Python 2 + # noinspection PyUnresolvedReferences + from htmlentitydefs import name2codepoint, entitydefs + +from .html import _cp1252 +from .namespaces import _base, cc, dc, georss, itunes, mediarss, psc +from .sanitizer import _sanitize_html, _HTMLSanitizer +from .util import FeedParserDict +from .urls import _urljoin, make_safe_absolute_uri, resolve_relative_uris + + +# Python 2.7 only offers "decodestring()". +# This name substitution can be removed when Python 2.7 support is dropped. +_base64decode = getattr(base64, 'decodebytes', base64.decodestring) + + +bytes_ = type(b'') +try: + # Python 2 + # noinspection PyUnresolvedReferences,PyShadowingBuiltins + chr = unichr +except NameError: + pass + + +class _FeedParserMixin( + _base.Namespace, + cc.Namespace, + dc.Namespace, + georss.Namespace, + itunes.Namespace, + mediarss.Namespace, + psc.Namespace, +): + namespaces = { + '': '', + 'http://backend.userland.com/rss': '', + 'http://blogs.law.harvard.edu/tech/rss': '', + 'http://purl.org/rss/1.0/': '', + 'http://my.netscape.com/rdf/simple/0.9/': '', + 'http://example.com/newformat#': '', + 'http://example.com/necho': '', + 'http://purl.org/echo/': '', + 'uri/of/echo/namespace#': '', + 'http://purl.org/pie/': '', + 'http://purl.org/atom/ns#': '', + 'http://www.w3.org/2005/Atom': '', + 'http://purl.org/rss/1.0/modules/rss091#': '', + + 'http://webns.net/mvcb/': 'admin', + 'http://purl.org/rss/1.0/modules/aggregation/': 'ag', + 'http://purl.org/rss/1.0/modules/annotate/': 'annotate', + 'http://media.tangent.org/rss/1.0/': 'audio', + 'http://backend.userland.com/blogChannelModule': 'blogChannel', + 'http://creativecommons.org/ns#license': 'cc', + 'http://web.resource.org/cc/': 'cc', + 'http://cyber.law.harvard.edu/rss/creativeCommonsRssModule.html': 'creativeCommons', + 'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons', + 'http://purl.org/rss/1.0/modules/company': 'co', + 'http://purl.org/rss/1.0/modules/content/': 'content', + 'http://my.theinfo.org/changed/1.0/rss/': 'cp', + 'http://purl.org/dc/elements/1.1/': 'dc', + 'http://purl.org/dc/terms/': 'dcterms', + 'http://purl.org/rss/1.0/modules/email/': 'email', + 'http://purl.org/rss/1.0/modules/event/': 'ev', + 'http://rssnamespace.org/feedburner/ext/1.0': 'feedburner', + 'http://freshmeat.net/rss/fm/': 'fm', + 'http://xmlns.com/foaf/0.1/': 'foaf', + 'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo', + 'http://www.georss.org/georss': 'georss', + 'http://www.opengis.net/gml': 'gml', + 'http://postneo.com/icbm/': 'icbm', + 'http://purl.org/rss/1.0/modules/image/': 'image', + 'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes', + 'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes', + 'http://purl.org/rss/1.0/modules/link/': 'l', + 'http://search.yahoo.com/mrss': 'media', + # Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace + 'http://search.yahoo.com/mrss/': 'media', + 'http://madskills.com/public/xml/rss/module/pingback/': 'pingback', + 'http://prismstandard.org/namespaces/1.2/basic/': 'prism', + 'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf', + 'http://www.w3.org/2000/01/rdf-schema#': 'rdfs', + 'http://purl.org/rss/1.0/modules/reference/': 'ref', + 'http://purl.org/rss/1.0/modules/richequiv/': 'reqv', + 'http://purl.org/rss/1.0/modules/search/': 'search', + 'http://purl.org/rss/1.0/modules/slash/': 'slash', + 'http://schemas.xmlsoap.org/soap/envelope/': 'soap', + 'http://purl.org/rss/1.0/modules/servicestatus/': 'ss', + 'http://hacks.benhammersley.com/rss/streaming/': 'str', + 'http://purl.org/rss/1.0/modules/subscription/': 'sub', + 'http://purl.org/rss/1.0/modules/syndication/': 'sy', + 'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf', + 'http://purl.org/rss/1.0/modules/taxonomy/': 'taxo', + 'http://purl.org/rss/1.0/modules/threading/': 'thr', + 'http://purl.org/rss/1.0/modules/textinput/': 'ti', + 'http://madskills.com/public/xml/rss/module/trackback/': 'trackback', + 'http://wellformedweb.org/commentAPI/': 'wfw', + 'http://purl.org/rss/1.0/modules/wiki/': 'wiki', + 'http://www.w3.org/1999/xhtml': 'xhtml', + 'http://www.w3.org/1999/xlink': 'xlink', + 'http://www.w3.org/XML/1998/namespace': 'xml', + 'http://podlove.org/simple-chapters': 'psc', + } + _matchnamespaces = {} + + can_be_relative_uri = { + 'comments', + 'docs', + 'href', + 'icon', + 'id', + 'link', + 'logo', + 'url', + 'wfw_comment', + 'wfw_commentrss', + } + + can_contain_relative_uris = { + 'content', + 'copyright', + 'description', + 'info', + 'rights', + 'subtitle', + 'summary', + 'tagline', + 'title', + } + + can_contain_dangerous_markup = { + 'content', + 'copyright', + 'description', + 'info', + 'rights', + 'subtitle', + 'summary', + 'tagline', + 'title', + } + + html_types = { + 'application/xhtml+xml', + 'text/html', + } + + def __init__(self): + if not self._matchnamespaces: + for k, v in self.namespaces.items(): + self._matchnamespaces[k.lower()] = v + self.feeddata = FeedParserDict() # feed-level data + self.entries = [] # list of entry-level data + self.version = '' # feed type/version, see SUPPORTED_VERSIONS + self.namespaces_in_use = {} # dictionary of namespaces defined by the feed + + # the following are used internally to track state; + # this is really out of control and should be refactored + self.infeed = 0 + self.inentry = 0 + self.incontent = 0 + self.intextinput = 0 + self.inimage = 0 + self.inauthor = 0 + self.incontributor = 0 + self.inpublisher = 0 + self.insource = 0 + + self.sourcedata = FeedParserDict() + self.contentparams = FeedParserDict() + self._summaryKey = None + self.namespacemap = {} + self.elementstack = [] + self.basestack = [] + self.langstack = [] + self.svgOK = 0 + self.title_depth = -1 + self.depth = 0 + if self.lang: + self.feeddata['language'] = self.lang.replace('_', '-') + + # A map of the following form: + # { + # object_that_value_is_set_on: { + # property_name: depth_of_node_property_was_extracted_from, + # other_property: depth_of_node_property_was_extracted_from, + # }, + # } + self.property_depth_map = {} + super(_FeedParserMixin, self).__init__() + + def _normalize_attributes(self, kv): + raise NotImplementedError + + def unknown_starttag(self, tag, attrs): + # increment depth counter + self.depth += 1 + + # normalize attrs + attrs = [self._normalize_attributes(attr) for attr in attrs] + + # track xml:base and xml:lang + attrs_d = dict(attrs) + baseuri = attrs_d.get('xml:base', attrs_d.get('base')) or self.baseuri + if isinstance(baseuri, bytes_): + baseuri = baseuri.decode(self.encoding, 'ignore') + # ensure that self.baseuri is always an absolute URI that + # uses a whitelisted URI scheme (e.g. not `javscript:`) + if self.baseuri: + self.baseuri = make_safe_absolute_uri(self.baseuri, baseuri) or self.baseuri + else: + self.baseuri = _urljoin(self.baseuri, baseuri) + lang = attrs_d.get('xml:lang', attrs_d.get('lang')) + if lang == '': + # xml:lang could be explicitly set to '', we need to capture that + lang = None + elif lang is None: + # if no xml:lang is specified, use parent lang + lang = self.lang + if lang: + if tag in ('feed', 'rss', 'rdf:RDF'): + self.feeddata['language'] = lang.replace('_', '-') + self.lang = lang + self.basestack.append(self.baseuri) + self.langstack.append(lang) + + # track namespaces + for prefix, uri in attrs: + if prefix.startswith('xmlns:'): + self.track_namespace(prefix[6:], uri) + elif prefix == 'xmlns': + self.track_namespace(None, uri) + + # track inline content + if self.incontent and not self.contentparams.get('type', 'xml').endswith('xml'): + if tag in ('xhtml:div', 'div'): + return # typepad does this 10/2007 + # element declared itself as escaped markup, but it isn't really + self.contentparams['type'] = 'application/xhtml+xml' + if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml': + if tag.find(':') != -1: + prefix, tag = tag.split(':', 1) + namespace = self.namespaces_in_use.get(prefix, '') + if tag == 'math' and namespace == 'http://www.w3.org/1998/Math/MathML': + attrs.append(('xmlns', namespace)) + if tag == 'svg' and namespace == 'http://www.w3.org/2000/svg': + attrs.append(('xmlns', namespace)) + if tag == 'svg': + self.svgOK += 1 + return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0) + + # match namespaces + if tag.find(':') != -1: + prefix, suffix = tag.split(':', 1) + else: + prefix, suffix = '', tag + prefix = self.namespacemap.get(prefix, prefix) + if prefix: + prefix = prefix + '_' + + # Special hack for better tracking of empty textinput/image elements in + # illformed feeds. + if (not prefix) and tag not in ('title', 'link', 'description', 'name'): + self.intextinput = 0 + if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'): + self.inimage = 0 + + # call special handler (if defined) or default handler + methodname = '_start_' + prefix + suffix + try: + method = getattr(self, methodname) + return method(attrs_d) + except AttributeError: + # Since there's no handler or something has gone wrong we + # explicitly add the element and its attributes. + unknown_tag = prefix + suffix + if len(attrs_d) == 0: + # No attributes so merge it into the enclosing dictionary + return self.push(unknown_tag, 1) + else: + # Has attributes so create it in its own dictionary + context = self._get_context() + context[unknown_tag] = attrs_d + + def unknown_endtag(self, tag): + # match namespaces + if tag.find(':') != -1: + prefix, suffix = tag.split(':', 1) + else: + prefix, suffix = '', tag + prefix = self.namespacemap.get(prefix, prefix) + if prefix: + prefix = prefix + '_' + if suffix == 'svg' and self.svgOK: + self.svgOK -= 1 + + # call special handler (if defined) or default handler + methodname = '_end_' + prefix + suffix + try: + if self.svgOK: + raise AttributeError() + method = getattr(self, methodname) + method() + except AttributeError: + self.pop(prefix + suffix) + + # track inline content + if self.incontent and not self.contentparams.get('type', 'xml').endswith('xml'): + # element declared itself as escaped markup, but it isn't really + if tag in ('xhtml:div', 'div'): + return # typepad does this 10/2007 + self.contentparams['type'] = 'application/xhtml+xml' + if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml': + tag = tag.split(':')[-1] + self.handle_data('' % tag, escape=0) + + # track xml:base and xml:lang going out of scope + if self.basestack: + self.basestack.pop() + if self.basestack and self.basestack[-1]: + self.baseuri = self.basestack[-1] + if self.langstack: + self.langstack.pop() + if self.langstack: # and (self.langstack[-1] is not None): + self.lang = self.langstack[-1] + + self.depth -= 1 + + def handle_charref(self, ref): + # Called for each character reference, e.g. for ' ', ref is '160' + if not self.elementstack: + return + ref = ref.lower() + if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'): + text = '&#%s;' % ref + else: + if ref[0] == 'x': + c = int(ref[1:], 16) + else: + c = int(ref) + text = chr(c).encode('utf-8') + self.elementstack[-1][2].append(text) + + def handle_entityref(self, ref): + # Called for each entity reference, e.g. for '©', ref is 'copy' + if not self.elementstack: + return + if ref in ('lt', 'gt', 'quot', 'amp', 'apos'): + text = '&%s;' % ref + elif ref in self.entities: + text = self.entities[ref] + if text.startswith('&#') and text.endswith(';'): + return self.handle_entityref(text) + else: + try: + name2codepoint[ref] + except KeyError: + text = '&%s;' % ref + else: + text = chr(name2codepoint[ref]).encode('utf-8') + self.elementstack[-1][2].append(text) + + def handle_data(self, text, escape=1): + # Called for each block of plain text, i.e. outside of any tag and + # not containing any character or entity references + if not self.elementstack: + return + if escape and self.contentparams.get('type') == 'application/xhtml+xml': + text = _xmlescape(text) + self.elementstack[-1][2].append(text) + + def handle_comment(self, text): + # Called for each comment, e.g. + pass + + def handle_pi(self, text): + # Called for each processing instruction, e.g. + pass + + def handle_decl(self, text): + pass + + def parse_declaration(self, i): + # Override internal declaration handler to handle CDATA blocks. + if self.rawdata[i:i+9] == '', i) + if k == -1: + # CDATA block began but didn't finish + k = len(self.rawdata) + return k + self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0) + return k+3 + else: + k = self.rawdata.find('>', i) + if k >= 0: + return k+1 + else: + # We have an incomplete CDATA block. + return k + + @staticmethod + def map_content_type(content_type): + content_type = content_type.lower() + if content_type == 'text' or content_type == 'plain': + content_type = 'text/plain' + elif content_type == 'html': + content_type = 'text/html' + elif content_type == 'xhtml': + content_type = 'application/xhtml+xml' + return content_type + + def track_namespace(self, prefix, uri): + loweruri = uri.lower() + if not self.version: + if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/'): + self.version = 'rss090' + elif loweruri == 'http://purl.org/rss/1.0/': + self.version = 'rss10' + elif loweruri == 'http://www.w3.org/2005/atom': + self.version = 'atom10' + if loweruri.find('backend.userland.com/rss') != -1: + # match any backend.userland.com namespace + uri = 'http://backend.userland.com/rss' + loweruri = uri + if loweruri in self._matchnamespaces: + self.namespacemap[prefix] = self._matchnamespaces[loweruri] + self.namespaces_in_use[self._matchnamespaces[loweruri]] = uri + else: + self.namespaces_in_use[prefix or ''] = uri + + def resolve_uri(self, uri): + return _urljoin(self.baseuri or '', uri) + + @staticmethod + def decode_entities(element, data): + return data + + @staticmethod + def strattrs(attrs): + return ''.join( + ' %s="%s"' % (t[0], _xmlescape(t[1], {'"': '"'})) + for t in attrs + ) + + def push(self, element, expecting_text): + self.elementstack.append([element, expecting_text, []]) + + def pop(self, element, strip_whitespace=1): + if not self.elementstack: + return + if self.elementstack[-1][0] != element: + return + + element, expecting_text, pieces = self.elementstack.pop() + + # Ensure each piece is a str for Python 3 + for (i, v) in enumerate(pieces): + if isinstance(v, bytes_): + pieces[i] = v.decode('utf-8') + + if self.version == 'atom10' and self.contentparams.get('type', 'text') == 'application/xhtml+xml': + # remove enclosing child element, but only if it is a
and + # only if all the remaining content is nested underneath it. + # This means that the divs would be retained in the following: + #
foo
bar
+ while pieces and len(pieces) > 1 and not pieces[-1].strip(): + del pieces[-1] + while pieces and len(pieces) > 1 and not pieces[0].strip(): + del pieces[0] + if pieces and (pieces[0] == '
' or pieces[0].startswith('
': + depth = 0 + for piece in pieces[:-1]: + if piece.startswith(''): + depth += 1 + else: + pieces = pieces[1:-1] + + output = ''.join(pieces) + if strip_whitespace: + output = output.strip() + if not expecting_text: + return output + + # decode base64 content + if base64 and self.contentparams.get('base64', 0): + try: + output = _base64decode(output) + except binascii.Error: + pass + except binascii.Incomplete: + pass + except TypeError: + # In Python 3, base64 takes and outputs bytes, not str + # This may not be the most correct way to accomplish this + output = _base64decode(output.encode('utf-8')).decode('utf-8') + + # resolve relative URIs + if (element in self.can_be_relative_uri) and output: + # do not resolve guid elements with isPermalink="false" + if not element == 'id' or self.guidislink: + output = self.resolve_uri(output) + + # decode entities within embedded markup + if not self.contentparams.get('base64', 0): + output = self.decode_entities(element, output) + + # some feed formats require consumers to guess + # whether the content is html or plain text + if not self.version.startswith('atom') and self.contentparams.get('type') == 'text/plain': + if self.looks_like_html(output): + self.contentparams['type'] = 'text/html' + + # remove temporary cruft from contentparams + try: + del self.contentparams['mode'] + except KeyError: + pass + try: + del self.contentparams['base64'] + except KeyError: + pass + + is_htmlish = self.map_content_type(self.contentparams.get('type', 'text/html')) in self.html_types + # resolve relative URIs within embedded markup + if is_htmlish and self.resolve_relative_uris: + if element in self.can_contain_relative_uris: + output = resolve_relative_uris(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html')) + + # sanitize embedded markup + if is_htmlish and self.sanitize_html: + if element in self.can_contain_dangerous_markup: + output = _sanitize_html(output, self.encoding, self.contentparams.get('type', 'text/html')) + + if self.encoding and isinstance(output, bytes_): + output = output.decode(self.encoding, 'ignore') + + # address common error where people take data that is already + # utf-8, presume that it is iso-8859-1, and re-encode it. + if self.encoding in ('utf-8', 'utf-8_INVALID_PYTHON_3') and not isinstance(output, bytes_): + try: + output = output.encode('iso-8859-1').decode('utf-8') + except (UnicodeEncodeError, UnicodeDecodeError): + pass + + # map win-1252 extensions to the proper code points + if not isinstance(output, bytes_): + output = output.translate(_cp1252) + + # categories/tags/keywords/whatever are handled in _end_category or + # _end_tags or _end_itunes_keywords + if element in ('category', 'tags', 'itunes_keywords'): + return output + + if element == 'title' and -1 < self.title_depth <= self.depth: + return output + + # store output in appropriate place(s) + if self.inentry and not self.insource: + if element == 'content': + self.entries[-1].setdefault(element, []) + contentparams = copy.deepcopy(self.contentparams) + contentparams['value'] = output + self.entries[-1][element].append(contentparams) + elif element == 'link': + if not self.inimage: + # query variables in urls in link elements are improperly + # converted from `?a=1&b=2` to `?a=1&b;=2` as if they're + # unhandled character references. fix this special case. + output = output.replace('&', '&') + output = re.sub("&([A-Za-z0-9_]+);", r"&\g<1>", output) + self.entries[-1][element] = output + if output: + self.entries[-1]['links'][-1]['href'] = output + else: + if element == 'description': + element = 'summary' + old_value_depth = self.property_depth_map.setdefault(self.entries[-1], {}).get(element) + if old_value_depth is None or self.depth <= old_value_depth: + self.property_depth_map[self.entries[-1]][element] = self.depth + self.entries[-1][element] = output + if self.incontent: + contentparams = copy.deepcopy(self.contentparams) + contentparams['value'] = output + self.entries[-1][element + '_detail'] = contentparams + elif self.infeed or self.insource: # and (not self.intextinput) and (not self.inimage): + context = self._get_context() + if element == 'description': + element = 'subtitle' + context[element] = output + if element == 'link': + # fix query variables; see above for the explanation + output = re.sub("&([A-Za-z0-9_]+);", r"&\g<1>", output) + context[element] = output + context['links'][-1]['href'] = output + elif self.incontent: + contentparams = copy.deepcopy(self.contentparams) + contentparams['value'] = output + context[element + '_detail'] = contentparams + return output + + def push_content(self, tag, attrs_d, default_content_type, expecting_text): + self.incontent += 1 + if self.lang: + self.lang = self.lang.replace('_', '-') + self.contentparams = FeedParserDict({ + 'type': self.map_content_type(attrs_d.get('type', default_content_type)), + 'language': self.lang, + 'base': self.baseuri}) + self.contentparams['base64'] = self._is_base64(attrs_d, self.contentparams) + self.push(tag, expecting_text) + + def pop_content(self, tag): + value = self.pop(tag) + self.incontent -= 1 + self.contentparams.clear() + return value + + # a number of elements in a number of RSS variants are nominally plain + # text, but this is routinely ignored. This is an attempt to detect + # the most common cases. As false positives often result in silent + # data loss, this function errs on the conservative side. + @staticmethod + def looks_like_html(s): + """ + :type s: str + :rtype: bool + """ + + # must have a close tag or an entity reference to qualify + if not (re.search(r'', s) or re.search(r'&#?\w+;', s)): + return False + + # all tags must be in a restricted subset of valid HTML tags + if any((t for t in re.findall(r'', '') + author = author.replace('<>', '') + author = author.strip() + if author and (author[0] == '('): + author = author[1:] + if author and (author[-1] == ')'): + author = author[:-1] + author = author.strip() + if author or email: + context.setdefault('%s_detail' % key, detail) + if author: + detail['name'] = author + if email: + detail['email'] = email + + def _add_tag(self, term, scheme, label): + context = self._get_context() + tags = context.setdefault('tags', []) + if (not term) and (not scheme) and (not label): + return + value = FeedParserDict(term=term, scheme=scheme, label=label) + if value not in tags: + tags.append(value) + + def _start_tags(self, attrs_d): + # This is a completely-made up element. Its semantics are determined + # only by a single feed that precipitated bug report 392 on Google Code. + # In short, this is junk code. + self.push('tags', 1) + + def _end_tags(self): + for term in self.pop('tags').split(','): + self._add_tag(term.strip(), None, None) diff --git a/modules/feedparser/namespaces/__init__.py b/modules/feedparser/namespaces/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/modules/feedparser/namespaces/_base.py b/modules/feedparser/namespaces/_base.py new file mode 100644 index 000000000..3b3b21c32 --- /dev/null +++ b/modules/feedparser/namespaces/_base.py @@ -0,0 +1,506 @@ +# Support for the Atom, RSS, RDF, and CDF feed formats +# Copyright 2010-2019 Kurt McKee +# Copyright 2002-2008 Mark Pilgrim +# All rights reserved. +# +# This file is a part of feedparser. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from __future__ import absolute_import +from __future__ import unicode_literals + +import copy + +from ..datetimes import _parse_date +from ..urls import make_safe_absolute_uri +from ..util import FeedParserDict + + +class Namespace(object): + """Support for the Atom, RSS, RDF, and CDF feed formats. + + The feed formats all share common elements, some of which have conflicting + interpretations. For simplicity, all of the base feed format support is + collected here. + """ + + supported_namespaces = { + '': '', + 'http://backend.userland.com/rss': '', + 'http://blogs.law.harvard.edu/tech/rss': '', + 'http://purl.org/rss/1.0/': '', + 'http://my.netscape.com/rdf/simple/0.9/': '', + 'http://example.com/newformat#': '', + 'http://example.com/necho': '', + 'http://purl.org/echo/': '', + 'uri/of/echo/namespace#': '', + 'http://purl.org/pie/': '', + 'http://purl.org/atom/ns#': '', + 'http://www.w3.org/2005/Atom': '', + 'http://purl.org/rss/1.0/modules/rss091#': '', + } + + def _start_rss(self, attrs_d): + versionmap = { + '0.91': 'rss091u', + '0.92': 'rss092', + '0.93': 'rss093', + '0.94': 'rss094', + } + + # If we're here then this is an RSS feed. + # If we don't have a version or have a version that starts with something + # other than RSS then there's been a mistake. Correct it. + if not self.version or not self.version.startswith('rss'): + attr_version = attrs_d.get('version', '') + version = versionmap.get(attr_version) + if version: + self.version = version + elif attr_version.startswith('2.'): + self.version = 'rss20' + else: + self.version = 'rss' + + def _start_channel(self, attrs_d): + self.infeed = 1 + self._cdf_common(attrs_d) + + def _cdf_common(self, attrs_d): + if 'lastmod' in attrs_d: + self._start_modified({}) + self.elementstack[-1][-1] = attrs_d['lastmod'] + self._end_modified() + if 'href' in attrs_d: + self._start_link({}) + self.elementstack[-1][-1] = attrs_d['href'] + self._end_link() + + def _start_feed(self, attrs_d): + self.infeed = 1 + versionmap = {'0.1': 'atom01', + '0.2': 'atom02', + '0.3': 'atom03'} + if not self.version: + attr_version = attrs_d.get('version') + version = versionmap.get(attr_version) + if version: + self.version = version + else: + self.version = 'atom' + + def _end_channel(self): + self.infeed = 0 + _end_feed = _end_channel + + def _start_image(self, attrs_d): + context = self._get_context() + if not self.inentry: + context.setdefault('image', FeedParserDict()) + self.inimage = 1 + self.title_depth = -1 + self.push('image', 0) + + def _end_image(self): + self.pop('image') + self.inimage = 0 + + def _start_textinput(self, attrs_d): + context = self._get_context() + context.setdefault('textinput', FeedParserDict()) + self.intextinput = 1 + self.title_depth = -1 + self.push('textinput', 0) + _start_textInput = _start_textinput + + def _end_textinput(self): + self.pop('textinput') + self.intextinput = 0 + _end_textInput = _end_textinput + + def _start_author(self, attrs_d): + self.inauthor = 1 + self.push('author', 1) + # Append a new FeedParserDict when expecting an author + context = self._get_context() + context.setdefault('authors', []) + context['authors'].append(FeedParserDict()) + _start_managingeditor = _start_author + + def _end_author(self): + self.pop('author') + self.inauthor = 0 + self._sync_author_detail() + _end_managingeditor = _end_author + + def _start_contributor(self, attrs_d): + self.incontributor = 1 + context = self._get_context() + context.setdefault('contributors', []) + context['contributors'].append(FeedParserDict()) + self.push('contributor', 0) + + def _end_contributor(self): + self.pop('contributor') + self.incontributor = 0 + + def _start_name(self, attrs_d): + self.push('name', 0) + + def _end_name(self): + value = self.pop('name') + if self.inpublisher: + self._save_author('name', value, 'publisher') + elif self.inauthor: + self._save_author('name', value) + elif self.incontributor: + self._save_contributor('name', value) + elif self.intextinput: + context = self._get_context() + context['name'] = value + + def _start_width(self, attrs_d): + self.push('width', 0) + + def _end_width(self): + value = self.pop('width') + try: + value = int(value) + except ValueError: + value = 0 + if self.inimage: + context = self._get_context() + context['width'] = value + + def _start_height(self, attrs_d): + self.push('height', 0) + + def _end_height(self): + value = self.pop('height') + try: + value = int(value) + except ValueError: + value = 0 + if self.inimage: + context = self._get_context() + context['height'] = value + + def _start_url(self, attrs_d): + self.push('href', 1) + _start_homepage = _start_url + _start_uri = _start_url + + def _end_url(self): + value = self.pop('href') + if self.inauthor: + self._save_author('href', value) + elif self.incontributor: + self._save_contributor('href', value) + _end_homepage = _end_url + _end_uri = _end_url + + def _start_email(self, attrs_d): + self.push('email', 0) + + def _end_email(self): + value = self.pop('email') + if self.inpublisher: + self._save_author('email', value, 'publisher') + elif self.inauthor: + self._save_author('email', value) + elif self.incontributor: + self._save_contributor('email', value) + + def _start_subtitle(self, attrs_d): + self.push_content('subtitle', attrs_d, 'text/plain', 1) + _start_tagline = _start_subtitle + + def _end_subtitle(self): + self.pop_content('subtitle') + _end_tagline = _end_subtitle + + def _start_rights(self, attrs_d): + self.push_content('rights', attrs_d, 'text/plain', 1) + _start_copyright = _start_rights + + def _end_rights(self): + self.pop_content('rights') + _end_copyright = _end_rights + + def _start_item(self, attrs_d): + self.entries.append(FeedParserDict()) + self.push('item', 0) + self.inentry = 1 + self.guidislink = 0 + self.title_depth = -1 + id = self._get_attribute(attrs_d, 'rdf:about') + if id: + context = self._get_context() + context['id'] = id + self._cdf_common(attrs_d) + _start_entry = _start_item + + def _end_item(self): + self.pop('item') + self.inentry = 0 + _end_entry = _end_item + + def _start_language(self, attrs_d): + self.push('language', 1) + + def _end_language(self): + self.lang = self.pop('language') + + def _start_webmaster(self, attrs_d): + self.push('publisher', 1) + + def _end_webmaster(self): + self.pop('publisher') + self._sync_author_detail('publisher') + + def _start_published(self, attrs_d): + self.push('published', 1) + _start_issued = _start_published + _start_pubdate = _start_published + + def _end_published(self): + value = self.pop('published') + self._save('published_parsed', _parse_date(value), overwrite=True) + _end_issued = _end_published + _end_pubdate = _end_published + + def _start_updated(self, attrs_d): + self.push('updated', 1) + _start_modified = _start_updated + _start_lastbuilddate = _start_updated + + def _end_updated(self): + value = self.pop('updated') + parsed_value = _parse_date(value) + self._save('updated_parsed', parsed_value, overwrite=True) + _end_modified = _end_updated + _end_lastbuilddate = _end_updated + + def _start_created(self, attrs_d): + self.push('created', 1) + + def _end_created(self): + value = self.pop('created') + self._save('created_parsed', _parse_date(value), overwrite=True) + + def _start_expirationdate(self, attrs_d): + self.push('expired', 1) + + def _end_expirationdate(self): + self._save('expired_parsed', _parse_date(self.pop('expired')), overwrite=True) + + def _start_category(self, attrs_d): + term = attrs_d.get('term') + scheme = attrs_d.get('scheme', attrs_d.get('domain')) + label = attrs_d.get('label') + self._add_tag(term, scheme, label) + self.push('category', 1) + _start_keywords = _start_category + + def _end_category(self): + value = self.pop('category') + if not value: + return + context = self._get_context() + tags = context['tags'] + if value and len(tags) and not tags[-1]['term']: + tags[-1]['term'] = value + else: + self._add_tag(value, None, None) + _end_keywords = _end_category + + def _start_cloud(self, attrs_d): + self._get_context()['cloud'] = FeedParserDict(attrs_d) + + def _start_link(self, attrs_d): + attrs_d.setdefault('rel', 'alternate') + if attrs_d['rel'] == 'self': + attrs_d.setdefault('type', 'application/atom+xml') + else: + attrs_d.setdefault('type', 'text/html') + context = self._get_context() + attrs_d = self._enforce_href(attrs_d) + if 'href' in attrs_d: + attrs_d['href'] = self.resolve_uri(attrs_d['href']) + expecting_text = self.infeed or self.inentry or self.insource + context.setdefault('links', []) + if not (self.inentry and self.inimage): + context['links'].append(FeedParserDict(attrs_d)) + if 'href' in attrs_d: + if ( + attrs_d.get('rel') == 'alternate' + and self.map_content_type(attrs_d.get('type')) in self.html_types + ): + context['link'] = attrs_d['href'] + else: + self.push('link', expecting_text) + + def _end_link(self): + self.pop('link') + + def _start_guid(self, attrs_d): + self.guidislink = (attrs_d.get('ispermalink', 'true') == 'true') + self.push('id', 1) + _start_id = _start_guid + + def _end_guid(self): + value = self.pop('id') + self._save('guidislink', self.guidislink and 'link' not in self._get_context()) + if self.guidislink: + # guid acts as link, but only if 'ispermalink' is not present or is 'true', + # and only if the item doesn't already have a link element + self._save('link', value) + _end_id = _end_guid + + def _start_title(self, attrs_d): + if self.svgOK: + return self.unknown_starttag('title', list(attrs_d.items())) + self.push_content('title', attrs_d, 'text/plain', self.infeed or self.inentry or self.insource) + + def _end_title(self): + if self.svgOK: + return + value = self.pop_content('title') + if not value: + return + self.title_depth = self.depth + + def _start_description(self, attrs_d): + context = self._get_context() + if 'summary' in context: + self._summaryKey = 'content' + self._start_content(attrs_d) + else: + self.push_content('description', attrs_d, 'text/html', self.infeed or self.inentry or self.insource) + + def _start_abstract(self, attrs_d): + self.push_content('description', attrs_d, 'text/plain', self.infeed or self.inentry or self.insource) + + def _end_description(self): + if self._summaryKey == 'content': + self._end_content() + else: + self.pop_content('description') + self._summaryKey = None + _end_abstract = _end_description + + def _start_info(self, attrs_d): + self.push_content('info', attrs_d, 'text/plain', 1) + _start_feedburner_browserfriendly = _start_info + + def _end_info(self): + self.pop_content('info') + _end_feedburner_browserfriendly = _end_info + + def _start_generator(self, attrs_d): + if attrs_d: + attrs_d = self._enforce_href(attrs_d) + if 'href' in attrs_d: + attrs_d['href'] = self.resolve_uri(attrs_d['href']) + self._get_context()['generator_detail'] = FeedParserDict(attrs_d) + self.push('generator', 1) + + def _end_generator(self): + value = self.pop('generator') + context = self._get_context() + if 'generator_detail' in context: + context['generator_detail']['name'] = value + + def _start_summary(self, attrs_d): + context = self._get_context() + if 'summary' in context: + self._summaryKey = 'content' + self._start_content(attrs_d) + else: + self._summaryKey = 'summary' + self.push_content(self._summaryKey, attrs_d, 'text/plain', 1) + + def _end_summary(self): + if self._summaryKey == 'content': + self._end_content() + else: + self.pop_content(self._summaryKey or 'summary') + self._summaryKey = None + + def _start_enclosure(self, attrs_d): + attrs_d = self._enforce_href(attrs_d) + context = self._get_context() + attrs_d['rel'] = 'enclosure' + context.setdefault('links', []).append(FeedParserDict(attrs_d)) + + def _start_source(self, attrs_d): + if 'url' in attrs_d: + # This means that we're processing a source element from an RSS 2.0 feed + self.sourcedata['href'] = attrs_d['url'] + self.push('source', 1) + self.insource = 1 + self.title_depth = -1 + + def _end_source(self): + self.insource = 0 + value = self.pop('source') + if value: + self.sourcedata['title'] = value + self._get_context()['source'] = copy.deepcopy(self.sourcedata) + self.sourcedata.clear() + + def _start_content(self, attrs_d): + self.push_content('content', attrs_d, 'text/plain', 1) + src = attrs_d.get('src') + if src: + self.contentparams['src'] = src + self.push('content', 1) + + def _start_body(self, attrs_d): + self.push_content('content', attrs_d, 'application/xhtml+xml', 1) + _start_xhtml_body = _start_body + + def _start_content_encoded(self, attrs_d): + self.push_content('content', attrs_d, 'text/html', 1) + _start_fullitem = _start_content_encoded + + def _end_content(self): + copyToSummary = self.map_content_type(self.contentparams.get('type')) in ({'text/plain'} | self.html_types) + value = self.pop_content('content') + if copyToSummary: + self._save('summary', value) + + _end_body = _end_content + _end_xhtml_body = _end_content + _end_content_encoded = _end_content + _end_fullitem = _end_content + + def _start_newlocation(self, attrs_d): + self.push('newlocation', 1) + + def _end_newlocation(self): + url = self.pop('newlocation') + context = self._get_context() + # don't set newlocation if the context isn't right + if context is not self.feeddata: + return + context['newlocation'] = make_safe_absolute_uri(self.baseuri, url.strip()) diff --git a/modules/feedparser/namespaces/admin.py b/modules/feedparser/namespaces/admin.py new file mode 100644 index 000000000..c870cae03 --- /dev/null +++ b/modules/feedparser/namespaces/admin.py @@ -0,0 +1,56 @@ +# Support for the administrative elements extension +# Copyright 2010-2019 Kurt McKee +# Copyright 2002-2008 Mark Pilgrim +# All rights reserved. +# +# This file is a part of feedparser. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from __future__ import absolute_import +from __future__ import unicode_literals + +from ..util import FeedParserDict + + +class Namespace(object): + # RDF Site Summary 1.0 Modules: Administrative + # http://web.resource.org/rss/1.0/modules/admin/ + + supported_namespaces = { + 'http://webns.net/mvcb/': 'admin', + } + + def _start_admin_generatoragent(self, attrs_d): + self.push('generator', 1) + value = self._get_attribute(attrs_d, 'rdf:resource') + if value: + self.elementstack[-1][2].append(value) + self.pop('generator') + self._get_context()['generator_detail'] = FeedParserDict({'href': value}) + + def _start_admin_errorreportsto(self, attrs_d): + self.push('errorreportsto', 1) + value = self._get_attribute(attrs_d, 'rdf:resource') + if value: + self.elementstack[-1][2].append(value) + self.pop('errorreportsto') diff --git a/modules/feedparser/namespaces/cc.py b/modules/feedparser/namespaces/cc.py new file mode 100644 index 000000000..e9e4240e1 --- /dev/null +++ b/modules/feedparser/namespaces/cc.py @@ -0,0 +1,72 @@ +# Support for the Creative Commons licensing extensions +# Copyright 2010-2019 Kurt McKee +# Copyright 2002-2008 Mark Pilgrim +# All rights reserved. +# +# This file is a part of feedparser. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from __future__ import absolute_import +from __future__ import unicode_literals + +from ..util import FeedParserDict + + +class Namespace(object): + supported_namespaces = { + # RDF-based namespace + 'http://creativecommons.org/ns#license': 'cc', + + # Old RDF-based namespace + 'http://web.resource.org/cc/': 'cc', + + # RSS-based namespace + 'http://cyber.law.harvard.edu/rss/creativeCommonsRssModule.html': 'creativecommons', + + # Old RSS-based namespace + 'http://backend.userland.com/creativeCommonsRssModule': 'creativecommons', + } + + def _start_cc_license(self, attrs_d): + context = self._get_context() + value = self._get_attribute(attrs_d, 'rdf:resource') + attrs_d = FeedParserDict() + attrs_d['rel'] = 'license' + if value: + attrs_d['href'] = value + context.setdefault('links', []).append(attrs_d) + + def _start_creativecommons_license(self, attrs_d): + self.push('license', 1) + _start_creativeCommons_license = _start_creativecommons_license + + def _end_creativecommons_license(self): + value = self.pop('license') + context = self._get_context() + attrs_d = FeedParserDict() + attrs_d['rel'] = 'license' + if value: + attrs_d['href'] = value + context.setdefault('links', []).append(attrs_d) + del context['license'] + _end_creativeCommons_license = _end_creativecommons_license diff --git a/modules/feedparser/namespaces/dc.py b/modules/feedparser/namespaces/dc.py new file mode 100644 index 000000000..f31f97534 --- /dev/null +++ b/modules/feedparser/namespaces/dc.py @@ -0,0 +1,137 @@ +# Support for the Dublin Core metadata extensions +# Copyright 2010-2019 Kurt McKee +# Copyright 2002-2008 Mark Pilgrim +# All rights reserved. +# +# This file is a part of feedparser. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from __future__ import absolute_import +from __future__ import unicode_literals + +from ..datetimes import _parse_date +from ..util import FeedParserDict + + +class Namespace(object): + supported_namespaces = { + 'http://purl.org/dc/elements/1.1/': 'dc', + 'http://purl.org/dc/terms/': 'dcterms', + } + + def _end_dc_author(self): + self._end_author() + + def _end_dc_creator(self): + self._end_author() + + def _end_dc_date(self): + self._end_updated() + + def _end_dc_description(self): + self._end_description() + + def _end_dc_language(self): + self._end_language() + + def _end_dc_publisher(self): + self._end_webmaster() + + def _end_dc_rights(self): + self._end_rights() + + def _end_dc_subject(self): + self._end_category() + + def _end_dc_title(self): + self._end_title() + + def _end_dcterms_created(self): + self._end_created() + + def _end_dcterms_issued(self): + self._end_published() + + def _end_dcterms_modified(self): + self._end_updated() + + def _start_dc_author(self, attrs_d): + self._start_author(attrs_d) + + def _start_dc_creator(self, attrs_d): + self._start_author(attrs_d) + + def _start_dc_date(self, attrs_d): + self._start_updated(attrs_d) + + def _start_dc_description(self, attrs_d): + self._start_description(attrs_d) + + def _start_dc_language(self, attrs_d): + self._start_language(attrs_d) + + def _start_dc_publisher(self, attrs_d): + self._start_webmaster(attrs_d) + + def _start_dc_rights(self, attrs_d): + self._start_rights(attrs_d) + + def _start_dc_subject(self, attrs_d): + self._start_category(attrs_d) + + def _start_dc_title(self, attrs_d): + self._start_title(attrs_d) + + def _start_dcterms_created(self, attrs_d): + self._start_created(attrs_d) + + def _start_dcterms_issued(self, attrs_d): + self._start_published(attrs_d) + + def _start_dcterms_modified(self, attrs_d): + self._start_updated(attrs_d) + + def _start_dcterms_valid(self, attrs_d): + self.push('validity', 1) + + def _end_dcterms_valid(self): + for validity_detail in self.pop('validity').split(';'): + if '=' in validity_detail: + key, value = validity_detail.split('=', 1) + if key == 'start': + self._save('validity_start', value, overwrite=True) + self._save('validity_start_parsed', _parse_date(value), overwrite=True) + elif key == 'end': + self._save('validity_end', value, overwrite=True) + self._save('validity_end_parsed', _parse_date(value), overwrite=True) + + def _start_dc_contributor(self, attrs_d): + self.incontributor = 1 + context = self._get_context() + context.setdefault('contributors', []) + context['contributors'].append(FeedParserDict()) + self.push('name', 0) + + def _end_dc_contributor(self): + self._end_name() + self.incontributor = 0 diff --git a/modules/feedparser/namespaces/georss.py b/modules/feedparser/namespaces/georss.py new file mode 100644 index 000000000..0a6d91feb --- /dev/null +++ b/modules/feedparser/namespaces/georss.py @@ -0,0 +1,276 @@ +# Support for the GeoRSS format +# Copyright 2010-2019 Kurt McKee +# Copyright 2002-2008 Mark Pilgrim +# All rights reserved. +# +# This file is a part of feedparser. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from __future__ import absolute_import +from __future__ import unicode_literals + +from ..util import FeedParserDict + + +class Namespace(object): + supported_namespaces = { + 'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo', + 'http://www.georss.org/georss': 'georss', + 'http://www.opengis.net/gml': 'gml', + } + + def __init__(self): + self.ingeometry = 0 + super(Namespace, self).__init__() + + def _start_georssgeom(self, attrs_d): + self.push('geometry', 0) + context = self._get_context() + context['where'] = FeedParserDict() + + _start_georss_point = _start_georssgeom + _start_georss_line = _start_georssgeom + _start_georss_polygon = _start_georssgeom + _start_georss_box = _start_georssgeom + + def _save_where(self, geometry): + context = self._get_context() + context['where'].update(geometry) + + def _end_georss_point(self): + geometry = _parse_georss_point(self.pop('geometry')) + if geometry: + self._save_where(geometry) + + def _end_georss_line(self): + geometry = _parse_georss_line(self.pop('geometry')) + if geometry: + self._save_where(geometry) + + def _end_georss_polygon(self): + this = self.pop('geometry') + geometry = _parse_georss_polygon(this) + if geometry: + self._save_where(geometry) + + def _end_georss_box(self): + geometry = _parse_georss_box(self.pop('geometry')) + if geometry: + self._save_where(geometry) + + def _start_where(self, attrs_d): + self.push('where', 0) + context = self._get_context() + context['where'] = FeedParserDict() + _start_georss_where = _start_where + + def _parse_srs_attrs(self, attrs_d): + srs_name = attrs_d.get('srsname') + try: + srs_dimension = int(attrs_d.get('srsdimension', '2')) + except ValueError: + srs_dimension = 2 + context = self._get_context() + context['where']['srsName'] = srs_name + context['where']['srsDimension'] = srs_dimension + + def _start_gml_point(self, attrs_d): + self._parse_srs_attrs(attrs_d) + self.ingeometry = 1 + self.push('geometry', 0) + + def _start_gml_linestring(self, attrs_d): + self._parse_srs_attrs(attrs_d) + self.ingeometry = 'linestring' + self.push('geometry', 0) + + def _start_gml_polygon(self, attrs_d): + self._parse_srs_attrs(attrs_d) + self.push('geometry', 0) + + def _start_gml_exterior(self, attrs_d): + self.push('geometry', 0) + + def _start_gml_linearring(self, attrs_d): + self.ingeometry = 'polygon' + self.push('geometry', 0) + + def _start_gml_pos(self, attrs_d): + self.push('pos', 0) + + def _end_gml_pos(self): + this = self.pop('pos') + context = self._get_context() + srs_name = context['where'].get('srsName') + srs_dimension = context['where'].get('srsDimension', 2) + swap = True + if srs_name and "EPSG" in srs_name: + epsg = int(srs_name.split(":")[-1]) + swap = bool(epsg in _geogCS) + geometry = _parse_georss_point(this, swap=swap, dims=srs_dimension) + if geometry: + self._save_where(geometry) + + def _start_gml_poslist(self, attrs_d): + self.push('pos', 0) + + def _end_gml_poslist(self): + this = self.pop('pos') + context = self._get_context() + srs_name = context['where'].get('srsName') + srs_dimension = context['where'].get('srsDimension', 2) + swap = True + if srs_name and "EPSG" in srs_name: + epsg = int(srs_name.split(":")[-1]) + swap = bool(epsg in _geogCS) + geometry = _parse_poslist( + this, self.ingeometry, swap=swap, dims=srs_dimension) + if geometry: + self._save_where(geometry) + + def _end_geom(self): + self.ingeometry = 0 + self.pop('geometry') + _end_gml_point = _end_geom + _end_gml_linestring = _end_geom + _end_gml_linearring = _end_geom + _end_gml_exterior = _end_geom + _end_gml_polygon = _end_geom + + def _end_where(self): + self.pop('where') + _end_georss_where = _end_where + + +# GeoRSS geometry parsers. Each return a dict with 'type' and 'coordinates' +# items, or None in the case of a parsing error. + +def _parse_poslist(value, geom_type, swap=True, dims=2): + if geom_type == 'linestring': + return _parse_georss_line(value, swap, dims) + elif geom_type == 'polygon': + ring = _parse_georss_line(value, swap, dims) + return {'type': 'Polygon', 'coordinates': (ring['coordinates'],)} + else: + return None + + +def _gen_georss_coords(value, swap=True, dims=2): + # A generator of (lon, lat) pairs from a string of encoded GeoRSS + # coordinates. Converts to floats and swaps order. + latlons = (float(ll) for ll in value.replace(',', ' ').split()) + while True: + try: + t = [next(latlons), next(latlons)][::swap and -1 or 1] + if dims == 3: + t.append(next(latlons)) + yield tuple(t) + except StopIteration: + return + + +def _parse_georss_point(value, swap=True, dims=2): + # A point contains a single latitude-longitude pair, separated by + # whitespace. We'll also handle comma separators. + try: + coords = list(_gen_georss_coords(value, swap, dims)) + return {'type': 'Point', 'coordinates': coords[0]} + except (IndexError, ValueError): + return None + + +def _parse_georss_line(value, swap=True, dims=2): + # A line contains a space separated list of latitude-longitude pairs in + # WGS84 coordinate reference system, with each pair separated by + # whitespace. There must be at least two pairs. + try: + coords = list(_gen_georss_coords(value, swap, dims)) + return {'type': 'LineString', 'coordinates': coords} + except (IndexError, ValueError): + return None + + +def _parse_georss_polygon(value, swap=True, dims=2): + # A polygon contains a space separated list of latitude-longitude pairs, + # with each pair separated by whitespace. There must be at least four + # pairs, with the last being identical to the first (so a polygon has a + # minimum of three actual points). + try: + ring = list(_gen_georss_coords(value, swap, dims)) + except (IndexError, ValueError): + return None + if len(ring) < 4: + return None + return {'type': 'Polygon', 'coordinates': (ring,)} + + +def _parse_georss_box(value, swap=True, dims=2): + # A bounding box is a rectangular region, often used to define the extents + # of a map or a rough area of interest. A box contains two space separate + # latitude-longitude pairs, with each pair separated by whitespace. The + # first pair is the lower corner, the second is the upper corner. + try: + coords = list(_gen_georss_coords(value, swap, dims)) + return {'type': 'Box', 'coordinates': tuple(coords)} + except (IndexError, ValueError): + return None + + +# The list of EPSG codes for geographic (latitude/longitude) coordinate +# systems to support decoding of GeoRSS GML profiles. +_geogCS = [ + 3819, 3821, 3824, 3889, 3906, 4001, 4002, 4003, 4004, 4005, 4006, 4007, 4008, + 4009, 4010, 4011, 4012, 4013, 4014, 4015, 4016, 4018, 4019, 4020, 4021, 4022, + 4023, 4024, 4025, 4027, 4028, 4029, 4030, 4031, 4032, 4033, 4034, 4035, 4036, + 4041, 4042, 4043, 4044, 4045, 4046, 4047, 4052, 4053, 4054, 4055, 4075, 4081, + 4120, 4121, 4122, 4123, 4124, 4125, 4126, 4127, 4128, 4129, 4130, 4131, 4132, + 4133, 4134, 4135, 4136, 4137, 4138, 4139, 4140, 4141, 4142, 4143, 4144, 4145, + 4146, 4147, 4148, 4149, 4150, 4151, 4152, 4153, 4154, 4155, 4156, 4157, 4158, + 4159, 4160, 4161, 4162, 4163, 4164, 4165, 4166, 4167, 4168, 4169, 4170, 4171, + 4172, 4173, 4174, 4175, 4176, 4178, 4179, 4180, 4181, 4182, 4183, 4184, 4185, + 4188, 4189, 4190, 4191, 4192, 4193, 4194, 4195, 4196, 4197, 4198, 4199, 4200, + 4201, 4202, 4203, 4204, 4205, 4206, 4207, 4208, 4209, 4210, 4211, 4212, 4213, + 4214, 4215, 4216, 4218, 4219, 4220, 4221, 4222, 4223, 4224, 4225, 4226, 4227, + 4228, 4229, 4230, 4231, 4232, 4233, 4234, 4235, 4236, 4237, 4238, 4239, 4240, + 4241, 4242, 4243, 4244, 4245, 4246, 4247, 4248, 4249, 4250, 4251, 4252, 4253, + 4254, 4255, 4256, 4257, 4258, 4259, 4260, 4261, 4262, 4263, 4264, 4265, 4266, + 4267, 4268, 4269, 4270, 4271, 4272, 4273, 4274, 4275, 4276, 4277, 4278, 4279, + 4280, 4281, 4282, 4283, 4284, 4285, 4286, 4287, 4288, 4289, 4291, 4292, 4293, + 4294, 4295, 4296, 4297, 4298, 4299, 4300, 4301, 4302, 4303, 4304, 4306, 4307, + 4308, 4309, 4310, 4311, 4312, 4313, 4314, 4315, 4316, 4317, 4318, 4319, 4322, + 4324, 4326, 4463, 4470, 4475, 4483, 4490, 4555, 4558, 4600, 4601, 4602, 4603, + 4604, 4605, 4606, 4607, 4608, 4609, 4610, 4611, 4612, 4613, 4614, 4615, 4616, + 4617, 4618, 4619, 4620, 4621, 4622, 4623, 4624, 4625, 4626, 4627, 4628, 4629, + 4630, 4631, 4632, 4633, 4634, 4635, 4636, 4637, 4638, 4639, 4640, 4641, 4642, + 4643, 4644, 4645, 4646, 4657, 4658, 4659, 4660, 4661, 4662, 4663, 4664, 4665, + 4666, 4667, 4668, 4669, 4670, 4671, 4672, 4673, 4674, 4675, 4676, 4677, 4678, + 4679, 4680, 4681, 4682, 4683, 4684, 4685, 4686, 4687, 4688, 4689, 4690, 4691, + 4692, 4693, 4694, 4695, 4696, 4697, 4698, 4699, 4700, 4701, 4702, 4703, 4704, + 4705, 4706, 4707, 4708, 4709, 4710, 4711, 4712, 4713, 4714, 4715, 4716, 4717, + 4718, 4719, 4720, 4721, 4722, 4723, 4724, 4725, 4726, 4727, 4728, 4729, 4730, + 4731, 4732, 4733, 4734, 4735, 4736, 4737, 4738, 4739, 4740, 4741, 4742, 4743, + 4744, 4745, 4746, 4747, 4748, 4749, 4750, 4751, 4752, 4753, 4754, 4755, 4756, + 4757, 4758, 4759, 4760, 4761, 4762, 4763, 4764, 4765, 4801, 4802, 4803, 4804, + 4805, 4806, 4807, 4808, 4809, 4810, 4811, 4813, 4814, 4815, 4816, 4817, 4818, + 4819, 4820, 4821, 4823, 4824, 4901, 4902, 4903, 4904, 4979, +] diff --git a/modules/feedparser/namespaces/itunes.py b/modules/feedparser/namespaces/itunes.py new file mode 100644 index 000000000..e106c52fe --- /dev/null +++ b/modules/feedparser/namespaces/itunes.py @@ -0,0 +1,112 @@ +# Support for the iTunes format +# Copyright 2010-2019 Kurt McKee +# Copyright 2002-2008 Mark Pilgrim +# All rights reserved. +# +# This file is a part of feedparser. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from __future__ import absolute_import +from __future__ import unicode_literals + +from ..util import FeedParserDict + + +class Namespace(object): + supported_namespaces = { + # Canonical namespace + 'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes', + + # Extra namespace + 'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes', + } + + def _start_itunes_author(self, attrs_d): + self._start_author(attrs_d) + + def _end_itunes_author(self): + self._end_author() + + def _end_itunes_category(self): + self._end_category() + + def _start_itunes_name(self, attrs_d): + self._start_name(attrs_d) + + def _end_itunes_name(self): + self._end_name() + + def _start_itunes_email(self, attrs_d): + self._start_email(attrs_d) + + def _end_itunes_email(self): + self._end_email() + + def _start_itunes_subtitle(self, attrs_d): + self._start_subtitle(attrs_d) + + def _end_itunes_subtitle(self): + self._end_subtitle() + + def _start_itunes_summary(self, attrs_d): + self._start_summary(attrs_d) + + def _end_itunes_summary(self): + self._end_summary() + + def _start_itunes_owner(self, attrs_d): + self.inpublisher = 1 + self.push('publisher', 0) + + def _end_itunes_owner(self): + self.pop('publisher') + self.inpublisher = 0 + self._sync_author_detail('publisher') + + def _end_itunes_keywords(self): + for term in self.pop('itunes_keywords').split(','): + if term.strip(): + self._add_tag(term.strip(), 'http://www.itunes.com/', None) + + def _start_itunes_category(self, attrs_d): + self._add_tag(attrs_d.get('text'), 'http://www.itunes.com/', None) + self.push('category', 1) + + def _start_itunes_image(self, attrs_d): + self.push('itunes_image', 0) + if attrs_d.get('href'): + self._get_context()['image'] = FeedParserDict({'href': attrs_d.get('href')}) + elif attrs_d.get('url'): + self._get_context()['image'] = FeedParserDict({'href': attrs_d.get('url')}) + _start_itunes_link = _start_itunes_image + + def _end_itunes_block(self): + value = self.pop('itunes_block', 0) + self._get_context()['itunes_block'] = (value == 'yes') and 1 or 0 + + def _end_itunes_explicit(self): + value = self.pop('itunes_explicit', 0) + # Convert 'yes' -> True, 'clean' to False, and any other value to None + # False and None both evaluate as False, so the difference can be ignored + # by applications that only need to know if the content is explicit. + self._get_context()['itunes_explicit'] = (None, False, True)[(value == 'yes' and 2) or value == 'clean' or 0] diff --git a/modules/feedparser/namespaces/mediarss.py b/modules/feedparser/namespaces/mediarss.py new file mode 100644 index 000000000..f4823712f --- /dev/null +++ b/modules/feedparser/namespaces/mediarss.py @@ -0,0 +1,144 @@ +# Support for the Media RSS format +# Copyright 2010-2019 Kurt McKee +# Copyright 2002-2008 Mark Pilgrim +# All rights reserved. +# +# This file is a part of feedparser. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from __future__ import absolute_import +from __future__ import unicode_literals + +from ..util import FeedParserDict + + +class Namespace(object): + supported_namespaces = { + # Canonical namespace + 'http://search.yahoo.com/mrss/': 'media', + + # Old namespace (no trailing slash) + 'http://search.yahoo.com/mrss': 'media', + } + + def _start_media_category(self, attrs_d): + attrs_d.setdefault('scheme', 'http://search.yahoo.com/mrss/category_schema') + self._start_category(attrs_d) + + def _end_media_category(self): + self._end_category() + + def _end_media_keywords(self): + for term in self.pop('media_keywords').split(','): + if term.strip(): + self._add_tag(term.strip(), None, None) + + def _start_media_title(self, attrs_d): + self._start_title(attrs_d) + + def _end_media_title(self): + title_depth = self.title_depth + self._end_title() + self.title_depth = title_depth + + def _start_media_group(self, attrs_d): + # don't do anything, but don't break the enclosed tags either + pass + + def _start_media_rating(self, attrs_d): + context = self._get_context() + context.setdefault('media_rating', attrs_d) + self.push('rating', 1) + + def _end_media_rating(self): + rating = self.pop('rating') + if rating is not None and rating.strip(): + context = self._get_context() + context['media_rating']['content'] = rating + + def _start_media_credit(self, attrs_d): + context = self._get_context() + context.setdefault('media_credit', []) + context['media_credit'].append(attrs_d) + self.push('credit', 1) + + def _end_media_credit(self): + credit = self.pop('credit') + if credit is not None and credit.strip(): + context = self._get_context() + context['media_credit'][-1]['content'] = credit + + def _start_media_description(self, attrs_d): + self._start_description(attrs_d) + + def _end_media_description(self): + self._end_description() + + def _start_media_restriction(self, attrs_d): + context = self._get_context() + context.setdefault('media_restriction', attrs_d) + self.push('restriction', 1) + + def _end_media_restriction(self): + restriction = self.pop('restriction') + if restriction is not None and restriction.strip(): + context = self._get_context() + context['media_restriction']['content'] = [cc.strip().lower() for cc in restriction.split(' ')] + + def _start_media_license(self, attrs_d): + context = self._get_context() + context.setdefault('media_license', attrs_d) + self.push('license', 1) + + def _end_media_license(self): + license_ = self.pop('license') + if license_ is not None and license_.strip(): + context = self._get_context() + context['media_license']['content'] = license_ + + def _start_media_content(self, attrs_d): + context = self._get_context() + context.setdefault('media_content', []) + context['media_content'].append(attrs_d) + + def _start_media_thumbnail(self, attrs_d): + context = self._get_context() + context.setdefault('media_thumbnail', []) + self.push('url', 1) # new + context['media_thumbnail'].append(attrs_d) + + def _end_media_thumbnail(self): + url = self.pop('url') + context = self._get_context() + if url is not None and url.strip(): + if 'url' not in context['media_thumbnail'][-1]: + context['media_thumbnail'][-1]['url'] = url + + def _start_media_player(self, attrs_d): + self.push('media_player', 0) + self._get_context()['media_player'] = FeedParserDict(attrs_d) + + def _end_media_player(self): + value = self.pop('media_player') + context = self._get_context() + context['media_player']['content'] = value diff --git a/modules/feedparser/namespaces/psc.py b/modules/feedparser/namespaces/psc.py new file mode 100644 index 000000000..2f4d5c7a3 --- /dev/null +++ b/modules/feedparser/namespaces/psc.py @@ -0,0 +1,77 @@ +# Support for the Podlove Simple Chapters format +# Copyright 2010-2019 Kurt McKee +# Copyright 2002-2008 Mark Pilgrim +# All rights reserved. +# +# This file is a part of feedparser. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from __future__ import absolute_import +from __future__ import unicode_literals + +import datetime +import re + +from .. import util + + +class Namespace(object): + supported_namespaces = { + 'http://podlove.org/simple-chapters': 'psc', + } + + def __init__(self): + # chapters will only be captured while psc_chapters_flag is True. + self.psc_chapters_flag = False + super(Namespace, self).__init__() + + def _start_psc_chapters(self, attrs_d): + context = self._get_context() + if 'psc_chapters' not in context: + self.psc_chapters_flag = True + attrs_d['chapters'] = [] + context['psc_chapters'] = util.FeedParserDict(attrs_d) + + def _end_psc_chapters(self): + self.psc_chapters_flag = False + + def _start_psc_chapter(self, attrs_d): + if self.psc_chapters_flag: + start = self._get_attribute(attrs_d, 'start') + attrs_d['start_parsed'] = _parse_psc_chapter_start(start) + + context = self._get_context()['psc_chapters'] + context['chapters'].append(util.FeedParserDict(attrs_d)) + + +format_ = re.compile(r'^((\d{2}):)?(\d{2}):(\d{2})(\.(\d{3}))?$') + + +def _parse_psc_chapter_start(start): + m = format_.match(start) + if m is None: + return None + + _, h, m, s, _, ms = m.groups() + h, m, s, ms = (int(h or 0), int(m), int(s), int(ms or 0)) + return datetime.timedelta(0, h*60*60 + m*60 + s, ms*1000) diff --git a/modules/feedparser/parsers/__init__.py b/modules/feedparser/parsers/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/modules/feedparser/parsers/loose.py b/modules/feedparser/parsers/loose.py new file mode 100644 index 000000000..1ded7a920 --- /dev/null +++ b/modules/feedparser/parsers/loose.py @@ -0,0 +1,81 @@ +# The loose feed parser that interfaces with an SGML parsing library +# Copyright 2010-2019 Kurt McKee +# Copyright 2002-2008 Mark Pilgrim +# All rights reserved. +# +# This file is a part of feedparser. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from __future__ import absolute_import +from __future__ import unicode_literals + + +class _LooseFeedParser(object): + contentparams = None + + def __init__(self, baseuri=None, baselang=None, encoding=None, entities=None): + self.baseuri = baseuri or '' + self.lang = baselang or None + self.encoding = encoding or 'utf-8' # character encoding + self.entities = entities or {} + super(_LooseFeedParser, self).__init__() + + @staticmethod + def _normalize_attributes(kv): + k = kv[0].lower() + v = k in ('rel', 'type') and kv[1].lower() or kv[1] + # the sgml parser doesn't handle entities in attributes, nor + # does it pass the attribute values through as unicode, while + # strict xml parsers do -- account for this difference + v = v.replace('&', '&') + return k, v + + def decode_entities(self, element, data): + data = data.replace('<', '<') + data = data.replace('<', '<') + data = data.replace('<', '<') + data = data.replace('>', '>') + data = data.replace('>', '>') + data = data.replace('>', '>') + data = data.replace('&', '&') + data = data.replace('&', '&') + data = data.replace('"', '"') + data = data.replace('"', '"') + data = data.replace(''', ''') + data = data.replace(''', ''') + if not self.contentparams.get('type', 'xml').endswith('xml'): + data = data.replace('<', '<') + data = data.replace('>', '>') + data = data.replace('&', '&') + data = data.replace('"', '"') + data = data.replace(''', "'") + data = data.replace('/', '/') + data = data.replace('/', '/') + return data + + @staticmethod + def strattrs(attrs): + return ''.join( + ' %s="%s"' % (n, v.replace('"', '"')) + for n, v in attrs + ) diff --git a/modules/feedparser/parsers/strict.py b/modules/feedparser/parsers/strict.py new file mode 100644 index 000000000..911dd121d --- /dev/null +++ b/modules/feedparser/parsers/strict.py @@ -0,0 +1,137 @@ +# The strict feed parser that interfaces with an XML parsing library +# Copyright 2010-2019 Kurt McKee +# Copyright 2002-2008 Mark Pilgrim +# All rights reserved. +# +# This file is a part of feedparser. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from __future__ import absolute_import, unicode_literals + +from ..exceptions import UndeclaredNamespace + + +class _StrictFeedParser(object): + def __init__(self, baseuri, baselang, encoding): + self.bozo = 0 + self.exc = None + self.decls = {} + self.baseuri = baseuri or '' + self.lang = baselang + self.encoding = encoding + super(_StrictFeedParser, self).__init__() + + @staticmethod + def _normalize_attributes(kv): + k = kv[0].lower() + v = k in ('rel', 'type') and kv[1].lower() or kv[1] + return k, v + + def startPrefixMapping(self, prefix, uri): + if not uri: + return + # Jython uses '' instead of None; standardize on None + prefix = prefix or None + self.track_namespace(prefix, uri) + if prefix and uri == 'http://www.w3.org/1999/xlink': + self.decls['xmlns:' + prefix] = uri + + def startElementNS(self, name, qname, attrs): + namespace, localname = name + lowernamespace = str(namespace or '').lower() + if lowernamespace.find('backend.userland.com/rss') != -1: + # match any backend.userland.com namespace + namespace = 'http://backend.userland.com/rss' + lowernamespace = namespace + if qname and qname.find(':') > 0: + givenprefix = qname.split(':')[0] + else: + givenprefix = None + prefix = self._matchnamespaces.get(lowernamespace, givenprefix) + if givenprefix and (prefix is None or (prefix == '' and lowernamespace == '')) and givenprefix not in self.namespaces_in_use: + raise UndeclaredNamespace("'%s' is not associated with a namespace" % givenprefix) + localname = str(localname).lower() + + # qname implementation is horribly broken in Python 2.1 (it + # doesn't report any), and slightly broken in Python 2.2 (it + # doesn't report the xml: namespace). So we match up namespaces + # with a known list first, and then possibly override them with + # the qnames the SAX parser gives us (if indeed it gives us any + # at all). Thanks to MatejC for helping me test this and + # tirelessly telling me that it didn't work yet. + attrsD, self.decls = self.decls, {} + if localname == 'math' and namespace == 'http://www.w3.org/1998/Math/MathML': + attrsD['xmlns'] = namespace + if localname == 'svg' and namespace == 'http://www.w3.org/2000/svg': + attrsD['xmlns'] = namespace + + if prefix: + localname = prefix.lower() + ':' + localname + elif namespace and not qname: # Expat + for name, value in self.namespaces_in_use.items(): + if name and value == namespace: + localname = name + ':' + localname + break + + for (namespace, attrlocalname), attrvalue in attrs.items(): + lowernamespace = (namespace or '').lower() + prefix = self._matchnamespaces.get(lowernamespace, '') + if prefix: + attrlocalname = prefix + ':' + attrlocalname + attrsD[str(attrlocalname).lower()] = attrvalue + for qname in attrs.getQNames(): + attrsD[str(qname).lower()] = attrs.getValueByQName(qname) + localname = str(localname).lower() + self.unknown_starttag(localname, list(attrsD.items())) + + def characters(self, text): + self.handle_data(text) + + def endElementNS(self, name, qname): + namespace, localname = name + lowernamespace = str(namespace or '').lower() + if qname and qname.find(':') > 0: + givenprefix = qname.split(':')[0] + else: + givenprefix = '' + prefix = self._matchnamespaces.get(lowernamespace, givenprefix) + if prefix: + localname = prefix + ':' + localname + elif namespace and not qname: # Expat + for name, value in self.namespaces_in_use.items(): + if name and value == namespace: + localname = name + ':' + localname + break + localname = str(localname).lower() + self.unknown_endtag(localname) + + def error(self, exc): + self.bozo = 1 + self.exc = exc + + # drv_libxml2 calls warning() in some cases + warning = error + + def fatalError(self, exc): + self.error(exc) + raise exc diff --git a/modules/feedparser/sanitizer.py b/modules/feedparser/sanitizer.py new file mode 100644 index 000000000..37c7d39ca --- /dev/null +++ b/modules/feedparser/sanitizer.py @@ -0,0 +1,955 @@ +# Copyright 2010-2019 Kurt McKee +# Copyright 2002-2008 Mark Pilgrim +# All rights reserved. +# +# This file is a part of feedparser. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from __future__ import absolute_import +from __future__ import unicode_literals + +import re + +from .html import _BaseHTMLProcessor +from .sgml import _SGML_AVAILABLE +from .urls import make_safe_absolute_uri + + +class _HTMLSanitizer(_BaseHTMLProcessor): + acceptable_elements = { + 'a', + 'abbr', + 'acronym', + 'address', + 'area', + 'article', + 'aside', + 'audio', + 'b', + 'big', + 'blockquote', + 'br', + 'button', + 'canvas', + 'caption', + 'center', + 'cite', + 'code', + 'col', + 'colgroup', + 'command', + 'datagrid', + 'datalist', + 'dd', + 'del', + 'details', + 'dfn', + 'dialog', + 'dir', + 'div', + 'dl', + 'dt', + 'em', + 'event-source', + 'fieldset', + 'figcaption', + 'figure', + 'font', + 'footer', + 'form', + 'h1', + 'h2', + 'h3', + 'h4', + 'h5', + 'h6', + 'header', + 'hr', + 'i', + 'img', + 'input', + 'ins', + 'kbd', + 'keygen', + 'label', + 'legend', + 'li', + 'm', + 'map', + 'menu', + 'meter', + 'multicol', + 'nav', + 'nextid', + 'noscript', + 'ol', + 'optgroup', + 'option', + 'output', + 'p', + 'pre', + 'progress', + 'q', + 's', + 'samp', + 'section', + 'select', + 'small', + 'sound', + 'source', + 'spacer', + 'span', + 'strike', + 'strong', + 'sub', + 'sup', + 'table', + 'tbody', + 'td', + 'textarea', + 'tfoot', + 'th', + 'thead', + 'time', + 'tr', + 'tt', + 'u', + 'ul', + 'var', + 'video', + } + + acceptable_attributes = { + 'abbr', + 'accept', + 'accept-charset', + 'accesskey', + 'action', + 'align', + 'alt', + 'autocomplete', + 'autofocus', + 'axis', + 'background', + 'balance', + 'bgcolor', + 'bgproperties', + 'border', + 'bordercolor', + 'bordercolordark', + 'bordercolorlight', + 'bottompadding', + 'cellpadding', + 'cellspacing', + 'ch', + 'challenge', + 'char', + 'charoff', + 'charset', + 'checked', + 'choff', + 'cite', + 'class', + 'clear', + 'color', + 'cols', + 'colspan', + 'compact', + 'contenteditable', + 'controls', + 'coords', + 'data', + 'datafld', + 'datapagesize', + 'datasrc', + 'datetime', + 'default', + 'delay', + 'dir', + 'disabled', + 'draggable', + 'dynsrc', + 'enctype', + 'end', + 'face', + 'for', + 'form', + 'frame', + 'galleryimg', + 'gutter', + 'headers', + 'height', + 'hidden', + 'hidefocus', + 'high', + 'href', + 'hreflang', + 'hspace', + 'icon', + 'id', + 'inputmode', + 'ismap', + 'keytype', + 'label', + 'lang', + 'leftspacing', + 'list', + 'longdesc', + 'loop', + 'loopcount', + 'loopend', + 'loopstart', + 'low', + 'lowsrc', + 'max', + 'maxlength', + 'media', + 'method', + 'min', + 'multiple', + 'name', + 'nohref', + 'noshade', + 'nowrap', + 'open', + 'optimum', + 'pattern', + 'ping', + 'point-size', + 'poster', + 'pqg', + 'preload', + 'prompt', + 'radiogroup', + 'readonly', + 'rel', + 'repeat-max', + 'repeat-min', + 'replace', + 'required', + 'rev', + 'rightspacing', + 'rows', + 'rowspan', + 'rules', + 'scope', + 'selected', + 'shape', + 'size', + 'span', + 'src', + 'start', + 'step', + 'summary', + 'suppress', + 'tabindex', + 'target', + 'template', + 'title', + 'toppadding', + 'type', + 'unselectable', + 'urn', + 'usemap', + 'valign', + 'value', + 'variable', + 'volume', + 'vrml', + 'vspace', + 'width', + 'wrap', + 'xml:lang', + } + + unacceptable_elements_with_end_tag = { + 'applet', + 'script', + 'style', + } + + acceptable_css_properties = { + 'azimuth', + 'background-color', + 'border-bottom-color', + 'border-collapse', + 'border-color', + 'border-left-color', + 'border-right-color', + 'border-top-color', + 'clear', + 'color', + 'cursor', + 'direction', + 'display', + 'elevation', + 'float', + 'font', + 'font-family', + 'font-size', + 'font-style', + 'font-variant', + 'font-weight', + 'height', + 'letter-spacing', + 'line-height', + 'overflow', + 'pause', + 'pause-after', + 'pause-before', + 'pitch', + 'pitch-range', + 'richness', + 'speak', + 'speak-header', + 'speak-numeral', + 'speak-punctuation', + 'speech-rate', + 'stress', + 'text-align', + 'text-decoration', + 'text-indent', + 'unicode-bidi', + 'vertical-align', + 'voice-family', + 'volume', + 'white-space', + 'width', + } + + # survey of common keywords found in feeds + acceptable_css_keywords = { + '!important', + 'aqua', + 'auto', + 'black', + 'block', + 'blue', + 'bold', + 'both', + 'bottom', + 'brown', + 'center', + 'collapse', + 'dashed', + 'dotted', + 'fuchsia', + 'gray', + 'green', + 'italic', + 'left', + 'lime', + 'maroon', + 'medium', + 'navy', + 'none', + 'normal', + 'nowrap', + 'olive', + 'pointer', + 'purple', + 'red', + 'right', + 'silver', + 'solid', + 'teal', + 'top', + 'transparent', + 'underline', + 'white', + 'yellow', + } + + valid_css_values = re.compile( + r'^(' + r'#[0-9a-f]+' # Hex values + r'|rgb\(\d+%?,\d*%?,?\d*%?\)?' # RGB values + r'|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?' # Sizes/widths + r')$' + ) + + mathml_elements = { + 'annotation', + 'annotation-xml', + 'maction', + 'maligngroup', + 'malignmark', + 'math', + 'menclose', + 'merror', + 'mfenced', + 'mfrac', + 'mglyph', + 'mi', + 'mlabeledtr', + 'mlongdiv', + 'mmultiscripts', + 'mn', + 'mo', + 'mover', + 'mpadded', + 'mphantom', + 'mprescripts', + 'mroot', + 'mrow', + 'ms', + 'mscarries', + 'mscarry', + 'msgroup', + 'msline', + 'mspace', + 'msqrt', + 'msrow', + 'mstack', + 'mstyle', + 'msub', + 'msubsup', + 'msup', + 'mtable', + 'mtd', + 'mtext', + 'mtr', + 'munder', + 'munderover', + 'none', + 'semantics', + } + + mathml_attributes = { + 'accent', + 'accentunder', + 'actiontype', + 'align', + 'alignmentscope', + 'altimg', + 'altimg-height', + 'altimg-valign', + 'altimg-width', + 'alttext', + 'bevelled', + 'charalign', + 'close', + 'columnalign', + 'columnlines', + 'columnspacing', + 'columnspan', + 'columnwidth', + 'crossout', + 'decimalpoint', + 'denomalign', + 'depth', + 'dir', + 'display', + 'displaystyle', + 'edge', + 'encoding', + 'equalcolumns', + 'equalrows', + 'fence', + 'fontstyle', + 'fontweight', + 'form', + 'frame', + 'framespacing', + 'groupalign', + 'height', + 'href', + 'id', + 'indentalign', + 'indentalignfirst', + 'indentalignlast', + 'indentshift', + 'indentshiftfirst', + 'indentshiftlast', + 'indenttarget', + 'infixlinebreakstyle', + 'largeop', + 'length', + 'linebreak', + 'linebreakmultchar', + 'linebreakstyle', + 'lineleading', + 'linethickness', + 'location', + 'longdivstyle', + 'lquote', + 'lspace', + 'mathbackground', + 'mathcolor', + 'mathsize', + 'mathvariant', + 'maxsize', + 'minlabelspacing', + 'minsize', + 'movablelimits', + 'notation', + 'numalign', + 'open', + 'other', + 'overflow', + 'position', + 'rowalign', + 'rowlines', + 'rowspacing', + 'rowspan', + 'rquote', + 'rspace', + 'scriptlevel', + 'scriptminsize', + 'scriptsizemultiplier', + 'selection', + 'separator', + 'separators', + 'shift', + 'side', + 'src', + 'stackalign', + 'stretchy', + 'subscriptshift', + 'superscriptshift', + 'symmetric', + 'voffset', + 'width', + 'xlink:href', + 'xlink:show', + 'xlink:type', + 'xmlns', + 'xmlns:xlink', + } + + # svgtiny - foreignObject + linearGradient + radialGradient + stop + svg_elements = { + 'a', + 'animate', + 'animateColor', + 'animateMotion', + 'animateTransform', + 'circle', + 'defs', + 'desc', + 'ellipse', + 'font-face', + 'font-face-name', + 'font-face-src', + 'foreignObject', + 'g', + 'glyph', + 'hkern', + 'line', + 'linearGradient', + 'marker', + 'metadata', + 'missing-glyph', + 'mpath', + 'path', + 'polygon', + 'polyline', + 'radialGradient', + 'rect', + 'set', + 'stop', + 'svg', + 'switch', + 'text', + 'title', + 'tspan', + 'use', + } + + # svgtiny + class + opacity + offset + xmlns + xmlns:xlink + svg_attributes = { + 'accent-height', + 'accumulate', + 'additive', + 'alphabetic', + 'arabic-form', + 'ascent', + 'attributeName', + 'attributeType', + 'baseProfile', + 'bbox', + 'begin', + 'by', + 'calcMode', + 'cap-height', + 'class', + 'color', + 'color-rendering', + 'content', + 'cx', + 'cy', + 'd', + 'descent', + 'display', + 'dur', + 'dx', + 'dy', + 'end', + 'fill', + 'fill-opacity', + 'fill-rule', + 'font-family', + 'font-size', + 'font-stretch', + 'font-style', + 'font-variant', + 'font-weight', + 'from', + 'fx', + 'fy', + 'g1', + 'g2', + 'glyph-name', + 'gradientUnits', + 'hanging', + 'height', + 'horiz-adv-x', + 'horiz-origin-x', + 'id', + 'ideographic', + 'k', + 'keyPoints', + 'keySplines', + 'keyTimes', + 'lang', + 'marker-end', + 'marker-mid', + 'marker-start', + 'markerHeight', + 'markerUnits', + 'markerWidth', + 'mathematical', + 'max', + 'min', + 'name', + 'offset', + 'opacity', + 'orient', + 'origin', + 'overline-position', + 'overline-thickness', + 'panose-1', + 'path', + 'pathLength', + 'points', + 'preserveAspectRatio', + 'r', + 'refX', + 'refY', + 'repeatCount', + 'repeatDur', + 'requiredExtensions', + 'requiredFeatures', + 'restart', + 'rotate', + 'rx', + 'ry', + 'slope', + 'stemh', + 'stemv', + 'stop-color', + 'stop-opacity', + 'strikethrough-position', + 'strikethrough-thickness', + 'stroke', + 'stroke-dasharray', + 'stroke-dashoffset', + 'stroke-linecap', + 'stroke-linejoin', + 'stroke-miterlimit', + 'stroke-opacity', + 'stroke-width', + 'systemLanguage', + 'target', + 'text-anchor', + 'to', + 'transform', + 'type', + 'u1', + 'u2', + 'underline-position', + 'underline-thickness', + 'unicode', + 'unicode-range', + 'units-per-em', + 'values', + 'version', + 'viewBox', + 'visibility', + 'width', + 'widths', + 'x', + 'x-height', + 'x1', + 'x2', + 'xlink:actuate', + 'xlink:arcrole', + 'xlink:href', + 'xlink:role', + 'xlink:show', + 'xlink:title', + 'xlink:type', + 'xml:base', + 'xml:lang', + 'xml:space', + 'xmlns', + 'xmlns:xlink', + 'y', + 'y1', + 'y2', + 'zoomAndPan', + } + + svg_attr_map = None + svg_elem_map = None + + acceptable_svg_properties = { + 'fill', + 'fill-opacity', + 'fill-rule', + 'stroke', + 'stroke-linecap', + 'stroke-linejoin', + 'stroke-opacity', + 'stroke-width', + } + + def __init__(self, encoding=None, _type='application/xhtml+xml'): + super(_HTMLSanitizer, self).__init__(encoding, _type) + + self.unacceptablestack = 0 + self.mathmlOK = 0 + self.svgOK = 0 + + def reset(self): + super(_HTMLSanitizer, self).reset() + self.unacceptablestack = 0 + self.mathmlOK = 0 + self.svgOK = 0 + + def unknown_starttag(self, tag, attrs): + acceptable_attributes = self.acceptable_attributes + keymap = {} + if tag not in self.acceptable_elements or self.svgOK: + if tag in self.unacceptable_elements_with_end_tag: + self.unacceptablestack += 1 + + # add implicit namespaces to html5 inline svg/mathml + if self._type.endswith('html'): + if not dict(attrs).get('xmlns'): + if tag == 'svg': + attrs.append(('xmlns', 'http://www.w3.org/2000/svg')) + if tag == 'math': + attrs.append(('xmlns', 'http://www.w3.org/1998/Math/MathML')) + + # not otherwise acceptable, perhaps it is MathML or SVG? + if tag == 'math' and ('xmlns', 'http://www.w3.org/1998/Math/MathML') in attrs: + self.mathmlOK += 1 + if tag == 'svg' and ('xmlns', 'http://www.w3.org/2000/svg') in attrs: + self.svgOK += 1 + + # chose acceptable attributes based on tag class, else bail + if self.mathmlOK and tag in self.mathml_elements: + acceptable_attributes = self.mathml_attributes + elif self.svgOK and tag in self.svg_elements: + # For most vocabularies, lowercasing is a good idea. Many + # svg elements, however, are camel case. + if not self.svg_attr_map: + lower = [attr.lower() for attr in self.svg_attributes] + mix = [a for a in self.svg_attributes if a not in lower] + self.svg_attributes = lower + self.svg_attr_map = {a.lower(): a for a in mix} + + lower = [attr.lower() for attr in self.svg_elements] + mix = [a for a in self.svg_elements if a not in lower] + self.svg_elements = lower + self.svg_elem_map = {a.lower(): a for a in mix} + acceptable_attributes = self.svg_attributes + tag = self.svg_elem_map.get(tag, tag) + keymap = self.svg_attr_map + elif tag not in self.acceptable_elements: + return + + # declare xlink namespace, if needed + if self.mathmlOK or self.svgOK: + if any((a for a in attrs if a[0].startswith('xlink:'))): + if not ('xmlns:xlink', 'http://www.w3.org/1999/xlink') in attrs: + attrs.append(('xmlns:xlink', 'http://www.w3.org/1999/xlink')) + + clean_attrs = [] + for key, value in self.normalize_attrs(attrs): + if key in acceptable_attributes: + key = keymap.get(key, key) + # make sure the uri uses an acceptable uri scheme + if key == 'href': + value = make_safe_absolute_uri(value) + clean_attrs.append((key, value)) + elif key == 'style': + clean_value = self.sanitize_style(value) + if clean_value: + clean_attrs.append((key, clean_value)) + super(_HTMLSanitizer, self).unknown_starttag(tag, clean_attrs) + + def unknown_endtag(self, tag): + if tag not in self.acceptable_elements: + if tag in self.unacceptable_elements_with_end_tag: + self.unacceptablestack -= 1 + if self.mathmlOK and tag in self.mathml_elements: + if tag == 'math' and self.mathmlOK: + self.mathmlOK -= 1 + elif self.svgOK and tag in self.svg_elements: + tag = self.svg_elem_map.get(tag, tag) + if tag == 'svg' and self.svgOK: + self.svgOK -= 1 + else: + return + super(_HTMLSanitizer, self).unknown_endtag(tag) + + def handle_pi(self, text): + pass + + def handle_decl(self, text): + pass + + def handle_data(self, text): + if not self.unacceptablestack: + super(_HTMLSanitizer, self).handle_data(text) + + def sanitize_style(self, style): + # disallow urls + style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style) + + # gauntlet + if not re.match(r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): + return '' + # This replaced a regexp that used re.match and was prone to + # pathological back-tracking. + if re.sub(r"\s*[-\w]+\s*:\s*[^:;]*;?", '', style).strip(): + return '' + + clean = [] + for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style): + if not value: + continue + if prop.lower() in self.acceptable_css_properties: + clean.append(prop + ': ' + value + ';') + elif prop.split('-')[0].lower() in ['background', 'border', 'margin', 'padding']: + for keyword in value.split(): + if ( + keyword not in self.acceptable_css_keywords + and not self.valid_css_values.match(keyword) + ): + break + else: + clean.append(prop + ': ' + value + ';') + elif self.svgOK and prop.lower() in self.acceptable_svg_properties: + clean.append(prop + ': ' + value + ';') + + return ' '.join(clean) + + def parse_comment(self, i, report=1): + ret = super(_HTMLSanitizer, self).parse_comment(i, report) + if ret >= 0: + return ret + # if ret == -1, this may be a malicious attempt to circumvent + # sanitization, or a page-destroying unclosed comment + match = re.compile(r'--[^>]*>').search(self.rawdata, i+4) + if match: + return match.end() + # unclosed comment; deliberately fail to handle_data() + return len(self.rawdata) + + +def _sanitize_html(html_source, encoding, _type): + if not _SGML_AVAILABLE: + return html_source + p = _HTMLSanitizer(encoding, _type) + html_source = html_source.replace(' +RE_ENTITY_PATTERN = re.compile(br'^\s*]*?)>', re.MULTILINE) + +# Match XML DOCTYPE declarations. +# Example: +RE_DOCTYPE_PATTERN = re.compile(br'^\s*]*?)>', re.MULTILINE) + +# Match safe entity declarations. +# This will allow hexadecimal character references through, +# as well as text, but not arbitrary nested entities. +# Example: cubed "³" +# Example: copyright "(C)" +# Forbidden: explode1 "&explode2;&explode2;" +RE_SAFE_ENTITY_PATTERN = re.compile(br'\s+(\w+)\s+"(&#\w+;|[^&"]*)"') + + +def replace_doctype(data): + """Strips and replaces the DOCTYPE, returns (rss_version, stripped_data) + + rss_version may be 'rss091n' or None + stripped_data is the same XML document with a replaced DOCTYPE + """ + + # Divide the document into two groups by finding the location + # of the first element that doesn't begin with '\n\n]>' + data = RE_DOCTYPE_PATTERN.sub(replacement, head) + data + + # Precompute the safe entities for the loose parser. + safe_entities = { + k.decode('utf-8'): v.decode('utf-8') + for k, v in RE_SAFE_ENTITY_PATTERN.findall(replacement) + } + return version, data, safe_entities diff --git a/modules/feedparser/sgml.py b/modules/feedparser/sgml.py new file mode 100644 index 000000000..b6bcf2e5b --- /dev/null +++ b/modules/feedparser/sgml.py @@ -0,0 +1,136 @@ +# Copyright 2010-2019 Kurt McKee +# Copyright 2002-2008 Mark Pilgrim +# All rights reserved. +# +# This file is a part of feedparser. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from __future__ import absolute_import + +import re + +__all__ = [ + '_SGML_AVAILABLE', + 'sgmllib', + 'charref', + 'tagfind', + 'attrfind', + 'entityref', + 'incomplete', + 'interesting', + 'shorttag', + 'shorttagopen', + 'starttagopen', + 'endbracket', +] + +# sgmllib is not available by default in Python 3; if the end user doesn't have +# it available then we'll lose illformed XML parsing and content sanitizing +try: + import sgmllib +except ImportError: + # This is probably Python 3, which doesn't include sgmllib anymore + _SGML_AVAILABLE = 0 + + # Mock sgmllib enough to allow subclassing later on + class sgmllib(object): + SGMLParseError = EnvironmentError + + class SGMLParser(object): + lasttag = None + rawdata = None + + def close(self): + pass + + def feed(self, data): + pass + + def goahead(self, i): + pass + + def parse_declaration(self, i): + pass + + def parse_starttag(self, i): + pass + + def reset(self): + pass + +else: + _SGML_AVAILABLE = 1 + + # sgmllib defines a number of module-level regular expressions that are + # insufficient for the XML parsing feedparser needs. Rather than modify + # the variables directly in sgmllib, they're defined here using the same + # names, and the compiled code objects of several sgmllib.SGMLParser + # methods are copied into _BaseHTMLProcessor so that they execute in + # feedparser's scope instead of sgmllib's scope. + charref = re.compile(r'&#(\d+|[xX][0-9a-fA-F]+);') + tagfind = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*') + attrfind = re.compile( + r"""\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)[$]?(\s*=\s*""" + r"""('[^']*'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$()_#=~'"@]*))?""" + ) + + # Unfortunately, these must be copied over to prevent NameError exceptions + entityref = sgmllib.entityref + incomplete = sgmllib.incomplete + interesting = sgmllib.interesting + shorttag = sgmllib.shorttag + shorttagopen = sgmllib.shorttagopen + starttagopen = sgmllib.starttagopen + + + class _EndBracketRegEx: + def __init__(self): + # Overriding the built-in sgmllib.endbracket regex allows the + # parser to find angle brackets embedded in element attributes. + self.endbracket = re.compile( + r'(' + r"""[^'"<>]""" + r"""|"[^"]*"(?=>|/|\s|\w+=)""" + r"""|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])""" + r"""|.*?(?=[<>]""" + r')' + ) + + def search(self, target, index=0): + match = self.endbracket.match(target, index) + if match is not None: + # Returning a new object in the calling thread's context + # resolves a thread-safety. + return EndBracketMatch(match) + return None + + + class EndBracketMatch: + def __init__(self, match): + self.match = match + + def start(self, n): + return self.match.end(n) + + + endbracket = _EndBracketRegEx() diff --git a/modules/feedparser/urls.py b/modules/feedparser/urls.py new file mode 100644 index 000000000..1d1b6c3a1 --- /dev/null +++ b/modules/feedparser/urls.py @@ -0,0 +1,162 @@ +# Copyright 2010-2019 Kurt McKee +# Copyright 2002-2008 Mark Pilgrim +# All rights reserved. +# +# This file is a part of feedparser. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from __future__ import absolute_import +from __future__ import unicode_literals + +import re + +try: + import urllib.parse as urlparse +except ImportError: + import urlparse as urlparse + +from .html import _BaseHTMLProcessor + +# If you want feedparser to allow all URL schemes, set this to () +# List culled from Python's urlparse documentation at: +# http://docs.python.org/library/urlparse.html +# as well as from "URI scheme" at Wikipedia: +# https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme +# Many more will likely need to be added! +ACCEPTABLE_URI_SCHEMES = ( + 'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'magnet', + 'mailto', 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu', + 'sftp', 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet', + 'wais', + # Additional common-but-unofficial schemes + 'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs', + 'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg', +) + +_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)') + + +def _urljoin(base, uri): + uri = _urifixer.sub(r'\1\3', uri) + try: + uri = urlparse.urljoin(base, uri) + except ValueError: + uri = '' + return uri + + +def convert_to_idn(url): + """Convert a URL to IDN notation""" + # this function should only be called with a unicode string + # strategy: if the host cannot be encoded in ascii, then + # it'll be necessary to encode it in idn form + parts = list(urlparse.urlsplit(url)) + try: + parts[1].encode('ascii') + except UnicodeEncodeError: + # the url needs to be converted to idn notation + host = parts[1].rsplit(':', 1) + newhost = [] + port = '' + if len(host) == 2: + port = host.pop() + for h in host[0].split('.'): + newhost.append(h.encode('idna').decode('utf-8')) + parts[1] = '.'.join(newhost) + if port: + parts[1] += ':' + port + return urlparse.urlunsplit(parts) + else: + return url + + +def make_safe_absolute_uri(base, rel=None): + # bail if ACCEPTABLE_URI_SCHEMES is empty + if not ACCEPTABLE_URI_SCHEMES: + return _urljoin(base, rel or '') + if not base: + return rel or '' + if not rel: + try: + scheme = urlparse.urlparse(base)[0] + except ValueError: + return '' + if not scheme or scheme in ACCEPTABLE_URI_SCHEMES: + return base + return '' + uri = _urljoin(base, rel) + if uri.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES: + return '' + return uri + + +class RelativeURIResolver(_BaseHTMLProcessor): + relative_uris = { + ('a', 'href'), + ('applet', 'codebase'), + ('area', 'href'), + ('audio', 'src'), + ('blockquote', 'cite'), + ('body', 'background'), + ('del', 'cite'), + ('form', 'action'), + ('frame', 'longdesc'), + ('frame', 'src'), + ('iframe', 'longdesc'), + ('iframe', 'src'), + ('head', 'profile'), + ('img', 'longdesc'), + ('img', 'src'), + ('img', 'usemap'), + ('input', 'src'), + ('input', 'usemap'), + ('ins', 'cite'), + ('link', 'href'), + ('object', 'classid'), + ('object', 'codebase'), + ('object', 'data'), + ('object', 'usemap'), + ('q', 'cite'), + ('script', 'src'), + ('source', 'src'), + ('video', 'poster'), + ('video', 'src'), + } + + def __init__(self, baseuri, encoding, _type): + _BaseHTMLProcessor.__init__(self, encoding, _type) + self.baseuri = baseuri + + def resolve_uri(self, uri): + return make_safe_absolute_uri(self.baseuri, uri.strip()) + + def unknown_starttag(self, tag, attrs): + attrs = self.normalize_attrs(attrs) + attrs = [(key, ((tag, key) in self.relative_uris) and self.resolve_uri(value) or value) for key, value in attrs] + super(RelativeURIResolver, self).unknown_starttag(tag, attrs) + + +def resolve_relative_uris(html_source, base_uri, encoding, type_): + p = RelativeURIResolver(base_uri, encoding, type_) + p.feed(html_source) + return p.output() diff --git a/modules/feedparser/util.py b/modules/feedparser/util.py new file mode 100644 index 000000000..260d3c5ce --- /dev/null +++ b/modules/feedparser/util.py @@ -0,0 +1,166 @@ +# Copyright 2010-2019 Kurt McKee +# Copyright 2002-2008 Mark Pilgrim +# All rights reserved. +# +# This file is a part of feedparser. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from __future__ import absolute_import +from __future__ import unicode_literals + +import warnings + + +class FeedParserDict(dict): + keymap = { + 'channel': 'feed', + 'items': 'entries', + 'guid': 'id', + 'date': 'updated', + 'date_parsed': 'updated_parsed', + 'description': ['summary', 'subtitle'], + 'description_detail': ['summary_detail', 'subtitle_detail'], + 'url': ['href'], + 'modified': 'updated', + 'modified_parsed': 'updated_parsed', + 'issued': 'published', + 'issued_parsed': 'published_parsed', + 'copyright': 'rights', + 'copyright_detail': 'rights_detail', + 'tagline': 'subtitle', + 'tagline_detail': 'subtitle_detail', + } + + def __getitem__(self, key): + """ + :return: A :class:`FeedParserDict`. + """ + + if key == 'category': + try: + return dict.__getitem__(self, 'tags')[0]['term'] + except IndexError: + raise KeyError("object doesn't have key 'category'") + elif key == 'enclosures': + norel = lambda link: FeedParserDict([(name, value) for (name, value) in link.items() if name != 'rel']) + return [ + norel(link) + for link in dict.__getitem__(self, 'links') + if link['rel'] == 'enclosure' + ] + elif key == 'license': + for link in dict.__getitem__(self, 'links'): + if link['rel'] == 'license' and 'href' in link: + return link['href'] + elif key == 'updated': + # Temporarily help developers out by keeping the old + # broken behavior that was reported in issue 310. + # This fix was proposed in issue 328. + if ( + not dict.__contains__(self, 'updated') + and dict.__contains__(self, 'published') + ): + warnings.warn( + "To avoid breaking existing software while " + "fixing issue 310, a temporary mapping has been created " + "from `updated` to `published` if `updated` doesn't " + "exist. This fallback will be removed in a future version " + "of feedparser.", + DeprecationWarning, + ) + return dict.__getitem__(self, 'published') + return dict.__getitem__(self, 'updated') + elif key == 'updated_parsed': + if ( + not dict.__contains__(self, 'updated_parsed') + and dict.__contains__(self, 'published_parsed') + ): + warnings.warn( + "To avoid breaking existing software while " + "fixing issue 310, a temporary mapping has been created " + "from `updated_parsed` to `published_parsed` if " + "`updated_parsed` doesn't exist. This fallback will be " + "removed in a future version of feedparser.", + DeprecationWarning, + ) + return dict.__getitem__(self, 'published_parsed') + return dict.__getitem__(self, 'updated_parsed') + else: + realkey = self.keymap.get(key, key) + if isinstance(realkey, list): + for k in realkey: + if dict.__contains__(self, k): + return dict.__getitem__(self, k) + elif dict.__contains__(self, realkey): + return dict.__getitem__(self, realkey) + return dict.__getitem__(self, key) + + def __contains__(self, key): + if key in ('updated', 'updated_parsed'): + # Temporarily help developers out by keeping the old + # broken behavior that was reported in issue 310. + # This fix was proposed in issue 328. + return dict.__contains__(self, key) + try: + self.__getitem__(key) + except KeyError: + return False + else: + return True + + has_key = __contains__ + + def get(self, key, default=None): + """ + :return: A :class:`FeedParserDict`. + """ + + try: + return self.__getitem__(key) + except KeyError: + return default + + def __setitem__(self, key, value): + key = self.keymap.get(key, key) + if isinstance(key, list): + key = key[0] + return dict.__setitem__(self, key, value) + + def setdefault(self, k, default): + if k not in self: + self[k] = default + return default + return self[k] + + def __getattr__(self, key): + # __getattribute__() is called first; this will be called + # only if an attribute was not already found + try: + return self.__getitem__(key) + except KeyError: + raise AttributeError("object has no attribute '%s'" % key) + + def __hash__(self): + # This is incorrect behavior -- dictionaries shouldn't be hashable. + # Note to self: remove this behavior in the future. + return id(self) diff --git a/modules/feedparser.py b/modules/feedparser521.py similarity index 100% rename from modules/feedparser.py rename to modules/feedparser521.py diff --git a/modules/s3/s3msg.py b/modules/s3/s3msg.py index cd5657cb7..51b8a27ef 100644 --- a/modules/s3/s3msg.py +++ b/modules/s3/s3msg.py @@ -2060,7 +2060,13 @@ def poll_rss(channel_id): return "No Such RSS Channel: %s" % channel_id # http://pythonhosted.org/feedparser - import feedparser + if PY2: + # Use Stable v5.2.1 + # - current known reason is to prevent SSL: CERTIFICATE_VERIFY_FAILED + import feedparser521 as feedparser + else: + # Use 6.0.0b1 which is required for Python 3.x + import feedparser # Basic Authentication username = channel.username password = channel.password @@ -2083,33 +2089,34 @@ def poll_rss(channel_id): # http://pythonhosted.org/feedparser/http-etag.html # NB This won't help for a server like Drupal 7 set to not allow caching & hence generating a new ETag/Last Modified each request! d = feedparser.parse(channel.url, - etag=channel.etag, - request_headers=request_headers, - response_headers=response_headers, + etag = channel.etag, + request_headers = request_headers, + response_headers = response_headers, ) elif channel.date: d = feedparser.parse(channel.url, - modified=channel.date.utctimetuple(), - request_headers=request_headers, - response_headers=response_headers, + modified = channel.date.utctimetuple(), + request_headers = request_headers, + response_headers = response_headers, ) else: # We've not polled this feed before d = feedparser.parse(channel.url, - request_headers=request_headers, - response_headers=response_headers, + request_headers = request_headers, + response_headers = response_headers, ) if d.bozo: # Something doesn't seem right S3Msg.update_channel_status(channel_id, - status = "ERROR: %s" % d.bozo_exception.message, + #status = "ERROR: %s" % d.bozo_exception.message, + status = "ERROR: %s" % d.bozo_exception, period = (300, 3600), ) return # Update ETag/Last-polled now = current.request.utcnow - data = dict(date=now) + data = {"date": now} etag = d.get("etag", None) if etag: data["etag"] = etag diff --git a/modules/templates/SAMBRO/Demo/tasks.cfg b/modules/templates/SAMBRO/Demo/tasks.cfg index cc2099816..054962b17 100644 --- a/modules/templates/SAMBRO/Demo/tasks.cfg +++ b/modules/templates/SAMBRO/Demo/tasks.cfg @@ -72,7 +72,7 @@ cap,area,cap_area.csv,area.xsl *,import_xml,TropicalCyclone20150808.xml,cap,alert,cap *,import_xml,tsunami.xml,cap,alert,cap *,import_xml,Fiji_Water_Shortage_Suva.xml,cap,alert,cap -*,import_xml,Fiji_Tsunami_Level_0_Earthquake_Bulletin.xml,cap,alert,cap +#*,import_xml,Fiji_Tsunami_Level_0_Earthquake_Bulletin.xml,cap,alert,cap # # --- MSG --- # diff --git a/optional_requirements.txt b/optional_requirements.txt index 3ab97c848..4e0e643d1 100644 --- a/optional_requirements.txt +++ b/optional_requirements.txt @@ -15,6 +15,8 @@ pyserial>=2.6 tweepy>=1.9 # Warning: S3XLS unresolved dependency: xlrd required for XLS export xlrd>=0.7.1 +# Warning: S3MSG unresolved dependency: sgmllib3k required for Feed import on Python 3.x +sgmllib3k>=1.0.0 # Warning: Vulnerability unresolved dependency: numpy required for Vulnerability module support numpy>=1.6.2 # Warning: S3GIS unresolved dependency: selenium required for Map printing support