Skip to content

Commit

Permalink
Fixed handling of templates.
Browse files Browse the repository at this point in the history
attardi committed Jan 24, 2023

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
1 parent f0ca16c commit 8f1b434
Showing 2 changed files with 20 additions and 17 deletions.
20 changes: 10 additions & 10 deletions wikiextractor/WikiExtractor.py
Original file line number Diff line number Diff line change
@@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-

# =============================================================================
# Version: 3.0 (July 22, 2020)
# Version: 3.0 (January 24, 2023)
# Author: Giuseppe Attardi ([email protected]), University of Pisa
#
# Contributors:
@@ -17,7 +17,7 @@
# Nick Ulven (nulven@github)
#
# =============================================================================
# Copyright (c) 2009-2020. Giuseppe Attardi ([email protected]).
# Copyright (c) 2009-2023. Giuseppe Attardi ([email protected]).
# =============================================================================
# This file is part of Tanl.
#
@@ -68,7 +68,7 @@
# ===========================================================================

# Program version
__version__ = '3.0.6'
__version__ = '3.0.7'

##
# Defined in <siteinfo>
@@ -194,6 +194,7 @@ def load_templates(file, output_file=None):
"""
Load templates from :param file:.
:param output_file: file where to save templates and modules.
:return: number of templates loaded.
"""
global templateNamespace
global moduleNamespace, modulePrefix
@@ -335,14 +336,16 @@ def collect_pages(text):


def process_dump(input_file, template_file, out_file, file_size, file_compress,
process_count, html_safe):
process_count, html_safe, expand_templates=True):
"""
:param input_file: name of the wikipedia dump file; '-' to read from stdin
:param template_file: optional file with template definitions.
:param out_file: directory where to store extracted data, or '-' for stdout
:param file_size: max size of each extracted file, or None for no max (one file)
:param file_compress: whether to compress files with bzip.
:param process_count: number of extraction processes to spawn.
:html_safe: whether to convert entities in text to HTML.
:param expand_templates: whether to expand templates.
"""
global knownNamespaces
global templateNamespace
@@ -528,7 +531,7 @@ def reduce_process(output_queue, output):

def main():
global acceptedNamespaces
global expand_templates, templateCache
global templateCache

parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
formatter_class=argparse.RawDescriptionHelpFormatter,
@@ -555,7 +558,7 @@ def main():
help="accepted namespaces")
groupP.add_argument("--templates",
help="use or create file containing templates")
groupP.add_argument("--no-templates", action="store_false",
groupP.add_argument("--no-templates", action="store_true",
help="Do not expand templates")
groupP.add_argument("--html-safe", default=True,
help="use to produce HTML safe output within <doc>...</doc>")
@@ -582,8 +585,6 @@ def main():
Extractor.keepLinks = True
Extractor.to_json = args.json

expand_templates = args.no_templates

try:
power = 'kmg'.find(args.bytes[-1].lower()) + 1
# 0 bytes means put a single article per file.
@@ -636,8 +637,7 @@ def main():
return

process_dump(input_file, args.templates, output_path, file_size,
args.compress, args.processes, args.html_safe)

args.compress, args.processes, args.html_safe, not args.no_templates)

if __name__ == '__main__':
main()
17 changes: 10 additions & 7 deletions wikiextractor/extract.py
Original file line number Diff line number Diff line change
@@ -26,7 +26,6 @@
from html.entities import name2codepoint
import logging
import time
import pdb # DEBUG

# ----------------------------------------------------------------------

@@ -82,7 +81,6 @@ def clean(extractor, text, expand_templates=False, html_safe=True):
if expand_templates:
# expand templates
# See: http://www.mediawiki.org/wiki/Help:Templates
pdb.set_trace() # DEBUG
text = extractor.expandTemplates(text)
else:
# Drop transclusions (template, parser functions)
@@ -830,7 +828,7 @@ def subst(self, params, extractor, depth=0):
# {{ppp|q=r|p=q}} gives r, but using Template:tvvv containing
# "{{{{{{{{{p}}}}}}}}}", {{tvvv|p=q|q=r|r=s}} gives s.

#logging.debug('subst tpl (%d, %d) %s', len(extractor.frame), depth, self)
logging.debug('subst tpl (%d, %d) %s', len(extractor.frame), depth, self)

if depth > extractor.maxParameterRecursionLevels:
extractor.recursion_exceeded_3_errs += 1
@@ -952,6 +950,7 @@ def clean_text(self, text, mark_headers=False, expand_templates=True,
e.g. "## Section 1"
"""
self.magicWords['namespace'] = self.title[:max(0, self.title.find(":"))]
#self.magicWords['namespacenumber'] = '0' # for article,
self.magicWords['pagename'] = self.title
self.magicWords['fullpagename'] = self.title
self.magicWords['currentyear'] = time.strftime('%Y')
@@ -1008,7 +1007,7 @@ def extract(self, out, html_safe=True):
# Expand templates

maxTemplateRecursionLevels = 30
maxParameterRecursionLevels = 10
maxParameterRecursionLevels = 16

# check for template beginning
reOpen = re.compile('(?<!{){{(?!{)', re.DOTALL)
@@ -1764,13 +1763,17 @@ def sharp_invoke(module, function, frame):

'int': lambda string, *rest: str(int(string)),

'padleft': lambda char, width, string: string.ljust(char, int(pad)), # CHECK_ME

}


def callParserFunction(functionName, args, frame):
"""
Parser functions have similar syntax as templates, except that
the first argument is everything after the first colon.
:param functionName: nameof the parser function
:param args: the arguments to the function
:return: the result of the invocation, None in case of failure.
http://meta.wikimedia.org/wiki/Help:ParserFunctions
@@ -1780,11 +1783,11 @@ def callParserFunction(functionName, args, frame):
if functionName == '#invoke':
# special handling of frame
ret = sharp_invoke(args[0].strip(), args[1].strip(), frame)
# logging.debug('parserFunction> %s %s', functionName, ret)
# logging.debug('parserFunction> %s %s', args[1], ret)
return ret
if functionName in parserFunctions:
ret = parserFunctions[functionName](*args)
# logging.debug('parserFunction> %s %s', functionName, ret)
# logging.debug('parserFunction> %s(%s) %s', functionName, args, ret)
return ret
except:
return "" # FIXME: fix errors
@@ -1851,6 +1854,6 @@ def define_template(title, page):
text = reIncludeonly.sub('', text)

if text:
if title in templates:
if title in templates and templates[title] != text:
logging.warn('Redefining: %s', title)
templates[title] = text

0 comments on commit 8f1b434

Please sign in to comment.