Fixed handling of templates.

attardi · Jan 24, 2023 · 8f1b434 · 8f1b434
1 parent f0ca16c
commit 8f1b434
Showing 2 changed files with 20 additions and 17 deletions.
diff --git a/wikiextractor/WikiExtractor.py b/wikiextractor/WikiExtractor.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 
 # =============================================================================
-#  Version: 3.0 (July 22, 2020)
+#  Version: 3.0 (January 24, 2023)
 #  Author: Giuseppe Attardi ([email protected]), University of Pisa
 #
 #  Contributors:
@@ -17,7 +17,7 @@
 #   Nick Ulven (nulven@github)
 #
 # =============================================================================
-#  Copyright (c) 2009-2020. Giuseppe Attardi ([email protected]).
+#  Copyright (c) 2009-2023. Giuseppe Attardi ([email protected]).
 # =============================================================================
 #  This file is part of Tanl.
 #
@@ -68,7 +68,7 @@
 # ===========================================================================
 
 # Program version
-__version__ = '3.0.6'
+__version__ = '3.0.7'
 
 ##
 # Defined in <siteinfo>
@@ -194,6 +194,7 @@ def load_templates(file, output_file=None):
     """
     Load templates from :param file:.
     :param output_file: file where to save templates and modules.
+    :return: number of templates loaded.
     """
     global templateNamespace
     global moduleNamespace, modulePrefix
@@ -335,14 +336,16 @@ def collect_pages(text):
 
 
 def process_dump(input_file, template_file, out_file, file_size, file_compress,
-                 process_count, html_safe):
+                 process_count, html_safe, expand_templates=True):
     """
     :param input_file: name of the wikipedia dump file; '-' to read from stdin
     :param template_file: optional file with template definitions.
     :param out_file: directory where to store extracted data, or '-' for stdout
     :param file_size: max size of each extracted file, or None for no max (one file)
     :param file_compress: whether to compress files with bzip.
     :param process_count: number of extraction processes to spawn.
+    :html_safe: whether to convert entities in text to HTML.
+    :param expand_templates: whether to expand templates.
     """
     global knownNamespaces
     global templateNamespace
@@ -528,7 +531,7 @@ def reduce_process(output_queue, output):
 
 def main():
     global acceptedNamespaces
-    global expand_templates, templateCache
+    global templateCache
 
     parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
                                      formatter_class=argparse.RawDescriptionHelpFormatter,
@@ -555,7 +558,7 @@ def main():
                         help="accepted namespaces")
     groupP.add_argument("--templates",
                         help="use or create file containing templates")
-    groupP.add_argument("--no-templates", action="store_false",
+    groupP.add_argument("--no-templates", action="store_true",
                         help="Do not expand templates")
     groupP.add_argument("--html-safe", default=True,
                         help="use to produce HTML safe output within <doc>...</doc>")
@@ -582,8 +585,6 @@ def main():
         Extractor.keepLinks = True
     Extractor.to_json = args.json
 
-    expand_templates = args.no_templates
-
     try:
         power = 'kmg'.find(args.bytes[-1].lower()) + 1
         # 0 bytes means put a single article per file.
@@ -636,8 +637,7 @@ def main():
             return
 
     process_dump(input_file, args.templates, output_path, file_size,
-                 args.compress, args.processes, args.html_safe)
-
+                 args.compress, args.processes, args.html_safe, not args.no_templates)
 
 if __name__ == '__main__':
     main()
diff --git a/wikiextractor/extract.py b/wikiextractor/extract.py
@@ -26,7 +26,6 @@
 from html.entities import name2codepoint
 import logging
 import time
-import pdb                      # DEBUG
 
 # ----------------------------------------------------------------------
 
@@ -82,7 +81,6 @@ def clean(extractor, text, expand_templates=False, html_safe=True):
     if expand_templates:
         # expand templates
         # See: http://www.mediawiki.org/wiki/Help:Templates
-        pdb.set_trace()         # DEBUG
         text = extractor.expandTemplates(text)
     else:
         # Drop transclusions (template, parser functions)
@@ -830,7 +828,7 @@ def subst(self, params, extractor, depth=0):
         # {{ppp|q=r|p=q}} gives r, but using Template:tvvv containing
         # "{{{{{{{{{p}}}}}}}}}", {{tvvv|p=q|q=r|r=s}} gives s.
 
-        #logging.debug('subst tpl (%d, %d) %s', len(extractor.frame), depth, self)
+        logging.debug('subst tpl (%d, %d) %s', len(extractor.frame), depth, self)
 
         if depth > extractor.maxParameterRecursionLevels:
             extractor.recursion_exceeded_3_errs += 1
@@ -952,6 +950,7 @@ def clean_text(self, text, mark_headers=False, expand_templates=True,
           e.g. "## Section 1"
         """
         self.magicWords['namespace'] = self.title[:max(0, self.title.find(":"))]
+        #self.magicWords['namespacenumber'] = '0' # for article, 
         self.magicWords['pagename'] = self.title
         self.magicWords['fullpagename'] = self.title
         self.magicWords['currentyear'] = time.strftime('%Y')
@@ -1008,7 +1007,7 @@ def extract(self, out, html_safe=True):
     # Expand templates
 
     maxTemplateRecursionLevels = 30
-    maxParameterRecursionLevels = 10
+    maxParameterRecursionLevels = 16
 
     # check for template beginning
     reOpen = re.compile('(?<!{){{(?!{)', re.DOTALL)
@@ -1764,13 +1763,17 @@ def sharp_invoke(module, function, frame):
 
     'int': lambda string, *rest: str(int(string)),
 
+    'padleft': lambda char, width, string: string.ljust(char, int(pad)), # CHECK_ME
+
 }
 
 
 def callParserFunction(functionName, args, frame):
     """
     Parser functions have similar syntax as templates, except that
     the first argument is everything after the first colon.
+    :param functionName: nameof the parser function
+    :param args: the arguments to the function
     :return: the result of the invocation, None in case of failure.
 
     http://meta.wikimedia.org/wiki/Help:ParserFunctions
@@ -1780,11 +1783,11 @@ def callParserFunction(functionName, args, frame):
         if functionName == '#invoke':
             # special handling of frame
             ret = sharp_invoke(args[0].strip(), args[1].strip(), frame)
-            # logging.debug('parserFunction> %s %s', functionName, ret)
+            # logging.debug('parserFunction> %s %s', args[1], ret)
             return ret
         if functionName in parserFunctions:
             ret = parserFunctions[functionName](*args)
-            # logging.debug('parserFunction> %s %s', functionName, ret)
+            # logging.debug('parserFunction> %s(%s) %s', functionName, args, ret)
             return ret
     except:
         return ""  # FIXME: fix errors
@@ -1851,6 +1854,6 @@ def define_template(title, page):
         text = reIncludeonly.sub('', text)
 
     if text:
-        if title in templates:
+        if title in templates and templates[title] != text:
             logging.warn('Redefining: %s', title)
         templates[title] = text