piskvorky · menshikh-iv · Jun 14, 2017 · May 22, 2017 · May 22, 2017 · May 22, 2017
diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py
@@ -20,12 +20,13 @@
 
 import bz2
 import logging
-import re
-from xml.etree.cElementTree import iterparse  # LXML isn't faster, so let's go with the built-in solution
 import multiprocessing
+import re
+import signal
+from xml.etree.cElementTree import \
+    iterparse  # LXML isn't faster, so let's go with the built-in solution
 
 from gensim import utils
-
 # cannot import whole gensim.corpora, because that imports wikicorpus...
 from gensim.corpora.dictionary import Dictionary
 from gensim.corpora.textcorpus import TextCorpus
@@ -249,6 +250,11 @@ def process_article(args):
     return result, title, pageid
 
 
+def init_to_ignore_interrupt():
+    """Should only be used when master is prepared to handle termination of child processes."""
+    signal.signal(signal.SIGINT, signal.SIG_IGN)
+
+
 class WikiCorpus(TextCorpus):
     """
     Treat a wikipedia articles dump (\*articles.xml.bz2) as a (read-only) corpus.
@@ -260,7 +266,8 @@ class WikiCorpus(TextCorpus):
     >>> MmCorpus.serialize('wiki_en_vocab200k.mm', wiki) # another 8h, creates a file in MatrixMarket format plus file with id->word
 
     """
-    def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0',)):
+    def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None,
+                 filter_namespaces=('0',)):
         """
         Initialize the corpus. Unless a dictionary is provided, this scans the
         corpus once, to determine its vocabulary.
@@ -299,28 +306,39 @@ def get_texts(self):
         """
         articles, articles_all = 0, 0
         positions, positions_all = 0, 0
-        texts = ((text, self.lemmatize, title, pageid) for title, text, pageid in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces))
-        pool = multiprocessing.Pool(self.processes)
-        # process the corpus in smaller chunks of docs, because multiprocessing.Pool
-        # is dumb and would load the entire input into RAM at once...
-        for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1):
-            for tokens, title, pageid in pool.imap(process_article, group):  # chunksize=10):
-                articles_all += 1
-                positions_all += len(tokens)
-                # article redirects and short stubs are pruned here
-                if len(tokens) < ARTICLE_MIN_WORDS or any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES):
-                    continue
-                articles += 1
-                positions += len(tokens)
-                if self.metadata:
-                    yield (tokens, (pageid, title))
-                else:
-                    yield tokens
-        pool.terminate()
-
-        logger.info(
-            "finished iterating over Wikipedia corpus of %i documents with %i positions"
-            " (total %i articles, %i positions before pruning articles shorter than %i words)",
-            articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS)
-        self.length = articles  # cache corpus length
+        texts = \
+            ((text, self.lemmatize, title, pageid)
+             for title, text, pageid
+             in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces))
+        pool = multiprocessing.Pool(self.processes, init_to_ignore_interrupt)
+
+        try:
+            # process the corpus in smaller chunks of docs, because multiprocessing.Pool
+            # is dumb and would load the entire input into RAM at once...
+            for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1):
+                for tokens, title, pageid in pool.imap(process_article, group):
+                    articles_all += 1
+                    positions_all += len(tokens)
+                    # article redirects and short stubs are pruned here
+                    if len(tokens) < ARTICLE_MIN_WORDS or any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES):
+                        continue
+                    articles += 1
+                    positions += len(tokens)
+                    if self.metadata:
+                        yield (tokens, (pageid, title))
+                    else:
+                        yield tokens
+        except KeyboardInterrupt:
+            logger.warn(
+                "user terminated iteration over Wikipedia corpus after %i documents with %i positions"
+                " (total %i articles, %i positions before pruning articles shorter than %i words)",
+                articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS)
+        else:
+            logger.info(
+                "finished iterating over Wikipedia corpus of %i documents with %i positions"
+                " (total %i articles, %i positions before pruning articles shorter than %i words)",
+                articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS)
+            self.length = articles  # cache corpus length
+        finally:
+            pool.terminate()
 # endclass WikiCorpus