-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Allow use of truncated Dictionary for coherence measures #1349
Changes from 32 commits
9d06a1f
f69a2ff
26de547
dfe159b
2e3852e
ec7af1b
3f8fb7f
b12edef
91b8a05
c6224b7
f00d389
327b739
3c0752b
e06c7c3
2ca43f7
825b0e9
314a400
e785773
5f78cdb
bbd2748
5fb0b95
880b8d0
1d32b8e
8e04b41
e3ce402
7f7f55d
343da69
1ce8a72
96fd343
a631ab6
5f58bda
b941f3c
75fcac8
96d1349
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,12 +20,13 @@ | |
|
||
import bz2 | ||
import logging | ||
import re | ||
from xml.etree.cElementTree import iterparse # LXML isn't faster, so let's go with the built-in solution | ||
import multiprocessing | ||
import re | ||
import signal | ||
from xml.etree.cElementTree import \ | ||
iterparse # LXML isn't faster, so let's go with the built-in solution | ||
|
||
from gensim import utils | ||
|
||
# cannot import whole gensim.corpora, because that imports wikicorpus... | ||
from gensim.corpora.dictionary import Dictionary | ||
from gensim.corpora.textcorpus import TextCorpus | ||
|
@@ -249,6 +250,11 @@ def process_article(args): | |
return result, title, pageid | ||
|
||
|
||
def init_to_ignore_interrupt(): | ||
"""Should only be used when master is prepared to handle termination of child processes.""" | ||
signal.signal(signal.SIGINT, signal.SIG_IGN) | ||
|
||
|
||
class WikiCorpus(TextCorpus): | ||
""" | ||
Treat a wikipedia articles dump (\*articles.xml.bz2) as a (read-only) corpus. | ||
|
@@ -260,7 +266,8 @@ class WikiCorpus(TextCorpus): | |
>>> MmCorpus.serialize('wiki_en_vocab200k.mm', wiki) # another 8h, creates a file in MatrixMarket format plus file with id->word | ||
|
||
""" | ||
def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0',)): | ||
def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, | ||
filter_namespaces=('0',)): | ||
""" | ||
Initialize the corpus. Unless a dictionary is provided, this scans the | ||
corpus once, to determine its vocabulary. | ||
|
@@ -299,28 +306,39 @@ def get_texts(self): | |
""" | ||
articles, articles_all = 0, 0 | ||
positions, positions_all = 0, 0 | ||
texts = ((text, self.lemmatize, title, pageid) for title, text, pageid in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces)) | ||
pool = multiprocessing.Pool(self.processes) | ||
# process the corpus in smaller chunks of docs, because multiprocessing.Pool | ||
# is dumb and would load the entire input into RAM at once... | ||
for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1): | ||
for tokens, title, pageid in pool.imap(process_article, group): # chunksize=10): | ||
articles_all += 1 | ||
positions_all += len(tokens) | ||
# article redirects and short stubs are pruned here | ||
if len(tokens) < ARTICLE_MIN_WORDS or any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES): | ||
continue | ||
articles += 1 | ||
positions += len(tokens) | ||
if self.metadata: | ||
yield (tokens, (pageid, title)) | ||
else: | ||
yield tokens | ||
pool.terminate() | ||
|
||
logger.info( | ||
"finished iterating over Wikipedia corpus of %i documents with %i positions" | ||
" (total %i articles, %i positions before pruning articles shorter than %i words)", | ||
articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS) | ||
self.length = articles # cache corpus length | ||
texts = \ | ||
((text, self.lemmatize, title, pageid) | ||
for title, text, pageid | ||
in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces)) | ||
pool = multiprocessing.Pool(self.processes, init_to_ignore_interrupt) | ||
|
||
try: | ||
# process the corpus in smaller chunks of docs, because multiprocessing.Pool | ||
# is dumb and would load the entire input into RAM at once... | ||
for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1): | ||
for tokens, title, pageid in pool.imap(process_article, group): | ||
articles_all += 1 | ||
positions_all += len(tokens) | ||
# article redirects and short stubs are pruned here | ||
if len(tokens) < ARTICLE_MIN_WORDS or any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES): | ||
continue | ||
articles += 1 | ||
positions += len(tokens) | ||
if self.metadata: | ||
yield (tokens, (pageid, title)) | ||
else: | ||
yield tokens | ||
except KeyboardInterrupt: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is this for? Masking user interrupts is an anti-pattern; deserves a detailed comment, at the very least. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've updated the code to handle the interrupt more appropriately. As for why I chose to handle the interrupt at all (at the risk of repeating the above comment): this pool may be active during many other phases of gensim execution if the underlying corpus/texts being iterated come from the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, not sure I understand. When iterating over a Now when you do CTRL+C, what happened (old behaviour)? And what happens now (after your changes here)? Why is that preferable? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here is the code I am running:
Before:
After:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So the advantage is a cleaner log? |
||
logger.warn( | ||
"user terminated iteration over Wikipedia corpus after %i documents with %i positions" | ||
" (total %i articles, %i positions before pruning articles shorter than %i words)", | ||
articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS) | ||
else: | ||
logger.info( | ||
"finished iterating over Wikipedia corpus of %i documents with %i positions" | ||
" (total %i articles, %i positions before pruning articles shorter than %i words)", | ||
articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS) | ||
self.length = articles # cache corpus length | ||
finally: | ||
pool.terminate() | ||
# endclass WikiCorpus |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Shouldn't the original interrupts be restored, once the pool is over (exception/terminated).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The interrupts are only disabled in the children, which are terminated with the Pool. I'm not sure why, but there is no place to reply to your comment about the disadvantages. The disadvantage, to my understanding, is that termination of the children must now be performed by the parent, instead of occurring exactly when the
KeyboardInterrupt
is sent to them. In this case, that is being done explicitly by callingpool.terminate()
. However, according to the docs, even if a secondKeyboardInterrupt
is issued just after the first and the explicit call is not made, the GC will take care of calling it. The advantage here is a cleaner log, yes, but also not leaking the details of the wikicorpus iteration (multiprocessing or not), which can confuse developers and users when they interrupt some other part of the code.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see, thanks.