From b260d4b07114b1c449292cda492a0842b19445ce Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Sat, 30 Sep 2017 15:39:56 +0500 Subject: [PATCH 01/48] Fix typo --- gensim/corpora/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/corpora/__init__.py b/gensim/corpora/__init__.py index 0d51a9b903..aa122d1833 100644 --- a/gensim/corpora/__init__.py +++ b/gensim/corpora/__init__.py @@ -1,5 +1,5 @@ """ -This package contains implementations of various streaming corpus I/O format. +This package contains implementations of various streaming corpus I/O formats. """ # bring corpus classes directly into package namespace, to save some typing From 36d98d11eb464ed74f7e6c22b45adbec7e5618e0 Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Mon, 2 Oct 2017 14:42:31 +0500 Subject: [PATCH 02/48] Make `save_corpus` private --- gensim/corpora/bleicorpus.py | 14 ++++++++------ gensim/corpora/indexedcorpus.py | 8 ++++---- gensim/corpora/lowcorpus.py | 2 +- gensim/corpora/malletcorpus.py | 2 +- gensim/corpora/mmcorpus.py | 2 +- gensim/corpora/sharded_corpus.py | 4 ++-- gensim/corpora/svmlightcorpus.py | 2 +- gensim/corpora/ucicorpus.py | 2 +- gensim/interfaces.py | 8 ++++++-- gensim/models/wrappers/dtmmodel.py | 2 +- gensim/test/test_corpora.py | 4 ++-- gensim/test/test_miislita.py | 2 +- 12 files changed, 29 insertions(+), 23 deletions(-) diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py index 6bd96da716..273759aca6 100644 --- a/gensim/corpora/bleicorpus.py +++ b/gensim/corpora/bleicorpus.py @@ -5,9 +5,7 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -Blei's LDA-C format. -""" +"""Blei's LDA-C format.""" from __future__ import with_statement @@ -41,8 +39,9 @@ def __init__(self, fname, fname_vocab=None): """ Initialize the corpus from a file. - `fname_vocab` is the file with vocabulary; if not specified, it defaults to - `fname.vocab`. + Args: + fname (str): serialized corpus's filename + fname_vocab (str): vocabulary file; takes precedence over fname.vocab """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s", fname) @@ -85,7 +84,7 @@ def line2doc(self, line): return doc @staticmethod - def save_corpus(fname, corpus, id2word=None, metadata=False): + def __save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the LDA-C format. @@ -94,6 +93,9 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): This function is automatically called by `BleiCorpus.serialize`; don't call it directly, call `serialize` instead. + + Args: + """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") diff --git a/gensim/corpora/indexedcorpus.py b/gensim/corpora/indexedcorpus.py index af79a2fd5f..ca0debbfd5 100644 --- a/gensim/corpora/indexedcorpus.py +++ b/gensim/corpora/indexedcorpus.py @@ -85,14 +85,14 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progres if progress_cnt is not None: if labels is not None: - offsets = serializer.save_corpus(fname, corpus, id2word, labels=labels, progress_cnt=progress_cnt, metadata=metadata) + offsets = serializer.__save_corpus(fname, corpus, id2word, labels=labels, progress_cnt=progress_cnt, metadata=metadata) else: - offsets = serializer.save_corpus(fname, corpus, id2word, progress_cnt=progress_cnt, metadata=metadata) + offsets = serializer.__save_corpus(fname, corpus, id2word, progress_cnt=progress_cnt, metadata=metadata) else: if labels is not None: - offsets = serializer.save_corpus(fname, corpus, id2word, labels=labels, metadata=metadata) + offsets = serializer.__save_corpus(fname, corpus, id2word, labels=labels, metadata=metadata) else: - offsets = serializer.save_corpus(fname, corpus, id2word, metadata=metadata) + offsets = serializer.__save_corpus(fname, corpus, id2word, metadata=metadata) if offsets is None: raise NotImplementedError("called serialize on class %s which doesn't support indexing!" % serializer.__name__) diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py index d5265f6571..572470af2e 100644 --- a/gensim/corpora/lowcorpus.py +++ b/gensim/corpora/lowcorpus.py @@ -140,7 +140,7 @@ def __iter__(self): yield self.line2doc(line) @staticmethod - def save_corpus(fname, corpus, id2word=None, metadata=False): + def __save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the List-of-words format. diff --git a/gensim/corpora/malletcorpus.py b/gensim/corpora/malletcorpus.py index cacf0074bd..b6dc482dcc 100644 --- a/gensim/corpora/malletcorpus.py +++ b/gensim/corpora/malletcorpus.py @@ -67,7 +67,7 @@ def line2doc(self, line): return doc @staticmethod - def save_corpus(fname, corpus, id2word=None, metadata=False): + def __save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the Mallet format. diff --git a/gensim/corpora/mmcorpus.py b/gensim/corpora/mmcorpus.py index 2158f0a526..1eaadfb332 100644 --- a/gensim/corpora/mmcorpus.py +++ b/gensim/corpora/mmcorpus.py @@ -38,7 +38,7 @@ def __iter__(self): yield doc # get rid of doc id, return the sparse vector only @staticmethod - def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False): + def __save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False): """ Save a corpus in the Matrix Market format to disk. diff --git a/gensim/corpora/sharded_corpus.py b/gensim/corpora/sharded_corpus.py index 4d0fde4999..d6596aa831 100644 --- a/gensim/corpora/sharded_corpus.py +++ b/gensim/corpora/sharded_corpus.py @@ -764,7 +764,7 @@ def load(cls, fname, mmap=None): return super(ShardedCorpus, cls).load(fname, mmap) @staticmethod - def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False, **kwargs): + def __save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False, **kwargs): """ Implement a serialization interface. Do not call directly; use the `serialize` method instead. @@ -799,4 +799,4 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progres Ignore the parameters id2word, index_fname, progress_cnt, labels and metadata. They currently do nothing and are here only to provide a compatible method signature with superclass.""" - serializer.save_corpus(fname, corpus, id2word=id2word, progress_cnt=progress_cnt, metadata=metadata, **kwargs) + serializer.__save_corpus(fname, corpus, id2word=id2word, progress_cnt=progress_cnt, metadata=metadata, **kwargs) diff --git a/gensim/corpora/svmlightcorpus.py b/gensim/corpora/svmlightcorpus.py index 290414836e..34527c31b7 100644 --- a/gensim/corpora/svmlightcorpus.py +++ b/gensim/corpora/svmlightcorpus.py @@ -79,7 +79,7 @@ def __iter__(self): self.length = lineno + 1 @staticmethod - def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): + def __save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): """ Save a corpus in the SVMlight format. diff --git a/gensim/corpora/ucicorpus.py b/gensim/corpora/ucicorpus.py index a8911ee07f..995ce3e6ad 100644 --- a/gensim/corpora/ucicorpus.py +++ b/gensim/corpora/ucicorpus.py @@ -192,7 +192,7 @@ def create_dictionary(self): return dictionary @staticmethod - def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False): + def __save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False): """ Save a corpus in the UCI Bag-of-Words format. diff --git a/gensim/interfaces.py b/gensim/interfaces.py index 81f85a8527..cf9a10b123 100644 --- a/gensim/interfaces.py +++ b/gensim/interfaces.py @@ -71,14 +71,14 @@ def __len__(self): # return sum(1 for doc in self) # sum(empty generator) == 0, so this works even for an empty corpus @staticmethod - def save_corpus(fname, corpus, id2word=None, metadata=False): + def __save_corpus(fname, corpus, id2word=None, metadata=False): """ Save an existing `corpus` to disk. Some formats also support saving the dictionary (`feature_id->word` mapping), which can in this case be provided by the optional `id2word` parameter. - >>> MmCorpus.save_corpus('file.mm', corpus) + >>> MmCorpus.__save_corpus('file.mm', corpus) Some corpora also support an index of where each document begins, so that the documents on disk can be accessed in O(1) time (see the @@ -100,6 +100,10 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): fmt = str(doc) # format the document appropriately... fout.write(utils.to_utf8("%s\n" % fmt)) # serialize the formatted document to disk + def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, + metadata=False): + pass + class TransformedCorpus(CorpusABC): def __init__(self, obj, corpus, chunksize=None, **kwargs): diff --git a/gensim/models/wrappers/dtmmodel.py b/gensim/models/wrappers/dtmmodel.py index 1f450a457a..34e9e7bc6b 100644 --- a/gensim/models/wrappers/dtmmodel.py +++ b/gensim/models/wrappers/dtmmodel.py @@ -175,7 +175,7 @@ def convert_input(self, corpus, time_slices): """ logger.info("serializing temporary corpus to %s", self.fcorpustxt()) # write out the corpus in a file format that DTM understands: - corpora.BleiCorpus.save_corpus(self.fcorpustxt(), corpus) + corpora.BleiCorpus.__save_corpus(self.fcorpustxt(), corpus) with utils.smart_open(self.ftimeslices(), 'wb') as fout: fout.write(utils.to_utf8(str(len(self.time_slices)) + "\n")) diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py index 3f7f4e8149..9f2c1967f0 100644 --- a/gensim/test/test_corpora.py +++ b/gensim/test/test_corpora.py @@ -109,7 +109,7 @@ def test_save(self): corpus = self.TEST_CORPUS # make sure the corpus can be saved - self.corpus_class.save_corpus(testfile(), corpus) + self.corpus_class.__save_corpus(testfile(), corpus) # and loaded back, resulting in exactly the same corpus corpus2 = list(self.corpus_class(testfile())) @@ -253,7 +253,7 @@ def setUp(self): def test_save_format_for_dtm(self): corpus = [[(1, 1.0)], [], [(0, 5.0), (2, 1.0)], []] test_file = testfile() - self.corpus_class.save_corpus(test_file, corpus) + self.corpus_class.__save_corpus(test_file, corpus) with open(test_file) as f: for line in f: # unique_word_count index1:count1 index2:count2 ... indexn:counnt diff --git a/gensim/test/test_miislita.py b/gensim/test/test_miislita.py index dd660f629f..e126d2ccb8 100644 --- a/gensim/test/test_miislita.py +++ b/gensim/test/test_miislita.py @@ -64,7 +64,7 @@ def test_textcorpus(self): # make sure serializing works ftmp = get_tmpfile('test_textcorpus.mm') - corpora.MmCorpus.save_corpus(ftmp, miislita) + corpora.MmCorpus.__save_corpus(ftmp, miislita) self.assertTrue(os.path.exists(ftmp)) # make sure deserializing gives the same result From 981ebbbbabcf95ae7e2629266bcfb7d9931b7694 Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Mon, 2 Oct 2017 22:13:22 +0500 Subject: [PATCH 03/48] Annotate `bleicorpus.py` --- gensim/corpora/bleicorpus.py | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py index 273759aca6..ef966b77cb 100644 --- a/gensim/corpora/bleicorpus.py +++ b/gensim/corpora/bleicorpus.py @@ -5,7 +5,9 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -"""Blei's LDA-C format.""" +""" +Blei's LDA-C format. +""" from __future__ import with_statement @@ -41,7 +43,9 @@ def __init__(self, fname, fname_vocab=None): Args: fname (str): serialized corpus's filename - fname_vocab (str): vocabulary file; takes precedence over fname.vocab + fname_vocab (:obj:`str`, optional): vocabulary file; takes precedence over fname.vocab + Raises: + IOError: If vocabulary file doesn't exist """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s", fname) @@ -76,6 +80,15 @@ def __iter__(self): self.length = lineno + 1 def line2doc(self, line): + """ + Args: + line (str): document's string representation + Returns: + :obj:`list` of (:obj:`int`, :obj:`float`): + document's list representation + Raises: + ValueError: If format is invalid + """ parts = utils.to_unicode(line).split() if int(parts[0]) != len(parts) - 1: raise ValueError("invalid format in %s: %s" % (self.fname, repr(line))) @@ -91,11 +104,14 @@ def __save_corpus(fname, corpus, id2word=None, metadata=False): There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. - This function is automatically called by `BleiCorpus.serialize`; don't - call it directly, call `serialize` instead. - Args: - + fname (str): filename + corpus : yields documents + id2word (:obj:`dict` of (:obj:`str`, :obj:`str`), optional): + transforms id to word + metadata (bool): any additional info + Returns: + :obj:`list` of :obj:`int`: fields' offsets """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") @@ -124,7 +140,12 @@ def __save_corpus(fname, corpus, id2word=None, metadata=False): def docbyoffset(self, offset): """ - Return the document stored at file position `offset`. + Return document corresponding to `offset`. + + Args: + offset (int): position of the document in the file + Returns: + :obj:`list` of (:obj:`int`, :obj:`float`): document's list representation """ with utils.smart_open(self.fname) as f: f.seek(offset) From 342811371b368315786ac8097a90e6612bba9e45 Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Mon, 2 Oct 2017 22:57:03 +0500 Subject: [PATCH 04/48] Make __save_corpus weakly private --- gensim/corpora/bleicorpus.py | 2 +- gensim/corpora/indexedcorpus.py | 8 ++++---- gensim/corpora/lowcorpus.py | 2 +- gensim/corpora/malletcorpus.py | 2 +- gensim/corpora/mmcorpus.py | 2 +- gensim/corpora/sharded_corpus.py | 4 ++-- gensim/corpora/svmlightcorpus.py | 2 +- gensim/corpora/ucicorpus.py | 2 +- gensim/interfaces.py | 4 ++-- gensim/test/test_corpora.py | 2 +- 10 files changed, 15 insertions(+), 15 deletions(-) diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py index ef966b77cb..d5eb0da8be 100644 --- a/gensim/corpora/bleicorpus.py +++ b/gensim/corpora/bleicorpus.py @@ -97,7 +97,7 @@ def line2doc(self, line): return doc @staticmethod - def __save_corpus(fname, corpus, id2word=None, metadata=False): + def _save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the LDA-C format. diff --git a/gensim/corpora/indexedcorpus.py b/gensim/corpora/indexedcorpus.py index ca0debbfd5..ec79d152dc 100644 --- a/gensim/corpora/indexedcorpus.py +++ b/gensim/corpora/indexedcorpus.py @@ -85,14 +85,14 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progres if progress_cnt is not None: if labels is not None: - offsets = serializer.__save_corpus(fname, corpus, id2word, labels=labels, progress_cnt=progress_cnt, metadata=metadata) + offsets = serializer._save_corpus(fname, corpus, id2word, labels=labels, progress_cnt=progress_cnt, metadata=metadata) else: - offsets = serializer.__save_corpus(fname, corpus, id2word, progress_cnt=progress_cnt, metadata=metadata) + offsets = serializer._save_corpus(fname, corpus, id2word, progress_cnt=progress_cnt, metadata=metadata) else: if labels is not None: - offsets = serializer.__save_corpus(fname, corpus, id2word, labels=labels, metadata=metadata) + offsets = serializer._save_corpus(fname, corpus, id2word, labels=labels, metadata=metadata) else: - offsets = serializer.__save_corpus(fname, corpus, id2word, metadata=metadata) + offsets = serializer._save_corpus(fname, corpus, id2word, metadata=metadata) if offsets is None: raise NotImplementedError("called serialize on class %s which doesn't support indexing!" % serializer.__name__) diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py index 572470af2e..53ae8e21c4 100644 --- a/gensim/corpora/lowcorpus.py +++ b/gensim/corpora/lowcorpus.py @@ -140,7 +140,7 @@ def __iter__(self): yield self.line2doc(line) @staticmethod - def __save_corpus(fname, corpus, id2word=None, metadata=False): + def _save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the List-of-words format. diff --git a/gensim/corpora/malletcorpus.py b/gensim/corpora/malletcorpus.py index b6dc482dcc..90de7a3c76 100644 --- a/gensim/corpora/malletcorpus.py +++ b/gensim/corpora/malletcorpus.py @@ -67,7 +67,7 @@ def line2doc(self, line): return doc @staticmethod - def __save_corpus(fname, corpus, id2word=None, metadata=False): + def _save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the Mallet format. diff --git a/gensim/corpora/mmcorpus.py b/gensim/corpora/mmcorpus.py index 1eaadfb332..0380e09066 100644 --- a/gensim/corpora/mmcorpus.py +++ b/gensim/corpora/mmcorpus.py @@ -38,7 +38,7 @@ def __iter__(self): yield doc # get rid of doc id, return the sparse vector only @staticmethod - def __save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False): + def _save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False): """ Save a corpus in the Matrix Market format to disk. diff --git a/gensim/corpora/sharded_corpus.py b/gensim/corpora/sharded_corpus.py index d6596aa831..412f9f4b1d 100644 --- a/gensim/corpora/sharded_corpus.py +++ b/gensim/corpora/sharded_corpus.py @@ -764,7 +764,7 @@ def load(cls, fname, mmap=None): return super(ShardedCorpus, cls).load(fname, mmap) @staticmethod - def __save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False, **kwargs): + def _save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False, **kwargs): """ Implement a serialization interface. Do not call directly; use the `serialize` method instead. @@ -799,4 +799,4 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progres Ignore the parameters id2word, index_fname, progress_cnt, labels and metadata. They currently do nothing and are here only to provide a compatible method signature with superclass.""" - serializer.__save_corpus(fname, corpus, id2word=id2word, progress_cnt=progress_cnt, metadata=metadata, **kwargs) + serializer._save_corpus(fname, corpus, id2word=id2word, progress_cnt=progress_cnt, metadata=metadata, **kwargs) diff --git a/gensim/corpora/svmlightcorpus.py b/gensim/corpora/svmlightcorpus.py index 34527c31b7..cfb821e085 100644 --- a/gensim/corpora/svmlightcorpus.py +++ b/gensim/corpora/svmlightcorpus.py @@ -79,7 +79,7 @@ def __iter__(self): self.length = lineno + 1 @staticmethod - def __save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): + def _save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): """ Save a corpus in the SVMlight format. diff --git a/gensim/corpora/ucicorpus.py b/gensim/corpora/ucicorpus.py index 995ce3e6ad..84efcb406a 100644 --- a/gensim/corpora/ucicorpus.py +++ b/gensim/corpora/ucicorpus.py @@ -192,7 +192,7 @@ def create_dictionary(self): return dictionary @staticmethod - def __save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False): + def _save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False): """ Save a corpus in the UCI Bag-of-Words format. diff --git a/gensim/interfaces.py b/gensim/interfaces.py index cf9a10b123..adcab45b1d 100644 --- a/gensim/interfaces.py +++ b/gensim/interfaces.py @@ -71,14 +71,14 @@ def __len__(self): # return sum(1 for doc in self) # sum(empty generator) == 0, so this works even for an empty corpus @staticmethod - def __save_corpus(fname, corpus, id2word=None, metadata=False): + def _save_corpus(fname, corpus, id2word=None, metadata=False): """ Save an existing `corpus` to disk. Some formats also support saving the dictionary (`feature_id->word` mapping), which can in this case be provided by the optional `id2word` parameter. - >>> MmCorpus.__save_corpus('file.mm', corpus) + >>> MmCorpus._save_corpus('file.mm', corpus) Some corpora also support an index of where each document begins, so that the documents on disk can be accessed in O(1) time (see the diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py index 9f2c1967f0..f6c32fc761 100644 --- a/gensim/test/test_corpora.py +++ b/gensim/test/test_corpora.py @@ -109,7 +109,7 @@ def test_save(self): corpus = self.TEST_CORPUS # make sure the corpus can be saved - self.corpus_class.__save_corpus(testfile(), corpus) + self.corpus_class._save_corpus(testfile(), corpus) # and loaded back, resulting in exactly the same corpus corpus2 = list(self.corpus_class(testfile())) From 69fc7e04a1c82cc7b72be231bbd3df207f50fe0b Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Mon, 2 Oct 2017 23:39:33 +0500 Subject: [PATCH 05/48] Fix _save_corpus in tests --- gensim/test/test_corpora.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py index f6c32fc761..6b648c9550 100644 --- a/gensim/test/test_corpora.py +++ b/gensim/test/test_corpora.py @@ -253,7 +253,7 @@ def setUp(self): def test_save_format_for_dtm(self): corpus = [[(1, 1.0)], [], [(0, 5.0), (2, 1.0)], []] test_file = testfile() - self.corpus_class.__save_corpus(test_file, corpus) + self.corpus_class._save_corpus(test_file, corpus) with open(test_file) as f: for line in f: # unique_word_count index1:count1 index2:count2 ... indexn:counnt From b65a69a4b0313a7670b620a28411478ed8715cca Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Tue, 3 Oct 2017 12:07:02 +0500 Subject: [PATCH 06/48] Fix _save_corpus[2] --- gensim/corpora/textcorpus.py | 2 +- gensim/models/wrappers/dtmmodel.py | 2 +- gensim/test/test_miislita.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/gensim/corpora/textcorpus.py b/gensim/corpora/textcorpus.py index 7265d20d0c..b83572ee20 100644 --- a/gensim/corpora/textcorpus.py +++ b/gensim/corpora/textcorpus.py @@ -44,7 +44,7 @@ def remove_stopwords(tokens, stopwords=STOPWORDS): - """Remove stopwords using list from `gensim.parsing.preprocessing.STOPWORDS.""" + """Remove stopwords using list from `gensim.parsing.preprocessing.STOPWORDS.`""" return [token for token in tokens if token not in stopwords] diff --git a/gensim/models/wrappers/dtmmodel.py b/gensim/models/wrappers/dtmmodel.py index 34e9e7bc6b..68e4f62e13 100644 --- a/gensim/models/wrappers/dtmmodel.py +++ b/gensim/models/wrappers/dtmmodel.py @@ -175,7 +175,7 @@ def convert_input(self, corpus, time_slices): """ logger.info("serializing temporary corpus to %s", self.fcorpustxt()) # write out the corpus in a file format that DTM understands: - corpora.BleiCorpus.__save_corpus(self.fcorpustxt(), corpus) + corpora.BleiCorpus._save_corpus(self.fcorpustxt(), corpus) with utils.smart_open(self.ftimeslices(), 'wb') as fout: fout.write(utils.to_utf8(str(len(self.time_slices)) + "\n")) diff --git a/gensim/test/test_miislita.py b/gensim/test/test_miislita.py index e126d2ccb8..5c7ec48916 100644 --- a/gensim/test/test_miislita.py +++ b/gensim/test/test_miislita.py @@ -64,7 +64,7 @@ def test_textcorpus(self): # make sure serializing works ftmp = get_tmpfile('test_textcorpus.mm') - corpora.MmCorpus.__save_corpus(ftmp, miislita) + corpora.MmCorpus._save_corpus(ftmp, miislita) self.assertTrue(os.path.exists(ftmp)) # make sure deserializing gives the same result From 78e207df79d7f0604a7e505caf70c697a6c43f26 Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Tue, 24 Oct 2017 15:12:36 +0500 Subject: [PATCH 07/48] Document bleicorpus in Numpy style --- gensim/corpora/bleicorpus.py | 81 +++++++++++++++++++++++------------- 1 file changed, 52 insertions(+), 29 deletions(-) diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py index d5eb0da8be..f5f98c21b2 100644 --- a/gensim/corpora/bleicorpus.py +++ b/gensim/corpora/bleicorpus.py @@ -5,9 +5,7 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -Blei's LDA-C format. -""" +"""Blei's LDA-C format.""" from __future__ import with_statement @@ -41,11 +39,18 @@ def __init__(self, fname, fname_vocab=None): """ Initialize the corpus from a file. - Args: - fname (str): serialized corpus's filename - fname_vocab (:obj:`str`, optional): vocabulary file; takes precedence over fname.vocab - Raises: - IOError: If vocabulary file doesn't exist + Parameters + ---------- + fname : str + Serialized corpus's filename + fname_vocab : str or None, optional + Vocabulary file; takes precedence over + + Raises + ------ + IOError + If vocabulary file doesn't exist + """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s", fname) @@ -70,9 +75,7 @@ def __init__(self, fname, fname_vocab=None): self.id2word = dict(enumerate(words)) def __iter__(self): - """ - Iterate over the corpus, returning one sparse vector at a time. - """ + """Iterate over the corpus, returning one sparse vector at a time.""" lineno = -1 with utils.smart_open(self.fname) as fin: for lineno, line in enumerate(fin): @@ -81,12 +84,20 @@ def __iter__(self): def line2doc(self, line): """ - Args: - line (str): document's string representation - Returns: - :obj:`list` of (:obj:`int`, :obj:`float`): - document's list representation - Raises: + Convert line to document. + + Parameters + ---------- + line : str + Document's string representation + + Returns + ------- + list of (int, float) + document's list representation + + Raises + ------ ValueError: If format is invalid """ parts = utils.to_unicode(line).split() @@ -104,14 +115,21 @@ def _save_corpus(fname, corpus, id2word=None, metadata=False): There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. - Args: - fname (str): filename - corpus : yields documents - id2word (:obj:`dict` of (:obj:`str`, :obj:`str`), optional): - transforms id to word - metadata (bool): any additional info - Returns: - :obj:`list` of :obj:`int`: fields' offsets + Parameters + ---------- + fname : str + Filename + corpus : iterable + Iterable of documents + id2word : dict of (str, str), optional + Transforms id to word + metadata : bool + Any additional info + + Returns + ------- + list of int + Fields' offsets """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") @@ -142,10 +160,15 @@ def docbyoffset(self, offset): """ Return document corresponding to `offset`. - Args: - offset (int): position of the document in the file - Returns: - :obj:`list` of (:obj:`int`, :obj:`float`): document's list representation + Parameters + ---------- + offset : int + Position of the document in the file + + Returns + ------- + list of (int, float) + Document's list representation """ with utils.smart_open(self.fname) as f: f.seek(offset) From 7519382c3c193341d96f28508bde050c4a750b3a Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Tue, 24 Oct 2017 16:45:50 +0500 Subject: [PATCH 08/48] Document indexedcorpus --- gensim/corpora/indexedcorpus.py | 84 +++++++++++++++++++++++---------- 1 file changed, 59 insertions(+), 25 deletions(-) diff --git a/gensim/corpora/indexedcorpus.py b/gensim/corpora/indexedcorpus.py index ec79d152dc..d2d78bc89b 100644 --- a/gensim/corpora/indexedcorpus.py +++ b/gensim/corpora/indexedcorpus.py @@ -6,15 +6,7 @@ """ -Indexed corpus is a mechanism for random-accessing corpora. - -While the standard corpus interface in gensim allows iterating over corpus with -`for doc in corpus: pass`, indexed corpus allows accessing the documents with -`corpus[docno]` (in O(1) look-up time). - -This functionality is achieved by storing an extra file (by default named the same -as the corpus file plus '.index' suffix) that stores the byte offset of the beginning -of each document. +Base Indexed Corpus class """ import logging @@ -30,11 +22,25 @@ class IndexedCorpus(interfaces.CorpusABC): def __init__(self, fname, index_fname=None): """ - Initialize this abstract base class, by loading a previously saved index - from `index_fname` (or `fname.index` if `index_fname` is not set). - This index will allow subclasses to support the `corpus[docno]` syntax - (random access to document #`docno` in O(1)). + Indexed corpus is a mechanism for random-accessing corpora. + + While the standard corpus interface in gensim allows iterating over + corpus with `for doc in corpus: pass`, indexed corpus allows accessing + the documents with `corpus[docno]` (in O(1) look-up time). + + This functionality is achieved by storing an extra file (by default + named the same as the '{corpus name}.index') that stores the byte + offset of the beginning of each document. + + Parameters + ---------- + fname : string + Corpus filename + index_fname : string or None + Index filename, or None for loading `fname`.index + Examples + -------- >>> # save corpus in SvmLightCorpus format with an index >>> corpus = [[(1, 0.5)], [(0, 1.0), (1, 2.0)]] >>> gensim.corpora.SvmLightCorpus.serialize('testfile.svmlight', corpus) @@ -56,23 +62,49 @@ def __init__(self, fname, index_fname=None): self.length = None @classmethod - def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False): + def serialize( + serializer, + fname, + corpus, + id2word=None, + index_fname=None, + progress_cnt=None, + labels=None, + metadata=False + ): """ - Iterate through the document stream `corpus`, saving the documents to `fname` - and recording byte offset of each document. Save the resulting index - structure to file `index_fname` (or `fname`.index is not set). + Iterate through the document stream `corpus`, saving the documents to + `fname` and recording byte offset of each document. + + Save the resulting index structure to file `index_fname` (or + `fname`.index is not set). This relies on the underlying corpus class `serializer` providing (in addition to standard iteration): * `save_corpus` method that returns a sequence of byte offsets, one for - each saved document, + each saved document * the `docbyoffset(offset)` method, which returns a document - positioned at `offset` bytes within the persistent storage (file). - * metadata if set to true will ensure that serialize will write out article titles to a pickle file. - - Example: - + positioned at `offset` bytes within the persistent storage (file) + * metadata if set to true will ensure that serialize will write out + article titles to a pickle file. + + Parameters + ---------- + fname : str + Filename + corpus : iterable + Iterable of documents + id2word : dict of (str, str), optional + Transforms id to word + index_fname : str + progress_cnt : int + labels : + metadata : bool + Any additional info + + Examples + -------- >>> MmCorpus.serialize('test.mm', corpus) >>> mm = MmCorpus('test.mm') # `mm` document stream now has random access >>> print(mm[42]) # retrieve document no. 42, etc. @@ -107,8 +139,10 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progres def __len__(self): """ - Return the index length if the corpus is indexed. Otherwise, make a pass - over self to calculate the corpus length and cache this number. + Return the index length. + + If the corpus is not indexed, also count corpus length and cache this + value. """ if self.index is not None: return len(self.index) From ae698671691ca5eeea537af29da738c72eb3b623 Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Fri, 3 Nov 2017 19:18:56 +0500 Subject: [PATCH 09/48] Annotate csvcorpus --- gensim/corpora/csvcorpus.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/gensim/corpora/csvcorpus.py b/gensim/corpora/csvcorpus.py index 969437e571..76d11e55b9 100644 --- a/gensim/corpora/csvcorpus.py +++ b/gensim/corpora/csvcorpus.py @@ -4,10 +4,7 @@ # Copyright (C) 2013 Zygmunt Zając # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -Corpus in CSV format. - -""" +"""Corpus in CSV format.""" from __future__ import with_statement @@ -22,18 +19,24 @@ class CsvCorpus(interfaces.CorpusABC): - """ - Corpus in CSV format. The CSV delimiter, headers etc. are guessed automatically - based on the file content. - + """Corpus in CSV format. + + The CSV delimiter, headers etc. are guessed automatically based on the + file content. + All row values are expected to be ints/floats. """ def __init__(self, fname, labels): - """ - Initialize the corpus from a file. - `labels` = are class labels present in the input file? => skip the first column + """Initialize the corpus from a file. + + Parameters + ---------- + fname : str + Filename + labels : bool + Whether to skip the first column """ logger.info("loading corpus from %s", fname) @@ -48,10 +51,7 @@ def __init__(self, fname, labels): logger.info("sniffed CSV delimiter=%r, headers=%s", self.dialect.delimiter, self.headers) def __iter__(self): - """ - Iterate over the corpus, returning one sparse vector at a time. - - """ + """Iterate over the corpus, returning one sparse vector at a time.""" reader = csv.reader(utils.smart_open(self.fname), self.dialect) if self.headers: next(reader) # skip the headers From c2765ed5958d9eb6078b54afa569faac2c3ba189 Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Fri, 3 Nov 2017 19:22:13 +0500 Subject: [PATCH 10/48] Add "Yields" section --- gensim/corpora/csvcorpus.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/gensim/corpora/csvcorpus.py b/gensim/corpora/csvcorpus.py index 76d11e55b9..dbee31dc30 100644 --- a/gensim/corpora/csvcorpus.py +++ b/gensim/corpora/csvcorpus.py @@ -51,7 +51,13 @@ def __init__(self, fname, labels): logger.info("sniffed CSV delimiter=%r, headers=%s", self.dialect.delimiter, self.headers) def __iter__(self): - """Iterate over the corpus, returning one sparse vector at a time.""" + """Iterate over the corpus, returning one sparse vector at a time. + + Yields + ------ + list of (int, float) + + """ reader = csv.reader(utils.smart_open(self.fname), self.dialect) if self.headers: next(reader) # skip the headers From 40add2186f0c34b6cfe19980c4110c114e8575d2 Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Fri, 3 Nov 2017 19:28:03 +0500 Subject: [PATCH 11/48] Make `_save_corpus` public --- gensim/corpora/bleicorpus.py | 42 ++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 24 deletions(-) diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py index f5f98c21b2..8816de7023 100644 --- a/gensim/corpora/bleicorpus.py +++ b/gensim/corpora/bleicorpus.py @@ -21,18 +21,18 @@ class BleiCorpus(IndexedCorpus): - """ - Corpus in Blei's LDA-C format. - + """Corpus in Blei's LDA-C format. + The corpus is represented as two files: one describing the documents, and another describing the mapping between words and their ids. - + Each document is one line:: - + N fieldId1:fieldValue1 fieldId2:fieldValue2 ... fieldIdN:fieldValueN - + The vocabulary is a file with words, one word per line; word at line K has an implicit ``id=K``. + """ def __init__(self, fname, fname_vocab=None): @@ -83,8 +83,7 @@ def __iter__(self): self.length = lineno + 1 def line2doc(self, line): - """ - Convert line to document. + """Convert line to document. Parameters ---------- @@ -95,10 +94,7 @@ def line2doc(self, line): ------- list of (int, float) document's list representation - - Raises - ------ - ValueError: If format is invalid + """ parts = utils.to_unicode(line).split() if int(parts[0]) != len(parts) - 1: @@ -108,10 +104,9 @@ def line2doc(self, line): return doc @staticmethod - def _save_corpus(fname, corpus, id2word=None, metadata=False): - """ - Save a corpus in the LDA-C format. - + def save_corpus(fname, corpus, id2word=None, metadata=False): + """Save a corpus in the LDA-C format. + There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. @@ -122,14 +117,14 @@ def _save_corpus(fname, corpus, id2word=None, metadata=False): corpus : iterable Iterable of documents id2word : dict of (str, str), optional - Transforms id to word + Transforms id to word (Default value = None) metadata : bool - Any additional info + Any additional info (Default value = False) Returns ------- - list of int - Fields' offsets + + """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") @@ -157,8 +152,7 @@ def _save_corpus(fname, corpus, id2word=None, metadata=False): return offsets def docbyoffset(self, offset): - """ - Return document corresponding to `offset`. + """Return document corresponding to `offset`. Parameters ---------- @@ -167,8 +161,8 @@ def docbyoffset(self, offset): Returns ------- - list of (int, float) - Document's list representation + list of (int, float) + """ with utils.smart_open(self.fname) as f: f.seek(offset) From e044c3a42464817fc9bfdbfdc5f13890efa1fe4c Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Fri, 3 Nov 2017 19:38:56 +0500 Subject: [PATCH 12/48] Annotate bleicorpus --- gensim/corpora/bleicorpus.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py index 8816de7023..a5c5a83753 100644 --- a/gensim/corpora/bleicorpus.py +++ b/gensim/corpora/bleicorpus.py @@ -94,7 +94,7 @@ def line2doc(self, line): ------- list of (int, float) document's list representation - + """ parts = utils.to_unicode(line).split() if int(parts[0]) != len(parts) - 1: @@ -106,7 +106,7 @@ def line2doc(self, line): @staticmethod def save_corpus(fname, corpus, id2word=None, metadata=False): """Save a corpus in the LDA-C format. - + There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. @@ -123,8 +123,8 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): Returns ------- - - + list of int + """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") From 123327d7d11dd6f938ce760e32e9e83a2f0f3cf0 Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Fri, 3 Nov 2017 20:00:34 +0500 Subject: [PATCH 13/48] Fix indentation in bleicorpus --- gensim/corpora/bleicorpus.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py index a5c5a83753..fb5a735228 100644 --- a/gensim/corpora/bleicorpus.py +++ b/gensim/corpora/bleicorpus.py @@ -123,7 +123,7 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): Returns ------- - list of int + list of int """ if id2word is None: @@ -161,7 +161,7 @@ def docbyoffset(self, offset): Returns ------- - list of (int, float) + list of (int, float) """ with utils.smart_open(self.fname) as f: From 2382d018f80d816c16953ce28305791a3f99a0f0 Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Tue, 21 Nov 2017 11:00:39 +0500 Subject: [PATCH 14/48] `_save_corpus` -> `save_corpus` --- gensim/corpora/lowcorpus.py | 2 +- gensim/corpora/malletcorpus.py | 2 +- gensim/corpora/mmcorpus.py | 36 +++++++++++++++++++++--------- gensim/corpora/sharded_corpus.py | 4 ++-- gensim/corpora/svmlightcorpus.py | 2 +- gensim/corpora/ucicorpus.py | 2 +- gensim/interfaces.py | 4 ++-- gensim/models/wrappers/dtmmodel.py | 2 +- gensim/test/test_corpora.py | 4 ++-- gensim/test/test_miislita.py | 2 +- 10 files changed, 38 insertions(+), 22 deletions(-) diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py index 53ae8e21c4..d5265f6571 100644 --- a/gensim/corpora/lowcorpus.py +++ b/gensim/corpora/lowcorpus.py @@ -140,7 +140,7 @@ def __iter__(self): yield self.line2doc(line) @staticmethod - def _save_corpus(fname, corpus, id2word=None, metadata=False): + def save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the List-of-words format. diff --git a/gensim/corpora/malletcorpus.py b/gensim/corpora/malletcorpus.py index 90de7a3c76..cacf0074bd 100644 --- a/gensim/corpora/malletcorpus.py +++ b/gensim/corpora/malletcorpus.py @@ -67,7 +67,7 @@ def line2doc(self, line): return doc @staticmethod - def _save_corpus(fname, corpus, id2word=None, metadata=False): + def save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the Mallet format. diff --git a/gensim/corpora/mmcorpus.py b/gensim/corpora/mmcorpus.py index 0380e09066..e8133d6b59 100644 --- a/gensim/corpora/mmcorpus.py +++ b/gensim/corpora/mmcorpus.py @@ -20,9 +20,7 @@ class MmCorpus(matutils.MmReader, IndexedCorpus): - """ - Corpus in the Matrix Market format. - """ + """Corpus in the Matrix Market format.""" def __init__(self, fname): # avoid calling super(), too confusing @@ -30,20 +28,38 @@ def __init__(self, fname): matutils.MmReader.__init__(self, fname) def __iter__(self): - """ - Interpret a matrix in Matrix Market format as a streamed gensim corpus - (yielding one document at a time). + """Iterate over the corpus. + + Yields + ------ + document : """ for doc_id, doc in super(MmCorpus, self).__iter__(): yield doc # get rid of doc id, return the sparse vector only @staticmethod - def _save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False): - """ - Save a corpus in the Matrix Market format to disk. - + def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False): + """Save a corpus in the Matrix Market format to disk. + This function is automatically called by `MmCorpus.serialize`; don't call it directly, call `serialize` instead. + + Parameters + ---------- + fname : + + corpus : + + id2word : + (Default value = None) + progress_cnt : + (Default value = 1000) + metadata : + (Default value = False) + + Returns + ------- + """ logger.info("storing corpus in Matrix Market format to %s", fname) num_terms = len(id2word) if id2word is not None else None diff --git a/gensim/corpora/sharded_corpus.py b/gensim/corpora/sharded_corpus.py index 412f9f4b1d..4d0fde4999 100644 --- a/gensim/corpora/sharded_corpus.py +++ b/gensim/corpora/sharded_corpus.py @@ -764,7 +764,7 @@ def load(cls, fname, mmap=None): return super(ShardedCorpus, cls).load(fname, mmap) @staticmethod - def _save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False, **kwargs): + def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False, **kwargs): """ Implement a serialization interface. Do not call directly; use the `serialize` method instead. @@ -799,4 +799,4 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progres Ignore the parameters id2word, index_fname, progress_cnt, labels and metadata. They currently do nothing and are here only to provide a compatible method signature with superclass.""" - serializer._save_corpus(fname, corpus, id2word=id2word, progress_cnt=progress_cnt, metadata=metadata, **kwargs) + serializer.save_corpus(fname, corpus, id2word=id2word, progress_cnt=progress_cnt, metadata=metadata, **kwargs) diff --git a/gensim/corpora/svmlightcorpus.py b/gensim/corpora/svmlightcorpus.py index cfb821e085..290414836e 100644 --- a/gensim/corpora/svmlightcorpus.py +++ b/gensim/corpora/svmlightcorpus.py @@ -79,7 +79,7 @@ def __iter__(self): self.length = lineno + 1 @staticmethod - def _save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): + def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): """ Save a corpus in the SVMlight format. diff --git a/gensim/corpora/ucicorpus.py b/gensim/corpora/ucicorpus.py index 84efcb406a..a8911ee07f 100644 --- a/gensim/corpora/ucicorpus.py +++ b/gensim/corpora/ucicorpus.py @@ -192,7 +192,7 @@ def create_dictionary(self): return dictionary @staticmethod - def _save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False): + def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False): """ Save a corpus in the UCI Bag-of-Words format. diff --git a/gensim/interfaces.py b/gensim/interfaces.py index adcab45b1d..51c4bc102b 100644 --- a/gensim/interfaces.py +++ b/gensim/interfaces.py @@ -71,14 +71,14 @@ def __len__(self): # return sum(1 for doc in self) # sum(empty generator) == 0, so this works even for an empty corpus @staticmethod - def _save_corpus(fname, corpus, id2word=None, metadata=False): + def save_corpus(fname, corpus, id2word=None, metadata=False): """ Save an existing `corpus` to disk. Some formats also support saving the dictionary (`feature_id->word` mapping), which can in this case be provided by the optional `id2word` parameter. - >>> MmCorpus._save_corpus('file.mm', corpus) + >>> MmCorpus.save_corpus('file.mm', corpus) Some corpora also support an index of where each document begins, so that the documents on disk can be accessed in O(1) time (see the diff --git a/gensim/models/wrappers/dtmmodel.py b/gensim/models/wrappers/dtmmodel.py index 68e4f62e13..1f450a457a 100644 --- a/gensim/models/wrappers/dtmmodel.py +++ b/gensim/models/wrappers/dtmmodel.py @@ -175,7 +175,7 @@ def convert_input(self, corpus, time_slices): """ logger.info("serializing temporary corpus to %s", self.fcorpustxt()) # write out the corpus in a file format that DTM understands: - corpora.BleiCorpus._save_corpus(self.fcorpustxt(), corpus) + corpora.BleiCorpus.save_corpus(self.fcorpustxt(), corpus) with utils.smart_open(self.ftimeslices(), 'wb') as fout: fout.write(utils.to_utf8(str(len(self.time_slices)) + "\n")) diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py index 6b648c9550..3f7f4e8149 100644 --- a/gensim/test/test_corpora.py +++ b/gensim/test/test_corpora.py @@ -109,7 +109,7 @@ def test_save(self): corpus = self.TEST_CORPUS # make sure the corpus can be saved - self.corpus_class._save_corpus(testfile(), corpus) + self.corpus_class.save_corpus(testfile(), corpus) # and loaded back, resulting in exactly the same corpus corpus2 = list(self.corpus_class(testfile())) @@ -253,7 +253,7 @@ def setUp(self): def test_save_format_for_dtm(self): corpus = [[(1, 1.0)], [], [(0, 5.0), (2, 1.0)], []] test_file = testfile() - self.corpus_class._save_corpus(test_file, corpus) + self.corpus_class.save_corpus(test_file, corpus) with open(test_file) as f: for line in f: # unique_word_count index1:count1 index2:count2 ... indexn:counnt diff --git a/gensim/test/test_miislita.py b/gensim/test/test_miislita.py index 5c7ec48916..dd660f629f 100644 --- a/gensim/test/test_miislita.py +++ b/gensim/test/test_miislita.py @@ -64,7 +64,7 @@ def test_textcorpus(self): # make sure serializing works ftmp = get_tmpfile('test_textcorpus.mm') - corpora.MmCorpus._save_corpus(ftmp, miislita) + corpora.MmCorpus.save_corpus(ftmp, miislita) self.assertTrue(os.path.exists(ftmp)) # make sure deserializing gives the same result From 42409bfa5b503743b3bc80b5f0bd8b96998050b4 Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Tue, 21 Nov 2017 11:04:13 +0500 Subject: [PATCH 15/48] Annotate bleicorpus --- gensim/corpora/bleicorpus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py index fb5a735228..6d2ee86a73 100644 --- a/gensim/corpora/bleicorpus.py +++ b/gensim/corpora/bleicorpus.py @@ -106,7 +106,7 @@ def line2doc(self, line): @staticmethod def save_corpus(fname, corpus, id2word=None, metadata=False): """Save a corpus in the LDA-C format. - + There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. From 7cb5bbf8c7b65cf5030c7b349792cdd0ddc528f7 Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Tue, 21 Nov 2017 11:17:24 +0500 Subject: [PATCH 16/48] Convert dictionary docs to numpy style --- gensim/corpora/dictionary.py | 248 ++++++++++++++++++++++++++--------- 1 file changed, 188 insertions(+), 60 deletions(-) diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py index 08c4097f03..716e891c05 100644 --- a/gensim/corpora/dictionary.py +++ b/gensim/corpora/dictionary.py @@ -6,13 +6,13 @@ """ -This module implements the concept of Dictionary -- a mapping between words and -their integer ids. +General mapping between normalized words and their ids. Dictionaries can be created from a corpus and can later be pruned according to -document frequency (removing (un)common words via the :func:`Dictionary.filter_extremes` method), -save/loaded from disk (via :func:`Dictionary.save` and :func:`Dictionary.load` methods), merged -with other dictionary (:func:`Dictionary.merge_with`) etc. +document frequency (removing (un)common words via the +:func:`Dictionary.filter_extremes` method), save/loaded from disk (via +:func:`Dictionary.save` and :func:`Dictionary.load` methods), merged with +other dictionary (:func:`Dictionary.merge_with`) etc. """ from __future__ import with_statement @@ -36,15 +36,23 @@ class Dictionary(utils.SaveLoad, Mapping): - """ - Dictionary encapsulates the mapping between normalized words and their integer ids. - + """Mapping between normalized words and their ids. + The main function is `doc2bow`, which converts a collection of words to its bag-of-words representation: a list of (word_id, word_frequency) 2-tuples. + """ + def __init__(self, documents=None, prune_at=2000000): - """ - If `documents` are given, use them to initialize Dictionary (see `add_documents()`). + """Initialize the dictionary. + + Parameters + ---------- + documents : (iterable of (list of str)) or None + If not None, used to initialize dictionary + prune_at : int + Number of unique words to keep + """ self.token2id = {} # token -> tokenId self.id2token = {} # reverse mapping for token2id; only formed on request, to save memory @@ -58,6 +66,17 @@ def __init__(self, documents=None, prune_at=2000000): self.add_documents(documents, prune_at=prune_at) def __getitem__(self, tokenid): + """Return token. + + If :param:``token2id`` has changed (presumably via + :func:``add_documents``), + update :param:``id2token``. + + Returns + ------- + str + + """ if len(self.id2token) != len(self.token2id): # the word->id mapping has changed (presumably via add_documents); # recompute id->word accordingly @@ -65,6 +84,13 @@ def __getitem__(self, tokenid): return self.id2token[tokenid] # will throw for non-existent ids def __iter__(self): + """Iterate through keys. + + Returns + ------- + iterable of str + + """ return iter(self.keys()) if PY3: @@ -72,9 +98,11 @@ def __iter__(self): iterkeys = __iter__ def iteritems(self): + """Iterate through items.""" return self.items() def itervalues(self): + """Iterate through values.""" return self.values() def keys(self): @@ -82,31 +110,65 @@ def keys(self): return list(self.token2id.values()) def __len__(self): - """ - Return the number of token->id mappings in the dictionary. + """Return the number of token->id mappings in the dictionary. + + Returns + ------- + int + """ return len(self.token2id) def __str__(self): + """Return string representation. + + Returns + ------- + str + + """ some_keys = list(itertools.islice(iterkeys(self.token2id), 5)) return "Dictionary(%i unique tokens: %s%s)" % (len(self), some_keys, '...' if len(self) > 5 else '') @staticmethod def from_documents(documents): - return Dictionary(documents=documents) + """Build Dictionary from documents set. - def add_documents(self, documents, prune_at=2000000): + Parameters + ---------- + documents : iterable of (list of str) + + + Returns + ------- + :class:`Dictionary` + """ - Update dictionary from a collection of documents. Each document is a list - of tokens = **tokenized and normalized** strings (either utf8 or unicode). + return Dictionary(documents=documents) + def add_documents(self, documents, prune_at=2000000): + u""" + Update dictionary from a collection of documents. + + Each document is a list of tokens = **tokenized and normalized** + strings (either utf8 or unicode). + This is a convenience wrapper for calling `doc2bow` on each document with `allow_update=True`, which also prunes infrequent words, keeping the total number of unique words <= `prune_at`. This is to save memory on very large inputs. To disable this pruning, set `prune_at=None`. + Parameters + ---------- + documents : iterable of str + prune_at : int + Number of unique words to keep + + Examples + -------- >>> print(Dictionary(["máma mele maso".split(), "ema má máma".split()])) Dictionary(5 unique tokens) + """ for docno, document in enumerate(documents): # log progress & run a regular check for pruning, once every 10k docs @@ -124,19 +186,27 @@ def add_documents(self, documents, prune_at=2000000): ) def doc2bow(self, document, allow_update=False, return_missing=False): - """ - Convert `document` (a list of words) into the bag-of-words format = list - of `(token_id, token_count)` 2-tuples. Each word is assumed to be a - **tokenized and normalized** string (either unicode or utf8-encoded). No further preprocessing - is done on the words in `document`; apply tokenization, stemming etc. before + """Convert document to the bag-of-words format. + + Each word is assumed to be a **tokenized and normalized** string ( + either unicode or utf8-encoded). No further preprocessing is done on + the words in `document`; apply tokenization, stemming etc. before calling this method. - If `allow_update` is set, then also update dictionary in the process: create - ids for new words. At the same time, update document frequencies -- for - each word appearing in this document, increase its document frequency (`self.dfs`) - by one. + Parameters + ---------- + document : list of str + + allow_update : bool + Whether to update the dictionary in the process + (Default value = False) + return_missing : + (Default value = False) + + Returns + ------- + dict of (int, int) - If `allow_update` is **not** set, this function is `const`, aka read-only. """ if isinstance(document, string_types): raise TypeError("doc2bow expects an array of unicode tokens on input, not a single string") @@ -174,9 +244,8 @@ def doc2bow(self, document, allow_update=False, return_missing=False): return result def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=None): - """ - Filter out tokens that appear in - + """Filter out tokens that appear in + 1. less than `no_below` documents (absolute number) or 2. more than `no_above` documents (fraction of total corpus size, *not* absolute number). @@ -184,11 +253,23 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=N the `no_below` and `no_above` settings 4. after (1), (2) and (3), keep only the first `keep_n` most frequent tokens (or keep all if `None`). - + After the pruning, shrink resulting gaps in word ids. - + **Note**: Due to the gap shrinking, the same word may have a different word id before and after the call to this function! + + Parameters + ---------- + no_below : + (Default value = 5) + no_above : + (Default value = 0.5) + keep_n : + (Default value = 100000) + keep_tokens : + (Default value = None) + """ no_above_abs = int(no_above * self.num_docs) # convert fractional threshold to absolute threshold @@ -219,13 +300,18 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=N logger.info("resulting dictionary: %s", self) def filter_n_most_frequent(self, remove_n): - """ - Filter out the 'remove_n' most frequent tokens that appear in the documents. - + """Filter out the 'remove_n' most frequent tokens that appear in the + documents. + After the pruning, shrink resulting gaps in word ids. - + **Note**: Due to the gap shrinking, the same word may have a different word id before and after the call to this function! + + Parameters + ---------- + remove_n : + """ # determine which tokens to keep most_frequent_ids = (v for v in itervalues(self.token2id)) @@ -239,11 +325,18 @@ def filter_n_most_frequent(self, remove_n): logger.info("resulting dictionary: %s", self) def filter_tokens(self, bad_ids=None, good_ids=None): - """ - Remove the selected `bad_ids` tokens from all dictionary mappings, or, keep - selected `good_ids` in the mapping and remove the rest. - + """Remove the selected `bad_ids` tokens from all dictionary mappings, + or keep selected `good_ids` in the mapping and remove the rest. + `bad_ids` and `good_ids` are collections of word ids to be removed. + + Parameters + ---------- + bad_ids : + (Default value = None) + good_ids : + (Default value = None) + """ if bad_ids is not None: bad_ids = set(bad_ids) @@ -256,12 +349,15 @@ def filter_tokens(self, bad_ids=None, good_ids=None): self.compactify() def compactify(self): - """ - Assign new word ids to all words. - + """Assign new word ids to all words. + This is done to make the ids more compact, e.g. after some tokens have been removed via :func:`filter_tokens` and there are gaps in the id series. Calling this method will remove the gaps. + + Parameters + ---------- + """ logger.debug("rebuilding dictionary, shrinking gaps") @@ -274,14 +370,21 @@ def compactify(self): self.dfs = {idmap[tokenid]: freq for tokenid, freq in iteritems(self.dfs)} def save_as_text(self, fname, sort_by_word=True): - """ - Save this Dictionary to a text file, in format: + """Save this Dictionary to a text file, in format: `num_docs` `id[TAB]word_utf8[TAB]document frequency[NEWLINE]`. Sorted by word, or by decreasing word frequency. - + Note: text format should be use for corpus inspection. Use `save`/`load` to store in binary format (pickle) for improved performance. + + Parameters + ---------- + fname : + + sort_by_word : + (Default value = True) + """ logger.info("saving dictionary mapping to %s", fname) with utils.smart_open(fname, 'wb') as fout: @@ -297,19 +400,26 @@ def save_as_text(self, fname, sort_by_word=True): fout.write(utils.to_utf8(line)) def merge_with(self, other): - """ - Merge another dictionary into this dictionary, mapping same tokens to the + """Merge another dictionary into this dictionary, mapping same tokens to the same ids and new tokens to new ids. The purpose is to merge two corpora created using two different dictionaries, one from `self` and one from `other`. - + `other` can be any id=>word mapping (a dict, a Dictionary object, ...). - + Return a transformation object which, when accessed as `result[doc_from_other_corpus]`, will convert documents from a corpus built using the `other` dictionary into a document using the new, merged dictionary (see :class:`gensim.interfaces.TransformationABC`). - Example: + Parameters + ---------- + other : + + + Returns + ------- + Examples + -------- >>> dict1 = Dictionary(some_documents) >>> dict2 = Dictionary(other_documents) # ids not compatible with dict1! >>> dict2_to_dict1 = dict1.merge_with(dict2) @@ -343,9 +453,17 @@ def merge_with(self, other): @staticmethod def load_from_text(fname): - """ - Load a previously stored Dictionary from a text file. + """Load a previously stored Dictionary from a text file. Mirror function to `save_as_text`. + + Parameters + ---------- + fname : + + + Returns + ------- + """ result = Dictionary() with utils.smart_open(fname) as f: @@ -372,18 +490,28 @@ def load_from_text(fname): @staticmethod def from_corpus(corpus, id2word=None): - """ - Create Dictionary from an existing corpus. This can be useful if you only - have a term-document BOW matrix (represented by `corpus`), but not the - original text corpus. - + """Create Dictionary from an existing corpus. This can be useful if you + only have a term-document BOW matrix (represented by `corpus`), + but not the original text corpus. + This will scan the term-document count matrix for all word ids that appear in it, then construct and return Dictionary which maps each `word_id -> id2word[word_id]`. + + `id2word` is an optional dictionary that maps the `word_id` to a + token. In case `id2word` isn't specified the mapping `id2word[ + word_id] = str(word_id)` will be used. + + Parameters + ---------- + corpus : + + id2word : + (Default value = None) + + Returns + ------- - `id2word` is an optional dictionary that maps the `word_id` to a token. In - case `id2word` isn't specified the mapping `id2word[word_id] = str(word_id)` - will be used. """ result = Dictionary() From 56f19e6c78a1bf8a23575822c99aafa3424879b4 Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Tue, 21 Nov 2017 11:22:08 +0500 Subject: [PATCH 17/48] Convert hashdictionary docs to numpy style --- gensim/corpora/hashdictionary.py | 90 ++++++++++++++++++++++++-------- 1 file changed, 69 insertions(+), 21 deletions(-) diff --git a/gensim/corpora/hashdictionary.py b/gensim/corpora/hashdictionary.py index 687ec241ac..a4c1304795 100644 --- a/gensim/corpora/hashdictionary.py +++ b/gensim/corpora/hashdictionary.py @@ -35,14 +35,13 @@ class HashDictionary(utils.SaveLoad, dict): - """ - HashDictionary encapsulates the mapping between normalized words and their + """HashDictionary encapsulates the mapping between normalized words and their integer ids. - + Unlike `Dictionary`, building a `HashDictionary` before using it is not a necessary step. The documents can be computed immediately, from an uninitialized `HashDictionary`, without seeing the rest of the corpus first. - + The main function is `doc2bow`, which converts a collection of words to its bag-of-words representation: a list of (word_id, word_frequency) 2-tuples. @@ -80,9 +79,17 @@ def __getitem__(self, tokenid): return self.id2token.get(tokenid, set()) def restricted_hash(self, token): - """ - Calculate id of the given token. Also keep track of what words were mapped + """Calculate id of the given token. Also keep track of what words were mapped to what ids, for debugging reasons. + + Parameters + ---------- + token : + + + Returns + ------- + """ h = self.myhash(utils.to_utf8(token)) % self.id_range if self.debug: @@ -97,7 +104,7 @@ def __len__(self): return self.id_range def keys(self): - """Return a list of all token ids.""" + """ """ return range(len(self)) def __str__(self): @@ -105,15 +112,32 @@ def __str__(self): @staticmethod def from_documents(*args, **kwargs): + """ + + Parameters + ---------- + *args : + + **kwargs : + + + Returns + ------- + + """ return HashDictionary(*args, **kwargs) def add_documents(self, documents): - """ - Build dictionary from a collection of documents. Each document is a list + """Build dictionary from a collection of documents. Each document is a list of tokens = **tokenized and normalized** utf-8 encoded strings. - + This is only a convenience wrapper for calling `doc2bow` on each document with `allow_update=True`. + + Parameters + ---------- + documents : + """ for docno, document in enumerate(documents): if docno % 10000 == 0: @@ -125,18 +149,29 @@ def add_documents(self, documents): ) def doc2bow(self, document, allow_update=False, return_missing=False): - """ - Convert `document` (a list of words) into the bag-of-words format = list + """Convert `document` (a list of words) into the bag-of-words format = list of `(token_id, token_count)` 2-tuples. Each word is assumed to be a **tokenized and normalized** utf-8 encoded string. No further preprocessing is done on the words in `document`; apply tokenization, stemming etc. before calling this method. - + If `allow_update` or `self.allow_update` is set, then also update dictionary in the process: update overall corpus statistics and document frequencies. For each id appearing in this document, increase its document frequency (`self.dfs`) by one. + Parameters + ---------- + document : + + allow_update : + (Default value = False) + return_missing : + (Default value = False) + + Returns + ------- + """ result = {} missing = {} @@ -167,19 +202,28 @@ def doc2bow(self, document, allow_update=False, return_missing=False): return result def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000): - """ - Remove document frequency statistics for tokens that appear in - + """Remove document frequency statistics for tokens that appear in + 1. less than `no_below` documents (absolute number) or 2. more than `no_above` documents (fraction of total corpus size, *not* absolute number). 3. after (1) and (2), keep only the first `keep_n` most frequent tokens (or keep all if `None`). - + **Note:** since HashDictionary's id range is fixed and doesn't depend on the number of tokens seen, this doesn't really "remove" anything. It only clears some supplementary statistics, for easier debugging and a smaller RAM footprint. + + Parameters + ---------- + no_below : + (Default value = 5) + no_above : + (Default value = 0.5) + keep_n : + (Default value = 100000) + """ no_above_abs = int(no_above * self.num_docs) # convert fractional threshold to absolute threshold ok = [item for item in iteritems(self.dfs_debug) if no_below <= item[1] <= no_above_abs] @@ -200,13 +244,17 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000): ) def save_as_text(self, fname): - """ - Save this HashDictionary to a text file, for easier debugging. - + """Save this HashDictionary to a text file, for easier debugging. + The format is: `id[TAB]document frequency of this id[TAB]tab-separated set of words in UTF8 that map to this id[NEWLINE]`. - + Note: use `save`/`load` to store in binary format instead (pickle). + + Parameters + ---------- + fname : + """ logger.info("saving HashDictionary mapping to %s" % fname) with utils.smart_open(fname, 'wb') as fout: From 9162a7e2f6554cf481cc521177d4e0427cf062cd Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Tue, 21 Nov 2017 11:30:40 +0500 Subject: [PATCH 18/48] Convert indexedcorpus docs to numpy style --- gensim/corpora/indexedcorpus.py | 58 ++++++++++++++++----------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/gensim/corpora/indexedcorpus.py b/gensim/corpora/indexedcorpus.py index d2d78bc89b..234b6dd547 100644 --- a/gensim/corpora/indexedcorpus.py +++ b/gensim/corpora/indexedcorpus.py @@ -5,9 +5,7 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -Base Indexed Corpus class -""" +"""Base Indexed Corpus class""" import logging import six @@ -21,8 +19,7 @@ class IndexedCorpus(interfaces.CorpusABC): def __init__(self, fname, index_fname=None): - """ - Indexed corpus is a mechanism for random-accessing corpora. + """Indexed corpus is a mechanism for random-accessing corpora. While the standard corpus interface in gensim allows iterating over corpus with `for doc in corpus: pass`, indexed corpus allows accessing @@ -42,12 +39,12 @@ def __init__(self, fname, index_fname=None): Examples -------- >>> # save corpus in SvmLightCorpus format with an index - >>> corpus = [[(1, 0.5)], [(0, 1.0), (1, 2.0)]] - >>> gensim.corpora.SvmLightCorpus.serialize('testfile.svmlight', corpus) - >>> # load back as a document stream (*not* plain Python list) - >>> corpus_with_random_access = gensim.corpora.SvmLightCorpus('tstfile.svmlight') - >>> print(corpus_with_random_access[1]) - [(0, 1.0), (1, 2.0)] + >>> corpus = [[(1, 0.5)], [(0, 1.0), (1, 2.0)]] + >>> gensim.corpora.SvmLightCorpus.serialize('testfile.svmlight', corpus) + >>> # load back as a document stream (*not* plain Python list) + >>> corpus_with_random_access = gensim.corpora.SvmLightCorpus('tstfile.svmlight') + >>> print(corpus_with_random_access[1]) + [(0, 1.0), (1, 2.0)] """ try: @@ -72,22 +69,21 @@ def serialize( labels=None, metadata=False ): - """ - Iterate through the document stream `corpus`, saving the documents to + """Iterate through the document stream `corpus`, saving the documents to `fname` and recording byte offset of each document. - + Save the resulting index structure to file `index_fname` (or `fname`.index is not set). - + This relies on the underlying corpus class `serializer` providing (in - addition to standard iteration): - - * `save_corpus` method that returns a sequence of byte offsets, one for - each saved document - * the `docbyoffset(offset)` method, which returns a document - positioned at `offset` bytes within the persistent storage (file) - * metadata if set to true will ensure that serialize will write out - article titles to a pickle file. + addition to standard iteration):: + + * `save_corpus` method that returns a sequence of byte offsets, one for + each saved document + * the `docbyoffset(offset)` method, which returns a document + positioned at `offset` bytes within the persistent storage (file) + * metadata if set to true will ensure that serialize will write out + article titles to a pickle file. Parameters ---------- @@ -96,12 +92,15 @@ def serialize( corpus : iterable Iterable of documents id2word : dict of (str, str), optional - Transforms id to word + Transforms id to word (Default value = None) index_fname : str + (Default value = None) progress_cnt : int + (Default value = None) labels : + (Default value = None) metadata : bool - Any additional info + Any additional info (Default value = False) Examples -------- @@ -117,14 +116,15 @@ def serialize( if progress_cnt is not None: if labels is not None: - offsets = serializer._save_corpus(fname, corpus, id2word, labels=labels, progress_cnt=progress_cnt, metadata=metadata) + offsets = serializer.save_corpus(fname, corpus, id2word, labels=labels, progress_cnt=progress_cnt, metadata=metadata) else: - offsets = serializer._save_corpus(fname, corpus, id2word, progress_cnt=progress_cnt, metadata=metadata) + offsets = serializer.save_corpus(fname, corpus, id2word, progress_cnt=progress_cnt, metadata=metadata) else: if labels is not None: - offsets = serializer._save_corpus(fname, corpus, id2word, labels=labels, metadata=metadata) + offsets = serializer.save_corpus(fname, corpus, id2word, labels=labels, metadata=metadata) else: - offsets = serializer._save_corpus(fname, corpus, id2word, metadata=metadata) + offsets = serializer.save_corpus(fname, corpus, id2word, + metadata=metadata) if offsets is None: raise NotImplementedError("called serialize on class %s which doesn't support indexing!" % serializer.__name__) From 5eaaac40a34dffe5134a46c53f11dfa7c8172da0 Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Tue, 21 Nov 2017 11:37:26 +0500 Subject: [PATCH 19/48] Convert lowcorpus docs to numpy style --- gensim/corpora/lowcorpus.py | 79 +++++++++++++++++++++++++++++++------ 1 file changed, 66 insertions(+), 13 deletions(-) diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py index d5265f6571..9484b63e11 100644 --- a/gensim/corpora/lowcorpus.py +++ b/gensim/corpora/lowcorpus.py @@ -23,32 +23,43 @@ def split_on_space(s): - return [word for word in utils.to_unicode(s).strip().split(' ') if word] + """ + Parameters + ---------- + s : + + + Returns + ------- -class LowCorpus(IndexedCorpus): """ - List_Of_Words corpus handles input in GibbsLda++ format. + return [word for word in utils.to_unicode(s).strip().split(' ') if word] - Quoting http://gibbslda.sourceforge.net/#3.2_Input_Data_Format:: +class LowCorpus(IndexedCorpus): + """List_Of_Words corpus handles input in GibbsLda++ format. + + Quoting http://gibbslda.sourceforge.net/#3.2_Input_Data_Format:: + Both data for training/estimating the model and new data (i.e., previously unseen data) have the same format as follows: - + [M] [document1] [document2] ... [documentM] - + in which the first line is the total number for documents [M]. Each line after that is one document. [documenti] is the ith document of the dataset that consists of a list of Ni words/terms. - + [documenti] = [wordi1] [wordi2] ... [wordiNi] - + in which all [wordij] (i=1..M, j=1..Ni) are text strings and they are separated by the blank character. + """ def __init__(self, fname, id2word=None, line2words=split_on_space): """ @@ -90,6 +101,7 @@ def __init__(self, fname, id2word=None, line2words=split_on_space): ) def _calculate_num_docs(self): + """ """ # the first line in input data is the number of documents (integer). throws exception on bad input. with utils.smart_open(self.fname) as fin: try: @@ -103,6 +115,17 @@ def __len__(self): return self.num_docs def line2doc(self, line): + """ + + Parameters + ---------- + line : + + + Returns + ------- + + """ words = self.line2words(line) if self.use_wordids: @@ -141,11 +164,25 @@ def __iter__(self): @staticmethod def save_corpus(fname, corpus, id2word=None, metadata=False): - """ - Save a corpus in the List-of-words format. - + """Save a corpus in the List-of-words format. + This function is automatically called by `LowCorpus.serialize`; don't call it directly, call `serialize` instead. + + Parameters + ---------- + fname : + + corpus : + + id2word : + (Default value = None) + metadata : + (Default value = False) + + Returns + ------- + """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") @@ -173,8 +210,16 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): return offsets def docbyoffset(self, offset): - """ - Return the document stored at file position `offset`. + """Return the document stored at file position `offset`. + + Parameters + ---------- + offset : + + + Returns + ------- + """ with utils.smart_open(self.fname) as f: f.seek(offset) @@ -182,9 +227,17 @@ def docbyoffset(self, offset): @property def id2word(self): + """ """ return self._id2word @id2word.setter def id2word(self, val): + """ + + Parameters + ---------- + val : + + """ self._id2word = val self.word2id = utils.revdict(val) From 3b6b0763494ca406b31fcf0ade3c8f37c3d93a8c Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Tue, 21 Nov 2017 11:39:56 +0500 Subject: [PATCH 20/48] Convert malletcorpus docs to numpy style --- gensim/corpora/malletcorpus.py | 63 ++++++++++++++++++++++++++-------- 1 file changed, 48 insertions(+), 15 deletions(-) diff --git a/gensim/corpora/malletcorpus.py b/gensim/corpora/malletcorpus.py index cacf0074bd..b41d5cd1a3 100644 --- a/gensim/corpora/malletcorpus.py +++ b/gensim/corpora/malletcorpus.py @@ -19,20 +19,20 @@ class MalletCorpus(LowCorpus): - """ - Quoting http://mallet.cs.umass.edu/import.php: - - One file, one instance per line - Assume the data is in the following format: - + """Quoting http://mallet.cs.umass.edu/import.php: + + One file, one instance per line + Assume the data is in the following format: + [URL] [language] [text of the page...] - + Or, more generally, + [document #1 id] [label] [text of the document...] [document #2 id] [label] [text of the document...] ... [document #N id] [label] [text of the document...] - + Note that language/label is *not* considered in Gensim. """ @@ -41,6 +41,7 @@ def __init__(self, fname, id2word=None, metadata=False): LowCorpus.__init__(self, fname, id2word) def _calculate_num_docs(self): + """ """ with utils.smart_open(self.fname) as fin: result = sum(1 for _ in fin) return result @@ -56,6 +57,17 @@ def __iter__(self): yield self.line2doc(line) def line2doc(self, line): + """ + + Parameters + ---------- + line : + + + Returns + ------- + + """ splited_line = [word for word in utils.to_unicode(line).strip().split(' ') if word] docid, doclang, words = splited_line[0], splited_line[1], splited_line[2:] @@ -68,18 +80,31 @@ def line2doc(self, line): @staticmethod def save_corpus(fname, corpus, id2word=None, metadata=False): - """ - Save a corpus in the Mallet format. - + """Save a corpus in the Mallet format. + The document id will be generated by enumerating the corpus. That is, it will range between 0 and number of documents in the corpus. - + Since Mallet has a language field in the format, this defaults to the string '__unknown__'. If the language needs to be saved, post-processing will be required. - + This function is automatically called by `MalletCorpus.serialize`; don't call it directly, call `serialize` instead. + Parameters + ---------- + fname : + + corpus : + + id2word : + (Default value = None) + metadata : + (Default value = False) + + Returns + ------- + """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") @@ -114,8 +139,16 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): return offsets def docbyoffset(self, offset): - """ - Return the document stored at file position `offset`. + """Return the document stored at file position `offset`. + + Parameters + ---------- + offset : + + + Returns + ------- + """ with utils.smart_open(self.fname) as f: f.seek(offset) From d7f3fc80c795c1c7f75af0c499fd6fae407fa4fe Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Tue, 21 Nov 2017 11:41:21 +0500 Subject: [PATCH 21/48] Convert mmcorpus docs to numpy style --- gensim/corpora/mmcorpus.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gensim/corpora/mmcorpus.py b/gensim/corpora/mmcorpus.py index e8133d6b59..ee6f1401eb 100644 --- a/gensim/corpora/mmcorpus.py +++ b/gensim/corpora/mmcorpus.py @@ -51,11 +51,11 @@ def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False): corpus : id2word : - (Default value = None) + (Default value = None) progress_cnt : - (Default value = 1000) + (Default value = 1000) metadata : - (Default value = False) + (Default value = False) Returns ------- From c46bff4ff94227df140b17b39db861ce3a96bb18 Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Tue, 21 Nov 2017 11:59:23 +0500 Subject: [PATCH 22/48] Convert sharded_corpus docs to numpy style --- gensim/corpora/sharded_corpus.py | 299 +++++++++++++++++++++++++------ 1 file changed, 241 insertions(+), 58 deletions(-) diff --git a/gensim/corpora/sharded_corpus.py b/gensim/corpora/sharded_corpus.py index 4d0fde4999..b49bfdf0db 100644 --- a/gensim/corpora/sharded_corpus.py +++ b/gensim/corpora/sharded_corpus.py @@ -44,23 +44,22 @@ class ShardedCorpus(IndexedCorpus): - """ - This corpus is designed for situations where you need to train a model + """This corpus is designed for situations where you need to train a model on matrices, with a large number of iterations. (It should be faster than gensim's other IndexedCorpus implementations for this use case; check the `benchmark_datasets.py` script. It should also serialize faster.) - + The corpus stores its data in separate files called "shards". This is a compromise between speed (keeping the whole dataset in memory) and memory footprint (keeping the data on disk and reading from it on demand). Persistence is done using the standard gensim load/save methods. - + .. note:: - + The dataset is **read-only**, there is - as opposed to gensim's Similarity class, which works similarly - no way of adding documents to the dataset (for now). - + You can use ShardedCorpus to serialize your data just like any other gensim corpus that implements serialization. However, because the data is saved as numpy 2-dimensional ndarrays (or scipy sparse matrices), you need to @@ -77,15 +76,15 @@ class ShardedCorpus(IndexedCorpus): is essentially a re-serialization into new-size shards), but note that this operation will temporarily take twice as much disk space, because the old shards are not deleted until the new shards are safely in place. - + After serializing the data, the corpus will then save itself to the file `output_prefix`. - + On further initialization with the same `output_prefix`, the corpus will load the already built dataset unless the `overwrite` option is given. (A new object is "cloned" from the one saved to `output_prefix` previously.) - + To retrieve data, you can load the corpus and use it like a list: >>> sh_corpus = ShardedCorpus.load(output_prefix) @@ -99,7 +98,7 @@ class ShardedCorpus(IndexedCorpus): >>> batch = sh_corpus[100:150] The batch now will be a generator of gensim vectors. - + Since the corpus needs the data serialized in order to be able to operate, it will serialize data right away on initialization. Instead of calling `ShardedCorpus.serialize()`, you can just initialize and use the corpus @@ -131,13 +130,14 @@ class ShardedCorpus(IndexedCorpus): of a ShardedCorpus object, you should definitely not touch ` `sharded_serialization`! Changing the attribute will not miraculously re-serialize the data in the requested format. - + The CSR format is used for sparse data throughout. - + Internally, to retrieve data, the dataset keeps track of which shard is currently open and on a `__getitem__` request, either returns an item from the current shard, or opens a new one. The shard size is constant, except for the last shard. + """ def __init__(self, output_prefix, corpus, dim=None, shardsize=4096, overwrite=False, sparse_serialization=False, @@ -249,7 +249,20 @@ def __init__(self, output_prefix, corpus, dim=None, self.init_by_clone() def init_shards(self, output_prefix, corpus, shardsize=4096, dtype=_default_dtype): - """Initialize shards from the corpus.""" + """Initialize shards from the corpus. + + Parameters + ---------- + output_prefix : + + corpus : + + shardsize : + (Default value = 4096) + dtype : + (Default value = _default_dtype) + + """ is_corpus, corpus = gensim.utils.is_corpus(corpus) if not is_corpus: @@ -294,8 +307,7 @@ def init_shards(self, output_prefix, corpus, shardsize=4096, dtype=_default_dtyp logger.info('Built %d shards in %f s.', self.n_shards, end_time - start_time) def init_by_clone(self): - """ - Initialize by copying over attributes of another ShardedCorpus + """Initialize by copying over attributes of another ShardedCorpus instance saved to the output_prefix given at __init__(). """ @@ -317,13 +329,21 @@ def init_by_clone(self): self.dim = temp.dim # To be consistent with the loaded data! def save_shard(self, shard, n=None, filename=None): - """ - Pickle the given shard. If `n` is not given, will consider the shard + """Pickle the given shard. If `n` is not given, will consider the shard a new one. - + If `filename` is given, will use that file name instead of generating one. + Parameters + ---------- + shard : + + n : + (Default value = None) + filename : + (Default value = None) + """ new_shard = False if n is None: @@ -340,9 +360,14 @@ def save_shard(self, shard, n=None, filename=None): self.n_shards += 1 def load_shard(self, n): + """Load (unpickle) the n-th shard as the "live" part of the dataset + into the Dataset object. + + Parameters + ---------- + n : + """ - Load (unpickle) the n-th shard as the "live" part of the dataset - into the Dataset object.""" # No-op if the shard is already open. if self.current_shard_n == n: @@ -358,22 +383,26 @@ def load_shard(self, n): self.current_offset = self.offsets[n] def reset(self): - """ - Reset to no shard at all. Used for saving. - - """ + """Reset to no shard at all. Used for saving.""" self.current_shard = None self.current_shard_n = None self.current_offset = None def shard_by_offset(self, offset): - """ - Determine which shard the given offset belongs to. If the offset + """Determine which shard the given offset belongs to. If the offset is greater than the number of available documents, raises a `ValueError`. - + Assumes that all shards have the same size. + Parameters + ---------- + offset : + + + Returns + ------- + """ k = int(offset / self.shardsize) if offset >= self.n_docs: @@ -385,33 +414,48 @@ def shard_by_offset(self, offset): return k def in_current(self, offset): - """ - Determine whether the given offset falls within the current shard. + """Determine whether the given offset falls within the current shard. + + Parameters + ---------- + offset : + + + Returns + ------- """ return (self.current_offset <= offset) and (offset < self.offsets[self.current_shard_n + 1]) def in_next(self, offset): - """ - Determine whether the given offset falls within the next shard. + """Determine whether the given offset falls within the next shard. This is a very small speedup: typically, we will be iterating through the data forward. Could save considerable time with a very large number of smaller shards. + Parameters + ---------- + offset : + + + Returns + ------- + """ if self.current_shard_n == self.n_shards: return False # There's no next shard. return (self.offsets[self.current_shard_n + 1] <= offset) and (offset < self.offsets[self.current_shard_n + 2]) def resize_shards(self, shardsize): - """ - Re-process the dataset to new shard size. This may take pretty long. + """Re-process the dataset to new shard size. This may take pretty long. Also, note that you need some space on disk for this one (we're assuming there is enough disk space for double the size of the dataset and that there is enough memory for old + new shardsize). - :type shardsize: int - :param shardsize: The new shard size. + Parameters + ---------- + shardsize : + The new shard size. """ # Determine how many new shards there will be @@ -475,18 +519,46 @@ def resize_shards(self, shardsize): self.reset() def _shard_name(self, n): - """Generate the name for the n-th shard.""" + """Generate the name for the n-th shard. + + Parameters + ---------- + n : + + + Returns + ------- + + """ return self.output_prefix + '.' + str(n) def _resized_shard_name(self, n): - """ - Generate the name for the n-th new shard temporary file when + """Generate the name for the n-th new shard temporary file when resizing dataset. The file will then be re-named to standard shard name. + + Parameters + ---------- + n : + + + Returns + ------- + """ return self.output_prefix + '.resize-temp.' + str(n) def _guess_n_features(self, corpus): - """Attempt to guess number of features in `corpus`.""" + """Attempt to guess number of features in `corpus`. + + Parameters + ---------- + corpus : + + + Returns + ------- + + """ n_features = None if hasattr(corpus, 'dim'): # print 'Guessing from \'dim\' attribute.' @@ -532,6 +604,14 @@ def __len__(self): return self.n_docs def _ensure_shard(self, offset): + """ + + Parameters + ---------- + offset : + + + """ # No shard loaded if self.current_shard is None: shard_n = self.shard_by_offset(offset) @@ -545,7 +625,17 @@ def _ensure_shard(self, offset): self.load_shard(shard_n) def get_by_offset(self, offset): - """As opposed to getitem, this one only accepts ints as offsets.""" + """As opposed to getitem, this one only accepts ints as offsets. + + Parameters + ---------- + offset : + + + Returns + ------- + + """ self._ensure_shard(offset) result = self.current_shard[offset - self.current_offset] return result @@ -692,6 +782,17 @@ def __add_to_slice(self, s_result, result_start, result_stop, start, stop): return s_result def _getitem_format(self, s_result): + """ + + Parameters + ---------- + s_result : + + + Returns + ------- + + """ if self.sparse_serialization: if self.gensim: s_result = self._getitem_sparse2gensim(s_result) @@ -705,13 +806,33 @@ def _getitem_format(self, s_result): return s_result def _getitem_sparse2gensim(self, result): - """ - Change given sparse result matrix to gensim sparse vectors. - + """Change given sparse result matrix to gensim sparse vectors. + Uses the internals of the sparse matrix to make this fast. + Parameters + ---------- + result : + + + Returns + ------- + """ def row_sparse2gensim(row_idx, csr_matrix): + """ + + Parameters + ---------- + row_idx : + + csr_matrix : + + + Returns + ------- + + """ indices = csr_matrix.indices[csr_matrix.indptr[row_idx]:csr_matrix.indptr[row_idx + 1]] g_row = [(col_idx, csr_matrix[row_idx, col_idx]) for col_idx in indices] return g_row @@ -721,7 +842,17 @@ def row_sparse2gensim(row_idx, csr_matrix): return output def _getitem_dense2gensim(self, result): - """Change given dense result matrix to gensim sparse vectors.""" + """Change given dense result matrix to gensim sparse vectors. + + Parameters + ---------- + result : + + + Returns + ------- + + """ if len(result.shape) == 1: output = gensim.matutils.full2sparse(result) else: @@ -739,11 +870,17 @@ def __iter__(self): yield self[i] def save(self, *args, **kwargs): - """ - Save itself (the wrapper) in clean state (after calling `reset()`) + """Save itself (the wrapper) in clean state (after calling `reset()`) to the output_prefix file. If you wish to save to a different file, use the `fname` argument as the first positional arg. + Parameters + ---------- + *args : + + **kwargs : + + """ # Can we save to a different file than output_prefix? Well, why not? if len(args) == 0: @@ -758,45 +895,91 @@ def save(self, *args, **kwargs): @classmethod def load(cls, fname, mmap=None): - """ - Load itself in clean state. `mmap` has no effect here. + """Load itself in clean state. `mmap` has no effect here. + + Parameters + ---------- + fname : + + mmap : + (Default value = None) + + Returns + ------- + """ return super(ShardedCorpus, cls).load(fname, mmap) @staticmethod def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False, **kwargs): - """ - Implement a serialization interface. Do not call directly; + """Implement a serialization interface. Do not call directly; use the `serialize` method instead. - + Note that you might need some ShardedCorpus init parameters, most likely the dimension (`dim`). Again, pass these as `kwargs` to the `serialize` method. - + All this thing does is initialize a ShardedCorpus from a corpus with the `output_prefix` argument set to the `fname` parameter of this method. The initialization of a ShardedCorpus takes care of serializing the data (in dense form) to shards. - + Ignore the parameters id2word, progress_cnt and metadata. They currently do nothing and are here only to provide a compatible method signature with superclass. + Parameters + ---------- + fname : + + corpus : + + id2word : + (Default value = None) + progress_cnt : + (Default value = 1000) + metadata : + (Default value = False) + **kwargs : + + """ ShardedCorpus(fname, corpus, **kwargs) @classmethod def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False, **kwargs): - """ - Iterate through the document stream `corpus`, saving the documents + """Iterate through the document stream `corpus`, saving the documents as a ShardedCorpus to `fname`. - + Use this method instead of calling `save_corpus` directly. You may need to supply some kwargs that are used upon dataset creation (namely: `dim`, unless the dataset can infer the dimension from the given corpus). - + Ignore the parameters id2word, index_fname, progress_cnt, labels and metadata. They currently do nothing and are here only to - provide a compatible method signature with superclass.""" + provide a compatible method signature with superclass. + + Parameters + ---------- + serializer : + + fname : + + corpus : + + id2word : + (Default value = None) + index_fname : + (Default value = None) + progress_cnt : + (Default value = None) + labels : + (Default value = None) + metadata : + (Default value = False) + **kwargs : + + + """ serializer.save_corpus(fname, corpus, id2word=id2word, progress_cnt=progress_cnt, metadata=metadata, **kwargs) From 78235469b3fd4ef52ebfaef3ef9cf0cceffc915f Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Tue, 21 Nov 2017 12:06:19 +0500 Subject: [PATCH 23/48] Convert svmlightcorpus docs to numpy style --- gensim/corpora/svmlightcorpus.py | 74 +++++++++++++++++++++++++------- 1 file changed, 58 insertions(+), 16 deletions(-) diff --git a/gensim/corpora/svmlightcorpus.py b/gensim/corpora/svmlightcorpus.py index 290414836e..b13998289f 100644 --- a/gensim/corpora/svmlightcorpus.py +++ b/gensim/corpora/svmlightcorpus.py @@ -22,22 +22,21 @@ class SvmLightCorpus(IndexedCorpus): - """ - Corpus in SVMlight format. - + """Corpus in SVMlight format. + Quoting http://svmlight.joachims.org/: The input file contains the training examples. The first lines may contain comments and are ignored if they start with #. Each of the following lines represents one training example and is of the following format:: - + .=. : : ... : # .=. +1 | -1 | 0 | .=. | "qid" .=. .=. - + The "qid" feature (used for SVMlight ranking), if present, is ignored. - + Although not mentioned in the specification above, SVMlight also expect its feature ids to be 1-based (counting starts at 1). We convert features to 0-base internally by decrementing all ids when loading a SVMlight input file, and @@ -80,14 +79,30 @@ def __iter__(self): @staticmethod def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): - """ - Save a corpus in the SVMlight format. - + """Save a corpus in the SVMlight format. + The SVMlight `` class tag is taken from the `labels` array, or set to 0 for all documents if `labels` is not supplied. - + This function is automatically called by `SvmLightCorpus.serialize`; don't call it directly, call `serialize` instead. + + Parameters + ---------- + fname : + + corpus : + + id2word : + (Default value = None) + labels : + (Default value = False) + metadata : + (Default value = False) + + Returns + ------- + """ logger.info("converting corpus to SVMlight format: %s", fname) @@ -100,16 +115,33 @@ def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): return offsets def docbyoffset(self, offset): - """ - Return the document stored at file position `offset`. + """Return the document stored at file position `offset`. + + Parameters + ---------- + offset : + + + Returns + ------- + + """ with utils.smart_open(self.fname) as f: f.seek(offset) return self.line2doc(f.readline())[0] def line2doc(self, line): - """ - Create a document from a single line (string) in SVMlight format + """Create a document from a single line (string) in SVMlight format + + Parameters + ---------- + line : + + + Returns + ------- + """ line = utils.to_unicode(line) line = line[: line.find('#')].strip() @@ -124,8 +156,18 @@ def line2doc(self, line): @staticmethod def doc2line(doc, label=0): - """ - Output the document in SVMlight format, as a string. Inverse function to `line2doc`. + """Output the document in SVMlight format, as a string. Inverse function to `line2doc`. + + Parameters + ---------- + doc : + + label : + (Default value = 0) + + Returns + ------- + """ pairs = ' '.join("%i:%s" % (termid + 1, termval) for termid, termval in doc) # +1 to convert 0-base to 1-base return "%s %s\n" % (label, pairs) From 98781339233385c6a3f7d74f18f96e672c42df8c Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Tue, 21 Nov 2017 12:23:50 +0500 Subject: [PATCH 24/48] Convert textcorpus docs to numpy style --- gensim/corpora/textcorpus.py | 219 ++++++++++++++++++++++++++++++----- 1 file changed, 188 insertions(+), 31 deletions(-) diff --git a/gensim/corpora/textcorpus.py b/gensim/corpora/textcorpus.py index 23f5fa3bd1..d9c4271460 100644 --- a/gensim/corpora/textcorpus.py +++ b/gensim/corpora/textcorpus.py @@ -44,56 +44,104 @@ def remove_stopwords(tokens, stopwords=STOPWORDS): - """Remove stopwords using list from `gensim.parsing.preprocessing.STOPWORDS`.""" + """Remove stopwords using list from `gensim.parsing.preprocessing.STOPWORDS`. + + Parameters + ---------- + tokens : + + stopwords : + (Default value = STOPWORDS) + + Returns + ------- + + """ return [token for token in tokens if token not in stopwords] def remove_short(tokens, minsize=3): - """Remove tokens smaller than `minsize` chars, which is 3 by default.""" + """Remove tokens smaller than `minsize` chars, which is 3 by default. + + Parameters + ---------- + tokens : + + minsize : + (Default value = 3) + + Returns + ------- + + """ return [token for token in tokens if len(token) >= minsize] def lower_to_unicode(text, encoding='utf8', errors='strict'): - """Lowercase `text` and convert to unicode.""" + """Lowercase `text` and convert to unicode. + + Parameters + ---------- + text : + + encoding : + (Default value = 'utf8') + errors : + (Default value = 'strict') + + Returns + ------- + + """ return utils.to_unicode(text.lower(), encoding, errors) def strip_multiple_whitespaces(s): - """Collapse multiple whitespace characters into a single space.""" + """Collapse multiple whitespace characters into a single space. + + Parameters + ---------- + s : + + + Returns + ------- + + """ return RE_WHITESPACE.sub(" ", s) class TextCorpus(interfaces.CorpusABC): """Helper class to simplify the pipeline of getting bag-of-words vectors (= a gensim corpus) from plain text. - + This is an abstract base class: override the `get_texts()` and `__len__()` methods to match your particular input. - + Given a filename (or a file-like object) in constructor, the corpus object will be automatically initialized with a dictionary in `self.dictionary` and will support the `iter` corpus method. You have a few different ways of utilizing this class via subclassing or by construction with different preprocessing arguments. - + The `iter` method converts the lists of tokens produced by `get_texts` to BoW format using `Dictionary.doc2bow`. `get_texts` does the following: - + 1. Calls `getstream` to get a generator over the texts. It yields each document in turn from the underlying text file or files. 2. For each document from the stream, calls `preprocess_text` to produce a list of tokens; if metadata is enabled, it yields a 2-`tuple` with the document number as the second element. - - + + Preprocessing consists of 0+ `character_filters`, a `tokenizer`, and 0+ `token_filters`. - + The preprocessing consists of calling each filter in `character_filters` with the document text; unicode is not guaranteed, and if desired, the first filter should convert to unicode. The output of each character filter should be another string. The output from the final filter is fed to the `tokenizer`, which should split the string into a list of tokens (strings). Afterwards, the list of tokens is fed through each filter in `token_filters`. The final output returned from `preprocess_text` is the output from the final token filter. - + So to use this class, you can either pass in different preprocessing functions using the `character_filters`, `tokenizer`, and `token_filters` arguments, or you can subclass it. If subclassing: override `getstream` to take text from different input sources in different @@ -101,9 +149,9 @@ class TextCorpus(interfaces.CorpusABC): then call the `TextCorpus.preprocess_text` method to apply the normal preprocessing. You can also overrride `get_texts` in order to tag the documents (token lists) with different metadata. - + The default preprocessing consists of: - + 1. lowercase and convert to unicode; assumes utf8 encoding 2. deaccent (asciifolding) 3. collapse multiple whitespaces into a single one @@ -158,6 +206,12 @@ def init_dictionary(self, dictionary): """If `dictionary` is None, initialize to an empty Dictionary, and then if there is an `input` for the corpus, add all documents from that `input`. If the `dictionary` is already initialized, simply set it as the corpus's `dictionary`. + + Parameters + ---------- + dictionary : + + """ self.dictionary = dictionary if dictionary is not None else Dictionary() if self.input is not None: @@ -188,6 +242,13 @@ def getstream(self): """Yield documents from the underlying plain text collection (of one or more files). Each item yielded from this method will be considered a document by subsequent preprocessing methods. + + Parameters + ---------- + + Yields + ------ + """ num_texts = 0 with utils.file_or_filename(self.input) as f: @@ -201,11 +262,16 @@ def preprocess_text(self, text): """Apply preprocessing to a single text document. This should perform tokenization in addition to any other desired preprocessing steps. - Args: - text (str): document text read from plain-text file. + Parameters + ---------- + text : str + document text read from plain-text file. + + Returns + ------- + iterable of str + tokens produced from `text` as a result of preprocessing. - Returns: - iterable of str: tokens produced from `text` as a result of preprocessing. """ for character_filter in self.character_filters: text = character_filter(text) @@ -219,6 +285,15 @@ def preprocess_text(self, text): def step_through_preprocess(self, text): """Yield tuples of functions and their output for each stage of preprocessing. This is useful for debugging issues with the corpus preprocessing pipeline. + + Parameters + ---------- + text : + + + Yields + ------ + """ for character_filter in self.character_filters: text = character_filter(text) @@ -238,9 +313,12 @@ def get_texts(self): to be overridden if the metadata you'd like to yield differs from the line number. - Returns: - generator of lists of tokens (strings); each list corresponds to a preprocessed + Yields + ------ + list of strings + each list corresponds to a preprocessed document from the corpus `input`. + """ lines = self.getstream() if self.metadata: @@ -252,24 +330,31 @@ def get_texts(self): def sample_texts(self, n, seed=None, length=None): """Yield n random documents from the corpus without replacement. - + Given the number of remaining documents in a corpus, we need to choose n elements. The probability for the current element to be chosen is n/remaining. If we choose it, we just decrease the n and move to the next element. Computing the corpus length may be a costly operation so you can use the optional parameter `length` instead. - Args: - n (int): number of documents we want to sample. - seed (int|None): if specified, use it as a seed for local random generator. - length (int|None): if specified, use it as a guess of corpus length. - It must be positive and not greater than actual corpus length. + Parameters + ---------- + n : int + number of documents we want to sample. + seed : int or None + if specified, use it as a seed for local random generator. (Default value = None) + length : int or None + if specified, use it as a guess of corpus length. + It must be positive and not greater than actual corpus length. (Default value = None) + + Yields + ------ - Yields: - list[str]: document represented as a list of tokens. See get_texts method. + Raises + ------ + ValueError + when n is invalid or length was set incorrectly. - Raises: - ValueError: when n is invalid or length was set incorrectly. """ random_generator = random if seed is None else random.Random(seed) if length is None: @@ -309,6 +394,7 @@ def __len__(self): class TextDirectoryCorpus(TextCorpus): """Read documents recursively from a directory, where each file (or line of each file) is interpreted as a plain text document. + """ def __init__(self, input, dictionary=None, metadata=False, min_depth=0, max_depth=None, @@ -339,46 +425,91 @@ def __init__(self, input, dictionary=None, metadata=False, min_depth=0, max_dept @property def lines_are_documents(self): + """ """ return self._lines_are_documents @lines_are_documents.setter def lines_are_documents(self, lines_are_documents): + """ + + Parameters + ---------- + lines_are_documents : + + + """ self._lines_are_documents = lines_are_documents self.length = None @property def pattern(self): + """ """ return self._pattern @pattern.setter def pattern(self, pattern): + """ + + Parameters + ---------- + pattern : + + + """ self._pattern = None if pattern is None else re.compile(pattern) self.length = None @property def exclude_pattern(self): + """ """ return self._exclude_pattern @exclude_pattern.setter def exclude_pattern(self, pattern): + """ + + Parameters + ---------- + pattern : + + + """ self._exclude_pattern = None if pattern is None else re.compile(pattern) self.length = None @property def min_depth(self): + """ """ return self._min_depth @min_depth.setter def min_depth(self, min_depth): + """ + + Parameters + ---------- + min_depth : + + + """ self._min_depth = min_depth self.length = None @property def max_depth(self): + """ """ return self._max_depth @max_depth.setter def max_depth(self, max_depth): + """ + + Parameters + ---------- + max_depth : + + + """ self._max_depth = max_depth self.length = None @@ -386,6 +517,10 @@ def iter_filepaths(self): """Lazily yield paths to each file in the directory structure within the specified range of depths. If a filename pattern to match was given, further filter to only those filenames that match. + + Yields + ------ + """ for depth, dirpath, dirnames, filenames in walk(self.input): if self.min_depth <= depth <= self.max_depth: @@ -401,9 +536,13 @@ def getstream(self): """Yield documents from the underlying plain text collection (of one or more files). Each item yielded from this method will be considered a document by subsequent preprocessing methods. - + If `lines_are_documents` was set to True, items will be lines from files. Otherwise there will be one item per file, containing the entire contents of the file. + + Yields + ------ + """ num_texts = 0 for path in self.iter_filepaths(): @@ -424,6 +563,7 @@ def __len__(self): return self.length def _cache_corpus_length(self): + """ """ if not self.lines_are_documents: self.length = sum(1 for _ in self.iter_filepaths()) else: @@ -434,6 +574,23 @@ def walk(top, topdown=True, onerror=None, followlinks=False, depth=0): """This is a mostly copied version of `os.walk` from the Python 2 source code. The only difference is that it returns the depth in the directory tree structure at which each yield is taking place. + + Parameters + ---------- + top : + + topdown : + (Default value = True) + onerror : + (Default value = None) + followlinks : + (Default value = False) + depth : + (Default value = 0) + + Yields + ------ + """ islink, join, isdir = os.path.islink, os.path.join, os.path.isdir From dba4429435f32f54327e5df48f8b713d90060140 Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Tue, 21 Nov 2017 12:32:02 +0500 Subject: [PATCH 25/48] Convert ucicorpus docs to numpy style --- gensim/corpora/ucicorpus.py | 86 +++++++++++++++++++++++++++++-------- 1 file changed, 67 insertions(+), 19 deletions(-) diff --git a/gensim/corpora/ucicorpus.py b/gensim/corpora/ucicorpus.py index a8911ee07f..931c773930 100644 --- a/gensim/corpora/ucicorpus.py +++ b/gensim/corpora/ucicorpus.py @@ -29,11 +29,11 @@ class UciReader(MmReader): def __init__(self, input): - """ - Initialize the reader. + """Initialize the reader. The `input` parameter refers to a file on the local filesystem, which is expected to be in the UCI Bag-of-Words format. + """ logger.info('Initializing corpus reader from %s', input) @@ -55,20 +55,27 @@ def __init__(self, input): ) def skip_headers(self, input_file): + """ + + Parameters + ---------- + input_file : + + + """ for lineno, _ in enumerate(input_file): if lineno == 2: break class UciWriter(MmWriter): - """ - Store a corpus in UCI Bag-of-Words format. - + """Store a corpus in UCI Bag-of-Words format. + This corpus format is identical to MM format, except for different file headers. There is no format line, and the first three lines of the file contain number_docs, num_terms, and num_nnz, one value per line. - + This implementation is based on matutils.MmWriter, and works the same way. """ @@ -76,8 +83,9 @@ class UciWriter(MmWriter): FAKE_HEADER = utils.to_utf8(' ' * MAX_HEADER_LENGTH + '\n') def write_headers(self): - """ - Write blank header lines. Will be updated later, once corpus stats are known. + """Write blank header lines. + + Will be updated later, once corpus stats are known. """ for _ in range(3): self.fout.write(self.FAKE_HEADER) @@ -86,8 +94,17 @@ def write_headers(self): self.headers_written = True def update_headers(self, num_docs, num_terms, num_nnz): - """ - Update headers with actual values. + """Update headers with actual values. + + Parameters + ---------- + num_docs : + + num_terms : + + num_nnz : + + """ offset = 0 values = [utils.to_utf8(str(n)) for n in [num_docs, num_terms, num_nnz]] @@ -101,6 +118,20 @@ def update_headers(self, num_docs, num_terms, num_nnz): @staticmethod def write_corpus(fname, corpus, progress_cnt=1000, index=False): + """ + + Parameters + ---------- + fname : + + corpus : + + progress_cnt : + (Default value = 1000) + index : + (Default value = False) + + """ writer = UciWriter(fname) writer.write_headers() @@ -139,9 +170,7 @@ def write_corpus(fname, corpus, progress_cnt=1000, index=False): class UciCorpus(UciReader, IndexedCorpus): - """ - Corpus in the UCI bag-of-words format. - """ + """Corpus in the UCI bag-of-words format.""" def __init__(self, fname, fname_vocab=None): IndexedCorpus.__init__(self, fname) UciReader.__init__(self, fname) @@ -165,9 +194,12 @@ def __iter__(self): yield doc # get rid of docId, return the sparse vector only def create_dictionary(self): - """ - Utility method to generate gensim-style Dictionary directly from + """Utility method to generate gensim-style Dictionary directly from the corpus and vocabulary data. + + Returns + ------- + """ dictionary = Dictionary() @@ -193,14 +225,30 @@ def create_dictionary(self): @staticmethod def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False): - """ - Save a corpus in the UCI Bag-of-Words format. - + """Save a corpus in the UCI Bag-of-Words format. + There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. - + This function is automatically called by `UciCorpus.serialize`; don't call it directly, call `serialize` instead. + + Parameters + ---------- + fname : + + corpus : + + id2word : + (Default value = None) + progress_cnt : + (Default value = 10000) + metadata : + (Default value = False) + + Returns + ------- + """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") From 6a95c94f76b68d89e6d3f1c4029046e2781db672 Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Tue, 21 Nov 2017 12:42:12 +0500 Subject: [PATCH 26/48] Convert wikicorpus docs to numpy style --- gensim/corpora/wikicorpus.py | 151 +++++++++++++++++++++++++++++------ 1 file changed, 126 insertions(+), 25 deletions(-) mode change 100755 => 100644 gensim/corpora/wikicorpus.py diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py old mode 100755 new mode 100644 index 07cb70630c..082bd20219 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -69,9 +69,17 @@ def filter_wiki(raw): - """ - Filter out wiki mark-up from `raw`, leaving only text. `raw` is either unicode + """Filter out wiki mark-up from `raw`, leaving only text. `raw` is either unicode or utf-8 encoded string. + + Parameters + ---------- + raw : + + + Returns + ------- + """ # parsing of the wiki markup is not perfect, but sufficient for our purposes # contributions to improving this code are welcome :) @@ -81,6 +89,17 @@ def filter_wiki(raw): def remove_markup(text): + """ + + Parameters + ---------- + text : + + + Returns + ------- + + """ text = re.sub(RE_P2, "", text) # remove the last list (=languages) # the wiki markup is recursive (markup inside markup etc) # instead of writing a recursive grammar, here we deal with that by removing @@ -121,8 +140,19 @@ def remove_template(s): http://meta.wikimedia.org/wiki/Help:Template for wikimedia templates details. - Note: Since template can be nested, it is difficult remove them using + Parameters + ---------- + s : + + + Returns + ------- + + Notes + ----- + Since template can be nested, it is difficult remove them using regular expresssions. + """ # Find the start and end position of each template by finding the opening @@ -158,6 +188,15 @@ def remove_file(s): Return a copy of `s` with all the 'File:' and 'Image:' markup replaced by their corresponding captions. See http://www.mediawiki.org/wiki/Help:Images for the markup details. + + Parameters + ---------- + s : + + + Returns + ------- + """ # The regex RE_P15 match a File: or Image: markup for match in re.finditer(RE_P15, s): @@ -168,13 +207,26 @@ def remove_file(s): def tokenize(content, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True): - """ - Tokenize a piece of text from wikipedia. The input string `content` is assumed - to be mark-up free (see `filter_wiki()`). - - Set `token_min_len`, `token_max_len` as character length (not bytes!) thresholds for individual tokens. + """Tokenize a piece of text from wikipedia. The input string `content` is + assumed to be mark-up free (see `filter_wiki()`). + + Set `token_min_len`, `token_max_len` as character length (not bytes!) + thresholds for individual tokens. + + Parameters + ---------- + content : + + token_min_len : + (Default value = TOKEN_MIN_LEN) + token_max_len : + (Default value = TOKEN_MAX_LEN) + lower : + (Default value = True) + + Returns + ------- - Return list of tokens as utf8 bytestrings. """ # TODO maybe ignore tokens with non-latin characters? (no chinese, arabic, russian etc.) return [ @@ -184,7 +236,17 @@ def tokenize(content, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, def get_namespace(tag): - """Returns the namespace of tag.""" + """Returns the namespace of tag. + + Parameters + ---------- + tag : + + + Returns + ------- + + """ m = re.match("^{(.*?)}", tag) namespace = m.group(1) if m else "" if not namespace.startswith("http://www.mediawiki.org/xml/export-"): @@ -196,10 +258,17 @@ def get_namespace(tag): def extract_pages(f, filter_namespaces=False): - """ - Extract pages from a MediaWiki database dump = open file-like object `f`. + """Extract pages from a MediaWiki database dump = open file-like object `f`. + + Parameters + ---------- + f : + + filter_namespaces : + (Default value = False) - Return an iterable over (str, str, str) which generates (title, content, pageid) triplets. + Yields + ------ """ elems = (elem for _, elem in iterparse(f, events=("end",))) @@ -244,12 +313,28 @@ def extract_pages(f, filter_namespaces=False): def process_article(args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True): - """ - Parse a wikipedia article, returning its content as a list of tokens + """Parse a wikipedia article, returning its content as a list of tokens (utf8-encoded strings). - + Set `tokenizer_func` (defaults to `tokenize`) parameter for languages like japanese or thai to perform better tokenization. The `tokenizer_func` needs to take 4 parameters: (text, token_min_len, token_max_len, lower). + + Parameters + ---------- + args : + + tokenizer_func : + (Default value = tokenize) + token_min_len : + (Default value = TOKEN_MIN_LEN) + token_max_len : + (Default value = TOKEN_MAX_LEN) + lower : + (Default value = True) + + Returns + ------- + """ text, lemmatize, title, pageid = args text = filter_wiki(text) @@ -266,7 +351,17 @@ def init_to_ignore_interrupt(): def _process_article(args): - """Should not be called explicitly. Use `process_article` instead.""" + """Should not be called explicitly. Use `process_article` instead. + + Parameters + ---------- + args : + + + Returns + ------- + + """ tokenizer_func, token_min_len, token_max_len, lower = args[-1] args = args[:-1] @@ -278,16 +373,17 @@ def _process_article(args): class WikiCorpus(TextCorpus): - """ - Treat a wikipedia articles dump (wiki--pages-articles.xml.bz2 or wiki-latest-pages-articles.xml.bz2) as a (read-only) corpus. - + """Treat a wikipedia articles dump (wiki--pages-articles.xml.bz2 or wiki-latest-pages-articles.xml.bz2) as a (read-only) corpus. + The documents are extracted on-the-fly, so that the whole (massive) dump can stay compressed on disk. - + **Note:** "multistream" archives are *not* supported in Python 2 due to `limitations in the core bz2 library `_. + Examples + -------- >>> wiki = WikiCorpus('enwiki-20100622-pages-articles.xml.bz2') # create word->word_id mapping, takes almost 8h >>> MmCorpus.serialize('wiki_en_vocab200k.mm', wiki) # another 8h, creates a file in MatrixMarket format plus file with id->word @@ -336,18 +432,23 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction self.dictionary = dictionary def get_texts(self): - """ - Iterate over the dump, returning text version of each article as a list + """Iterate over the dump, returning text version of each article as a list of tokens. - + Only articles of sufficient length are returned (short articles & redirects etc are ignored). This is control by `article_min_tokens` on the class instance. - + Note that this iterates over the **texts**; if you want vectors, just use the standard corpus interface instead of this function:: + Yields + ------ + + Examples + -------- >>> for vec in wiki_corpus: >>> print(vec) + """ articles, articles_all = 0, 0 From 6dcfb07b0278f912397ece1baf4e007ed2e6a904 Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Tue, 21 Nov 2017 12:44:06 +0500 Subject: [PATCH 27/48] Add sphinx tweaks --- docs/src/conf.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/src/conf.py b/docs/src/conf.py index cce21dc95e..7b3a6793e4 100644 --- a/docs/src/conf.py +++ b/docs/src/conf.py @@ -25,7 +25,12 @@ # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = ['sphinx.ext.autodoc', 'sphinxcontrib.napoleon'] +extensions = ['sphinx.ext.autodoc', 'sphinx.ext.napoleon'] + +# napoleon_google_docstring = False +# napoleon_use_param = False +# napoleon_use_ivar = True + autoclass_content = "both" # Add any paths that contain templates here, relative to this directory. From 833ec649343ffc2bed1372ec1f5e2089b6a120fb Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Tue, 21 Nov 2017 17:36:11 +0500 Subject: [PATCH 28/48] Remove trailing whitespaces --- gensim/corpora/bleicorpus.py | 12 ++--- gensim/corpora/csvcorpus.py | 4 +- gensim/corpora/dictionary.py | 44 ++++++++--------- gensim/corpora/hashdictionary.py | 24 ++++----- gensim/corpora/indexedcorpus.py | 6 +-- gensim/corpora/lowcorpus.py | 24 ++++----- gensim/corpora/malletcorpus.py | 22 ++++----- gensim/corpora/mmcorpus.py | 6 +-- gensim/corpora/sharded_corpus.py | 84 ++++++++++++++++---------------- gensim/corpora/svmlightcorpus.py | 24 ++++----- gensim/corpora/textcorpus.py | 46 ++++++++--------- gensim/corpora/ucicorpus.py | 22 ++++----- gensim/corpora/wikicorpus.py | 28 +++++------ 13 files changed, 173 insertions(+), 173 deletions(-) diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py index 6d2ee86a73..69f7d2d1d7 100644 --- a/gensim/corpora/bleicorpus.py +++ b/gensim/corpora/bleicorpus.py @@ -22,14 +22,14 @@ class BleiCorpus(IndexedCorpus): """Corpus in Blei's LDA-C format. - + The corpus is represented as two files: one describing the documents, and another describing the mapping between words and their ids. - + Each document is one line:: - + N fieldId1:fieldValue1 fieldId2:fieldValue2 ... fieldIdN:fieldValueN - + The vocabulary is a file with words, one word per line; word at line K has an implicit ``id=K``. @@ -106,7 +106,7 @@ def line2doc(self, line): @staticmethod def save_corpus(fname, corpus, id2word=None, metadata=False): """Save a corpus in the LDA-C format. - + There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. @@ -162,7 +162,7 @@ def docbyoffset(self, offset): Returns ------- list of (int, float) - + """ with utils.smart_open(self.fname) as f: f.seek(offset) diff --git a/gensim/corpora/csvcorpus.py b/gensim/corpora/csvcorpus.py index dbee31dc30..7ce633af11 100644 --- a/gensim/corpora/csvcorpus.py +++ b/gensim/corpora/csvcorpus.py @@ -20,10 +20,10 @@ class CsvCorpus(interfaces.CorpusABC): """Corpus in CSV format. - + The CSV delimiter, headers etc. are guessed automatically based on the file content. - + All row values are expected to be ints/floats. """ diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py index 70899d4a85..03482de9a0 100644 --- a/gensim/corpora/dictionary.py +++ b/gensim/corpora/dictionary.py @@ -37,7 +37,7 @@ class Dictionary(utils.SaveLoad, Mapping): """Mapping between normalized words and their ids. - + The main function is `doc2bow`, which converts a collection of words to its bag-of-words representation: a list of (word_id, word_frequency) 2-tuples. @@ -142,17 +142,17 @@ def from_documents(documents): Returns ------- :class:`Dictionary` - + """ return Dictionary(documents=documents) def add_documents(self, documents, prune_at=2000000): u""" Update dictionary from a collection of documents. - + Each document is a list of tokens = **tokenized and normalized** strings (either utf8 or unicode). - + This is a convenience wrapper for calling `doc2bow` on each document with `allow_update=True`, which also prunes infrequent words, keeping the total number of unique words <= `prune_at`. This is to save memory on very @@ -187,7 +187,7 @@ def add_documents(self, documents, prune_at=2000000): def doc2bow(self, document, allow_update=False, return_missing=False): """Convert document to the bag-of-words format. - + Each word is assumed to be a **tokenized and normalized** string ( either unicode or utf8-encoded). No further preprocessing is done on the words in `document`; apply tokenization, stemming etc. before @@ -196,7 +196,7 @@ def doc2bow(self, document, allow_update=False, return_missing=False): Parameters ---------- document : list of str - + allow_update : bool Whether to update the dictionary in the process (Default value = False) @@ -285,7 +285,7 @@ def doc2idx(self, document, unknown_word_index=-1): def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=None): """Filter out tokens that appear in - + 1. less than `no_below` documents (absolute number) or 2. more than `no_above` documents (fraction of total corpus size, *not* absolute number). @@ -293,9 +293,9 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=N the `no_below` and `no_above` settings 4. after (1), (2) and (3), keep only the first `keep_n` most frequent tokens (or keep all if `None`). - + After the pruning, shrink resulting gaps in word ids. - + **Note**: Due to the gap shrinking, the same word may have a different word id before and after the call to this function! @@ -342,9 +342,9 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=N def filter_n_most_frequent(self, remove_n): """Filter out the 'remove_n' most frequent tokens that appear in the documents. - + After the pruning, shrink resulting gaps in word ids. - + **Note**: Due to the gap shrinking, the same word may have a different word id before and after the call to this function! @@ -367,7 +367,7 @@ def filter_n_most_frequent(self, remove_n): def filter_tokens(self, bad_ids=None, good_ids=None): """Remove the selected `bad_ids` tokens from all dictionary mappings, or keep selected `good_ids` in the mapping and remove the rest. - + `bad_ids` and `good_ids` are collections of word ids to be removed. Parameters @@ -390,7 +390,7 @@ def filter_tokens(self, bad_ids=None, good_ids=None): def compactify(self): """Assign new word ids to all words. - + This is done to make the ids more compact, e.g. after some tokens have been removed via :func:`filter_tokens` and there are gaps in the id series. Calling this method will remove the gaps. @@ -414,14 +414,14 @@ def save_as_text(self, fname, sort_by_word=True): `num_docs` `id[TAB]word_utf8[TAB]document frequency[NEWLINE]`. Sorted by word, or by decreasing word frequency. - + Note: text format should be use for corpus inspection. Use `save`/`load` to store in binary format (pickle) for improved performance. Parameters ---------- fname : - + sort_by_word : (Default value = True) @@ -443,9 +443,9 @@ def merge_with(self, other): """Merge another dictionary into this dictionary, mapping same tokens to the same ids and new tokens to new ids. The purpose is to merge two corpora created using two different dictionaries, one from `self` and one from `other`. - + `other` can be any id=>word mapping (a dict, a Dictionary object, ...). - + Return a transformation object which, when accessed as `result[doc_from_other_corpus]`, will convert documents from a corpus built using the `other` dictionary into a document using the new, merged dictionary (see :class:`gensim.interfaces.TransformationABC`). @@ -453,7 +453,7 @@ def merge_with(self, other): Parameters ---------- other : - + Returns ------- @@ -499,7 +499,7 @@ def load_from_text(fname): Parameters ---------- fname : - + Returns ------- @@ -533,11 +533,11 @@ def from_corpus(corpus, id2word=None): """Create Dictionary from an existing corpus. This can be useful if you only have a term-document BOW matrix (represented by `corpus`), but not the original text corpus. - + This will scan the term-document count matrix for all word ids that appear in it, then construct and return Dictionary which maps each `word_id -> id2word[word_id]`. - + `id2word` is an optional dictionary that maps the `word_id` to a token. In case `id2word` isn't specified the mapping `id2word[ word_id] = str(word_id)` will be used. @@ -545,7 +545,7 @@ def from_corpus(corpus, id2word=None): Parameters ---------- corpus : - + id2word : (Default value = None) diff --git a/gensim/corpora/hashdictionary.py b/gensim/corpora/hashdictionary.py index a4c1304795..730e9527da 100644 --- a/gensim/corpora/hashdictionary.py +++ b/gensim/corpora/hashdictionary.py @@ -37,11 +37,11 @@ class HashDictionary(utils.SaveLoad, dict): """HashDictionary encapsulates the mapping between normalized words and their integer ids. - + Unlike `Dictionary`, building a `HashDictionary` before using it is not a necessary step. The documents can be computed immediately, from an uninitialized `HashDictionary`, without seeing the rest of the corpus first. - + The main function is `doc2bow`, which converts a collection of words to its bag-of-words representation: a list of (word_id, word_frequency) 2-tuples. @@ -85,7 +85,7 @@ def restricted_hash(self, token): Parameters ---------- token : - + Returns ------- @@ -117,9 +117,9 @@ def from_documents(*args, **kwargs): Parameters ---------- *args : - + **kwargs : - + Returns ------- @@ -130,7 +130,7 @@ def from_documents(*args, **kwargs): def add_documents(self, documents): """Build dictionary from a collection of documents. Each document is a list of tokens = **tokenized and normalized** utf-8 encoded strings. - + This is only a convenience wrapper for calling `doc2bow` on each document with `allow_update=True`. @@ -154,7 +154,7 @@ def doc2bow(self, document, allow_update=False, return_missing=False): **tokenized and normalized** utf-8 encoded string. No further preprocessing is done on the words in `document`; apply tokenization, stemming etc. before calling this method. - + If `allow_update` or `self.allow_update` is set, then also update dictionary in the process: update overall corpus statistics and document frequencies. For each id appearing in this document, increase its document frequency @@ -163,7 +163,7 @@ def doc2bow(self, document, allow_update=False, return_missing=False): Parameters ---------- document : - + allow_update : (Default value = False) return_missing : @@ -203,13 +203,13 @@ def doc2bow(self, document, allow_update=False, return_missing=False): def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000): """Remove document frequency statistics for tokens that appear in - + 1. less than `no_below` documents (absolute number) or 2. more than `no_above` documents (fraction of total corpus size, *not* absolute number). 3. after (1) and (2), keep only the first `keep_n` most frequent tokens (or keep all if `None`). - + **Note:** since HashDictionary's id range is fixed and doesn't depend on the number of tokens seen, this doesn't really "remove" anything. It only clears some supplementary statistics, for easier debugging and a smaller RAM @@ -245,10 +245,10 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000): def save_as_text(self, fname): """Save this HashDictionary to a text file, for easier debugging. - + The format is: `id[TAB]document frequency of this id[TAB]tab-separated set of words in UTF8 that map to this id[NEWLINE]`. - + Note: use `save`/`load` to store in binary format instead (pickle). Parameters diff --git a/gensim/corpora/indexedcorpus.py b/gensim/corpora/indexedcorpus.py index 1b0738a6d0..a5d05b496f 100644 --- a/gensim/corpora/indexedcorpus.py +++ b/gensim/corpora/indexedcorpus.py @@ -63,13 +63,13 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False): """Iterate through the document stream `corpus`, saving the documents to `fname` and recording byte offset of each document. - + Save the resulting index structure to file `index_fname` (or `fname`.index is not set). - + This relies on the underlying corpus class `serializer` providing (in addition to standard iteration):: - + * `save_corpus` method that returns a sequence of byte offsets, one for each saved document * the `docbyoffset(offset)` method, which returns a document diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py index c1beac3c32..f0edb59f49 100644 --- a/gensim/corpora/lowcorpus.py +++ b/gensim/corpora/lowcorpus.py @@ -28,7 +28,7 @@ def split_on_space(s): Parameters ---------- s : - + Returns ------- @@ -39,24 +39,24 @@ def split_on_space(s): class LowCorpus(IndexedCorpus): """List_Of_Words corpus handles input in GibbsLda++ format. - + Quoting http://gibbslda.sourceforge.net/#3.2_Input_Data_Format:: - + Both data for training/estimating the model and new data (i.e., previously unseen data) have the same format as follows: - + [M] [document1] [document2] ... [documentM] - + in which the first line is the total number for documents [M]. Each line after that is one document. [documenti] is the ith document of the dataset that consists of a list of Ni words/terms. - + [documenti] = [wordi1] [wordi2] ... [wordiNi] - + in which all [wordij] (i=1..M, j=1..Ni) are text strings and they are separated by the blank character. @@ -121,7 +121,7 @@ def line2doc(self, line): Parameters ---------- line : - + Returns ------- @@ -166,16 +166,16 @@ def __iter__(self): @staticmethod def save_corpus(fname, corpus, id2word=None, metadata=False): """Save a corpus in the List-of-words format. - + This function is automatically called by `LowCorpus.serialize`; don't call it directly, call `serialize` instead. Parameters ---------- fname : - + corpus : - + id2word : (Default value = None) metadata : @@ -216,7 +216,7 @@ def docbyoffset(self, offset): Parameters ---------- offset : - + Returns ------- diff --git a/gensim/corpora/malletcorpus.py b/gensim/corpora/malletcorpus.py index b41d5cd1a3..f67d141fd1 100644 --- a/gensim/corpora/malletcorpus.py +++ b/gensim/corpora/malletcorpus.py @@ -20,19 +20,19 @@ class MalletCorpus(LowCorpus): """Quoting http://mallet.cs.umass.edu/import.php: - + One file, one instance per line Assume the data is in the following format: - + [URL] [language] [text of the page...] - + Or, more generally, [document #1 id] [label] [text of the document...] [document #2 id] [label] [text of the document...] ... [document #N id] [label] [text of the document...] - + Note that language/label is *not* considered in Gensim. """ @@ -62,7 +62,7 @@ def line2doc(self, line): Parameters ---------- line : - + Returns ------- @@ -81,22 +81,22 @@ def line2doc(self, line): @staticmethod def save_corpus(fname, corpus, id2word=None, metadata=False): """Save a corpus in the Mallet format. - + The document id will be generated by enumerating the corpus. That is, it will range between 0 and number of documents in the corpus. - + Since Mallet has a language field in the format, this defaults to the string '__unknown__'. If the language needs to be saved, post-processing will be required. - + This function is automatically called by `MalletCorpus.serialize`; don't call it directly, call `serialize` instead. Parameters ---------- fname : - + corpus : - + id2word : (Default value = None) metadata : @@ -144,7 +144,7 @@ def docbyoffset(self, offset): Parameters ---------- offset : - + Returns ------- diff --git a/gensim/corpora/mmcorpus.py b/gensim/corpora/mmcorpus.py index ee6f1401eb..cbe69e0cde 100644 --- a/gensim/corpora/mmcorpus.py +++ b/gensim/corpora/mmcorpus.py @@ -40,16 +40,16 @@ def __iter__(self): @staticmethod def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False): """Save a corpus in the Matrix Market format to disk. - + This function is automatically called by `MmCorpus.serialize`; don't call it directly, call `serialize` instead. Parameters ---------- fname : - + corpus : - + id2word : (Default value = None) progress_cnt : diff --git a/gensim/corpora/sharded_corpus.py b/gensim/corpora/sharded_corpus.py index 2b299d195f..175a82de28 100644 --- a/gensim/corpora/sharded_corpus.py +++ b/gensim/corpora/sharded_corpus.py @@ -48,18 +48,18 @@ class ShardedCorpus(IndexedCorpus): on matrices, with a large number of iterations. (It should be faster than gensim's other IndexedCorpus implementations for this use case; check the `benchmark_datasets.py` script. It should also serialize faster.) - + The corpus stores its data in separate files called "shards". This is a compromise between speed (keeping the whole dataset in memory) and memory footprint (keeping the data on disk and reading from it on demand). Persistence is done using the standard gensim load/save methods. - + .. note:: - + The dataset is **read-only**, there is - as opposed to gensim's Similarity class, which works similarly - no way of adding documents to the dataset (for now). - + You can use ShardedCorpus to serialize your data just like any other gensim corpus that implements serialization. However, because the data is saved as numpy 2-dimensional ndarrays (or scipy sparse matrices), you need to @@ -76,15 +76,15 @@ class ShardedCorpus(IndexedCorpus): is essentially a re-serialization into new-size shards), but note that this operation will temporarily take twice as much disk space, because the old shards are not deleted until the new shards are safely in place. - + After serializing the data, the corpus will then save itself to the file `output_prefix`. - + On further initialization with the same `output_prefix`, the corpus will load the already built dataset unless the `overwrite` option is given. (A new object is "cloned" from the one saved to `output_prefix` previously.) - + To retrieve data, you can load the corpus and use it like a list: >>> sh_corpus = ShardedCorpus.load(output_prefix) @@ -98,7 +98,7 @@ class ShardedCorpus(IndexedCorpus): >>> batch = sh_corpus[100:150] The batch now will be a generator of gensim vectors. - + Since the corpus needs the data serialized in order to be able to operate, it will serialize data right away on initialization. Instead of calling `ShardedCorpus.serialize()`, you can just initialize and use the corpus @@ -130,9 +130,9 @@ class ShardedCorpus(IndexedCorpus): of a ShardedCorpus object, you should definitely not touch ` `sharded_serialization`! Changing the attribute will not miraculously re-serialize the data in the requested format. - + The CSR format is used for sparse data throughout. - + Internally, to retrieve data, the dataset keeps track of which shard is currently open and on a `__getitem__` request, either returns an item from the current shard, or opens a new one. The shard size is constant, except @@ -254,9 +254,9 @@ def init_shards(self, output_prefix, corpus, shardsize=4096, dtype=_default_dtyp Parameters ---------- output_prefix : - + corpus : - + shardsize : (Default value = 4096) dtype : @@ -331,14 +331,14 @@ def init_by_clone(self): def save_shard(self, shard, n=None, filename=None): """Pickle the given shard. If `n` is not given, will consider the shard a new one. - + If `filename` is given, will use that file name instead of generating one. Parameters ---------- shard : - + n : (Default value = None) filename : @@ -392,13 +392,13 @@ def shard_by_offset(self, offset): """Determine which shard the given offset belongs to. If the offset is greater than the number of available documents, raises a `ValueError`. - + Assumes that all shards have the same size. Parameters ---------- offset : - + Returns ------- @@ -419,7 +419,7 @@ def in_current(self, offset): Parameters ---------- offset : - + Returns ------- @@ -436,7 +436,7 @@ def in_next(self, offset): Parameters ---------- offset : - + Returns ------- @@ -527,7 +527,7 @@ def _shard_name(self, n): Parameters ---------- n : - + Returns ------- @@ -542,7 +542,7 @@ def _resized_shard_name(self, n): Parameters ---------- n : - + Returns ------- @@ -556,7 +556,7 @@ def _guess_n_features(self, corpus): Parameters ---------- corpus : - + Returns ------- @@ -633,7 +633,7 @@ def get_by_offset(self, offset): Parameters ---------- offset : - + Returns ------- @@ -796,7 +796,7 @@ def _getitem_format(self, s_result): Parameters ---------- s_result : - + Returns ------- @@ -816,13 +816,13 @@ def _getitem_format(self, s_result): def _getitem_sparse2gensim(self, result): """Change given sparse result matrix to gensim sparse vectors. - + Uses the internals of the sparse matrix to make this fast. Parameters ---------- result : - + Returns ------- @@ -834,9 +834,9 @@ def row_sparse2gensim(row_idx, csr_matrix): Parameters ---------- row_idx : - + csr_matrix : - + Returns ------- @@ -856,7 +856,7 @@ def _getitem_dense2gensim(self, result): Parameters ---------- result : - + Returns ------- @@ -886,9 +886,9 @@ def save(self, *args, **kwargs): Parameters ---------- *args : - + **kwargs : - + """ # Can we save to a different file than output_prefix? Well, why not? @@ -909,7 +909,7 @@ def load(cls, fname, mmap=None): Parameters ---------- fname : - + mmap : (Default value = None) @@ -923,16 +923,16 @@ def load(cls, fname, mmap=None): def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False, **kwargs): """Implement a serialization interface. Do not call directly; use the `serialize` method instead. - + Note that you might need some ShardedCorpus init parameters, most likely the dimension (`dim`). Again, pass these as `kwargs` to the `serialize` method. - + All this thing does is initialize a ShardedCorpus from a corpus with the `output_prefix` argument set to the `fname` parameter of this method. The initialization of a ShardedCorpus takes care of serializing the data (in dense form) to shards. - + Ignore the parameters id2word, progress_cnt and metadata. They currently do nothing and are here only to provide a compatible method signature with superclass. @@ -940,9 +940,9 @@ def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False, Parameters ---------- fname : - + corpus : - + id2word : (Default value = None) progress_cnt : @@ -960,12 +960,12 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progres labels=None, metadata=False, **kwargs): """Iterate through the document stream `corpus`, saving the documents as a ShardedCorpus to `fname`. - + Use this method instead of calling `save_corpus` directly. You may need to supply some kwargs that are used upon dataset creation (namely: `dim`, unless the dataset can infer the dimension from the given corpus). - + Ignore the parameters id2word, index_fname, progress_cnt, labels and metadata. They currently do nothing and are here only to provide a compatible method signature with superclass. @@ -973,11 +973,11 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progres Parameters ---------- serializer : - + fname : - + corpus : - + id2word : (Default value = None) index_fname : @@ -989,7 +989,7 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progres metadata : (Default value = False) **kwargs : - + """ serializer.save_corpus(fname, corpus, id2word=id2word, progress_cnt=progress_cnt, metadata=metadata, **kwargs) diff --git a/gensim/corpora/svmlightcorpus.py b/gensim/corpora/svmlightcorpus.py index ac1bf495bb..17b100afb2 100644 --- a/gensim/corpora/svmlightcorpus.py +++ b/gensim/corpora/svmlightcorpus.py @@ -23,20 +23,20 @@ class SvmLightCorpus(IndexedCorpus): """Corpus in SVMlight format. - + Quoting http://svmlight.joachims.org/: The input file contains the training examples. The first lines may contain comments and are ignored if they start with #. Each of the following lines represents one training example and is of the following format:: - + .=. : : ... : # .=. +1 | -1 | 0 | .=. | "qid" .=. .=. - + The "qid" feature (used for SVMlight ranking), if present, is ignored. - + Although not mentioned in the specification above, SVMlight also expect its feature ids to be 1-based (counting starts at 1). We convert features to 0-base internally by decrementing all ids when loading a SVMlight input file, and @@ -80,19 +80,19 @@ def __iter__(self): @staticmethod def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): """Save a corpus in the SVMlight format. - + The SVMlight `` class tag is taken from the `labels` array, or set to 0 for all documents if `labels` is not supplied. - + This function is automatically called by `SvmLightCorpus.serialize`; don't call it directly, call `serialize` instead. Parameters ---------- fname : - + corpus : - + id2word : (Default value = None) labels : @@ -120,11 +120,11 @@ def docbyoffset(self, offset): Parameters ---------- offset : - + Returns ------- - + """ with utils.smart_open(self.fname) as f: @@ -137,7 +137,7 @@ def line2doc(self, line): Parameters ---------- line : - + Returns ------- @@ -162,7 +162,7 @@ def doc2line(doc, label=0): Parameters ---------- doc : - + label : (Default value = 0) diff --git a/gensim/corpora/textcorpus.py b/gensim/corpora/textcorpus.py index c41fd87f8a..ac12cc2dcd 100644 --- a/gensim/corpora/textcorpus.py +++ b/gensim/corpora/textcorpus.py @@ -49,7 +49,7 @@ def remove_stopwords(tokens, stopwords=STOPWORDS): Parameters ---------- tokens : - + stopwords : (Default value = STOPWORDS) @@ -66,7 +66,7 @@ def remove_short(tokens, minsize=3): Parameters ---------- tokens : - + minsize : (Default value = 3) @@ -83,7 +83,7 @@ def lower_to_unicode(text, encoding='utf8', errors='strict'): Parameters ---------- text : - + encoding : (Default value = 'utf8') errors : @@ -102,7 +102,7 @@ def strip_multiple_whitespaces(s): Parameters ---------- s : - + Returns ------- @@ -114,34 +114,34 @@ def strip_multiple_whitespaces(s): class TextCorpus(interfaces.CorpusABC): """Helper class to simplify the pipeline of getting bag-of-words vectors (= a gensim corpus) from plain text. - + This is an abstract base class: override the `get_texts()` and `__len__()` methods to match your particular input. - + Given a filename (or a file-like object) in constructor, the corpus object will be automatically initialized with a dictionary in `self.dictionary` and will support the `iter` corpus method. You have a few different ways of utilizing this class via subclassing or by construction with different preprocessing arguments. - + The `iter` method converts the lists of tokens produced by `get_texts` to BoW format using `Dictionary.doc2bow`. `get_texts` does the following: - + 1. Calls `getstream` to get a generator over the texts. It yields each document in turn from the underlying text file or files. 2. For each document from the stream, calls `preprocess_text` to produce a list of tokens; if metadata is enabled, it yields a 2-`tuple` with the document number as the second element. - - + + Preprocessing consists of 0+ `character_filters`, a `tokenizer`, and 0+ `token_filters`. - + The preprocessing consists of calling each filter in `character_filters` with the document text; unicode is not guaranteed, and if desired, the first filter should convert to unicode. The output of each character filter should be another string. The output from the final filter is fed to the `tokenizer`, which should split the string into a list of tokens (strings). Afterwards, the list of tokens is fed through each filter in `token_filters`. The final output returned from `preprocess_text` is the output from the final token filter. - + So to use this class, you can either pass in different preprocessing functions using the `character_filters`, `tokenizer`, and `token_filters` arguments, or you can subclass it. If subclassing: override `getstream` to take text from different input sources in different @@ -149,9 +149,9 @@ class TextCorpus(interfaces.CorpusABC): then call the `TextCorpus.preprocess_text` method to apply the normal preprocessing. You can also overrride `get_texts` in order to tag the documents (token lists) with different metadata. - + The default preprocessing consists of: - + 1. lowercase and convert to unicode; assumes utf8 encoding 2. deaccent (asciifolding) 3. collapse multiple whitespaces into a single one @@ -212,7 +212,7 @@ def init_dictionary(self, dictionary): Parameters ---------- dictionary : - + """ self.dictionary = dictionary if dictionary is not None else Dictionary() @@ -291,7 +291,7 @@ def step_through_preprocess(self, text): Parameters ---------- text : - + Yields ------ @@ -332,7 +332,7 @@ def get_texts(self): def sample_texts(self, n, seed=None, length=None): """Yield n random documents from the corpus without replacement. - + Given the number of remaining documents in a corpus, we need to choose n elements. The probability for the current element to be chosen is n/remaining. If we choose it, we just decrease the n and move to the next element. @@ -437,7 +437,7 @@ def lines_are_documents(self, lines_are_documents): Parameters ---------- lines_are_documents : - + """ self._lines_are_documents = lines_are_documents @@ -455,7 +455,7 @@ def pattern(self, pattern): Parameters ---------- pattern : - + """ self._pattern = None if pattern is None else re.compile(pattern) @@ -491,7 +491,7 @@ def min_depth(self, min_depth): Parameters ---------- min_depth : - + """ self._min_depth = min_depth @@ -509,7 +509,7 @@ def max_depth(self, max_depth): Parameters ---------- max_depth : - + """ self._max_depth = max_depth @@ -538,7 +538,7 @@ def getstream(self): """Yield documents from the underlying plain text collection (of one or more files). Each item yielded from this method will be considered a document by subsequent preprocessing methods. - + If `lines_are_documents` was set to True, items will be lines from files. Otherwise there will be one item per file, containing the entire contents of the file. @@ -580,7 +580,7 @@ def walk(top, topdown=True, onerror=None, followlinks=False, depth=0): Parameters ---------- top : - + topdown : (Default value = True) onerror : diff --git a/gensim/corpora/ucicorpus.py b/gensim/corpora/ucicorpus.py index 931c773930..0e8ea5c500 100644 --- a/gensim/corpora/ucicorpus.py +++ b/gensim/corpora/ucicorpus.py @@ -70,12 +70,12 @@ def skip_headers(self, input_file): class UciWriter(MmWriter): """Store a corpus in UCI Bag-of-Words format. - + This corpus format is identical to MM format, except for different file headers. There is no format line, and the first three lines of the file contain number_docs, num_terms, and num_nnz, one value per line. - + This implementation is based on matutils.MmWriter, and works the same way. """ @@ -99,11 +99,11 @@ def update_headers(self, num_docs, num_terms, num_nnz): Parameters ---------- num_docs : - + num_terms : - + num_nnz : - + """ offset = 0 @@ -123,9 +123,9 @@ def write_corpus(fname, corpus, progress_cnt=1000, index=False): Parameters ---------- fname : - + corpus : - + progress_cnt : (Default value = 1000) index : @@ -226,19 +226,19 @@ def create_dictionary(self): @staticmethod def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False): """Save a corpus in the UCI Bag-of-Words format. - + There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. - + This function is automatically called by `UciCorpus.serialize`; don't call it directly, call `serialize` instead. Parameters ---------- fname : - + corpus : - + id2word : (Default value = None) progress_cnt : diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index 6f6fe99776..86e3a3394a 100644 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -75,7 +75,7 @@ def filter_wiki(raw): Parameters ---------- raw : - + Returns ------- @@ -94,7 +94,7 @@ def remove_markup(text): Parameters ---------- text : - + Returns ------- @@ -144,7 +144,7 @@ def remove_template(s): Parameters ---------- s : - + Returns ------- @@ -193,7 +193,7 @@ def remove_file(s): Parameters ---------- s : - + Returns ------- @@ -210,14 +210,14 @@ def remove_file(s): def tokenize(content, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True): """Tokenize a piece of text from wikipedia. The input string `content` is assumed to be mark-up free (see `filter_wiki()`). - + Set `token_min_len`, `token_max_len` as character length (not bytes!) thresholds for individual tokens. Parameters ---------- content : - + token_min_len : (Default value = TOKEN_MIN_LEN) token_max_len : @@ -242,7 +242,7 @@ def get_namespace(tag): Parameters ---------- tag : - + Returns ------- @@ -264,7 +264,7 @@ def extract_pages(f, filter_namespaces=False): Parameters ---------- f : - + filter_namespaces : (Default value = False) @@ -317,14 +317,14 @@ def process_article(args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True): """Parse a wikipedia article, returning its content as a list of tokens (utf8-encoded strings). - + Set `tokenizer_func` (defaults to `tokenize`) parameter for languages like japanese or thai to perform better tokenization. The `tokenizer_func` needs to take 4 parameters: (text, token_min_len, token_max_len, lower). Parameters ---------- args : - + tokenizer_func : (Default value = tokenize) token_min_len : @@ -358,7 +358,7 @@ def _process_article(args): Parameters ---------- args : - + Returns ------- @@ -381,7 +381,7 @@ class WikiCorpus(TextCorpus): The documents are extracted on-the-fly, so that the whole (massive) dump can stay compressed on disk. - + **Note:** "multistream" archives are *not* supported in Python 2 due to `limitations in the core bz2 library `_. @@ -438,10 +438,10 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction def get_texts(self): """Iterate over the dump, returning text version of each article as a list of tokens. - + Only articles of sufficient length are returned (short articles & redirects etc are ignored). This is control by `article_min_tokens` on the class instance. - + Note that this iterates over the **texts**; if you want vectors, just use the standard corpus interface instead of this function:: From 3e597feb87abe48b7446a3408c7a5e8f3db805a6 Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Tue, 28 Nov 2017 17:29:57 +0500 Subject: [PATCH 29/48] Annotate wikicorpus --- gensim/corpora/wikicorpus.py | 250 ++++++++++++++++++++++------------- 1 file changed, 160 insertions(+), 90 deletions(-) diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index 86e3a3394a..ef4c8eed93 100644 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -9,12 +9,19 @@ """ Construct a corpus from a Wikipedia (or other MediaWiki-based) database dump. +Notes +----- If you have the `pattern` package installed, this module will use a fancy lemmatization to get a lemma of each token (instead of plain alphabetic -tokenizer). The package is available at https://github.com/clips/pattern . +tokenizer). The package is available at [1]_ . See scripts/process_wiki.py for a canned (example) script based on this module. + +References +---------- +.. [1] https://github.com/clips/pattern + """ @@ -33,52 +40,72 @@ logger = logging.getLogger(__name__) -# ignore articles shorter than ARTICLE_MIN_WORDS characters (after full preprocessing) ARTICLE_MIN_WORDS = 50 +"""Ignore shorter articles (after full preprocessing)""" # default thresholds for lengths of individual tokens TOKEN_MIN_LEN = 2 TOKEN_MAX_LEN = 15 -RE_P0 = re.compile(r'', re.DOTALL | re.UNICODE) # comments -RE_P1 = re.compile(r' ].*?)(|/>)', re.DOTALL | re.UNICODE) # footnotes -RE_P2 = re.compile(r'(\n\[\[[a-z][a-z][\w-]*:[^:\]]+\]\])+$', re.UNICODE) # links to languages -RE_P3 = re.compile(r'{{([^}{]*)}}', re.DOTALL | re.UNICODE) # template -RE_P4 = re.compile(r'{{([^}]*)}}', re.DOTALL | re.UNICODE) # template -RE_P5 = re.compile(r'\[(\w+):\/\/(.*?)(( (.*?))|())\]', re.UNICODE) # remove URL, keep description -RE_P6 = re.compile(r'\[([^][]*)\|([^][]*)\]', re.DOTALL | re.UNICODE) # simplify links, keep description -RE_P7 = re.compile(r'\n\[\[[iI]mage(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE) # keep description of images -RE_P8 = re.compile(r'\n\[\[[fF]ile(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE) # keep description of files -RE_P9 = re.compile(r' ].*?)(|/>)', re.DOTALL | re.UNICODE) # outside links -RE_P10 = re.compile(r' ].*?)(|/>)', re.DOTALL | re.UNICODE) # math content -RE_P11 = re.compile(r'<(.*?)>', re.DOTALL | re.UNICODE) # all other tags -RE_P12 = re.compile(r'\n(({\|)|(\|-)|(\|}))(.*?)(?=\n)', re.UNICODE) # table formatting -RE_P13 = re.compile(r'\n(\||\!)(.*?\|)*([^|]*?)', re.UNICODE) # table cell formatting -RE_P14 = re.compile(r'\[\[Category:[^][]*\]\]', re.UNICODE) # categories -# Remove File and Image template +RE_P0 = re.compile(r'', re.DOTALL | re.UNICODE) +"""Comments""" +RE_P1 = re.compile(r' ].*?)(|/>)', re.DOTALL | re.UNICODE) +"""Footnotes""" +RE_P2 = re.compile(r'(\n\[\[[a-z][a-z][\w-]*:[^:\]]+\]\])+$', re.UNICODE) +"""Links to languages""" +RE_P3 = re.compile(r'{{([^}{]*)}}', re.DOTALL | re.UNICODE) +"""Template""" +RE_P4 = re.compile(r'{{([^}]*)}}', re.DOTALL | re.UNICODE) +"""Template""" +RE_P5 = re.compile(r'\[(\w+):\/\/(.*?)(( (.*?))|())\]', re.UNICODE) +"""Remove URL, keep description""" +RE_P6 = re.compile(r'\[([^][]*)\|([^][]*)\]', re.DOTALL | re.UNICODE) +"""Simplify links, keep description""" +RE_P7 = re.compile(r'\n\[\[[iI]mage(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE) +"""Keep description of images""" +RE_P8 = re.compile(r'\n\[\[[fF]ile(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE) +"""Keep description of files""" +RE_P9 = re.compile(r' ].*?)(|/>)', re.DOTALL | re.UNICODE) +"""External links""" +RE_P10 = re.compile(r' ].*?)(|/>)', re.DOTALL | re.UNICODE) +"""Math content""" +RE_P11 = re.compile(r'<(.*?)>', re.DOTALL | re.UNICODE) +"""All other tags""" +RE_P12 = re.compile(r'\n(({\|)|(\|-)|(\|}))(.*?)(?=\n)', re.UNICODE) +"""Table formatting""" +RE_P13 = re.compile(r'\n(\||\!)(.*?\|)*([^|]*?)', re.UNICODE) +"""Table cell formatting""" +RE_P14 = re.compile(r'\[\[Category:[^][]*\]\]', re.UNICODE) +"""Categories""" RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE) +"""Remove File and Image templates""" -# MediaWiki namespaces (https://www.mediawiki.org/wiki/Manual:Namespace) that -# ought to be ignored IGNORED_NAMESPACES = [ 'Wikipedia', 'Category', 'File', 'Portal', 'Template', 'MediaWiki', 'User', 'Help', 'Book', 'Draft', 'WikiProject', 'Special', 'Talk' ] +"""MediaWiki namespaces [2]_ that ought to be ignored. + +References +---------- +.. [2] https://www.mediawiki.org/wiki/Manual:Namespace + +""" def filter_wiki(raw): - """Filter out wiki mark-up from `raw`, leaving only text. `raw` is either unicode - or utf-8 encoded string. + """Filter out wiki markup from `raw`, leaving only text. Parameters ---------- - raw : - + raw : str + Unicode or utf-8 encoded string. Returns ------- + str """ # parsing of the wiki markup is not perfect, but sufficient for our purposes @@ -89,15 +116,16 @@ def filter_wiki(raw): def remove_markup(text): - """ + """Filter out wiki markup from `text`, leaving only text. Parameters ---------- - text : - + text : str + String containing markup Returns ------- + str """ text = re.sub(RE_P2, '', text) # remove the last list (=languages) @@ -138,22 +166,27 @@ def remove_template(s): """Remove template wikimedia markup. Return a copy of `s` with all the wikimedia markup template removed. See - http://meta.wikimedia.org/wiki/Help:Template for wikimedia templates + [4]_ for wikimedia templates details. Parameters ---------- - s : - + s : str + String containing markup template Returns ------- + str Notes ----- Since template can be nested, it is difficult remove them using regular expresssions. + References + ---------- + .. [4] http://meta.wikimedia.org/wiki/Help:Template + """ # Find the start and end position of each template by finding the opening @@ -187,16 +220,20 @@ def remove_file(s): """Remove the 'File:' and 'Image:' markup, keeping the file caption. Return a copy of `s` with all the 'File:' and 'Image:' markup replaced by - their corresponding captions. See http://www.mediawiki.org/wiki/Help:Images - for the markup details. + their corresponding captions. [3]_ Parameters ---------- - s : - + s : str + String containing 'File:' and 'Image:' markup Returns ------- + str + + References + ---------- + .. [3] http://www.mediawiki.org/wiki/Help:Images """ # The regex RE_P15 match a File: or Image: markup @@ -208,25 +245,23 @@ def remove_file(s): def tokenize(content, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True): - """Tokenize a piece of text from wikipedia. The input string `content` is - assumed to be mark-up free (see `filter_wiki()`). + """Tokenize a piece of text from wikipedia. Set `token_min_len`, `token_max_len` as character length (not bytes!) thresholds for individual tokens. Parameters ---------- - content : - - token_min_len : - (Default value = TOKEN_MIN_LEN) - token_max_len : - (Default value = TOKEN_MAX_LEN) - lower : - (Default value = True) + content : str + String without markup (see `filter_wiki()`) + token_min_len : int + token_max_len : int + lower : bool + Whether to lowercase content Returns ------- + list of str """ # TODO maybe ignore tokens with non-latin characters? (no chinese, arabic, russian etc.) @@ -241,11 +276,11 @@ def get_namespace(tag): Parameters ---------- - tag : - + tag : str Returns ------- + str """ m = re.match("^{(.*?)}", tag) @@ -263,13 +298,15 @@ def extract_pages(f, filter_namespaces=False): Parameters ---------- - f : + f : File + File-like object - filter_namespaces : - (Default value = False) + filter_namespaces : list of str or bool + Namespaces to consider Yields ------ + tuple of (str or None, str, str) """ elems = (elem for _, elem in iterparse(f, events=("end",))) @@ -327,15 +364,14 @@ def process_article(args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN, tokenizer_func : (Default value = tokenize) - token_min_len : - (Default value = TOKEN_MIN_LEN) - token_max_len : - (Default value = TOKEN_MAX_LEN) - lower : - (Default value = True) + token_min_len : int + token_max_len : int + lower : bool + Whether to lowercase result Returns ------- + tuple(list of str, str, str) """ text, lemmatize, title, pageid = args @@ -348,20 +384,31 @@ def process_article(args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN, def init_to_ignore_interrupt(): - """Should only be used when master is prepared to handle termination of child processes.""" + """Enables interruption ignoring. + + Notes + ----- + Should only be used when master is prepared to handle termination of + child processes. + + """ signal.signal(signal.SIGINT, signal.SIG_IGN) def _process_article(args): - """Should not be called explicitly. Use `process_article` instead. + """Same as `process_article`, but with args in list format. Parameters ---------- - args : - + args : list of (function, int, int, bool) Returns ------- + tuple(list of str, str, str) + + Notes + ----- + Should not be called explicitly. Use `process_article` instead. """ @@ -376,13 +423,20 @@ def _process_article(args): class WikiCorpus(TextCorpus): """ - Treat a wikipedia articles dump (wiki--pages-articles.xml.bz2 - or wiki-latest-pages-articles.xml.bz2) as a (read-only) corpus. + Treat a wikipedia articles dump as a (read-only) corpus. + + Supported dump formats: + + *wiki--pages-articles.xml.bz2* + + *wiki-latest-pages-articles.xml.bz2* The documents are extracted on-the-fly, so that the whole (massive) dump can stay compressed on disk. - **Note:** "multistream" archives are *not* supported in Python 2 due to + Notes + ----- + "Multistream" archives are *not* supported in Python 2 due to `limitations in the core bz2 library `_. @@ -395,26 +449,39 @@ class WikiCorpus(TextCorpus): def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True): - """ - Initialize the corpus. Unless a dictionary is provided, this scans the - corpus once, to determine its vocabulary. - - If `pattern` package is installed, use fancier shallow parsing to get - token lemmas. Otherwise, use simple regexp tokenization. You can override - this automatic logic by forcing the `lemmatize` parameter explicitly. - self.metadata if set to true will ensure that serialize will write out article titles to a pickle file. - - Set `article_min_tokens` as a min threshold for article token count (defaults to 50). Any article below this is - ignored. - - Set `tokenizer_func` (defaults to `tokenize`) with a custom function reference to control tokenization else use - the default regexp tokenization. Set this parameter for languages like japanese or thai to perform better - tokenization. The `tokenizer_func` needs to take 4 parameters: (text, token_min_len, token_max_len, lower). The - parameter values are as configured on the class instance by default. - - Set `lower` to control if everything should be converted to lowercase or not (default True). - - Set `token_min_len`, `token_max_len` as thresholds for token lengths that are returned (default to 2 and 15). + """Initialize the corpus. + + Unless a dictionary is provided, this scans the corpus once, + to determine its vocabulary. + + Parameters + ---------- + fname : str + Filename + processes : int or None + Number of processes to run, defaults to *number of cpu - 1* + lemmatize : bool + Whether to use lemmatization instead of simple regexp + tokenization. Defaults to `True` if *pattern* package installed + and to `False` otherwise + dictionary : `corpora.Dictionary` or None + filter_namespaces : tuple of str + Namespaces to consider + tokenizer_func : function(text, token_min_len, token_max_len, lower) + Returns list of tokens. Set this parameter for languages like + japanese or thai to perform better tokenization + article_min_tokens : int + Minimum tokens in article. Article ignored if number of tokens is + less + token_min_len : int + token_max_len : int + lower : bool + Whether to lowercase texts + + Attributes + ---------- + metadata : bool + Whether to write articles titles to serialized corpus. """ self.fname = fname @@ -436,20 +503,23 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction self.dictionary = dictionary def get_texts(self): - """Iterate over the dump, returning text version of each article as a list - of tokens. - - Only articles of sufficient length are returned (short articles & redirects - etc are ignored). This is control by `article_min_tokens` on the class instance. - - Note that this iterates over the **texts**; if you want vectors, just use - the standard corpus interface instead of this function:: + """Iterate over the dump, yielding list of tokens for each article. Yields ------ + (list of str) or tuple(list of str, tuple(str, str))) + + Notes + ----- + Only articles of sufficient length are returned (short articles, + redirects, etc. are ignored). This is control by + `article_min_tokens` on the class instance. Examples -------- + Note that this iterates over the **texts**; if you want vectors, + just use the standard corpus interface instead of this function: + >>> for vec in wiki_corpus: >>> print(vec) From da1d5c213a3122d056712a698474dc92b5b84d89 Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Tue, 5 Dec 2017 17:29:04 +0500 Subject: [PATCH 30/48] SVMLight Corpus annotated --- gensim/corpora/svmlightcorpus.py | 68 +++++++++++++++++++------------- 1 file changed, 40 insertions(+), 28 deletions(-) diff --git a/gensim/corpora/svmlightcorpus.py b/gensim/corpora/svmlightcorpus.py index 17b100afb2..a68239a0ca 100644 --- a/gensim/corpora/svmlightcorpus.py +++ b/gensim/corpora/svmlightcorpus.py @@ -48,10 +48,14 @@ def __init__(self, fname, store_labels=True): """ Initialize the corpus from a file. - Although vector labels (~SVM target class) are not used in gensim in any way, - they are parsed and stored in `self.labels` for convenience. Set `store_labels=False` - to skip storing these labels (e.g. if there are too many vectors to store - the self.labels array in memory). + Parameters + ---------- + fname: str + Corpus filename + store_labels : bool + Whether to store labels (~SVM target class). They currently have + no application but stored in `self.labels` for convenience by + default. """ IndexedCorpus.__init__(self, fname) @@ -65,6 +69,11 @@ def __init__(self, fname, store_labels=True): def __iter__(self): """ Iterate over the corpus, returning one sparse vector at a time. + + Yields + ------ + list of (int, float) + """ lineno = -1 self.labels = [] @@ -84,24 +93,22 @@ def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): The SVMlight `` class tag is taken from the `labels` array, or set to 0 for all documents if `labels` is not supplied. - This function is automatically called by `SvmLightCorpus.serialize`; don't - call it directly, call `serialize` instead. - Parameters ---------- - fname : - - corpus : - - id2word : - (Default value = None) - labels : - (Default value = False) - metadata : - (Default value = False) + fname : str + Corpus filename + corpus : iterable + Iterable of documents + id2word : dict of (str, str), optional + Transforms id to word (Default value = None) + labels : list or False + An SVMlight `` class tags or False if not present + metadata : bool + Any additional info (Default value = False) Returns ------- + list of int """ logger.info("converting corpus to SVMlight format: %s", fname) @@ -119,28 +126,30 @@ def docbyoffset(self, offset): Parameters ---------- - offset : - + offset : int + Document's position Returns ------- - + tuple of (int, float) """ with utils.smart_open(self.fname) as f: f.seek(offset) return self.line2doc(f.readline())[0] + #TODO: it brokes if gets None from line2doc def line2doc(self, line): - """Create a document from a single line (string) in SVMlight format + """Create a document from a single line (string) in SVMlight format. Parameters ---------- - line : - + line : str + Line in SVMLight format Returns ------- + (tuple of (int, float)) or None """ line = utils.to_unicode(line) @@ -157,17 +166,20 @@ def line2doc(self, line): @staticmethod def doc2line(doc, label=0): - """Output the document in SVMlight format, as a string. Inverse function to `line2doc`. + """Output the document in SVMlight format, as a string. + + Inverse function to `line2doc`. Parameters ---------- - doc : - - label : - (Default value = 0) + doc : tuple of (int, float) + Document + label : int + (Default value = 0) Returns ------- + str """ pairs = ' '.join("%i:%s" % (termid + 1, termval) for termid, termval in doc) # +1 to convert 0-base to 1-base From 89f60989950e313f5b17607552955a4de18cc005 Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Tue, 5 Dec 2017 18:05:59 +0500 Subject: [PATCH 31/48] Fix TODO --- gensim/corpora/svmlightcorpus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/corpora/svmlightcorpus.py b/gensim/corpora/svmlightcorpus.py index a68239a0ca..2f1f4a2c40 100644 --- a/gensim/corpora/svmlightcorpus.py +++ b/gensim/corpora/svmlightcorpus.py @@ -137,7 +137,7 @@ def docbyoffset(self, offset): with utils.smart_open(self.fname) as f: f.seek(offset) return self.line2doc(f.readline())[0] - #TODO: it brokes if gets None from line2doc + # TODO: it brokes if gets None from line2doc def line2doc(self, line): """Create a document from a single line (string) in SVMlight format. From 9eeea2191f442b1be7d65c74884030943f2079b7 Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Wed, 6 Dec 2017 19:29:45 +0500 Subject: [PATCH 32/48] Fix grammar mistake --- gensim/corpora/svmlightcorpus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/corpora/svmlightcorpus.py b/gensim/corpora/svmlightcorpus.py index 2f1f4a2c40..a93f1bfce1 100644 --- a/gensim/corpora/svmlightcorpus.py +++ b/gensim/corpora/svmlightcorpus.py @@ -137,7 +137,7 @@ def docbyoffset(self, offset): with utils.smart_open(self.fname) as f: f.seek(offset) return self.line2doc(f.readline())[0] - # TODO: it brokes if gets None from line2doc + # TODO: it brakes if gets None from line2doc def line2doc(self, line): """Create a document from a single line (string) in SVMlight format. From 2b6aeaf9a4adbc585abc4a34a308da7631de9e62 Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Thu, 7 Dec 2017 22:45:04 +0500 Subject: [PATCH 33/48] Undo changes to dictionary --- gensim/corpora/dictionary.py | 222 ++++++++--------------------------- 1 file changed, 47 insertions(+), 175 deletions(-) diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py index 03482de9a0..4894a365b9 100644 --- a/gensim/corpora/dictionary.py +++ b/gensim/corpora/dictionary.py @@ -6,13 +6,13 @@ """ -General mapping between normalized words and their ids. +This module implements the concept of Dictionary -- a mapping between words and +their integer ids. Dictionaries can be created from a corpus and can later be pruned according to -document frequency (removing (un)common words via the -:func:`Dictionary.filter_extremes` method), save/loaded from disk (via -:func:`Dictionary.save` and :func:`Dictionary.load` methods), merged with -other dictionary (:func:`Dictionary.merge_with`) etc. +document frequency (removing (un)common words via the :func:`Dictionary.filter_extremes` method), +save/loaded from disk (via :func:`Dictionary.save` and :func:`Dictionary.load` methods), merged +with other dictionary (:func:`Dictionary.merge_with`) etc. """ from __future__ import with_statement @@ -36,23 +36,15 @@ class Dictionary(utils.SaveLoad, Mapping): - """Mapping between normalized words and their ids. + """ + Dictionary encapsulates the mapping between normalized words and their integer ids. The main function is `doc2bow`, which converts a collection of words to its bag-of-words representation: a list of (word_id, word_frequency) 2-tuples. - """ - def __init__(self, documents=None, prune_at=2000000): - """Initialize the dictionary. - - Parameters - ---------- - documents : (iterable of (list of str)) or None - If not None, used to initialize dictionary - prune_at : int - Number of unique words to keep - + """ + If `documents` are given, use them to initialize Dictionary (see `add_documents()`). """ self.token2id = {} # token -> tokenId self.id2token = {} # reverse mapping for token2id; only formed on request, to save memory @@ -66,17 +58,6 @@ def __init__(self, documents=None, prune_at=2000000): self.add_documents(documents, prune_at=prune_at) def __getitem__(self, tokenid): - """Return token. - - If :param:``token2id`` has changed (presumably via - :func:``add_documents``), - update :param:``id2token``. - - Returns - ------- - str - - """ if len(self.id2token) != len(self.token2id): # the word->id mapping has changed (presumably via add_documents); # recompute id->word accordingly @@ -84,13 +65,6 @@ def __getitem__(self, tokenid): return self.id2token[tokenid] # will throw for non-existent ids def __iter__(self): - """Iterate through keys. - - Returns - ------- - iterable of str - - """ return iter(self.keys()) if PY3: @@ -98,11 +72,9 @@ def __iter__(self): iterkeys = __iter__ def iteritems(self): - """Iterate through items.""" return self.items() def itervalues(self): - """Iterate through values.""" return self.values() def keys(self): @@ -110,65 +82,31 @@ def keys(self): return list(self.token2id.values()) def __len__(self): - """Return the number of token->id mappings in the dictionary. - - Returns - ------- - int - + """ + Return the number of token->id mappings in the dictionary. """ return len(self.token2id) def __str__(self): - """Return string representation. - - Returns - ------- - str - - """ some_keys = list(itertools.islice(iterkeys(self.token2id), 5)) return "Dictionary(%i unique tokens: %s%s)" % (len(self), some_keys, '...' if len(self) > 5 else '') @staticmethod def from_documents(documents): - """Build Dictionary from documents set. - - Parameters - ---------- - documents : iterable of (list of str) - - - Returns - ------- - :class:`Dictionary` - - """ return Dictionary(documents=documents) def add_documents(self, documents, prune_at=2000000): - u""" - Update dictionary from a collection of documents. - - Each document is a list of tokens = **tokenized and normalized** - strings (either utf8 or unicode). + """ + Update dictionary from a collection of documents. Each document is a list + of tokens = **tokenized and normalized** strings (either utf8 or unicode). This is a convenience wrapper for calling `doc2bow` on each document with `allow_update=True`, which also prunes infrequent words, keeping the total number of unique words <= `prune_at`. This is to save memory on very large inputs. To disable this pruning, set `prune_at=None`. - Parameters - ---------- - documents : iterable of str - prune_at : int - Number of unique words to keep - - Examples - -------- >>> print(Dictionary(["máma mele maso".split(), "ema má máma".split()])) Dictionary(5 unique tokens) - """ for docno, document in enumerate(documents): # log progress & run a regular check for pruning, once every 10k docs @@ -186,27 +124,19 @@ def add_documents(self, documents, prune_at=2000000): ) def doc2bow(self, document, allow_update=False, return_missing=False): - """Convert document to the bag-of-words format. - - Each word is assumed to be a **tokenized and normalized** string ( - either unicode or utf8-encoded). No further preprocessing is done on - the words in `document`; apply tokenization, stemming etc. before + """ + Convert `document` (a list of words) into the bag-of-words format = list + of `(token_id, token_count)` 2-tuples. Each word is assumed to be a + **tokenized and normalized** string (either unicode or utf8-encoded). No further preprocessing + is done on the words in `document`; apply tokenization, stemming etc. before calling this method. - Parameters - ---------- - document : list of str - - allow_update : bool - Whether to update the dictionary in the process - (Default value = False) - return_missing : - (Default value = False) - - Returns - ------- - dict of (int, int) + If `allow_update` is set, then also update dictionary in the process: create + ids for new words. At the same time, update document frequencies -- for + each word appearing in this document, increase its document frequency (`self.dfs`) + by one. + If `allow_update` is **not** set, this function is `const`, aka read-only. """ if isinstance(document, string_types): raise TypeError("doc2bow expects an array of unicode tokens on input, not a single string") @@ -284,7 +214,8 @@ def doc2idx(self, document, unknown_word_index=-1): return [self.token2id.get(word, unknown_word_index) for word in document] def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=None): - """Filter out tokens that appear in + """ + Filter out tokens that appear in 1. less than `no_below` documents (absolute number) or 2. more than `no_above` documents (fraction of total corpus size, *not* @@ -298,18 +229,6 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=N **Note**: Due to the gap shrinking, the same word may have a different word id before and after the call to this function! - - Parameters - ---------- - no_below : - (Default value = 5) - no_above : - (Default value = 0.5) - keep_n : - (Default value = 100000) - keep_tokens : - (Default value = None) - """ no_above_abs = int(no_above * self.num_docs) # convert fractional threshold to absolute threshold @@ -340,18 +259,13 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=N logger.info("resulting dictionary: %s", self) def filter_n_most_frequent(self, remove_n): - """Filter out the 'remove_n' most frequent tokens that appear in the - documents. + """ + Filter out the 'remove_n' most frequent tokens that appear in the documents. After the pruning, shrink resulting gaps in word ids. **Note**: Due to the gap shrinking, the same word may have a different word id before and after the call to this function! - - Parameters - ---------- - remove_n : - """ # determine which tokens to keep most_frequent_ids = (v for v in itervalues(self.token2id)) @@ -365,18 +279,11 @@ def filter_n_most_frequent(self, remove_n): logger.info("resulting dictionary: %s", self) def filter_tokens(self, bad_ids=None, good_ids=None): - """Remove the selected `bad_ids` tokens from all dictionary mappings, - or keep selected `good_ids` in the mapping and remove the rest. + """ + Remove the selected `bad_ids` tokens from all dictionary mappings, or, keep + selected `good_ids` in the mapping and remove the rest. `bad_ids` and `good_ids` are collections of word ids to be removed. - - Parameters - ---------- - bad_ids : - (Default value = None) - good_ids : - (Default value = None) - """ if bad_ids is not None: bad_ids = set(bad_ids) @@ -389,15 +296,12 @@ def filter_tokens(self, bad_ids=None, good_ids=None): self.compactify() def compactify(self): - """Assign new word ids to all words. + """ + Assign new word ids to all words. This is done to make the ids more compact, e.g. after some tokens have been removed via :func:`filter_tokens` and there are gaps in the id series. Calling this method will remove the gaps. - - Parameters - ---------- - """ logger.debug("rebuilding dictionary, shrinking gaps") @@ -410,21 +314,14 @@ def compactify(self): self.dfs = {idmap[tokenid]: freq for tokenid, freq in iteritems(self.dfs)} def save_as_text(self, fname, sort_by_word=True): - """Save this Dictionary to a text file, in format: + """ + Save this Dictionary to a text file, in format: `num_docs` `id[TAB]word_utf8[TAB]document frequency[NEWLINE]`. Sorted by word, or by decreasing word frequency. Note: text format should be use for corpus inspection. Use `save`/`load` to store in binary format (pickle) for improved performance. - - Parameters - ---------- - fname : - - sort_by_word : - (Default value = True) - """ logger.info("saving dictionary mapping to %s", fname) with utils.smart_open(fname, 'wb') as fout: @@ -440,7 +337,8 @@ def save_as_text(self, fname, sort_by_word=True): fout.write(utils.to_utf8(line)) def merge_with(self, other): - """Merge another dictionary into this dictionary, mapping same tokens to the + """ + Merge another dictionary into this dictionary, mapping same tokens to the same ids and new tokens to new ids. The purpose is to merge two corpora created using two different dictionaries, one from `self` and one from `other`. @@ -450,16 +348,8 @@ def merge_with(self, other): will convert documents from a corpus built using the `other` dictionary into a document using the new, merged dictionary (see :class:`gensim.interfaces.TransformationABC`). - Parameters - ---------- - other : - - - Returns - ------- + Example: - Examples - -------- >>> dict1 = Dictionary(some_documents) >>> dict2 = Dictionary(other_documents) # ids not compatible with dict1! >>> dict2_to_dict1 = dict1.merge_with(dict2) @@ -493,17 +383,9 @@ def merge_with(self, other): @staticmethod def load_from_text(fname): - """Load a previously stored Dictionary from a text file. + """ + Load a previously stored Dictionary from a text file. Mirror function to `save_as_text`. - - Parameters - ---------- - fname : - - - Returns - ------- - """ result = Dictionary() with utils.smart_open(fname) as f: @@ -530,28 +412,18 @@ def load_from_text(fname): @staticmethod def from_corpus(corpus, id2word=None): - """Create Dictionary from an existing corpus. This can be useful if you - only have a term-document BOW matrix (represented by `corpus`), - but not the original text corpus. + """ + Create Dictionary from an existing corpus. This can be useful if you only + have a term-document BOW matrix (represented by `corpus`), but not the + original text corpus. This will scan the term-document count matrix for all word ids that appear in it, then construct and return Dictionary which maps each `word_id -> id2word[word_id]`. - `id2word` is an optional dictionary that maps the `word_id` to a - token. In case `id2word` isn't specified the mapping `id2word[ - word_id] = str(word_id)` will be used. - - Parameters - ---------- - corpus : - - id2word : - (Default value = None) - - Returns - ------- - + `id2word` is an optional dictionary that maps the `word_id` to a token. In + case `id2word` isn't specified the mapping `id2word[word_id] = str(word_id)` + will be used. """ result = Dictionary() From 9b170577e614a6b60d42a152b2b4fe81b951db62 Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Thu, 7 Dec 2017 22:45:32 +0500 Subject: [PATCH 34/48] Undo changes to hashdictionary --- gensim/corpora/hashdictionary.py | 74 ++++++-------------------------- 1 file changed, 13 insertions(+), 61 deletions(-) diff --git a/gensim/corpora/hashdictionary.py b/gensim/corpora/hashdictionary.py index 730e9527da..687ec241ac 100644 --- a/gensim/corpora/hashdictionary.py +++ b/gensim/corpora/hashdictionary.py @@ -35,7 +35,8 @@ class HashDictionary(utils.SaveLoad, dict): - """HashDictionary encapsulates the mapping between normalized words and their + """ + HashDictionary encapsulates the mapping between normalized words and their integer ids. Unlike `Dictionary`, building a `HashDictionary` before using it is not a necessary @@ -79,17 +80,9 @@ def __getitem__(self, tokenid): return self.id2token.get(tokenid, set()) def restricted_hash(self, token): - """Calculate id of the given token. Also keep track of what words were mapped + """ + Calculate id of the given token. Also keep track of what words were mapped to what ids, for debugging reasons. - - Parameters - ---------- - token : - - - Returns - ------- - """ h = self.myhash(utils.to_utf8(token)) % self.id_range if self.debug: @@ -104,7 +97,7 @@ def __len__(self): return self.id_range def keys(self): - """ """ + """Return a list of all token ids.""" return range(len(self)) def __str__(self): @@ -112,32 +105,15 @@ def __str__(self): @staticmethod def from_documents(*args, **kwargs): - """ - - Parameters - ---------- - *args : - - **kwargs : - - - Returns - ------- - - """ return HashDictionary(*args, **kwargs) def add_documents(self, documents): - """Build dictionary from a collection of documents. Each document is a list + """ + Build dictionary from a collection of documents. Each document is a list of tokens = **tokenized and normalized** utf-8 encoded strings. This is only a convenience wrapper for calling `doc2bow` on each document with `allow_update=True`. - - Parameters - ---------- - documents : - """ for docno, document in enumerate(documents): if docno % 10000 == 0: @@ -149,7 +125,8 @@ def add_documents(self, documents): ) def doc2bow(self, document, allow_update=False, return_missing=False): - """Convert `document` (a list of words) into the bag-of-words format = list + """ + Convert `document` (a list of words) into the bag-of-words format = list of `(token_id, token_count)` 2-tuples. Each word is assumed to be a **tokenized and normalized** utf-8 encoded string. No further preprocessing is done on the words in `document`; apply tokenization, stemming etc. before @@ -160,18 +137,6 @@ def doc2bow(self, document, allow_update=False, return_missing=False): For each id appearing in this document, increase its document frequency (`self.dfs`) by one. - Parameters - ---------- - document : - - allow_update : - (Default value = False) - return_missing : - (Default value = False) - - Returns - ------- - """ result = {} missing = {} @@ -202,7 +167,8 @@ def doc2bow(self, document, allow_update=False, return_missing=False): return result def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000): - """Remove document frequency statistics for tokens that appear in + """ + Remove document frequency statistics for tokens that appear in 1. less than `no_below` documents (absolute number) or 2. more than `no_above` documents (fraction of total corpus size, *not* @@ -214,16 +180,6 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000): the number of tokens seen, this doesn't really "remove" anything. It only clears some supplementary statistics, for easier debugging and a smaller RAM footprint. - - Parameters - ---------- - no_below : - (Default value = 5) - no_above : - (Default value = 0.5) - keep_n : - (Default value = 100000) - """ no_above_abs = int(no_above * self.num_docs) # convert fractional threshold to absolute threshold ok = [item for item in iteritems(self.dfs_debug) if no_below <= item[1] <= no_above_abs] @@ -244,17 +200,13 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000): ) def save_as_text(self, fname): - """Save this HashDictionary to a text file, for easier debugging. + """ + Save this HashDictionary to a text file, for easier debugging. The format is: `id[TAB]document frequency of this id[TAB]tab-separated set of words in UTF8 that map to this id[NEWLINE]`. Note: use `save`/`load` to store in binary format instead (pickle). - - Parameters - ---------- - fname : - """ logger.info("saving HashDictionary mapping to %s" % fname) with utils.smart_open(fname, 'wb') as fout: From de3ea0f384ed0398449d5f4fd455af8cceae92f2 Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Sun, 10 Dec 2017 04:50:36 +0500 Subject: [PATCH 35/48] Document indexedcorpus --- gensim/corpora/indexedcorpus.py | 70 ++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 28 deletions(-) diff --git a/gensim/corpora/indexedcorpus.py b/gensim/corpora/indexedcorpus.py index a5d05b496f..f9cdb367d1 100644 --- a/gensim/corpora/indexedcorpus.py +++ b/gensim/corpora/indexedcorpus.py @@ -5,7 +5,7 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -"""Base Indexed Corpus class""" +"""Base Indexed Corpus class.""" import logging import six @@ -18,16 +18,22 @@ class IndexedCorpus(interfaces.CorpusABC): - def __init__(self, fname, index_fname=None): - """Indexed corpus is a mechanism for random-accessing corpora. + """Indexed corpus is a mechanism for random-accessing corpora. + + While the standard corpus interface in gensim allows iterating over + corpus with `for doc in corpus: pass`, indexed corpus allows accessing + the documents with `corpus[docno]` (in O(1) look-up time). - While the standard corpus interface in gensim allows iterating over - corpus with `for doc in corpus: pass`, indexed corpus allows accessing - the documents with `corpus[docno]` (in O(1) look-up time). + Notes + ----- + This functionality is achieved by storing an extra file (by default + named the same as the '{corpus name}.index') that stores the byte + offset of the beginning of each document. - This functionality is achieved by storing an extra file (by default - named the same as the '{corpus name}.index') that stores the byte - offset of the beginning of each document. + """ + + def __init__(self, fname, index_fname=None): + """Initialize the corpus. Parameters ---------- @@ -61,21 +67,10 @@ def __init__(self, fname, index_fname=None): @classmethod def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False): - """Iterate through the document stream `corpus`, saving the documents to - `fname` and recording byte offset of each document. - - Save the resulting index structure to file `index_fname` (or - `fname`.index is not set). + """Iterate through the document stream `corpus`. - This relies on the underlying corpus class `serializer` providing (in - addition to standard iteration):: - - * `save_corpus` method that returns a sequence of byte offsets, one for - each saved document - * the `docbyoffset(offset)` method, which returns a document - positioned at `offset` bytes within the persistent storage (file) - * metadata if set to true will ensure that serialize will write out - article titles to a pickle file. + Saving the documents to + `fname` and recording byte offset of each document. Parameters ---------- @@ -86,13 +81,14 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, id2word : dict of (str, str), optional Transforms id to word (Default value = None) index_fname : str - (Default value = None) + Where to save resulting index. Saved to `fname`.index if None. progress_cnt : int - (Default value = None) - labels : - (Default value = None) + Number of documents after which progress info is printed + labels : bool + Whether to skip the first column (class labels) metadata : bool - Any additional info (Default value = False) + If True will ensure that serialize will write out + article titles to a pickle file. (Default value = False) Examples -------- @@ -134,6 +130,11 @@ def __len__(self): If the corpus is not indexed, also count corpus length and cache this value. + + Returns + ------- + int + """ if self.index is not None: return len(self.index) @@ -143,11 +144,24 @@ def __len__(self): return self.length def __getitem__(self, docno): + """Return certain document. + + Parameters + ---------- + docno : int + Document number + + Returns + ------- + `utils.SlicedCorpus` + + """ if self.index is None: raise RuntimeError("Cannot call corpus[docid] without an index") if isinstance(docno, (slice, list, numpy.ndarray)): return utils.SlicedCorpus(self, docno) elif isinstance(docno, six.integer_types + (numpy.integer,)): return self.docbyoffset(self.index[docno]) + # TODO: no `docbyoffset` method, should be defined in this class else: raise ValueError('Unrecognised value for docno, use either a single integer, a slice or a numpy.ndarray') From dafc37352992048402f266ff216f7e9752693c09 Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Sun, 10 Dec 2017 08:01:09 +0500 Subject: [PATCH 36/48] Document indexedcorpus[2] Fix identation --- gensim/corpora/indexedcorpus.py | 12 ++-- gensim/corpora/mmcorpus.py | 2 +- gensim/corpora/sharded_corpus.py | 99 ++++++++++++++++++-------------- 3 files changed, 63 insertions(+), 50 deletions(-) diff --git a/gensim/corpora/indexedcorpus.py b/gensim/corpora/indexedcorpus.py index f9cdb367d1..2e89061076 100644 --- a/gensim/corpora/indexedcorpus.py +++ b/gensim/corpora/indexedcorpus.py @@ -45,12 +45,12 @@ def __init__(self, fname, index_fname=None): Examples -------- >>> # save corpus in SvmLightCorpus format with an index - >>> corpus = [[(1, 0.5)], [(0, 1.0), (1, 2.0)]] - >>> gensim.corpora.SvmLightCorpus.serialize('testfile.svmlight', corpus) - >>> # load back as a document stream (*not* plain Python list) - >>> corpus_with_random_access = gensim.corpora.SvmLightCorpus('tstfile.svmlight') - >>> print(corpus_with_random_access[1]) - [(0, 1.0), (1, 2.0)] + >>> corpus = [[(1, 0.5)], [(0, 1.0), (1, 2.0)]] + >>> gensim.corpora.SvmLightCorpus.serialize('testfile.svmlight', corpus) + >>> # load back as a document stream (*not* plain Python list) + >>> corpus_with_random_access = gensim.corpora.SvmLightCorpus('tstfile.svmlight') + >>> print(corpus_with_random_access[1]) + [(0, 1.0), (1, 2.0)] """ try: diff --git a/gensim/corpora/mmcorpus.py b/gensim/corpora/mmcorpus.py index cbe69e0cde..ecc7798bc1 100644 --- a/gensim/corpora/mmcorpus.py +++ b/gensim/corpora/mmcorpus.py @@ -53,7 +53,7 @@ def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False): id2word : (Default value = None) progress_cnt : - (Default value = 1000) + Number of documents after which progress info is printed metadata : (Default value = False) diff --git a/gensim/corpora/sharded_corpus.py b/gensim/corpora/sharded_corpus.py index 175a82de28..8749404265 100644 --- a/gensim/corpora/sharded_corpus.py +++ b/gensim/corpora/sharded_corpus.py @@ -145,8 +145,10 @@ def __init__(self, output_prefix, corpus, dim=None, """Initializes the dataset. If `output_prefix` is not found, builds the shards. - :type output_prefix: str - :param output_prefix: The absolute path to the file from which shard + Parameters + ---------- + output_prefix : + The absolute path to the file from which shard filenames should be derived. The individual shards will be saved as `output_prefix.0`, `output_prefix.1`, etc. @@ -164,53 +166,44 @@ def __init__(self, output_prefix, corpus, dim=None, Of course, you can save your corpus separately as well using the `save()` method. - - :type corpus: gensim.interfaces.CorpusABC - :param corpus: The source corpus from which to build the dataset. - - :type dim: int - :param dim: Specify beforehand what the dimension of a dataset item + corpus : + The source corpus from which to build the dataset. + dim : + Specify beforehand what the dimension of a dataset item should be. This is useful when initializing from a corpus that doesn't advertise its dimension, or when it does and you want to check that the corpus matches the expected dimension. **If `dim` is left unused and `corpus` does not provide its dimension in - an expected manner, initialization will fail.** - - :type shardsize: int - :param shardsize: How many data points should be in one shard. More + an expected manner, initialization will fail.** (Default value = None) + shardsize : + How many data points should be in one shard. More data per shard means less shard reloading but higher memory usage - and vice versa. - - :type overwrite: bool - :param overwrite: If set, will build dataset from given corpus even - if `output_prefix` already exists. - - :type sparse_serialization: bool - :param sparse_serialization: If set, will save the data in a sparse + and vice versa. (Default value = 4096) + overwrite : + If set, will build dataset from given corpus even + if `output_prefix` already exists. (Default value = False) + sparse_serialization : + If set, will save the data in a sparse form (as csr matrices). This is to speed up retrieval when you know you will be using sparse matrices. - ..note:: - - This property **should not change** during the lifetime of - the dataset. (If you find out you need to change from a sparse - to a dense representation, the best practice is to create - another ShardedCorpus object.) - - :type sparse_retrieval: bool - :param sparse_retrieval: If set, will retrieve data as sparse vectors + This property **should not change** during the lifetime of + the dataset. (If you find out you need to change from a sparse + to a dense representation, the best practice is to create + another ShardedCorpus object.) (Default value = False) + sparse_retrieval : + If set, will retrieve data as sparse vectors (numpy csr matrices). If unset, will return ndarrays. Note that retrieval speed for this option depends on how the dataset was serialized. If `sparse_serialization` was set, then setting `sparse_retrieval` will be faster. However, if the two settings do not correspond, the conversion on the fly will slow the dataset - down. - - :type gensim: bool - :param gensim: If set, will convert the output to gensim + down. (Default value = False) + gensim : + If set, will convert the output to gensim sparse vectors (list of tuples (id, value)) to make it behave like - any other gensim corpus. This **will** slow the dataset down. + any other gensim corpus. This **will** slow the dataset down. (Default value = False) """ self.output_prefix = output_prefix @@ -236,7 +229,7 @@ def __init__(self, output_prefix, corpus, dim=None, logger.info('Initializing sharded corpus with prefix %s', output_prefix) if (not os.path.isfile(output_prefix)) or overwrite: - logger.info('Building from corpus...') + # logger.info('Building from corpus...') self.init_shards(output_prefix, corpus, shardsize) # Save automatically, to facilitate re-loading @@ -644,8 +637,15 @@ def get_by_offset(self, offset): return result def __getitem__(self, offset): - """ - Retrieve the given row of the dataset. Supports slice notation. + """Retrieve the given row of the dataset. Supports slice notation. + + Parameters + ---------- + offset : + + + Returns + ------- """ if isinstance(offset, list): @@ -753,8 +753,7 @@ def __getitem__(self, offset): return s_result def __add_to_slice(self, s_result, result_start, result_stop, start, stop): - """ - Add the rows of the current shard from `start` to `stop` + """Add the rows of the current shard from `start` to `stop` into rows `result_start` to `result_stop` of `s_result`. Operation is based on the self.sparse_serialize setting. If the shard @@ -764,6 +763,23 @@ def __add_to_slice(self, s_result, result_start, result_stop, start, stop): and we should add them up to `result_stop`. Returns the resulting s_result. + + Parameters + ---------- + s_result : + + result_start : + + result_stop : + + start : + + stop : + + + Returns + ------- + """ if (result_stop - result_start) != (stop - start): raise ValueError( @@ -871,10 +887,7 @@ def _getitem_dense2gensim(self, result): # Overriding the IndexedCorpus and other corpus superclass methods def __iter__(self): - """ - Yield dataset items one by one (generator). - - """ + """Yield dataset items one by one (generator).""" for i in xrange(len(self)): yield self[i] From 0189d8d96c5cea902d6f65ebf7b8b8b4b8abd0fc Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Fri, 12 Jan 2018 00:10:33 +0500 Subject: [PATCH 37/48] Remove redundant files --- gensim/corpora/__init__.py | 2 +- gensim/corpora/lowcorpus.py | 65 +----- gensim/corpora/malletcorpus.py | 49 +---- gensim/corpora/mmcorpus.py | 32 +-- gensim/corpora/sharded_corpus.py | 362 +++++++------------------------ gensim/corpora/textcorpus.py | 195 ++--------------- gensim/corpora/ucicorpus.py | 78 ++----- 7 files changed, 140 insertions(+), 643 deletions(-) diff --git a/gensim/corpora/__init__.py b/gensim/corpora/__init__.py index aa122d1833..0d51a9b903 100644 --- a/gensim/corpora/__init__.py +++ b/gensim/corpora/__init__.py @@ -1,5 +1,5 @@ """ -This package contains implementations of various streaming corpus I/O formats. +This package contains implementations of various streaming corpus I/O format. """ # bring corpus classes directly into package namespace, to save some typing diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py index f0edb59f49..e293c998a1 100644 --- a/gensim/corpora/lowcorpus.py +++ b/gensim/corpora/lowcorpus.py @@ -23,22 +23,12 @@ def split_on_space(s): - """ - - Parameters - ---------- - s : - - - Returns - ------- - - """ return [word for word in utils.to_unicode(s).strip().split(' ') if word] class LowCorpus(IndexedCorpus): - """List_Of_Words corpus handles input in GibbsLda++ format. + """ + List_Of_Words corpus handles input in GibbsLda++ format. Quoting http://gibbslda.sourceforge.net/#3.2_Input_Data_Format:: @@ -59,7 +49,6 @@ class LowCorpus(IndexedCorpus): in which all [wordij] (i=1..M, j=1..Ni) are text strings and they are separated by the blank character. - """ def __init__(self, fname, id2word=None, line2words=split_on_space): """ @@ -102,7 +91,6 @@ def __init__(self, fname, id2word=None, line2words=split_on_space): ) def _calculate_num_docs(self): - """ """ # the first line in input data is the number of documents (integer). throws exception on bad input. with utils.smart_open(self.fname) as fin: try: @@ -116,17 +104,6 @@ def __len__(self): return self.num_docs def line2doc(self, line): - """ - - Parameters - ---------- - line : - - - Returns - ------- - - """ words = self.line2words(line) if self.use_wordids: @@ -165,25 +142,11 @@ def __iter__(self): @staticmethod def save_corpus(fname, corpus, id2word=None, metadata=False): - """Save a corpus in the List-of-words format. + """ + Save a corpus in the List-of-words format. This function is automatically called by `LowCorpus.serialize`; don't call it directly, call `serialize` instead. - - Parameters - ---------- - fname : - - corpus : - - id2word : - (Default value = None) - metadata : - (Default value = False) - - Returns - ------- - """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") @@ -211,16 +174,8 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): return offsets def docbyoffset(self, offset): - """Return the document stored at file position `offset`. - - Parameters - ---------- - offset : - - - Returns - ------- - + """ + Return the document stored at file position `offset`. """ with utils.smart_open(self.fname) as f: f.seek(offset) @@ -228,17 +183,9 @@ def docbyoffset(self, offset): @property def id2word(self): - """ """ return self._id2word @id2word.setter def id2word(self, val): - """ - - Parameters - ---------- - val : - - """ self._id2word = val self.word2id = utils.revdict(val) diff --git a/gensim/corpora/malletcorpus.py b/gensim/corpora/malletcorpus.py index f67d141fd1..cacf0074bd 100644 --- a/gensim/corpora/malletcorpus.py +++ b/gensim/corpora/malletcorpus.py @@ -19,15 +19,15 @@ class MalletCorpus(LowCorpus): - """Quoting http://mallet.cs.umass.edu/import.php: + """ + Quoting http://mallet.cs.umass.edu/import.php: - One file, one instance per line - Assume the data is in the following format: + One file, one instance per line + Assume the data is in the following format: [URL] [language] [text of the page...] Or, more generally, - [document #1 id] [label] [text of the document...] [document #2 id] [label] [text of the document...] ... @@ -41,7 +41,6 @@ def __init__(self, fname, id2word=None, metadata=False): LowCorpus.__init__(self, fname, id2word) def _calculate_num_docs(self): - """ """ with utils.smart_open(self.fname) as fin: result = sum(1 for _ in fin) return result @@ -57,17 +56,6 @@ def __iter__(self): yield self.line2doc(line) def line2doc(self, line): - """ - - Parameters - ---------- - line : - - - Returns - ------- - - """ splited_line = [word for word in utils.to_unicode(line).strip().split(' ') if word] docid, doclang, words = splited_line[0], splited_line[1], splited_line[2:] @@ -80,7 +68,8 @@ def line2doc(self, line): @staticmethod def save_corpus(fname, corpus, id2word=None, metadata=False): - """Save a corpus in the Mallet format. + """ + Save a corpus in the Mallet format. The document id will be generated by enumerating the corpus. That is, it will range between 0 and number of documents in the corpus. @@ -91,20 +80,6 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): This function is automatically called by `MalletCorpus.serialize`; don't call it directly, call `serialize` instead. - Parameters - ---------- - fname : - - corpus : - - id2word : - (Default value = None) - metadata : - (Default value = False) - - Returns - ------- - """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") @@ -139,16 +114,8 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): return offsets def docbyoffset(self, offset): - """Return the document stored at file position `offset`. - - Parameters - ---------- - offset : - - - Returns - ------- - + """ + Return the document stored at file position `offset`. """ with utils.smart_open(self.fname) as f: f.seek(offset) diff --git a/gensim/corpora/mmcorpus.py b/gensim/corpora/mmcorpus.py index ecc7798bc1..2158f0a526 100644 --- a/gensim/corpora/mmcorpus.py +++ b/gensim/corpora/mmcorpus.py @@ -20,7 +20,9 @@ class MmCorpus(matutils.MmReader, IndexedCorpus): - """Corpus in the Matrix Market format.""" + """ + Corpus in the Matrix Market format. + """ def __init__(self, fname): # avoid calling super(), too confusing @@ -28,38 +30,20 @@ def __init__(self, fname): matutils.MmReader.__init__(self, fname) def __iter__(self): - """Iterate over the corpus. - - Yields - ------ - document : + """ + Interpret a matrix in Matrix Market format as a streamed gensim corpus + (yielding one document at a time). """ for doc_id, doc in super(MmCorpus, self).__iter__(): yield doc # get rid of doc id, return the sparse vector only @staticmethod def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False): - """Save a corpus in the Matrix Market format to disk. + """ + Save a corpus in the Matrix Market format to disk. This function is automatically called by `MmCorpus.serialize`; don't call it directly, call `serialize` instead. - - Parameters - ---------- - fname : - - corpus : - - id2word : - (Default value = None) - progress_cnt : - Number of documents after which progress info is printed - metadata : - (Default value = False) - - Returns - ------- - """ logger.info("storing corpus in Matrix Market format to %s", fname) num_terms = len(id2word) if id2word is not None else None diff --git a/gensim/corpora/sharded_corpus.py b/gensim/corpora/sharded_corpus.py index 8749404265..049e22f226 100644 --- a/gensim/corpora/sharded_corpus.py +++ b/gensim/corpora/sharded_corpus.py @@ -44,7 +44,8 @@ class ShardedCorpus(IndexedCorpus): - """This corpus is designed for situations where you need to train a model + """ + This corpus is designed for situations where you need to train a model on matrices, with a large number of iterations. (It should be faster than gensim's other IndexedCorpus implementations for this use case; check the `benchmark_datasets.py` script. It should also serialize faster.) @@ -137,7 +138,6 @@ class ShardedCorpus(IndexedCorpus): currently open and on a `__getitem__` request, either returns an item from the current shard, or opens a new one. The shard size is constant, except for the last shard. - """ def __init__(self, output_prefix, corpus, dim=None, shardsize=4096, overwrite=False, sparse_serialization=False, @@ -145,10 +145,8 @@ def __init__(self, output_prefix, corpus, dim=None, """Initializes the dataset. If `output_prefix` is not found, builds the shards. - Parameters - ---------- - output_prefix : - The absolute path to the file from which shard + :type output_prefix: str + :param output_prefix: The absolute path to the file from which shard filenames should be derived. The individual shards will be saved as `output_prefix.0`, `output_prefix.1`, etc. @@ -166,44 +164,53 @@ def __init__(self, output_prefix, corpus, dim=None, Of course, you can save your corpus separately as well using the `save()` method. - corpus : - The source corpus from which to build the dataset. - dim : - Specify beforehand what the dimension of a dataset item + + :type corpus: gensim.interfaces.CorpusABC + :param corpus: The source corpus from which to build the dataset. + + :type dim: int + :param dim: Specify beforehand what the dimension of a dataset item should be. This is useful when initializing from a corpus that doesn't advertise its dimension, or when it does and you want to check that the corpus matches the expected dimension. **If `dim` is left unused and `corpus` does not provide its dimension in - an expected manner, initialization will fail.** (Default value = None) - shardsize : - How many data points should be in one shard. More + an expected manner, initialization will fail.** + + :type shardsize: int + :param shardsize: How many data points should be in one shard. More data per shard means less shard reloading but higher memory usage - and vice versa. (Default value = 4096) - overwrite : - If set, will build dataset from given corpus even - if `output_prefix` already exists. (Default value = False) - sparse_serialization : - If set, will save the data in a sparse + and vice versa. + + :type overwrite: bool + :param overwrite: If set, will build dataset from given corpus even + if `output_prefix` already exists. + + :type sparse_serialization: bool + :param sparse_serialization: If set, will save the data in a sparse form (as csr matrices). This is to speed up retrieval when you know you will be using sparse matrices. - This property **should not change** during the lifetime of - the dataset. (If you find out you need to change from a sparse - to a dense representation, the best practice is to create - another ShardedCorpus object.) (Default value = False) - sparse_retrieval : - If set, will retrieve data as sparse vectors + ..note:: + + This property **should not change** during the lifetime of + the dataset. (If you find out you need to change from a sparse + to a dense representation, the best practice is to create + another ShardedCorpus object.) + + :type sparse_retrieval: bool + :param sparse_retrieval: If set, will retrieve data as sparse vectors (numpy csr matrices). If unset, will return ndarrays. Note that retrieval speed for this option depends on how the dataset was serialized. If `sparse_serialization` was set, then setting `sparse_retrieval` will be faster. However, if the two settings do not correspond, the conversion on the fly will slow the dataset - down. (Default value = False) - gensim : - If set, will convert the output to gensim + down. + + :type gensim: bool + :param gensim: If set, will convert the output to gensim sparse vectors (list of tuples (id, value)) to make it behave like - any other gensim corpus. This **will** slow the dataset down. (Default value = False) + any other gensim corpus. This **will** slow the dataset down. """ self.output_prefix = output_prefix @@ -229,7 +236,7 @@ def __init__(self, output_prefix, corpus, dim=None, logger.info('Initializing sharded corpus with prefix %s', output_prefix) if (not os.path.isfile(output_prefix)) or overwrite: - # logger.info('Building from corpus...') + logger.info('Building from corpus...') self.init_shards(output_prefix, corpus, shardsize) # Save automatically, to facilitate re-loading @@ -242,20 +249,7 @@ def __init__(self, output_prefix, corpus, dim=None, self.init_by_clone() def init_shards(self, output_prefix, corpus, shardsize=4096, dtype=_default_dtype): - """Initialize shards from the corpus. - - Parameters - ---------- - output_prefix : - - corpus : - - shardsize : - (Default value = 4096) - dtype : - (Default value = _default_dtype) - - """ + """Initialize shards from the corpus.""" is_corpus, corpus = gensim.utils.is_corpus(corpus) if not is_corpus: @@ -300,7 +294,8 @@ def init_shards(self, output_prefix, corpus, shardsize=4096, dtype=_default_dtyp logger.info('Built %d shards in %f s.', self.n_shards, end_time - start_time) def init_by_clone(self): - """Initialize by copying over attributes of another ShardedCorpus + """ + Initialize by copying over attributes of another ShardedCorpus instance saved to the output_prefix given at __init__(). """ @@ -322,21 +317,13 @@ def init_by_clone(self): self.dim = temp.dim # To be consistent with the loaded data! def save_shard(self, shard, n=None, filename=None): - """Pickle the given shard. If `n` is not given, will consider the shard + """ + Pickle the given shard. If `n` is not given, will consider the shard a new one. If `filename` is given, will use that file name instead of generating one. - Parameters - ---------- - shard : - - n : - (Default value = None) - filename : - (Default value = None) - """ new_shard = False if n is None: @@ -353,14 +340,9 @@ def save_shard(self, shard, n=None, filename=None): self.n_shards += 1 def load_shard(self, n): - """Load (unpickle) the n-th shard as the "live" part of the dataset - into the Dataset object. - - Parameters - ---------- - n : - """ + Load (unpickle) the n-th shard as the "live" part of the dataset + into the Dataset object.""" # No-op if the shard is already open. if self.current_shard_n == n: @@ -376,26 +358,22 @@ def load_shard(self, n): self.current_offset = self.offsets[n] def reset(self): - """Reset to no shard at all. Used for saving.""" + """ + Reset to no shard at all. Used for saving. + + """ self.current_shard = None self.current_shard_n = None self.current_offset = None def shard_by_offset(self, offset): - """Determine which shard the given offset belongs to. If the offset + """ + Determine which shard the given offset belongs to. If the offset is greater than the number of available documents, raises a `ValueError`. Assumes that all shards have the same size. - Parameters - ---------- - offset : - - - Returns - ------- - """ k = int(offset / self.shardsize) if offset >= self.n_docs: @@ -407,48 +385,33 @@ def shard_by_offset(self, offset): return k def in_current(self, offset): - """Determine whether the given offset falls within the current shard. - - Parameters - ---------- - offset : - - - Returns - ------- + """ + Determine whether the given offset falls within the current shard. """ return (self.current_offset <= offset) and (offset < self.offsets[self.current_shard_n + 1]) def in_next(self, offset): - """Determine whether the given offset falls within the next shard. + """ + Determine whether the given offset falls within the next shard. This is a very small speedup: typically, we will be iterating through the data forward. Could save considerable time with a very large number of smaller shards. - Parameters - ---------- - offset : - - - Returns - ------- - """ if self.current_shard_n == self.n_shards: return False # There's no next shard. return (self.offsets[self.current_shard_n + 1] <= offset) and (offset < self.offsets[self.current_shard_n + 2]) def resize_shards(self, shardsize): - """Re-process the dataset to new shard size. This may take pretty long. + """ + Re-process the dataset to new shard size. This may take pretty long. Also, note that you need some space on disk for this one (we're assuming there is enough disk space for double the size of the dataset and that there is enough memory for old + new shardsize). - Parameters - ---------- - shardsize : - The new shard size. + :type shardsize: int + :param shardsize: The new shard size. """ # Determine how many new shards there will be @@ -515,46 +478,18 @@ def resize_shards(self, shardsize): self.reset() def _shard_name(self, n): - """Generate the name for the n-th shard. - - Parameters - ---------- - n : - - - Returns - ------- - - """ + """Generate the name for the n-th shard.""" return self.output_prefix + '.' + str(n) def _resized_shard_name(self, n): - """Generate the name for the n-th new shard temporary file when + """ + Generate the name for the n-th new shard temporary file when resizing dataset. The file will then be re-named to standard shard name. - - Parameters - ---------- - n : - - - Returns - ------- - """ return self.output_prefix + '.resize-temp.' + str(n) def _guess_n_features(self, corpus): - """Attempt to guess number of features in `corpus`. - - Parameters - ---------- - corpus : - - - Returns - ------- - - """ + """Attempt to guess number of features in `corpus`.""" n_features = None if hasattr(corpus, 'dim'): # print 'Guessing from \'dim\' attribute.' @@ -600,14 +535,6 @@ def __len__(self): return self.n_docs def _ensure_shard(self, offset): - """ - - Parameters - ---------- - offset : - - - """ # No shard loaded if self.current_shard is None: shard_n = self.shard_by_offset(offset) @@ -621,31 +548,14 @@ def _ensure_shard(self, offset): self.load_shard(shard_n) def get_by_offset(self, offset): - """As opposed to getitem, this one only accepts ints as offsets. - - Parameters - ---------- - offset : - - - Returns - ------- - - """ + """As opposed to getitem, this one only accepts ints as offsets.""" self._ensure_shard(offset) result = self.current_shard[offset - self.current_offset] return result def __getitem__(self, offset): - """Retrieve the given row of the dataset. Supports slice notation. - - Parameters - ---------- - offset : - - - Returns - ------- + """ + Retrieve the given row of the dataset. Supports slice notation. """ if isinstance(offset, list): @@ -753,7 +663,8 @@ def __getitem__(self, offset): return s_result def __add_to_slice(self, s_result, result_start, result_stop, start, stop): - """Add the rows of the current shard from `start` to `stop` + """ + Add the rows of the current shard from `start` to `stop` into rows `result_start` to `result_stop` of `s_result`. Operation is based on the self.sparse_serialize setting. If the shard @@ -763,23 +674,6 @@ def __add_to_slice(self, s_result, result_start, result_stop, start, stop): and we should add them up to `result_stop`. Returns the resulting s_result. - - Parameters - ---------- - s_result : - - result_start : - - result_stop : - - start : - - stop : - - - Returns - ------- - """ if (result_stop - result_start) != (stop - start): raise ValueError( @@ -807,17 +701,6 @@ def __add_to_slice(self, s_result, result_start, result_stop, start, stop): return s_result def _getitem_format(self, s_result): - """ - - Parameters - ---------- - s_result : - - - Returns - ------- - - """ if self.sparse_serialization: if self.gensim: s_result = self._getitem_sparse2gensim(s_result) @@ -831,33 +714,13 @@ def _getitem_format(self, s_result): return s_result def _getitem_sparse2gensim(self, result): - """Change given sparse result matrix to gensim sparse vectors. + """ + Change given sparse result matrix to gensim sparse vectors. Uses the internals of the sparse matrix to make this fast. - Parameters - ---------- - result : - - - Returns - ------- - """ def row_sparse2gensim(row_idx, csr_matrix): - """ - - Parameters - ---------- - row_idx : - - csr_matrix : - - - Returns - ------- - - """ indices = csr_matrix.indices[csr_matrix.indptr[row_idx]:csr_matrix.indptr[row_idx + 1]] g_row = [(col_idx, csr_matrix[row_idx, col_idx]) for col_idx in indices] return g_row @@ -867,17 +730,7 @@ def row_sparse2gensim(row_idx, csr_matrix): return output def _getitem_dense2gensim(self, result): - """Change given dense result matrix to gensim sparse vectors. - - Parameters - ---------- - result : - - - Returns - ------- - - """ + """Change given dense result matrix to gensim sparse vectors.""" if len(result.shape) == 1: output = gensim.matutils.full2sparse(result) else: @@ -887,22 +740,19 @@ def _getitem_dense2gensim(self, result): # Overriding the IndexedCorpus and other corpus superclass methods def __iter__(self): - """Yield dataset items one by one (generator).""" + """ + Yield dataset items one by one (generator). + + """ for i in xrange(len(self)): yield self[i] def save(self, *args, **kwargs): - """Save itself (the wrapper) in clean state (after calling `reset()`) + """ + Save itself (the wrapper) in clean state (after calling `reset()`) to the output_prefix file. If you wish to save to a different file, use the `fname` argument as the first positional arg. - Parameters - ---------- - *args : - - **kwargs : - - """ # Can we save to a different file than output_prefix? Well, why not? if len(args) == 0: @@ -917,24 +767,15 @@ def save(self, *args, **kwargs): @classmethod def load(cls, fname, mmap=None): - """Load itself in clean state. `mmap` has no effect here. - - Parameters - ---------- - fname : - - mmap : - (Default value = None) - - Returns - ------- - + """ + Load itself in clean state. `mmap` has no effect here. """ return super(ShardedCorpus, cls).load(fname, mmap) @staticmethod def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False, **kwargs): - """Implement a serialization interface. Do not call directly; + """ + Implement a serialization interface. Do not call directly; use the `serialize` method instead. Note that you might need some ShardedCorpus init parameters, most @@ -950,28 +791,14 @@ def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False, currently do nothing and are here only to provide a compatible method signature with superclass. - Parameters - ---------- - fname : - - corpus : - - id2word : - (Default value = None) - progress_cnt : - (Default value = 1000) - metadata : - (Default value = False) - **kwargs : - - """ ShardedCorpus(fname, corpus, **kwargs) @classmethod def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False, **kwargs): - """Iterate through the document stream `corpus`, saving the documents + """ + Iterate through the document stream `corpus`, saving the documents as a ShardedCorpus to `fname`. Use this method instead of calling `save_corpus` directly. @@ -981,28 +808,5 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progres Ignore the parameters id2word, index_fname, progress_cnt, labels and metadata. They currently do nothing and are here only to - provide a compatible method signature with superclass. - - Parameters - ---------- - serializer : - - fname : - - corpus : - - id2word : - (Default value = None) - index_fname : - (Default value = None) - progress_cnt : - (Default value = None) - labels : - (Default value = None) - metadata : - (Default value = False) - **kwargs : - - - """ + provide a compatible method signature with superclass.""" serializer.save_corpus(fname, corpus, id2word=id2word, progress_cnt=progress_cnt, metadata=metadata, **kwargs) diff --git a/gensim/corpora/textcorpus.py b/gensim/corpora/textcorpus.py index ac12cc2dcd..7f78f5ca91 100644 --- a/gensim/corpora/textcorpus.py +++ b/gensim/corpora/textcorpus.py @@ -44,70 +44,22 @@ def remove_stopwords(tokens, stopwords=STOPWORDS): - """Remove stopwords using list from `gensim.parsing.preprocessing.STOPWORDS`. - - Parameters - ---------- - tokens : - - stopwords : - (Default value = STOPWORDS) - - Returns - ------- - - """ + """Remove stopwords using list from `gensim.parsing.preprocessing.STOPWORDS`.""" return [token for token in tokens if token not in stopwords] def remove_short(tokens, minsize=3): - """Remove tokens smaller than `minsize` chars, which is 3 by default. - - Parameters - ---------- - tokens : - - minsize : - (Default value = 3) - - Returns - ------- - - """ + """Remove tokens smaller than `minsize` chars, which is 3 by default.""" return [token for token in tokens if len(token) >= minsize] def lower_to_unicode(text, encoding='utf8', errors='strict'): - """Lowercase `text` and convert to unicode. - - Parameters - ---------- - text : - - encoding : - (Default value = 'utf8') - errors : - (Default value = 'strict') - - Returns - ------- - - """ + """Lowercase `text` and convert to unicode.""" return utils.to_unicode(text.lower(), encoding, errors) def strip_multiple_whitespaces(s): - """Collapse multiple whitespace characters into a single space. - - Parameters - ---------- - s : - - - Returns - ------- - - """ + """Collapse multiple whitespace characters into a single space.""" return RE_WHITESPACE.sub(" ", s) @@ -208,12 +160,6 @@ def init_dictionary(self, dictionary): """If `dictionary` is None, initialize to an empty Dictionary, and then if there is an `input` for the corpus, add all documents from that `input`. If the `dictionary` is already initialized, simply set it as the corpus's `dictionary`. - - Parameters - ---------- - dictionary : - - """ self.dictionary = dictionary if dictionary is not None else Dictionary() if self.input is not None: @@ -244,13 +190,6 @@ def getstream(self): """Yield documents from the underlying plain text collection (of one or more files). Each item yielded from this method will be considered a document by subsequent preprocessing methods. - - Parameters - ---------- - - Yields - ------ - """ num_texts = 0 with utils.file_or_filename(self.input) as f: @@ -264,16 +203,11 @@ def preprocess_text(self, text): """Apply preprocessing to a single text document. This should perform tokenization in addition to any other desired preprocessing steps. - Parameters - ---------- - text : str - document text read from plain-text file. - - Returns - ------- - iterable of str - tokens produced from `text` as a result of preprocessing. + Args: + text (str): document text read from plain-text file. + Returns: + iterable of str: tokens produced from `text` as a result of preprocessing. """ for character_filter in self.character_filters: text = character_filter(text) @@ -287,15 +221,6 @@ def preprocess_text(self, text): def step_through_preprocess(self, text): """Yield tuples of functions and their output for each stage of preprocessing. This is useful for debugging issues with the corpus preprocessing pipeline. - - Parameters - ---------- - text : - - - Yields - ------ - """ for character_filter in self.character_filters: text = character_filter(text) @@ -315,12 +240,9 @@ def get_texts(self): to be overridden if the metadata you'd like to yield differs from the line number. - Yields - ------ - list of strings - each list corresponds to a preprocessed + Returns: + generator of lists of tokens (strings); each list corresponds to a preprocessed document from the corpus `input`. - """ lines = self.getstream() if self.metadata: @@ -339,24 +261,17 @@ def sample_texts(self, n, seed=None, length=None): Computing the corpus length may be a costly operation so you can use the optional parameter `length` instead. - Parameters - ---------- - n : int - number of documents we want to sample. - seed : int or None - if specified, use it as a seed for local random generator. (Default value = None) - length : int or None - if specified, use it as a guess of corpus length. - It must be positive and not greater than actual corpus length. (Default value = None) - - Yields - ------ + Args: + n (int): number of documents we want to sample. + seed (int|None): if specified, use it as a seed for local random generator. + length (int|None): if specified, use it as a guess of corpus length. + It must be positive and not greater than actual corpus length. - Raises - ------ - ValueError - when n is invalid or length was set incorrectly. + Yields: + list[str]: document represented as a list of tokens. See get_texts method. + Raises: + ValueError: when n is invalid or length was set incorrectly. """ random_generator = random if seed is None else random.Random(seed) if length is None: @@ -396,7 +311,6 @@ def __len__(self): class TextDirectoryCorpus(TextCorpus): """Read documents recursively from a directory, where each file (or line of each file) is interpreted as a plain text document. - """ def __init__(self, input, dictionary=None, metadata=False, min_depth=0, max_depth=None, @@ -427,91 +341,46 @@ def __init__(self, input, dictionary=None, metadata=False, min_depth=0, max_dept @property def lines_are_documents(self): - """ """ return self._lines_are_documents @lines_are_documents.setter def lines_are_documents(self, lines_are_documents): - """ - - Parameters - ---------- - lines_are_documents : - - - """ self._lines_are_documents = lines_are_documents self.length = None @property def pattern(self): - """ """ return self._pattern @pattern.setter def pattern(self, pattern): - """ - - Parameters - ---------- - pattern : - - - """ self._pattern = None if pattern is None else re.compile(pattern) self.length = None @property def exclude_pattern(self): - """ """ return self._exclude_pattern @exclude_pattern.setter def exclude_pattern(self, pattern): - """ - - Parameters - ---------- - pattern : - - - """ self._exclude_pattern = None if pattern is None else re.compile(pattern) self.length = None @property def min_depth(self): - """ """ return self._min_depth @min_depth.setter def min_depth(self, min_depth): - """ - - Parameters - ---------- - min_depth : - - - """ self._min_depth = min_depth self.length = None @property def max_depth(self): - """ """ return self._max_depth @max_depth.setter def max_depth(self, max_depth): - """ - - Parameters - ---------- - max_depth : - - - """ self._max_depth = max_depth self.length = None @@ -519,10 +388,6 @@ def iter_filepaths(self): """Lazily yield paths to each file in the directory structure within the specified range of depths. If a filename pattern to match was given, further filter to only those filenames that match. - - Yields - ------ - """ for depth, dirpath, dirnames, filenames in walk(self.input): if self.min_depth <= depth <= self.max_depth: @@ -541,10 +406,6 @@ def getstream(self): If `lines_are_documents` was set to True, items will be lines from files. Otherwise there will be one item per file, containing the entire contents of the file. - - Yields - ------ - """ num_texts = 0 for path in self.iter_filepaths(): @@ -565,7 +426,6 @@ def __len__(self): return self.length def _cache_corpus_length(self): - """ """ if not self.lines_are_documents: self.length = sum(1 for _ in self.iter_filepaths()) else: @@ -576,23 +436,6 @@ def walk(top, topdown=True, onerror=None, followlinks=False, depth=0): """This is a mostly copied version of `os.walk` from the Python 2 source code. The only difference is that it returns the depth in the directory tree structure at which each yield is taking place. - - Parameters - ---------- - top : - - topdown : - (Default value = True) - onerror : - (Default value = None) - followlinks : - (Default value = False) - depth : - (Default value = 0) - - Yields - ------ - """ islink, join, isdir = os.path.islink, os.path.join, os.path.isdir diff --git a/gensim/corpora/ucicorpus.py b/gensim/corpora/ucicorpus.py index 0e8ea5c500..a8911ee07f 100644 --- a/gensim/corpora/ucicorpus.py +++ b/gensim/corpora/ucicorpus.py @@ -29,11 +29,11 @@ class UciReader(MmReader): def __init__(self, input): - """Initialize the reader. + """ + Initialize the reader. The `input` parameter refers to a file on the local filesystem, which is expected to be in the UCI Bag-of-Words format. - """ logger.info('Initializing corpus reader from %s', input) @@ -55,21 +55,14 @@ def __init__(self, input): ) def skip_headers(self, input_file): - """ - - Parameters - ---------- - input_file : - - - """ for lineno, _ in enumerate(input_file): if lineno == 2: break class UciWriter(MmWriter): - """Store a corpus in UCI Bag-of-Words format. + """ + Store a corpus in UCI Bag-of-Words format. This corpus format is identical to MM format, except for different file headers. There is no format line, and the first @@ -83,9 +76,8 @@ class UciWriter(MmWriter): FAKE_HEADER = utils.to_utf8(' ' * MAX_HEADER_LENGTH + '\n') def write_headers(self): - """Write blank header lines. - - Will be updated later, once corpus stats are known. + """ + Write blank header lines. Will be updated later, once corpus stats are known. """ for _ in range(3): self.fout.write(self.FAKE_HEADER) @@ -94,17 +86,8 @@ def write_headers(self): self.headers_written = True def update_headers(self, num_docs, num_terms, num_nnz): - """Update headers with actual values. - - Parameters - ---------- - num_docs : - - num_terms : - - num_nnz : - - + """ + Update headers with actual values. """ offset = 0 values = [utils.to_utf8(str(n)) for n in [num_docs, num_terms, num_nnz]] @@ -118,20 +101,6 @@ def update_headers(self, num_docs, num_terms, num_nnz): @staticmethod def write_corpus(fname, corpus, progress_cnt=1000, index=False): - """ - - Parameters - ---------- - fname : - - corpus : - - progress_cnt : - (Default value = 1000) - index : - (Default value = False) - - """ writer = UciWriter(fname) writer.write_headers() @@ -170,7 +139,9 @@ def write_corpus(fname, corpus, progress_cnt=1000, index=False): class UciCorpus(UciReader, IndexedCorpus): - """Corpus in the UCI bag-of-words format.""" + """ + Corpus in the UCI bag-of-words format. + """ def __init__(self, fname, fname_vocab=None): IndexedCorpus.__init__(self, fname) UciReader.__init__(self, fname) @@ -194,12 +165,9 @@ def __iter__(self): yield doc # get rid of docId, return the sparse vector only def create_dictionary(self): - """Utility method to generate gensim-style Dictionary directly from + """ + Utility method to generate gensim-style Dictionary directly from the corpus and vocabulary data. - - Returns - ------- - """ dictionary = Dictionary() @@ -225,30 +193,14 @@ def create_dictionary(self): @staticmethod def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False): - """Save a corpus in the UCI Bag-of-Words format. + """ + Save a corpus in the UCI Bag-of-Words format. There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. This function is automatically called by `UciCorpus.serialize`; don't call it directly, call `serialize` instead. - - Parameters - ---------- - fname : - - corpus : - - id2word : - (Default value = None) - progress_cnt : - (Default value = 10000) - metadata : - (Default value = False) - - Returns - ------- - """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") From 57cb5a3f54ae739334735394049bf436f917625e Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Tue, 16 Jan 2018 13:56:22 +0500 Subject: [PATCH 38/48] Add more dots. :) --- gensim/corpora/bleicorpus.py | 21 ++++----- gensim/corpora/csvcorpus.py | 4 +- gensim/corpora/indexedcorpus.py | 18 ++++---- gensim/corpora/svmlightcorpus.py | 20 ++++----- gensim/corpora/wikicorpus.py | 73 ++++++++++++++++---------------- 5 files changed, 68 insertions(+), 68 deletions(-) diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py index 69f7d2d1d7..70485eafb0 100644 --- a/gensim/corpora/bleicorpus.py +++ b/gensim/corpora/bleicorpus.py @@ -42,14 +42,15 @@ def __init__(self, fname, fname_vocab=None): Parameters ---------- fname : str - Serialized corpus's filename + Serialized corpus's filename. fname_vocab : str or None, optional - Vocabulary file; takes precedence over + Vocabulary file. If not given, searching for the + ``vocab``/``vocab.txt `` file. Raises ------ IOError - If vocabulary file doesn't exist + If vocabulary file doesn't exist. """ IndexedCorpus.__init__(self, fname) @@ -88,12 +89,12 @@ def line2doc(self, line): Parameters ---------- line : str - Document's string representation + Document's string representation. Returns ------- list of (int, float) - document's list representation + Document's list representation. """ parts = utils.to_unicode(line).split() @@ -113,13 +114,13 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): Parameters ---------- fname : str - Filename + Filename. corpus : iterable - Iterable of documents + Iterable of documents. id2word : dict of (str, str), optional - Transforms id to word (Default value = None) + Transforms id to word. metadata : bool - Any additional info (Default value = False) + Any additional info. Returns ------- @@ -157,7 +158,7 @@ def docbyoffset(self, offset): Parameters ---------- offset : int - Position of the document in the file + Position of the document in the file. Returns ------- diff --git a/gensim/corpora/csvcorpus.py b/gensim/corpora/csvcorpus.py index 7ce633af11..b40efc94d3 100644 --- a/gensim/corpora/csvcorpus.py +++ b/gensim/corpora/csvcorpus.py @@ -34,9 +34,9 @@ def __init__(self, fname, labels): Parameters ---------- fname : str - Filename + Filename. labels : bool - Whether to skip the first column + Whether to skip the first column. """ logger.info("loading corpus from %s", fname) diff --git a/gensim/corpora/indexedcorpus.py b/gensim/corpora/indexedcorpus.py index 2e89061076..8319d67482 100644 --- a/gensim/corpora/indexedcorpus.py +++ b/gensim/corpora/indexedcorpus.py @@ -38,9 +38,9 @@ def __init__(self, fname, index_fname=None): Parameters ---------- fname : string - Corpus filename + Filename. index_fname : string or None - Index filename, or None for loading `fname`.index + Index filename, or None for loading `fname`.index. Examples -------- @@ -75,20 +75,20 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, Parameters ---------- fname : str - Filename + Filename. corpus : iterable - Iterable of documents + Iterable of documents. id2word : dict of (str, str), optional - Transforms id to word (Default value = None) + Transforms id to word. index_fname : str Where to save resulting index. Saved to `fname`.index if None. progress_cnt : int - Number of documents after which progress info is printed + Number of documents after which progress info is printed. labels : bool - Whether to skip the first column (class labels) + Whether to skip the first column (class labels). metadata : bool If True will ensure that serialize will write out - article titles to a pickle file. (Default value = False) + article titles to a pickle file. (Default value = False). Examples -------- @@ -149,7 +149,7 @@ def __getitem__(self, docno): Parameters ---------- docno : int - Document number + Document number. Returns ------- diff --git a/gensim/corpora/svmlightcorpus.py b/gensim/corpora/svmlightcorpus.py index a93f1bfce1..97a0975efb 100644 --- a/gensim/corpora/svmlightcorpus.py +++ b/gensim/corpora/svmlightcorpus.py @@ -51,7 +51,7 @@ def __init__(self, fname, store_labels=True): Parameters ---------- fname: str - Corpus filename + Filename. store_labels : bool Whether to store labels (~SVM target class). They currently have no application but stored in `self.labels` for convenience by @@ -96,15 +96,15 @@ def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): Parameters ---------- fname : str - Corpus filename + Filename. corpus : iterable - Iterable of documents + Iterable of documents. id2word : dict of (str, str), optional - Transforms id to word (Default value = None) + Transforms id to word. labels : list or False - An SVMlight `` class tags or False if not present + An SVMlight `` class tags or False if not present. metadata : bool - Any additional info (Default value = False) + Any additional info. Returns ------- @@ -127,7 +127,7 @@ def docbyoffset(self, offset): Parameters ---------- offset : int - Document's position + Document's position. Returns ------- @@ -145,7 +145,7 @@ def line2doc(self, line): Parameters ---------- line : str - Line in SVMLight format + Line in SVMLight format. Returns ------- @@ -173,9 +173,9 @@ def doc2line(doc, label=0): Parameters ---------- doc : tuple of (int, float) - Document + Document. label : int - (Default value = 0) + Document label. Returns ------- diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index f13ac0165e..850090e316 100644 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -42,7 +42,7 @@ logger = logging.getLogger(__name__) ARTICLE_MIN_WORDS = 50 -"""Ignore shorter articles (after full preprocessing)""" +"""Ignore shorter articles (after full preprocessing).""" # default thresholds for lengths of individual tokens TOKEN_MIN_LEN = 2 @@ -50,37 +50,37 @@ RE_P0 = re.compile(r'', re.DOTALL | re.UNICODE) -"""Comments""" +"""Comments.""" RE_P1 = re.compile(r' ].*?)(|/>)', re.DOTALL | re.UNICODE) -"""Footnotes""" +"""Footnotes.""" RE_P2 = re.compile(r'(\n\[\[[a-z][a-z][\w-]*:[^:\]]+\]\])+$', re.UNICODE) -"""Links to languages""" +"""Links to languages.""" RE_P3 = re.compile(r'{{([^}{]*)}}', re.DOTALL | re.UNICODE) -"""Template""" +"""Template.""" RE_P4 = re.compile(r'{{([^}]*)}}', re.DOTALL | re.UNICODE) -"""Template""" +"""Template.""" RE_P5 = re.compile(r'\[(\w+):\/\/(.*?)(( (.*?))|())\]', re.UNICODE) -"""Remove URL, keep description""" +"""Remove URL, keep description.""" RE_P6 = re.compile(r'\[([^][]*)\|([^][]*)\]', re.DOTALL | re.UNICODE) -"""Simplify links, keep description""" +"""Simplify links, keep description.""" RE_P7 = re.compile(r'\n\[\[[iI]mage(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE) -"""Keep description of images""" +"""Keep description of images.""" RE_P8 = re.compile(r'\n\[\[[fF]ile(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE) -"""Keep description of files""" +"""Keep description of files.""" RE_P9 = re.compile(r' ].*?)(|/>)', re.DOTALL | re.UNICODE) -"""External links""" +"""External links.""" RE_P10 = re.compile(r' ].*?)(|/>)', re.DOTALL | re.UNICODE) -"""Math content""" +"""Math content.""" RE_P11 = re.compile(r'<(.*?)>', re.DOTALL | re.UNICODE) -"""All other tags""" +"""All other tags.""" RE_P12 = re.compile(r'\n(({\|)|(\|-)|(\|}))(.*?)(?=\n)', re.UNICODE) -"""Table formatting""" +"""Table formatting.""" RE_P13 = re.compile(r'\n(\||\!)(.*?\|)*([^|]*?)', re.UNICODE) -"""Table cell formatting""" +"""Table cell formatting.""" RE_P14 = re.compile(r'\[\[Category:[^][]*\]\]', re.UNICODE) -"""Categories""" +"""Categories.""" RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE) -"""Remove File and Image templates""" +"""Remove File and Image templates.""" IGNORED_NAMESPACES = [ 'Wikipedia', 'Category', 'File', 'Portal', 'Template', @@ -122,7 +122,7 @@ def remove_markup(text): Parameters ---------- text : str - String containing markup + String containing markup. Returns ------- @@ -226,7 +226,7 @@ def remove_file(s): Parameters ---------- s : str - String containing 'File:' and 'Image:' markup + String containing 'File:' and 'Image:' markup. Returns ------- @@ -254,11 +254,11 @@ def tokenize(content, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, Parameters ---------- content : str - String without markup (see `filter_wiki()`) + String without markup (see `filter_wiki()`). token_min_len : int token_max_len : int lower : bool - Whether to lowercase content + Whether to lowercase content. Returns ------- @@ -300,10 +300,10 @@ def extract_pages(f, filter_namespaces=False): Parameters ---------- f : File - File-like object + File-like object. filter_namespaces : list of str or bool - Namespaces to consider + Namespaces to consider. Yields ------ @@ -353,26 +353,25 @@ def extract_pages(f, filter_namespaces=False): def process_article(args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True): - """Parse a wikipedia article, returning its content as a list of tokens - (utf8-encoded strings). + """Parse a wikipedia article, returning its content as a list of tokens. Set `tokenizer_func` (defaults to `tokenize`) parameter for languages like japanese or thai to perform better tokenization. The `tokenizer_func` needs to take 4 parameters: (text, token_min_len, token_max_len, lower). Parameters ---------- - args : - + args : list of (function, int, int, bool) + Meta info. tokenizer_func : - (Default value = tokenize) token_min_len : int token_max_len : int lower : bool - Whether to lowercase result + Whether to lowercase result. Returns ------- - tuple(list of str, str, str) + tokens : tuple(list of str, str, str) + UTF-8 encoded. """ text, lemmatize, title, pageid = args @@ -458,26 +457,26 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction Parameters ---------- fname : str - Filename + Filename. processes : int or None - Number of processes to run, defaults to *number of cpu - 1* + Number of processes to run, defaults to *number of cpu - 1*. lemmatize : bool Whether to use lemmatization instead of simple regexp tokenization. Defaults to `True` if *pattern* package installed - and to `False` otherwise + and to `False` otherwise. dictionary : `corpora.Dictionary` or None filter_namespaces : tuple of str - Namespaces to consider + Namespaces to consider. tokenizer_func : function(text, token_min_len, token_max_len, lower) Returns list of tokens. Set this parameter for languages like - japanese or thai to perform better tokenization + japanese or thai to perform better tokenization. article_min_tokens : int Minimum tokens in article. Article ignored if number of tokens is - less + less. token_min_len : int token_max_len : int lower : bool - Whether to lowercase texts + Whether to lowercase texts. Attributes ---------- From 08ca492136205dffcac4ae64ea64eb7e11abe8cf Mon Sep 17 00:00:00 2001 From: Timofey Yefimov Date: Tue, 16 Jan 2018 18:58:59 +0500 Subject: [PATCH 39/48] Fix monospace --- gensim/corpora/bleicorpus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py index 70485eafb0..dd0bd95cc1 100644 --- a/gensim/corpora/bleicorpus.py +++ b/gensim/corpora/bleicorpus.py @@ -45,7 +45,7 @@ def __init__(self, fname, fname_vocab=None): Serialized corpus's filename. fname_vocab : str or None, optional Vocabulary file. If not given, searching for the - ``vocab``/``vocab.txt `` file. + vocab/vocab.txt file. Raises ------ From 381fb9757e18e3158c0041385a6cfc7d5aa0eb65 Mon Sep 17 00:00:00 2001 From: ivan Date: Thu, 18 Jan 2018 13:17:23 +0500 Subject: [PATCH 40/48] remove useless method --- gensim/interfaces.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/gensim/interfaces.py b/gensim/interfaces.py index 76e67dde8c..6cc7e8d872 100644 --- a/gensim/interfaces.py +++ b/gensim/interfaces.py @@ -103,10 +103,6 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): fmt = str(doc) # format the document appropriately... fout.write(utils.to_utf8("%s\n" % fmt)) # serialize the formatted document to disk - def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, - metadata=False): - pass - class TransformedCorpus(CorpusABC): def __init__(self, obj, corpus, chunksize=None, **kwargs): From 5b5701a84b81b9591943a339d4efe8dfea3bb9a5 Mon Sep 17 00:00:00 2001 From: ivan Date: Thu, 18 Jan 2018 13:54:57 +0500 Subject: [PATCH 41/48] fix bleicorpus --- gensim/corpora/bleicorpus.py | 57 +++++++++++++++++++++--------------- 1 file changed, 33 insertions(+), 24 deletions(-) diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py index dd0bd95cc1..605212b50c 100644 --- a/gensim/corpora/bleicorpus.py +++ b/gensim/corpora/bleicorpus.py @@ -5,7 +5,7 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -"""Blei's LDA-C format.""" +"""Сorpus in Blei's LDA-C format.""" from __future__ import with_statement @@ -17,7 +17,7 @@ from six.moves import xrange -logger = logging.getLogger('gensim.corpora.bleicorpus') +logger = logging.getLogger(__name__) class BleiCorpus(IndexedCorpus): @@ -28,24 +28,22 @@ class BleiCorpus(IndexedCorpus): Each document is one line:: - N fieldId1:fieldValue1 fieldId2:fieldValue2 ... fieldIdN:fieldValueN + N fieldId1:fieldValue1 fieldId2:fieldValue2 ... fieldIdN:fieldValueN - The vocabulary is a file with words, one word per line; word at line K has an - implicit ``id=K``. + + The vocabulary is a file with words, one word per line; word at line K has an implicit `id=K`. """ def __init__(self, fname, fname_vocab=None): """ - Initialize the corpus from a file. Parameters ---------- fname : str - Serialized corpus's filename. - fname_vocab : str or None, optional - Vocabulary file. If not given, searching for the - vocab/vocab.txt file. + File path to Serialized corpus. + fname_vocab : str, optional + Vocabulary file. If `fname_vocab` is None, searching for the vocab.txt or `fname_vocab`.vocab file. Raises ------ @@ -76,7 +74,14 @@ def __init__(self, fname, fname_vocab=None): self.id2word = dict(enumerate(words)) def __iter__(self): - """Iterate over the corpus, returning one sparse vector at a time.""" + """Iterate over the corpus, returning one sparse (BoW) vector at a time. + + Yields + ------ + list of (int, float) + Document's BoW representation. + + """ lineno = -1 with utils.smart_open(self.fname) as fin: for lineno, line in enumerate(fin): @@ -84,17 +89,17 @@ def __iter__(self): self.length = lineno + 1 def line2doc(self, line): - """Convert line to document. + """Convert line in Blei LDA-C format to document (BoW representation). Parameters ---------- line : str - Document's string representation. + Line in Blei's LDA-C format. Returns ------- list of (int, float) - Document's list representation. + Document's BoW representation. """ parts = utils.to_unicode(line).split() @@ -108,23 +113,25 @@ def line2doc(self, line): def save_corpus(fname, corpus, id2word=None, metadata=False): """Save a corpus in the LDA-C format. - There are actually two files saved: `fname` and `fname.vocab`, where - `fname.vocab` is the vocabulary file. + Notes + ----- + There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. Parameters ---------- fname : str - Filename. - corpus : iterable - Iterable of documents. + Path to output filename. + corpus : iterable of iterable of (int, float) + Input corpus id2word : dict of (str, str), optional - Transforms id to word. - metadata : bool - Any additional info. + Mapping id -> word for `corpus`. + metadata : bool, optional + THIS PARAMETER WILL BE IGNORED. Returns ------- list of int + Offsets for each line in file (in bytes). """ if id2word is None: @@ -153,16 +160,18 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): return offsets def docbyoffset(self, offset): - """Return document corresponding to `offset`. + """Get document corresponding to `offset`, + offset can be given from :meth:`~gensim.corpora.bleicorpus.BleiCorpus.save_corpus`. Parameters ---------- offset : int - Position of the document in the file. + Position of the document in the file (in bytes). Returns ------- list of (int, float) + Document in BoW format. """ with utils.smart_open(self.fname) as f: From 0e5c0cf963dd26c9bafa189a3d7dd12e844b5741 Mon Sep 17 00:00:00 2001 From: ivan Date: Thu, 18 Jan 2018 14:00:44 +0500 Subject: [PATCH 42/48] fix csvcorpus --- gensim/corpora/csvcorpus.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/gensim/corpora/csvcorpus.py b/gensim/corpora/csvcorpus.py index b40efc94d3..069e9acf3a 100644 --- a/gensim/corpora/csvcorpus.py +++ b/gensim/corpora/csvcorpus.py @@ -15,28 +15,28 @@ from gensim import interfaces, utils -logger = logging.getLogger('gensim.corpora.csvcorpus') +logger = logging.getLogger(__name__) class CsvCorpus(interfaces.CorpusABC): """Corpus in CSV format. - The CSV delimiter, headers etc. are guessed automatically based on the - file content. - + Notes + ----- + The CSV delimiter, headers etc. are guessed automatically based on the file content. All row values are expected to be ints/floats. """ def __init__(self, fname, labels): - """Initialize the corpus from a file. + """ Parameters ---------- fname : str - Filename. + Path to corpus in CSV format. labels : bool - Whether to skip the first column. + If True - skip first line (header). """ logger.info("loading corpus from %s", fname) @@ -51,11 +51,12 @@ def __init__(self, fname, labels): logger.info("sniffed CSV delimiter=%r, headers=%s", self.dialect.delimiter, self.headers) def __iter__(self): - """Iterate over the corpus, returning one sparse vector at a time. + """Iterate over the corpus, returning one BoW vector at a time. Yields ------ list of (int, float) + Document in BoW format. """ reader = csv.reader(utils.smart_open(self.fname), self.dialect) From 627c0e563e8d0c60216039a56d20842b1012ca55 Mon Sep 17 00:00:00 2001 From: ivan Date: Thu, 18 Jan 2018 14:46:26 +0500 Subject: [PATCH 43/48] fix indexedcorpus --- gensim/corpora/csvcorpus.py | 2 +- gensim/corpora/indexedcorpus.py | 114 ++++++++++++++++++-------------- 2 files changed, 66 insertions(+), 50 deletions(-) diff --git a/gensim/corpora/csvcorpus.py b/gensim/corpora/csvcorpus.py index 069e9acf3a..aa43dcac9e 100644 --- a/gensim/corpora/csvcorpus.py +++ b/gensim/corpora/csvcorpus.py @@ -36,7 +36,7 @@ def __init__(self, fname, labels): fname : str Path to corpus in CSV format. labels : bool - If True - skip first line (header). + If True - ignore first column (class labels). """ logger.info("loading corpus from %s", fname) diff --git a/gensim/corpora/indexedcorpus.py b/gensim/corpora/indexedcorpus.py index 8319d67482..574eb79ee7 100644 --- a/gensim/corpora/indexedcorpus.py +++ b/gensim/corpora/indexedcorpus.py @@ -14,43 +14,44 @@ from gensim import interfaces, utils -logger = logging.getLogger('gensim.corpora.indexedcorpus') +logger = logging.getLogger(__name__) class IndexedCorpus(interfaces.CorpusABC): """Indexed corpus is a mechanism for random-accessing corpora. - While the standard corpus interface in gensim allows iterating over - corpus with `for doc in corpus: pass`, indexed corpus allows accessing - the documents with `corpus[docno]` (in O(1) look-up time). + While the standard corpus interface in gensim allows iterating over corpus, + we'll show it with :class:`~gensim.corpora.mmcorpus.MmCorpus`. + + >>> from gensim.corpora import MmCorpus + >>> from gensim.test.utils import datapath + >>> + >>> corpus = MmCorpus(datapath('testcorpus.mm')) + >>> for doc in corpus: + ... pass + + :class:`~gensim.corpora.indexedcorpus.IndexedCorpus` allows accessing the documents with index + in :math:`{O}(1)` look-up time. + + >>> document_index = 3 + >>> doc = corpus[document_index] Notes ----- - This functionality is achieved by storing an extra file (by default - named the same as the '{corpus name}.index') that stores the byte - offset of the beginning of each document. + This functionality is achieved by storing an extra file (by default named the same as the `fname.index`) + that stores the byte offset of the beginning of each document. """ def __init__(self, fname, index_fname=None): - """Initialize the corpus. + """ Parameters ---------- - fname : string - Filename. - index_fname : string or None - Index filename, or None for loading `fname`.index. - - Examples - -------- - >>> # save corpus in SvmLightCorpus format with an index - >>> corpus = [[(1, 0.5)], [(0, 1.0), (1, 2.0)]] - >>> gensim.corpora.SvmLightCorpus.serialize('testfile.svmlight', corpus) - >>> # load back as a document stream (*not* plain Python list) - >>> corpus_with_random_access = gensim.corpora.SvmLightCorpus('tstfile.svmlight') - >>> print(corpus_with_random_access[1]) - [(0, 1.0), (1, 2.0)] + fname : str + Path to indexed corpus. + index_fname : str, optional + Path to index, if not provided - used `fname.index`. """ try: @@ -67,34 +68,38 @@ def __init__(self, fname, index_fname=None): @classmethod def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False): - """Iterate through the document stream `corpus`. - - Saving the documents to - `fname` and recording byte offset of each document. + """Serialize corpus with offset metadata, allows to use direct indexes after loading. Parameters ---------- fname : str - Filename. - corpus : iterable - Iterable of documents. + Path to output filename + corpus : iterable of iterable of (int, float) + Corpus in BoW format id2word : dict of (str, str), optional - Transforms id to word. - index_fname : str - Where to save resulting index. Saved to `fname`.index if None. - progress_cnt : int + Mapping id -> word. + index_fname : str, optional + Where to save resulting index, if None - store index to `fname`.index. + progress_cnt : int, optional Number of documents after which progress info is printed. - labels : bool - Whether to skip the first column (class labels). - metadata : bool - If True will ensure that serialize will write out - article titles to a pickle file. (Default value = False). + labels : bool, optional + If True - ignore first column (class labels). + metadata : bool, optional + If True - ensure that serialize will write out article titles to a pickle file. Examples -------- - >>> MmCorpus.serialize('test.mm', corpus) - >>> mm = MmCorpus('test.mm') # `mm` document stream now has random access - >>> print(mm[42]) # retrieve document no. 42, etc. + >>> from gensim.corpora import MmCorpus + >>> from gensim.test.utils import get_tmpfile + >>> + >>> corpus = [[(1, 0.3), (2, 0.1)], [(1, 0.1)], [(2, 0.3)]] + >>> output_fname = get_tmpfile("test.mm") + >>> + >>> MmCorpus.serialize(output_fname, corpus) + >>> mm = MmCorpus(output_fname) # `mm` document stream now has random access + >>> print(mm[1]) # retrieve document no. 42, etc. + [(1, 0.1)] + """ if getattr(corpus, 'fname', None) == fname: raise ValueError("identical input vs. output corpus filename, refusing to serialize: %s" % fname) @@ -125,15 +130,16 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, utils.pickle(offsets, index_fname) def __len__(self): - """ - Return the index length. + """Get the index length. - If the corpus is not indexed, also count corpus length and cache this - value. + Notes + ----- + If the corpus is not indexed, also count corpus length and cache this value. Returns ------- int + Length of index. """ if self.index is not None: @@ -144,16 +150,26 @@ def __len__(self): return self.length def __getitem__(self, docno): - """Return certain document. + """Get document by `docno` index. Parameters ---------- - docno : int - Document number. + docno : {int, iterable of int} + Document number or iterable of numbers (like a list of str). Returns ------- - `utils.SlicedCorpus` + list of (int, float) + If `docno` is int - return document in BoW format. + + :class:`~gensim.utils.SlicedCorpus` + If `docno` is iterable of int - return several documents in BoW format + wrapped to :class:`~gensim.utils.SlicedCorpus`. + + Raises + ------ + RuntimeError + If index isn't exist. """ if self.index is None: From b771bb54d6235f877210d2b3f18c4c9dcf15550f Mon Sep 17 00:00:00 2001 From: ivan Date: Thu, 18 Jan 2018 15:17:33 +0500 Subject: [PATCH 44/48] fix svmlightcorpus --- gensim/corpora/svmlightcorpus.py | 72 ++++++++++++++++---------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/gensim/corpora/svmlightcorpus.py b/gensim/corpora/svmlightcorpus.py index 97a0975efb..5bdd0a0c23 100644 --- a/gensim/corpora/svmlightcorpus.py +++ b/gensim/corpora/svmlightcorpus.py @@ -5,9 +5,7 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -Corpus in SVMlight format. -""" +"""Corpus in SVMlight format.""" from __future__ import with_statement @@ -18,16 +16,16 @@ from gensim.corpora import IndexedCorpus -logger = logging.getLogger('gensim.corpora.svmlightcorpus') +logger = logging.getLogger(__name__) class SvmLightCorpus(IndexedCorpus): """Corpus in SVMlight format. Quoting http://svmlight.joachims.org/: - The input file contains the training examples. The first lines - may contain comments and are ignored if they start with #. Each of the following - lines represents one training example and is of the following format:: + The input file contains the training examples. The first lines may contain comments and are ignored + if they start with #. Each of the following lines represents one training example + and is of the following format:: .=. : : ... : # .=. +1 | -1 | 0 | @@ -37,25 +35,24 @@ class SvmLightCorpus(IndexedCorpus): The "qid" feature (used for SVMlight ranking), if present, is ignored. - Although not mentioned in the specification above, SVMlight also expect its - feature ids to be 1-based (counting starts at 1). We convert features to 0-base - internally by decrementing all ids when loading a SVMlight input file, and - increment them again when saving as SVMlight. + Notes + ----- + Although not mentioned in the specification above, SVMlight also expect its feature ids to be 1-based + (counting starts at 1). We convert features to 0-base internally by decrementing all ids when loading a SVMlight + input file, and increment them again when saving as SVMlight. """ def __init__(self, fname, store_labels=True): """ - Initialize the corpus from a file. Parameters ---------- fname: str - Filename. - store_labels : bool - Whether to store labels (~SVM target class). They currently have - no application but stored in `self.labels` for convenience by - default. + Path to corpus in SVMlight format. + store_labels : bool, optional + Whether to store labels (~SVM target class). They currently have no application but stored + in `self.labels` for convenience by default. """ IndexedCorpus.__init__(self, fname) @@ -67,12 +64,12 @@ def __init__(self, fname, store_labels=True): self.labels = [] def __iter__(self): - """ - Iterate over the corpus, returning one sparse vector at a time. + """ Iterate over the corpus, returning one sparse (BoW) vector at a time. Yields ------ list of (int, float) + Document in BoW format. """ lineno = -1 @@ -90,25 +87,26 @@ def __iter__(self): def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): """Save a corpus in the SVMlight format. - The SVMlight `` class tag is taken from the `labels` array, or set - to 0 for all documents if `labels` is not supplied. + The SVMlight `` class tag is taken from the `labels` array, or set to 0 for all documents + if `labels` is not supplied. Parameters ---------- fname : str - Filename. - corpus : iterable - Iterable of documents. + Path to output file. + corpus : iterable of iterable of (int, float) + Corpus in BoW format. id2word : dict of (str, str), optional - Transforms id to word. + Mapping id -> word. labels : list or False An SVMlight `` class tags or False if not present. metadata : bool - Any additional info. + ARGUMENT WILL BE IGNORED. Returns ------- list of int + Offsets for each line in file (in bytes). """ logger.info("converting corpus to SVMlight format: %s", fname) @@ -122,7 +120,7 @@ def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): return offsets def docbyoffset(self, offset): - """Return the document stored at file position `offset`. + """Get the document stored at file position `offset`. Parameters ---------- @@ -140,7 +138,8 @@ def docbyoffset(self, offset): # TODO: it brakes if gets None from line2doc def line2doc(self, line): - """Create a document from a single line (string) in SVMlight format. + """Get a document from a single line in SVMlight format, + inverse of :meth:`~gensim.corpora.svmlightcorpus.SvmLightCorpus.doc2line`. Parameters ---------- @@ -149,7 +148,8 @@ def line2doc(self, line): Returns ------- - (tuple of (int, float)) or None + (list of (int, float), str) + Document in BoW format and target class label. """ line = utils.to_unicode(line) @@ -166,20 +166,20 @@ def line2doc(self, line): @staticmethod def doc2line(doc, label=0): - """Output the document in SVMlight format, as a string. - - Inverse function to `line2doc`. + """Convert BoW representation of document in SVMlight format, + inverse of :meth:`~gensim.corpora.svmlightcorpus.SvmLightCorpus.line2doc`. Parameters ---------- - doc : tuple of (int, float) - Document. - label : int - Document label. + doc : list of (int, float) + Document in BoW format. + label : int, optional + Document label (if provided). Returns ------- str + `doc` in SVMlight format. """ pairs = ' '.join("%i:%s" % (termid + 1, termval) for termid, termval in doc) # +1 to convert 0-base to 1-base From d76af8da39ac3825a274c7fbc4922bb5d6f20fd4 Mon Sep 17 00:00:00 2001 From: ivan Date: Thu, 18 Jan 2018 15:37:11 +0500 Subject: [PATCH 45/48] fix wikicorpus[1] --- gensim/corpora/wikicorpus.py | 47 +++++++++++++++++------------------- 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index 850090e316..578b514fca 100644 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -7,17 +7,14 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -Construct a corpus from a Wikipedia (or other MediaWiki-based) database dump. +"""Construct a corpus from a Wikipedia (or other MediaWiki-based) database dump. Notes ----- -If you have the `pattern` package installed, this module will use a fancy -lemmatization to get a lemma of each token (instead of plain alphabetic -tokenizer). The package is available at [1]_ . +If you have the `pattern` package installed, this module will use a fancy lemmatization to get a lemma +of each token (instead of plain alphabetic tokenizer). The package is available at [1]_ . -See scripts/process_wiki.py for a canned (example) script based on this -module. +See :mod:`~gensim.scripts.make_wiki` for a canned (example) script based on this module. References ---------- @@ -107,6 +104,7 @@ def filter_wiki(raw): Returns ------- str + `raw` without markup. """ # parsing of the wiki markup is not perfect, but sufficient for our purposes @@ -127,6 +125,7 @@ def remove_markup(text): Returns ------- str + `text` without markup. """ text = re.sub(RE_P2, '', text) # remove the last list (=languages) @@ -166,10 +165,6 @@ def remove_markup(text): def remove_template(s): """Remove template wikimedia markup. - Return a copy of `s` with all the wikimedia markup template removed. See - [4]_ for wikimedia templates - details. - Parameters ---------- s : str @@ -178,11 +173,11 @@ def remove_template(s): Returns ------- str + Сopy of `s` with all the wikimedia markup template removed. See [4]_ for wikimedia templates details. Notes ----- - Since template can be nested, it is difficult remove them using - regular expresssions. + Since template can be nested, it is difficult remove them using regular expressions. References ---------- @@ -220,9 +215,6 @@ def remove_template(s): def remove_file(s): """Remove the 'File:' and 'Image:' markup, keeping the file caption. - Return a copy of `s` with all the 'File:' and 'Image:' markup replaced by - their corresponding captions. [3]_ - Parameters ---------- s : str @@ -231,6 +223,7 @@ def remove_file(s): Returns ------- str + Сopy of `s` with all the 'File:' and 'Image:' markup replaced by their corresponding captions. [3]_ References ---------- @@ -248,21 +241,23 @@ def remove_file(s): def tokenize(content, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True): """Tokenize a piece of text from wikipedia. - Set `token_min_len`, `token_max_len` as character length (not bytes!) - thresholds for individual tokens. + Set `token_min_len`, `token_max_len` as character length (not bytes!) thresholds for individual tokens. Parameters ---------- content : str - String without markup (see `filter_wiki()`). + String without markup (see :func:`~gensim.corpora.wikicorpus.filter_wiki`). token_min_len : int + Minimal token length. token_max_len : int + Maximal token length lower : bool - Whether to lowercase content. + If True - convert `content` to lower case. Returns ------- list of str + List of tokens from `content`. """ # TODO maybe ignore tokens with non-latin characters? (no chinese, arabic, russian etc.) @@ -273,15 +268,17 @@ def tokenize(content, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, def get_namespace(tag): - """Returns the namespace of tag. + """Get the namespace of tag. Parameters ---------- tag : str + Namespace or tag. Returns ------- str + Matched namespace or tag. """ m = re.match("^{(.*?)}", tag) @@ -295,19 +292,19 @@ def get_namespace(tag): def extract_pages(f, filter_namespaces=False): - """Extract pages from a MediaWiki database dump = open file-like object `f`. + """Extract pages from a MediaWiki database dump. Parameters ---------- - f : File + f : file File-like object. - filter_namespaces : list of str or bool - Namespaces to consider. + Namespaces that will be extracted Yields ------ tuple of (str or None, str, str) + Title, text and page id. """ elems = (elem for _, elem in iterparse(f, events=("end",))) From 7fe753f2cd315783a152b8d1eb2740e670465120 Mon Sep 17 00:00:00 2001 From: ivan Date: Thu, 18 Jan 2018 15:56:52 +0500 Subject: [PATCH 46/48] fix wikicorpus[2] --- gensim/corpora/wikicorpus.py | 46 ++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index 578b514fca..0299cc3c70 100644 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -350,25 +350,33 @@ def extract_pages(f, filter_namespaces=False): def process_article(args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True): - """Parse a wikipedia article, returning its content as a list of tokens. + """Parse a wikipedia article, extract all tokens. - Set `tokenizer_func` (defaults to `tokenize`) parameter for languages like japanese or thai to perform better - tokenization. The `tokenizer_func` needs to take 4 parameters: (text, token_min_len, token_max_len, lower). + Notes + ----- + Set `tokenizer_func` (defaults is :func:`~gensim.corpora.wikicorpus.tokenize`) parameter for languages + like japanese or thai to perform better tokenization. + The `tokenizer_func` needs to take 4 parameters: (text: str, token_min_len: int, token_max_len: int, lower: bool). Parameters ---------- - args : list of (function, int, int, bool) - Meta info. - tokenizer_func : + args : (str, bool, str, int) + Article text, lemmatize flag (if True, :func:`~gensim.utils.lemmatize` will be used), article title, + page identificator. + tokenizer_func : function + Function for tokenization (defaults is :func:`~gensim.corpora.wikicorpus.tokenize`). + Needs to take 4 parameters: (text: str, token_min_len: int, token_max_len: int, lower: bool). token_min_len : int + Minimal token length. token_max_len : int + Maximal token length. lower : bool - Whether to lowercase result. + If True - convert article text to lower case. Returns ------- - tokens : tuple(list of str, str, str) - UTF-8 encoded. + (list of str, str, int) + List of tokens from article, title and page id. """ text, lemmatize, title, pageid = args @@ -383,8 +391,8 @@ def process_article(args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN, def init_to_ignore_interrupt(): """Enables interruption ignoring. - Notes - ----- + Warnings + -------- Should only be used when master is prepared to handle termination of child processes. @@ -393,22 +401,24 @@ def init_to_ignore_interrupt(): def _process_article(args): - """Same as `process_article`, but with args in list format. + """Same as :func:`~gensim.corpora.wikicorpus.process_article`, but with args in list format. Parameters ---------- - args : list of (function, int, int, bool) + args : [(str, bool, str, int), (function, int, int, bool)] + First element - same as `args` from :func:`~gensim.corpora.wikicorpus.process_article`, + second element is tokenizer function, token minimal length, token maximal length, lowercase flag. Returns ------- - tuple(list of str, str, str) + (list of str, str, int) + List of tokens from article, title and page id. - Notes - ----- - Should not be called explicitly. Use `process_article` instead. + Warnings + -------- + Should not be called explicitly. Use :func:`~gensim.corpora.wikicorpus.process_article` instead. """ - tokenizer_func, token_min_len, token_max_len, lower = args[-1] args = args[:-1] From a9eb1a3f4e1db35543cf1b2643e7165564080cc3 Mon Sep 17 00:00:00 2001 From: ivan Date: Thu, 18 Jan 2018 16:19:05 +0500 Subject: [PATCH 47/48] fix wikicorpus[3] --- gensim/corpora/wikicorpus.py | 91 +++++++++++++++++++----------------- 1 file changed, 47 insertions(+), 44 deletions(-) diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index 0299cc3c70..6219ac47c9 100644 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -365,7 +365,8 @@ def process_article(args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN, page identificator. tokenizer_func : function Function for tokenization (defaults is :func:`~gensim.corpora.wikicorpus.tokenize`). - Needs to take 4 parameters: (text: str, token_min_len: int, token_max_len: int, lower: bool). + Needs to have interface: + tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list of str. token_min_len : int Minimal token length. token_max_len : int @@ -429,26 +430,33 @@ def _process_article(args): class WikiCorpus(TextCorpus): - """ - Treat a wikipedia articles dump as a (read-only) corpus. + """Treat a wikipedia articles dump as a **read-only** corpus. Supported dump formats: - *wiki--pages-articles.xml.bz2* - - *wiki-latest-pages-articles.xml.bz2* + * wiki--pages-articles.xml.bz2 + * wiki-latest-pages-articles.xml.bz2 - The documents are extracted on-the-fly, so that the whole (massive) dump - can stay compressed on disk. + The documents are extracted on-the-fly, so that the whole (massive) dump can stay compressed on disk. Notes ----- - "Multistream" archives are *not* supported in Python 2 due to - `limitations in the core bz2 library + Dumps for English wikipedia can be founded `here `_. + + Attributes + ---------- + metadata : bool + Whether to write articles titles to serialized corpus. + + Warnings + -------- + "Multistream" archives are *not* supported in Python 2 due to `limitations in the core bz2 library `_. Examples -------- + >>> from gensim.corpora import WikiCorpus, MmCorpus + >>> >>> wiki = WikiCorpus('enwiki-20100622-pages-articles.xml.bz2') # create word->word_id mapping, takes almost 8h >>> MmCorpus.serialize('wiki_en_vocab200k.mm', wiki) # another 8h, creates a file in MatrixMarket format and mapping @@ -464,31 +472,29 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction Parameters ---------- fname : str - Filename. - processes : int or None - Number of processes to run, defaults to *number of cpu - 1*. + Path to file with wikipedia dump. + processes : int, optional + Number of processes to run, defaults to **number of cpu - 1**. lemmatize : bool - Whether to use lemmatization instead of simple regexp - tokenization. Defaults to `True` if *pattern* package installed - and to `False` otherwise. - dictionary : `corpora.Dictionary` or None + Whether to use lemmatization instead of simple regexp tokenization. + Defaults to `True` if *pattern* package installed. + dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional + Dictionary, if not provided, this scans the corpus once, to determine its vocabulary + (this needs **really long time**). filter_namespaces : tuple of str Namespaces to consider. - tokenizer_func : function(text, token_min_len, token_max_len, lower) - Returns list of tokens. Set this parameter for languages like - japanese or thai to perform better tokenization. - article_min_tokens : int - Minimum tokens in article. Article ignored if number of tokens is - less. - token_min_len : int - token_max_len : int - lower : bool - Whether to lowercase texts. - - Attributes - ---------- - metadata : bool - Whether to write articles titles to serialized corpus. + tokenizer_func : function, optional + Function that will be used for tokenization. By default, use :func:`~gensim.corpora.wikicorpus.tokenize`. + Need to support interface: + tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list of str. + article_min_tokens : int, optional + Minimum tokens in article. Article will be ignored if number of tokens is less. + token_min_len : int, optional + Minimal token length. + token_max_len : int, optional + Maximal token length. + lower : bool, optional + If True - convert all text to lower case. """ self.fname = fname @@ -508,24 +514,21 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction def get_texts(self): """Iterate over the dump, yielding list of tokens for each article. - Yields - ------ - (list of str) or tuple(list of str, tuple(str, str))) - Notes ----- - Only articles of sufficient length are returned (short articles, - redirects, etc. are ignored). This is controlled by - `article_min_tokens` on the class instance. - - Examples - -------- - Note that this iterates over the **texts**; if you want vectors, - just use the standard corpus interface instead of this function: + This iterates over the **texts**. If you want vectors, just use the standard corpus interface + instead of this method >>> for vec in wiki_corpus: >>> print(vec) + Yields + ------ + list of str + If `metadata` is False, yield only list of token extracted from the article. + (list of str, (int, str)) + List of tokens (extracted from the article), page id and article title otherwise. + """ articles, articles_all = 0, 0 From e3a8ebf71fb9f6f8b248bad16d0d94baabf9a28d Mon Sep 17 00:00:00 2001 From: ivan Date: Mon, 22 Jan 2018 15:23:17 +0500 Subject: [PATCH 48/48] fix review comments --- gensim/corpora/bleicorpus.py | 17 +++++++++++------ gensim/corpora/csvcorpus.py | 2 +- gensim/corpora/indexedcorpus.py | 6 +++--- gensim/corpora/svmlightcorpus.py | 10 +++++----- gensim/corpora/wikicorpus.py | 8 ++++---- 5 files changed, 24 insertions(+), 19 deletions(-) diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py index 605212b50c..b0e5094ac0 100644 --- a/gensim/corpora/bleicorpus.py +++ b/gensim/corpora/bleicorpus.py @@ -41,9 +41,14 @@ def __init__(self, fname, fname_vocab=None): Parameters ---------- fname : str - File path to Serialized corpus. + Path to corpus. fname_vocab : str, optional - Vocabulary file. If `fname_vocab` is None, searching for the vocab.txt or `fname_vocab`.vocab file. + Vocabulary file. If `fname_vocab` is None, searching one of variants: + + * `fname`.vocab + * `fname`/vocab.txt + * `fname_without_ext`.vocab + * `fname_folder`/vocab.txt Raises ------ @@ -120,9 +125,9 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): Parameters ---------- fname : str - Path to output filename. + Path to output file. corpus : iterable of iterable of (int, float) - Input corpus + Input corpus in BoW format. id2word : dict of (str, str), optional Mapping id -> word for `corpus`. metadata : bool, optional @@ -160,8 +165,8 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): return offsets def docbyoffset(self, offset): - """Get document corresponding to `offset`, - offset can be given from :meth:`~gensim.corpora.bleicorpus.BleiCorpus.save_corpus`. + """Get document corresponding to `offset`. + Offset can be given from :meth:`~gensim.corpora.bleicorpus.BleiCorpus.save_corpus`. Parameters ---------- diff --git a/gensim/corpora/csvcorpus.py b/gensim/corpora/csvcorpus.py index aa43dcac9e..16a88a93e9 100644 --- a/gensim/corpora/csvcorpus.py +++ b/gensim/corpora/csvcorpus.py @@ -34,7 +34,7 @@ def __init__(self, fname, labels): Parameters ---------- fname : str - Path to corpus in CSV format. + Path to corpus. labels : bool If True - ignore first column (class labels). diff --git a/gensim/corpora/indexedcorpus.py b/gensim/corpora/indexedcorpus.py index 574eb79ee7..c4e58cb95a 100644 --- a/gensim/corpora/indexedcorpus.py +++ b/gensim/corpora/indexedcorpus.py @@ -49,7 +49,7 @@ def __init__(self, fname, index_fname=None): Parameters ---------- fname : str - Path to indexed corpus. + Path to corpus. index_fname : str, optional Path to index, if not provided - used `fname.index`. @@ -73,9 +73,9 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, Parameters ---------- fname : str - Path to output filename + Path to output file. corpus : iterable of iterable of (int, float) - Corpus in BoW format + Corpus in BoW format. id2word : dict of (str, str), optional Mapping id -> word. index_fname : str, optional diff --git a/gensim/corpora/svmlightcorpus.py b/gensim/corpora/svmlightcorpus.py index 5bdd0a0c23..459274cfae 100644 --- a/gensim/corpora/svmlightcorpus.py +++ b/gensim/corpora/svmlightcorpus.py @@ -49,7 +49,7 @@ def __init__(self, fname, store_labels=True): Parameters ---------- fname: str - Path to corpus in SVMlight format. + Path to corpus. store_labels : bool, optional Whether to store labels (~SVM target class). They currently have no application but stored in `self.labels` for convenience by default. @@ -138,8 +138,8 @@ def docbyoffset(self, offset): # TODO: it brakes if gets None from line2doc def line2doc(self, line): - """Get a document from a single line in SVMlight format, - inverse of :meth:`~gensim.corpora.svmlightcorpus.SvmLightCorpus.doc2line`. + """Get a document from a single line in SVMlight format. + This method inverse of :meth:`~gensim.corpora.svmlightcorpus.SvmLightCorpus.doc2line`. Parameters ---------- @@ -166,8 +166,8 @@ def line2doc(self, line): @staticmethod def doc2line(doc, label=0): - """Convert BoW representation of document in SVMlight format, - inverse of :meth:`~gensim.corpora.svmlightcorpus.SvmLightCorpus.line2doc`. + """Convert BoW representation of document in SVMlight format. + This method inverse of :meth:`~gensim.corpora.svmlightcorpus.SvmLightCorpus.line2doc`. Parameters ---------- diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index 6219ac47c9..cd57f20109 100644 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -168,7 +168,7 @@ def remove_template(s): Parameters ---------- s : str - String containing markup template + String containing markup template. Returns ------- @@ -250,7 +250,7 @@ def tokenize(content, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, token_min_len : int Minimal token length. token_max_len : int - Maximal token length + Maximal token length. lower : bool If True - convert `content` to lower case. @@ -299,7 +299,7 @@ def extract_pages(f, filter_namespaces=False): f : file File-like object. filter_namespaces : list of str or bool - Namespaces that will be extracted + Namespaces that will be extracted. Yields ------ @@ -517,7 +517,7 @@ def get_texts(self): Notes ----- This iterates over the **texts**. If you want vectors, just use the standard corpus interface - instead of this method + instead of this method: >>> for vec in wiki_corpus: >>> print(vec)