Merge pull request #4 from piskvorky/develop

catchup gojomo/gensim develop
piskvorky · Jun 6, 2015 · 1f23a65 · 1f23a65
2 parents 3768a6b + 7e542db
commit 1f23a65
Show file tree

Hide file tree

Showing 38 changed files with 3,167 additions and 1,121 deletions.
diff --git a/CHANGELOG.txt b/CHANGELOG.txt
@@ -1,10 +1,19 @@
 Changes
 =======
 
-0.10.4
-
-* make utils.decode_htmlentities more robust and reliable on narrow Python builds
-* Move wrappers for external modeling programs into a submodule (Christopher Corley, #295)
+0.11.0 = 0.11.1 = 0.11.1-1, 10/04/2015
+
+* added "topic ranking" to sort topics by coherence in LdaModel (jtmcmc, #311)
+* new fast ShardedCorpus out-of-core corpus (Jan Hajic jr., #284)
+* utils.smart_open now uses the smart_open package (#316)
+* new wrapper for LDA in Vowpal Wabbit (Dave Challis, #304)
+* improvements to the DtmModel wrapper (Yang Han, #272, #277)
+* move wrappers for external modeling programs into a submodule (Christopher Corley, #295)
+* allow transparent compression of NumPy files in save/load (Christopher Corley, #248)
+* save/load methods now accept file handles, in addition to file names (macks22, #292)
+* fixes to LdaMulticore on Windows (Feng Mai, #305)
+* lots of small fixes & py3k compatibility improvements (Chyi-Kwei Yau, Daniel Nouri, Timothy Emerick, Juarez Bochi, Christopher Corley, Chirag Nagpal, Jan Hajic jr., Flávio Codeço Coelho)
+* re-released as 0.11.1 and 0.11.1-1 because of a packaging bug
 
 0.10.3, 17/11/2014
 

diff --git a/README.rst b/README.rst
@@ -48,7 +48,7 @@ The simple way to install `gensim` is::
     pip install -U gensim
 
 Or, if you have instead downloaded and unzipped the `source tar.gz <http://pypi.python.org/pypi/gensim>`_ package,
-you'll need to run::
+you'd run::
 
     python setup.py test
     python setup.py install

diff --git a/docs/src/apiref.rst b/docs/src/apiref.rst
@@ -37,6 +37,7 @@ Modules:
     models/phrases
     models/wrappers/ldamallet
     models/wrappers/dtmmodel
+    models/wrappers/ldavowpalwabbit.rst
     similarities/docsim
     similarities/simserver
 
diff --git a/docs/src/conf.py b/docs/src/conf.py
@@ -52,9 +52,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '0.10.3'
+version = '0.11.1'
 # The full version, including alpha/beta/rc tags.
-release = '0.10.3'
+release = '0.11.1'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/docs/src/models/wrappers/ldavowpalwabbit.rst b/docs/src/models/wrappers/ldavowpalwabbit.rst
@@ -0,0 +1,9 @@
+:mod:`models.wrappers.ldavowpalwabbit` -- Latent Dirichlet Allocation via Vowpal Wabbit
+=======================================================================================
+
+.. automodule:: gensim.models.wrappers.ldavowpalwabbit
+    :synopsis: Latent Dirichlet Allocation via Vowpal Wabbit
+    :members:
+    :inherited-members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/gensim/__init__.py b/gensim/__init__.py
@@ -3,7 +3,7 @@
 similarities within a corpus of documents.
 """
 
-from gensim import utils, matutils, interfaces, corpora, models, similarities
+from gensim import parsing, matutils, interfaces, corpora, models, similarities
 import logging
 
 try:

diff --git a/gensim/corpora/__init__.py b/gensim/corpora/__init__.py
@@ -15,3 +15,4 @@
 from .textcorpus import TextCorpus
 from .ucicorpus import UciCorpus
 from .malletcorpus import MalletCorpus
+from .sharded_corpus import ShardedCorpus
diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py
@@ -51,10 +51,10 @@ def __init__(self, fname, fname_vocab=None):
             fname_base, _ = path.splitext(fname)
             fname_dir = path.dirname(fname)
             for fname_vocab in [
-                        fname + '.vocab',
-                        fname + '/vocab.txt',
-                        fname_base + '.vocab',
-                        fname_dir + '/vocab.txt',
+                        utils.smart_extension(fname, '.vocab'),
+                        utils.smart_extension(fname, '/vocab.txt'),
+                        utils.smart_extension(fname_base, '.vocab'),
+                        utils.smart_extension(fname_dir, '/vocab.txt'),
                         ]:
                 if path.exists(fname_vocab):
                     break
@@ -112,7 +112,7 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
                 fout.write(utils.to_utf8("%i %s\n" % (len(doc), ' '.join(parts))))
 
         # write out vocabulary, in a format compatible with Blei's topics.py script
-        fname_vocab = fname + '.vocab'
+        fname_vocab = utils.smart_extension(fname, '.vocab')
         logger.info("saving vocabulary of %i words to %s" % (num_terms, fname_vocab))
         with utils.smart_open(fname_vocab, 'wb') as fout:
             for featureid in xrange(num_terms):

diff --git a/gensim/corpora/csvcorpus.py b/gensim/corpora/csvcorpus.py
@@ -16,7 +16,7 @@
 import csv
 import itertools
 
-from gensim import interfaces
+from gensim import interfaces, utils
 
 logger = logging.getLogger('gensim.corpora.csvcorpus')
 
@@ -42,7 +42,7 @@ def __init__(self, fname, labels):
         self.labels = labels
 
         # load the first few lines, to guess the CSV dialect
-        head = ''.join(itertools.islice(open(self.fname), 5))
+        head = ''.join(itertools.islice(utils.smart_open(self.fname), 5))
         self.headers = csv.Sniffer().has_header(head)
         self.dialect = csv.Sniffer().sniff(head)
         logger.info("sniffed CSV delimiter=%r, headers=%s" % (self.dialect.delimiter, self.headers))
@@ -52,7 +52,7 @@ def __iter__(self):
         Iterate over the corpus, returning one sparse vector at a time.
 
         """
-        reader = csv.reader(open(self.fname), self.dialect)
+        reader = csv.reader(utils.smart_open(self.fname), self.dialect)
         if self.headers:
             next(reader)    # skip the headers
 

diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py
@@ -213,7 +213,6 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000):
 
         # do the actual filtering, then rebuild dictionary to remove gaps in ids
         self.filter_tokens(good_ids=good_ids)
-        self.compactify()
         logger.info("resulting dictionary: %s" % self)
 
 
@@ -240,6 +239,7 @@ def filter_tokens(self, bad_ids=None, good_ids=None):
             self.dfs = dict((tokenid, freq)
                             for tokenid, freq in iteritems(self.dfs)
                             if tokenid in good_ids)
+        self.compactify()
 
 
     def compactify(self):
@@ -392,4 +392,5 @@ def from_corpus(corpus, id2word=None):
         logger.info("built %s from %i documents (total %i corpus positions)" %
                      (result, result.num_docs, result.num_pos))
         return result
+
 #endclass Dictionary
diff --git a/gensim/corpora/indexedcorpus.py b/gensim/corpora/indexedcorpus.py
@@ -44,7 +44,7 @@ def __init__(self, fname, index_fname=None):
         """
         try:
             if index_fname is None:
-                index_fname = fname + '.index'
+                index_fname = utils.smart_extension(fname, '.index')
             self.index = utils.unpickle(index_fname)
             logger.info("loaded corpus index from %s" % index_fname)
         except:
@@ -76,7 +76,7 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progres
             raise ValueError("identical input vs. output corpus filename, refusing to serialize: %s" % fname)
 
         if index_fname is None:
-            index_fname = fname + '.index'
+            index_fname = utils.smart_extension(fname, '.index')
 
         if progress_cnt is not None:
             if labels is not None: