Skip to content

Commit

Permalink
Merge pull request #4 from piskvorky/develop
Browse files Browse the repository at this point in the history
catchup gojomo/gensim develop
  • Loading branch information
gojomo committed Jun 6, 2015
2 parents 3768a6b + 7e542db commit 1f23a65
Show file tree
Hide file tree
Showing 38 changed files with 3,167 additions and 1,121 deletions.
17 changes: 13 additions & 4 deletions CHANGELOG.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,19 @@
Changes
=======

0.10.4

* make utils.decode_htmlentities more robust and reliable on narrow Python builds
* Move wrappers for external modeling programs into a submodule (Christopher Corley, #295)
0.11.0 = 0.11.1 = 0.11.1-1, 10/04/2015

* added "topic ranking" to sort topics by coherence in LdaModel (jtmcmc, #311)
* new fast ShardedCorpus out-of-core corpus (Jan Hajic jr., #284)
* utils.smart_open now uses the smart_open package (#316)
* new wrapper for LDA in Vowpal Wabbit (Dave Challis, #304)
* improvements to the DtmModel wrapper (Yang Han, #272, #277)
* move wrappers for external modeling programs into a submodule (Christopher Corley, #295)
* allow transparent compression of NumPy files in save/load (Christopher Corley, #248)
* save/load methods now accept file handles, in addition to file names (macks22, #292)
* fixes to LdaMulticore on Windows (Feng Mai, #305)
* lots of small fixes & py3k compatibility improvements (Chyi-Kwei Yau, Daniel Nouri, Timothy Emerick, Juarez Bochi, Christopher Corley, Chirag Nagpal, Jan Hajic jr., Flávio Codeço Coelho)
* re-released as 0.11.1 and 0.11.1-1 because of a packaging bug

0.10.3, 17/11/2014

Expand Down
2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ The simple way to install `gensim` is::
pip install -U gensim

Or, if you have instead downloaded and unzipped the `source tar.gz <http://pypi.python.org/pypi/gensim>`_ package,
you'll need to run::
you'd run::

python setup.py test
python setup.py install
Expand Down
1 change: 1 addition & 0 deletions docs/src/apiref.rst
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ Modules:
models/phrases
models/wrappers/ldamallet
models/wrappers/dtmmodel
models/wrappers/ldavowpalwabbit.rst
similarities/docsim
similarities/simserver

4 changes: 2 additions & 2 deletions docs/src/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,9 @@
# built documents.
#
# The short X.Y version.
version = '0.10.3'
version = '0.11.1'
# The full version, including alpha/beta/rc tags.
release = '0.10.3'
release = '0.11.1'

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
9 changes: 9 additions & 0 deletions docs/src/models/wrappers/ldavowpalwabbit.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
:mod:`models.wrappers.ldavowpalwabbit` -- Latent Dirichlet Allocation via Vowpal Wabbit
=======================================================================================

.. automodule:: gensim.models.wrappers.ldavowpalwabbit
:synopsis: Latent Dirichlet Allocation via Vowpal Wabbit
:members:
:inherited-members:
:undoc-members:
:show-inheritance:
2 changes: 1 addition & 1 deletion gensim/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
similarities within a corpus of documents.
"""

from gensim import utils, matutils, interfaces, corpora, models, similarities
from gensim import parsing, matutils, interfaces, corpora, models, similarities
import logging

try:
Expand Down
1 change: 1 addition & 0 deletions gensim/corpora/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@
from .textcorpus import TextCorpus
from .ucicorpus import UciCorpus
from .malletcorpus import MalletCorpus
from .sharded_corpus import ShardedCorpus
10 changes: 5 additions & 5 deletions gensim/corpora/bleicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,10 @@ def __init__(self, fname, fname_vocab=None):
fname_base, _ = path.splitext(fname)
fname_dir = path.dirname(fname)
for fname_vocab in [
fname + '.vocab',
fname + '/vocab.txt',
fname_base + '.vocab',
fname_dir + '/vocab.txt',
utils.smart_extension(fname, '.vocab'),
utils.smart_extension(fname, '/vocab.txt'),
utils.smart_extension(fname_base, '.vocab'),
utils.smart_extension(fname_dir, '/vocab.txt'),
]:
if path.exists(fname_vocab):
break
Expand Down Expand Up @@ -112,7 +112,7 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
fout.write(utils.to_utf8("%i %s\n" % (len(doc), ' '.join(parts))))

# write out vocabulary, in a format compatible with Blei's topics.py script
fname_vocab = fname + '.vocab'
fname_vocab = utils.smart_extension(fname, '.vocab')
logger.info("saving vocabulary of %i words to %s" % (num_terms, fname_vocab))
with utils.smart_open(fname_vocab, 'wb') as fout:
for featureid in xrange(num_terms):
Expand Down
6 changes: 3 additions & 3 deletions gensim/corpora/csvcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import csv
import itertools

from gensim import interfaces
from gensim import interfaces, utils

logger = logging.getLogger('gensim.corpora.csvcorpus')

Expand All @@ -42,7 +42,7 @@ def __init__(self, fname, labels):
self.labels = labels

# load the first few lines, to guess the CSV dialect
head = ''.join(itertools.islice(open(self.fname), 5))
head = ''.join(itertools.islice(utils.smart_open(self.fname), 5))
self.headers = csv.Sniffer().has_header(head)
self.dialect = csv.Sniffer().sniff(head)
logger.info("sniffed CSV delimiter=%r, headers=%s" % (self.dialect.delimiter, self.headers))
Expand All @@ -52,7 +52,7 @@ def __iter__(self):
Iterate over the corpus, returning one sparse vector at a time.
"""
reader = csv.reader(open(self.fname), self.dialect)
reader = csv.reader(utils.smart_open(self.fname), self.dialect)
if self.headers:
next(reader) # skip the headers

Expand Down
3 changes: 2 additions & 1 deletion gensim/corpora/dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,6 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000):

# do the actual filtering, then rebuild dictionary to remove gaps in ids
self.filter_tokens(good_ids=good_ids)
self.compactify()
logger.info("resulting dictionary: %s" % self)


Expand All @@ -240,6 +239,7 @@ def filter_tokens(self, bad_ids=None, good_ids=None):
self.dfs = dict((tokenid, freq)
for tokenid, freq in iteritems(self.dfs)
if tokenid in good_ids)
self.compactify()


def compactify(self):
Expand Down Expand Up @@ -392,4 +392,5 @@ def from_corpus(corpus, id2word=None):
logger.info("built %s from %i documents (total %i corpus positions)" %
(result, result.num_docs, result.num_pos))
return result

#endclass Dictionary
4 changes: 2 additions & 2 deletions gensim/corpora/indexedcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def __init__(self, fname, index_fname=None):
"""
try:
if index_fname is None:
index_fname = fname + '.index'
index_fname = utils.smart_extension(fname, '.index')
self.index = utils.unpickle(index_fname)
logger.info("loaded corpus index from %s" % index_fname)
except:
Expand Down Expand Up @@ -76,7 +76,7 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progres
raise ValueError("identical input vs. output corpus filename, refusing to serialize: %s" % fname)

if index_fname is None:
index_fname = fname + '.index'
index_fname = utils.smart_extension(fname, '.index')

if progress_cnt is not None:
if labels is not None:
Expand Down
Loading

0 comments on commit 1f23a65

Please sign in to comment.