piskvorky · menshikh-iv · Sep 19, 2017 · Aug 21, 2017 · Aug 22, 2017 · Aug 25, 2017
diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py
@@ -847,7 +847,7 @@ def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*
                     fout.write(utils.to_utf8("%s %s\n" % (total_vec, self.vector_size)))
                 # store as in input order
                 for i in range(len(self.docvecs)):
-                    doctag = prefix + str(self.docvecs.index_to_doctag(i))
+                    doctag = "%s%s" % (prefix, self.docvecs.index_to_doctag(i))
                     row = self.docvecs.doctag_syn0[i]
                     if binary:
                         fout.write(utils.to_utf8(doctag) + b" " + row.tostring())

diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py
@@ -30,11 +30,16 @@
 
 
 class DocsLeeCorpus(object):
-    def __init__(self, string_tags=False):
+    def __init__(self, string_tags=False, unicode_tags=False):
         self.string_tags = string_tags
+        self.unicode_tags = unicode_tags
 
     def _tag(self, i):
-        return i if not self.string_tags else '_*%d' % i
+        if self.unicode_tags:
+            return u'_\xa1_%d' % i
+        elif self.string_tags:
+            return '_*%d' % i
+        return i
 
     def __iter__(self):
         with open(datapath('lee_background.cor')) as f:
@@ -95,6 +100,13 @@ def testPersistenceWord2VecFormat(self):
         binary_model_dv = keyedvectors.KeyedVectors.load_word2vec_format(test_word, binary=True)
         self.assertEqual(len(model.wv.vocab), len(binary_model_dv.vocab))
 
+    def test_unicode_in_doctag(self):
+        """Test storing document vectors of a model with unicode titles."""
+        model = doc2vec.Doc2Vec(DocsLeeCorpus(unicode_tags=True), min_count=1)
+        model.save_word2vec_format(testfile(), doctag_vec=True, word_vec=True, binary=True)
+        binary_model_dv = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True)
+        self.assertEqual(len(model.wv.vocab) + len(model.docvecs), len(binary_model_dv.vocab))
+
     def test_load_mmap(self):
         """Test storing/loading the entire model."""
         model = doc2vec.Doc2Vec(sentences, min_count=1)