Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix encoding problems (Windows, python >= 3) #1469

Merged
merged 3 commits into from
Jul 6, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,7 @@ test_script:
- "cd empty_folder"
- "pip install pyemd testfixtures unittest2 sklearn Morfessor==2.0.2a4"

# Use run instead of main to avoid nnz code when fail (TODO: remove this)
- "python -c \"import nose; nose.run()\" -s -v gensim"
- "python -c \"import nose; nose.main()\" -s -v gensim"
# Move back to the project folder
- "cd .."

Expand Down
30 changes: 16 additions & 14 deletions gensim/test/test_corpora_dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,14 @@
import logging
import tempfile
import unittest
import codecs
import os
import os.path

import scipy
import gensim
from gensim.corpora import Dictionary
from gensim.utils import to_utf8
from six import PY3
from six.moves import zip

Expand Down Expand Up @@ -189,32 +191,32 @@ def test_saveAsText(self):
d = Dictionary(small_text)

d.save_as_text(tmpf)
with open(tmpf) as file:
with codecs.open(tmpf, 'r', encoding='utf-8') as file:
Copy link
Owner

@piskvorky piskvorky Jul 15, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

-1: please use smart_open, binary mode rb, and convert to unicode explicitly (avoid codecs).

serialized_lines = file.readlines()
self.assertEqual(serialized_lines[0], "3\n")
self.assertEqual(serialized_lines[0], u"3\n")
self.assertEqual(len(serialized_lines), 4)
# We do not know, which word will have which index
self.assertEqual(serialized_lines[1][1:], "\tdruhé\t2\n")
self.assertEqual(serialized_lines[2][1:], "\tprvé\t1\n")
self.assertEqual(serialized_lines[3][1:], "\tslovo\t3\n")
self.assertEqual(serialized_lines[1][1:], u"\tdruhé\t2\n")
self.assertEqual(serialized_lines[2][1:], u"\tprvé\t1\n")
self.assertEqual(serialized_lines[3][1:], u"\tslovo\t3\n")

d.save_as_text(tmpf, sort_by_word=False)
with open(tmpf) as file:
with codecs.open(tmpf, 'r', encoding='utf-8') as file:
Copy link
Owner

@piskvorky piskvorky Jul 15, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

-1: please use smart_open, binary mode rb, and convert to unicode explicitly (avoid codecs).

serialized_lines = file.readlines()
self.assertEqual(serialized_lines[0], "3\n")
self.assertEqual(serialized_lines[0], u"3\n")
self.assertEqual(len(serialized_lines), 4)
self.assertEqual(serialized_lines[1][1:], "\tslovo\t3\n")
self.assertEqual(serialized_lines[2][1:], "\tdruhé\t2\n")
self.assertEqual(serialized_lines[3][1:], "\tprvé\t1\n")
self.assertEqual(serialized_lines[1][1:], u"\tslovo\t3\n")
self.assertEqual(serialized_lines[2][1:], u"\tdruhé\t2\n")
self.assertEqual(serialized_lines[3][1:], u"\tprvé\t1\n")

def test_loadFromText_legacy(self):
"""
`Dictionary` can be loaded from textfile in legacy format.
Legacy format does not have num_docs on the first line.
"""
tmpf = get_tmpfile('load_dict_test_legacy.txt')
no_num_docs_serialization = "1\tprvé\t1\n2\tslovo\t2\n"
with open(tmpf, "w") as file:
no_num_docs_serialization = to_utf8("1\tprvé\t1\n2\tslovo\t2\n")
with open(tmpf, "wb") as file:
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Prefer smart_open.

file.write(no_num_docs_serialization)

d = Dictionary.load_from_text(tmpf)
Expand All @@ -227,8 +229,8 @@ def test_loadFromText_legacy(self):
def test_loadFromText(self):
"""`Dictionary` can be loaded from textfile."""
tmpf = get_tmpfile('load_dict_test.txt')
no_num_docs_serialization = "2\n1\tprvé\t1\n2\tslovo\t2\n"
with open(tmpf, "w") as file:
no_num_docs_serialization = to_utf8("2\n1\tprvé\t1\n2\tslovo\t2\n")
with open(tmpf, "wb") as file:
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Prefer smart_open.

file.write(no_num_docs_serialization)

d = Dictionary.load_from_text(tmpf)
Expand Down