-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix encoding problems (Windows, python >= 3) #1469
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,12 +12,14 @@ | |
import logging | ||
import tempfile | ||
import unittest | ||
import codecs | ||
import os | ||
import os.path | ||
|
||
import scipy | ||
import gensim | ||
from gensim.corpora import Dictionary | ||
from gensim.utils import to_utf8 | ||
from six import PY3 | ||
from six.moves import zip | ||
|
||
|
@@ -189,32 +191,32 @@ def test_saveAsText(self): | |
d = Dictionary(small_text) | ||
|
||
d.save_as_text(tmpf) | ||
with open(tmpf) as file: | ||
with codecs.open(tmpf, 'r', encoding='utf-8') as file: | ||
serialized_lines = file.readlines() | ||
self.assertEqual(serialized_lines[0], "3\n") | ||
self.assertEqual(serialized_lines[0], u"3\n") | ||
self.assertEqual(len(serialized_lines), 4) | ||
# We do not know, which word will have which index | ||
self.assertEqual(serialized_lines[1][1:], "\tdruhé\t2\n") | ||
self.assertEqual(serialized_lines[2][1:], "\tprvé\t1\n") | ||
self.assertEqual(serialized_lines[3][1:], "\tslovo\t3\n") | ||
self.assertEqual(serialized_lines[1][1:], u"\tdruhé\t2\n") | ||
self.assertEqual(serialized_lines[2][1:], u"\tprvé\t1\n") | ||
self.assertEqual(serialized_lines[3][1:], u"\tslovo\t3\n") | ||
|
||
d.save_as_text(tmpf, sort_by_word=False) | ||
with open(tmpf) as file: | ||
with codecs.open(tmpf, 'r', encoding='utf-8') as file: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. -1: please use |
||
serialized_lines = file.readlines() | ||
self.assertEqual(serialized_lines[0], "3\n") | ||
self.assertEqual(serialized_lines[0], u"3\n") | ||
self.assertEqual(len(serialized_lines), 4) | ||
self.assertEqual(serialized_lines[1][1:], "\tslovo\t3\n") | ||
self.assertEqual(serialized_lines[2][1:], "\tdruhé\t2\n") | ||
self.assertEqual(serialized_lines[3][1:], "\tprvé\t1\n") | ||
self.assertEqual(serialized_lines[1][1:], u"\tslovo\t3\n") | ||
self.assertEqual(serialized_lines[2][1:], u"\tdruhé\t2\n") | ||
self.assertEqual(serialized_lines[3][1:], u"\tprvé\t1\n") | ||
|
||
def test_loadFromText_legacy(self): | ||
""" | ||
`Dictionary` can be loaded from textfile in legacy format. | ||
Legacy format does not have num_docs on the first line. | ||
""" | ||
tmpf = get_tmpfile('load_dict_test_legacy.txt') | ||
no_num_docs_serialization = "1\tprvé\t1\n2\tslovo\t2\n" | ||
with open(tmpf, "w") as file: | ||
no_num_docs_serialization = to_utf8("1\tprvé\t1\n2\tslovo\t2\n") | ||
with open(tmpf, "wb") as file: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Prefer |
||
file.write(no_num_docs_serialization) | ||
|
||
d = Dictionary.load_from_text(tmpf) | ||
|
@@ -227,8 +229,8 @@ def test_loadFromText_legacy(self): | |
def test_loadFromText(self): | ||
"""`Dictionary` can be loaded from textfile.""" | ||
tmpf = get_tmpfile('load_dict_test.txt') | ||
no_num_docs_serialization = "2\n1\tprvé\t1\n2\tslovo\t2\n" | ||
with open(tmpf, "w") as file: | ||
no_num_docs_serialization = to_utf8("2\n1\tprvé\t1\n2\tslovo\t2\n") | ||
with open(tmpf, "wb") as file: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Prefer |
||
file.write(no_num_docs_serialization) | ||
|
||
d = Dictionary.load_from_text(tmpf) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
-1: please use
smart_open
, binary moderb
, and convert to unicode explicitly (avoidcodecs
).