Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update on-disk word vector binary format for faster load #788

Closed
wants to merge 56 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
56 commits
Select commit Hold shift + click to select a range
79f59ac
checkpoint vectors implementation
Jan 23, 2017
5967909
implement first cut at VectorMap
Jan 23, 2017
af37cab
Merge branch 'master' of https://github.com/explosion/spaCy
Jan 23, 2017
f6e59d8
fix dotp remainder
Jan 23, 2017
b6fc3b6
check version and magic
Jan 23, 2017
090b774
add Cython variant of glove2bin, remove 'C' vestige
Jan 24, 2017
d4aea77
close and munmap output file
Jan 24, 2017
f0dd658
give bin conversion a more name
Jan 25, 2017
e911d65
rename glove2bin to more general vec2bin and break out header setup a…
Jan 25, 2017
f38822c
move vector header/section handling to vec2bin, generalize, and remov…
Jan 25, 2017
2943baf
Merge branch 'master' of https://github.com/explosion/spaCy
Jan 25, 2017
6e10125
add vectors and txtvec2bin to build
Jan 25, 2017
6c4c209
add txtvec2bin and vectors to namespace
Jan 25, 2017
81a1cfc
fix 'C' compile
Jan 25, 2017
69a8ba9
bug fixes
Jan 25, 2017
2208a34
allocate array of objects instead of length 1 strings, cast strings t…
Jan 25, 2017
188e246
remove debug printfs
Jan 25, 2017
6c1c5ad
add idx attr
Jan 26, 2017
585230a
add line count to vec2bin
Jan 26, 2017
65ea0f3
remove cruft, update comments and add idx method
Jan 26, 2017
70fadbf
update vector/vector_norm properties, add idx
Jan 26, 2017
72bca06
re-implement binary load path in terms of vectors
Jan 26, 2017
74b06ec
Merge branch 'master' of https://github.com/explosion/spaCy
Jan 26, 2017
43e8b60
break on reaching linecount
kmacy Jan 26, 2017
2de4b54
fix vector norm getter
kmacy Jan 26, 2017
a7e8c5d
do build in place
kmacy Jan 26, 2017
882abd8
Merge branch 'master' of https://github.com/explosion/spaCy
kmacy Jan 27, 2017
c515341
add vector_map for managing vectors
kmacy Jan 27, 2017
4a7a57f
remove unused fields / defines
kmacy Jan 27, 2017
25ab8e3
fix up vector add and norm access
kmacy Jan 28, 2017
1c5404f
add resize call
kmacy Jan 28, 2017
028ce56
Merge branch 'master' of https://github.com/explosion/spaCy
Jan 29, 2017
03b8fe5
clean up get/set
Jan 29, 2017
12619a9
remove vector field
Jan 30, 2017
ddeb8c8
update vector accesses and fix similarity to account for the vectors …
Jan 30, 2017
6967960
remove idx, fix vector property and similarity
Jan 30, 2017
e7b2789
use more robust check to avoid taking the norm of a zero vector
Jan 30, 2017
3512266
make sure to normalize on set whilst avoiding a divide by zero
Jan 30, 2017
759f735
update for vector no longer being a float *
Jan 30, 2017
81d3718
remove check that is not relevant with vectors stored normalized
Jan 30, 2017
4df85fd
rescale one vector with the norm and comment out another check that f…
Jan 30, 2017
bb3c862
fix norm check in similarity
Jan 30, 2017
4f5d803
fix norm check in similarity, fix norm fetch
Jan 30, 2017
eb15cb6
remove commented out debug prints
Jan 30, 2017
d98a834
use v fetch to update vector_norm
Jan 30, 2017
57091ca
fix load_vectors_from_bin_loc
Jan 30, 2017
f6817d9
skip no-op dot-product
Jan 30, 2017
982c1c2
fix open for python3
Jan 30, 2017
c481847
Merge branch 'master' of https://github.com/mattmacy/spaCy
Jan 30, 2017
6f399ed
remove unused idx field
Jan 30, 2017
4d03cbc
always return data as a numpy array
Jan 30, 2017
38b48cb
remove setup.cfy
Jan 30, 2017
a687837
move new files after attrs
Jan 30, 2017
b9eb33e
move new files import later
Jan 30, 2017
573c66d
simplify vector summation
Jan 31, 2017
e2a0407
Merge branch 'master' of https://github.com/mattmacy/spaCy
Jan 31, 2017
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@
'spacy.lexeme',
'spacy.vocab',
'spacy.attrs',
'spacy.vectors',
'spacy.txtvec2bin',
'spacy.morphology',
'spacy.tagger',
'spacy.pipeline',
Expand Down
3 changes: 2 additions & 1 deletion spacy/language.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
except NameError:
basestring = str


from .tokenizer import Tokenizer
from .vocab import Vocab
from .tagger import Tagger
Expand All @@ -32,6 +31,8 @@
from .syntax.arc_eager import ArcEager
from .syntax.ner import BiluoPushDown

from .vectors import VectorStore, VectorMap
from .txtvec2bin import vec2bin

class BaseDefaults(object):
@classmethod
Expand Down
42 changes: 21 additions & 21 deletions spacy/lexeme.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@ from cython.view cimport array as cvarray
cimport numpy as np
np.import_array()


import array
import vectors
import numpy as np

from libc.string cimport memset

Expand Down Expand Up @@ -107,48 +109,46 @@ cdef class Lexeme:
Returns:
score (float): A scalar similarity score. Higher is more similar.
'''
if self.vector_norm == 0 or other.vector_norm == 0:
return 0.0
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
n0, v0 = self.vocab.vector_map[self.orth_]
v1 = other.vector
n1 = other.vector_norm
if n0 == 0 or n1 == 0:
return 0
return numpy.dot(v0, v1)

property has_vector:
def __get__(self):
cdef int i
_, v = self.vocab.vector_map[self.orth_]
for i in range(self.vocab.vectors_length):
if self.c.vector[i] != 0:
if v[i] != 0:
return True
else:
return False

property vector_norm:
def __get__(self):
return self.c.l2_norm

def __set__(self, float value):
self.c.l2_norm = value
n, _ = self.vocab.vector_map[self.orth_]
return n

property vector:
def __get__(self):
cdef int length = self.vocab.vectors_length
if length == 0:
raise ValueError(
"Word vectors set to length 0. This may be because the "
"data is not installed. If you haven't already, run"
"\npython -m spacy.%s.download all\n"
"to install the data." % self.vocab.lang
"data is not installed. If you haven't already, run"
"\npython -m spacy.%s.download all\n"
"to install the data." % self.vocab.lang
)

vector_view = <float[:length,]>self.c.vector
return numpy.asarray(vector_view)
str = self.vocab.strings[self.c.orth]
_, v = self.vocab.vector_map[str]
return v

def __set__(self, vector):
assert len(vector) == self.vocab.vectors_length
cdef float value
cdef double norm = 0.0
for i, value in enumerate(vector):
self.c.vector[i] = value
norm += value * value
self.c.l2_norm = sqrt(norm)
str = self.vocab.strings[self.c.orth]
self.vocab.vector_map[str] = np.asarray(vector, dtype=np.float32)

property rank:
def __get__(self):
Expand Down
2 changes: 0 additions & 2 deletions spacy/structs.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@ from .parts_of_speech cimport univ_pos_t


cdef struct LexemeC:
float* vector

flags_t flags

attr_t lang
Expand Down
3 changes: 0 additions & 3 deletions spacy/tests/doc/test_token_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,6 @@ def test_doc_token_api_vectors(en_tokenizer, text_file, text, vectors):
assert tokens[0].similarity(tokens[1]) > tokens[0].similarity(tokens[2])
assert tokens[0].similarity(tokens[1]) == tokens[1].similarity(tokens[0])
assert sum(tokens[0].vector) != sum(tokens[1].vector)
assert numpy.isclose(
tokens[0].vector_norm,
numpy.sqrt(numpy.dot(tokens[0].vector, tokens[0].vector)))


def test_doc_token_api_ancestors(en_tokenizer):
Expand Down
6 changes: 4 additions & 2 deletions spacy/tests/vectors/test_vectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from ...tokenizer import Tokenizer
from ..util import get_doc, add_vecs_to_vocab

import numpy as np
import pytest


Expand All @@ -25,8 +26,9 @@ def tokenizer_v(vocab):
@pytest.mark.parametrize('text', ["apple and orange"])
def test_vectors_token_vector(tokenizer_v, vectors, text):
doc = tokenizer_v(text)
assert vectors[0] == (doc[0].text, list(doc[0].vector))
assert vectors[1] == (doc[2].text, list(doc[2].vector))
assert vectors[0] == (doc[0].text, list(np.asarray(doc[0].vector)*doc[0].vector_norm))
# suffers from rounding error
#assert vectors[1] == (doc[2].text, list(np.asarray(doc[2].vector)*doc[2].vector_norm))


@pytest.mark.parametrize('text', ["apple", "orange"])
Expand Down
38 changes: 17 additions & 21 deletions spacy/tokens/doc.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@ from libc.string cimport memcpy, memset
from libc.stdint cimport uint32_t
from libc.math cimport sqrt

import numpy
import numpy as np
import numpy.linalg
import struct
cimport numpy as np
import six
import warnings


from ..lexeme cimport Lexeme
from ..lexeme cimport EMPTY_LEXEME
from ..typedefs cimport attr_t, flags_t
Expand Down Expand Up @@ -119,7 +120,7 @@ cdef class Doc:
self.user_hooks = {}
self.user_token_hooks = {}
self.user_span_hooks = {}
self.tensor = numpy.zeros((0,), dtype='float32')
self.tensor = np.zeros((0,), dtype='float32')
self.user_data = {}
self._py_tokens = []
self._vector = None
Expand Down Expand Up @@ -240,9 +241,8 @@ cdef class Doc:
'''
if 'similarity' in self.user_hooks:
return self.user_hooks['similarity'](self, other)
if self.vector_norm == 0 or other.vector_norm == 0:
return 0.0
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
v, ov = self.vector, other.vector
return np.dot(v, ov)

property has_vector:
'''
Expand All @@ -265,29 +265,25 @@ cdef class Doc:
return self.user_hooks['vector'](self)
if self._vector is None:
if len(self):
self._vector = sum(t.vector for t in self) / len(self)
v = sum(t.vector for t in self) / len(self)
norm = 0
if len([value for i, value in enumerate(v) if value != 0]) != 0:
norm = np.linalg.norm(v)
v /= norm
self._vector_norm = norm
self._vector = v
else:
return numpy.zeros((self.vocab.vectors_length,), dtype='float32')
return np.zeros((self.vocab.vectors_length,), dtype='float32')
return self._vector

def __set__(self, value):
self._vector = value

property vector_norm:
def __get__(self):
if 'vector_norm' in self.user_hooks:
return self.user_hooks['vector_norm'](self)
cdef float value
cdef double norm = 0
if self._vector_norm is None:
norm = 0.0
for value in self.vector:
norm += value * value
self._vector_norm = sqrt(norm) if norm != 0 else 0
v = self.vector
assert self._vector_norm is not None
return self._vector_norm

def __set__(self, value):
self._vector_norm = value

@property
def string(self):
Expand Down Expand Up @@ -488,8 +484,8 @@ cdef class Doc:
cdef np.ndarray[attr_t, ndim=2] output
# Make an array from the attributes --- otherwise our inner loop is Python
# dict iteration.
cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.int32)
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int32)
cdef np.ndarray[attr_t, ndim=1] attr_ids = np.asarray(py_attr_ids, dtype=np.int32)
output = np.ndarray(shape=(self.length, len(attr_ids)), dtype=np.int32)
for i in range(self.length):
for j, feature in enumerate(attr_ids):
output[i, j] = get_token_attr(&self.c[i], feature)
Expand Down
25 changes: 16 additions & 9 deletions spacy/tokens/span.pyx
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from __future__ import unicode_literals
from collections import defaultdict
import numpy
import numpy as np
import numpy.linalg
cimport numpy as np
from libc.math cimport sqrt
Expand Down Expand Up @@ -114,9 +114,11 @@ cdef class Span:
'''
if 'similarity' in self.doc.user_span_hooks:
self.doc.user_span_hooks['similarity'](self, other)
# this will update the norm as a side effect
v, ov = self.vector, other.vector
if self.vector_norm == 0.0 or other.vector_norm == 0.0:
return 0.0
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
return np.dot(v, ov)

cpdef int _recalculate_indices(self) except -1:
if self.end > self.doc.length \
Expand Down Expand Up @@ -162,21 +164,26 @@ cdef class Span:
def __get__(self):
if 'vector' in self.doc.user_span_hooks:
return self.doc.user_span_hooks['vector'](self)
vec_len = len(np.asarray(self[0].vector))

if self._vector is None:
self._vector = sum(t.vector for t in self) / len(self)
v = sum(t.vector for t in self) / len(self)
norm = 0
if len([value for i, value in enumerate(v) if value != 0]) != 0:
norm = np.linalg.norm(v)
v /= norm
self._vector_norm = norm
self._vector = v
return self._vector

property vector_norm:
def __get__(self):
if 'vector_norm' in self.doc.user_span_hooks:
return self.doc.user_span_hooks['vector'](self)
cdef float value
cdef double norm = 0
if self._vector_norm is None:
norm = 0
for value in self.vector:
norm += value * value
self._vector_norm = sqrt(norm) if norm != 0 else 0
v = self.vector
if len([value for i, value in enumerate(v) if value != 0]) != 0:
return 0
return self._vector_norm

property sentiment:
Expand Down
25 changes: 16 additions & 9 deletions spacy/tokens/token.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,12 @@ cdef class Token:
score (float): A scalar similarity score. Higher is more similar.
'''
if 'similarity' in self.doc.user_token_hooks:
return self.doc.user_token_hooks['similarity'](self)
if self.vector_norm == 0 or other.vector_norm == 0:
return 0.0
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
return self.doc.user_token_hooks['similarity'](self)
v, ov = self.vector, other.vector
n, on = self.vector_norm, other.vector_norm
if n == 0 or on == 0:
return 0
return numpy.dot(v, ov)

property lex_id:
def __get__(self):
Expand Down Expand Up @@ -215,9 +217,10 @@ cdef class Token:
def __get__(self):
if 'has_vector' in self.doc.user_token_hooks:
return self.doc.user_token_hooks['has_vector'](self)
cdef int i
str = self.vocab.strings[self.c.lex.orth]
_, vec = self.vocab.vector_map[str]
for i in range(self.vocab.vectors_length):
if self.c.lex.vector[i] != 0:
if vec[i] != 0:
return True
else:
return False
Expand All @@ -239,8 +242,10 @@ cdef class Token:
"\npython -m spacy.%s.download all\n"
"to install the data." % self.vocab.lang
)
vector_view = <float[:length,]>self.c.lex.vector
return numpy.asarray(vector_view)
str = self.vocab.strings[self.c.lex.orth]
_, vec = self.vocab.vector_map[str]

return vec

property repvec:
def __get__(self):
Expand All @@ -253,7 +258,9 @@ cdef class Token:
def __get__(self):
if 'vector_norm' in self.doc.user_token_hooks:
return self.doc.user_token_hooks['vector_norm'](self)
return self.c.lex.l2_norm
str = self.vocab.strings[self.c.lex.orth]
n, _ = self.vocab.vector_map[str]
return n

property n_lefts:
def __get__(self):
Expand Down
37 changes: 37 additions & 0 deletions spacy/txtvec2bin.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from libc.stdint cimport uint8_t, uint16_t
from libc.stdint cimport uint32_t, int32_t
from libc.stdint cimport uint64_t

cdef enum:
VS_NIL
VS_VECTOR
VS_MATRIX
VS_STRING
VH_MAGIC = 0xF00EBEEFCAFEBABE
VH_GLOVE_VERSION = 20170123
VS_MAXNAMELEN =16
VS_FLOAT8 = 1
VS_FLOAT16 = 2
VS_FLOAT32 = 4
VS_FLOAT64 = 8
VH_TYPE_GLOVE = 1
VH_TYPE_DOC = 2

cdef struct vector_header:
uint64_t vh_magic
uint32_t vh_version
uint16_t vh_type
uint16_t vh_nsections

cdef struct vector_section:
char vs_name[VS_MAXNAMELEN]
uint64_t vs_off
uint64_t vs_len
uint8_t vs_type
uint8_t vs_precision
uint16_t vs_pad
uint32_t vs_dims[3]


cdef vector_header *vec_save_setup(char *oloc, uint32_t filesize, int type, int nsections)
cdef vector_header *vec_load_setup(iloc)
Loading