Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change Dataset interface to support sparse matrix. #165

Merged
merged 7 commits into from
Jul 3, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions examples/albl_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,7 @@ def run(trn_ds, tst_ds, lbr, model, qs, quota):

for _ in range(quota):
ask_id = qs.make_query()
X, _ = zip(*trn_ds.data)
lb = lbr.label(X[ask_id])
lb = lbr.label(trn_ds.data[ask_id][0])
trn_ds.update(ask_id, lb)

model.train(trn_ds)
Expand Down
3 changes: 1 addition & 2 deletions examples/alce_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,7 @@ def run(trn_ds, tst_ds, lbr, model, qs, quota, cost_matrix):
# Standard usage of libact objects
if i > 0:
ask_id = qs.make_query()
X, _ = zip(*trn_ds.data)
lb = lbr.label(X[ask_id])
lb = lbr.label(trn_ds.data[ask_id][0])
trn_ds.update(ask_id, lb)

model.train(trn_ds)
Expand Down
5 changes: 2 additions & 3 deletions examples/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,8 @@ def run(trn_ds, tst_ds, lbr, model, qs, quota):

for _ in range(quota):
# Standard usage of libact objects
ask_id = qs.make_query()
X, _ = zip(*trn_ds.data)
lb = lbr.label(X[ask_id])
ask_id = qs.make_query()
lb = lbr.label(trn_ds.data[ask_id][0])
trn_ds.update(ask_id, lb)

model.train(trn_ds)
Expand Down
86 changes: 54 additions & 32 deletions libact/base/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import random
import numpy as np
import scipy.sparse as sp

from libact.utils import zip

Expand All @@ -33,9 +34,15 @@ class Dataset(object):
"""

def __init__(self, X=None, y=None):
if X is None: X = []
if X is None: X = np.array([])
elif not isinstance(X, sp.csr_matrix):
X = np.array(X)

if y is None: y = []
self.data = list(zip(X, y))
y = np.array(y)

self._X = X
self._y = y
self.modified = True
self._update_callback = set()

Expand All @@ -47,7 +54,24 @@ def __len__(self):
-------
n_samples : int
"""
return len(self.data)
return self._X.shape[0]

def __getitem__(self, idx):
# still provide the interface to direct access the data by index
return self._X[idx], self._y[idx]

@property
def data(self): return self

def get_labeled_mask(self):
"""
Get the mask of labeled entries.

Returns
-------
mask: numpy array of bool, shape = (n_sample, )
"""
return ~np.fromiter((e is None for e in self._y), dtype=bool)

def len_labeled(self):
"""
Expand All @@ -57,7 +81,7 @@ def len_labeled(self):
-------
n_samples : int
"""
return len(self.get_labeled_entries())
return self.get_labeled_mask().sum()

def len_unlabeled(self):
"""
Expand All @@ -67,7 +91,7 @@ def len_unlabeled(self):
-------
n_samples : int
"""
return len(list(filter(lambda entry: entry[1] is None, self.data)))
return (~self.get_labeled_mask()).sum()

def get_num_of_labels(self):
"""
Expand All @@ -77,7 +101,7 @@ def get_num_of_labels(self):
-------
n_labels : int
"""
return len({entry[1] for entry in self.data if entry[1] is not None})
return np.unique(self._y[self.get_labeled_mask()]).size

def append(self, feature, label=None):
"""
Expand All @@ -97,9 +121,14 @@ def append(self, feature, label=None):
entry_id : {int}
entry_id for the appened sample.
"""
self.data.append((feature, label))
if isinstance(self._X, np.ndarray):
self._X = np.vstack([self._X, feature])
else: # sp.csr_matrix
self._X = sp.vstack([self._X, feature])
self._y = np.append(self._y, label)

self.modified = True
return len(self.data) - 1
return len(self) - 1

def update(self, entry_id, new_label):
"""
Expand All @@ -113,7 +142,7 @@ def update(self, entry_id, new_label):
label : {int, None}
Label of the sample to be update.
"""
self.data[entry_id] = (self.data[entry_id][0], new_label)
self._y[entry_id] = new_label
self.modified = True
for callback in self._update_callback:
callback(entry_id, new_label)
Expand Down Expand Up @@ -142,44 +171,42 @@ def format_sklearn(self):
y : numpy array, shape = (n_samples)
Sample labels.
"""
X, y = zip(*self.get_labeled_entries())
return np.array(X), np.array(y)
# becomes the same as get_labled_entries
X, y = self.get_labeled_entries()
return X, np.array(y)

def get_entries(self):
"""
Return the list of all sample feature and ground truth tuple.

Returns
-------
data : list, shape = (n_samples)
List of all sample feature and label tuple.
X: numpy array or scipy matrix, shape = ( n_sample, n_features )
y: numpy array, shape = (n_samples)
"""
return self.data
return self._X, self._y

def get_labeled_entries(self):
"""
Returns list of labeled feature and their label

Returns
-------
labeled_entries : list of (feature, label) tuple
Labeled entries
X: numpy array or scipy matrix, shape = ( n_sample labeled, n_features )
y: list, shape = (n_samples lebaled)
"""
return list(filter(lambda entry: entry[1] is not None, self.data))
return self._X[self.get_labeled_mask()], self._y[self.get_labeled_mask()].tolist()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to make y into a list, can we just keep it numpy array?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We actually need this.
For some multi-label code, I think they are taking advantage of the nested structure of out output. I removed it ran against the unit test and got back with a lot of fails. So I would suggest just keep it as a list. And I don't think the performance would not be affected too much.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I think we can merge first and I'll look into this later.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cool. I am starting to adapt libact to my own research experiments. So might start opening more full requests for improvements in the future :)


def get_unlabeled_entries(self):
"""
Returns list of unlabeled features, along with their entry_ids

Returns
-------
unlabeled_entries : list of (entry_id, feature) tuple
Labeled entries
idx: numpy array, shape = (n_samples unlebaled)
X: numpy array or scipy matrix, shape = ( n_sample unlabeled, n_features )
"""
return [
(idx, entry[0]) for idx, entry in enumerate(self.data)
if entry[1] is None
]
return np.where(~self.get_labeled_mask())[0], self._X[~self.get_labeled_mask()]

def labeled_uniform_sample(self, sample_size, replace=True):
"""Returns a Dataset object with labeled data only, which is
Expand All @@ -190,21 +217,16 @@ def labeled_uniform_sample(self, sample_size, replace=True):
----------
sample_size
"""
if replace:
samples = [
random.choice(self.get_labeled_entries())
for _ in range(sample_size)
]
else:
samples = random.sample(self.get_labeled_entries(), sample_size)
return Dataset(*zip(*samples))
idx = np.random.choice(np.where(self.get_labeled_mask())[0],
size=sample_size, replace=replace )
return Dataset(self._X[idx], self._y[idx])


def import_libsvm_sparse(filename):
"""Imports dataset file in libsvm sparse format"""
from sklearn.datasets import load_svmlight_file
X, y = load_svmlight_file(filename)
return Dataset(X.toarray().tolist(), y.tolist())
return Dataset(X.toarray(), y)


def import_scipy_mat(filename):
Expand Down
40 changes: 20 additions & 20 deletions libact/base/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,12 @@ def test_append(self):
dataset = self.setup_dataset()
# labeled
dataset.append(np.array([9, 8, 7]), 2)
last_labeled_entry = dataset.get_labeled_entries()[-1]
last_labeled_entry = [e[-1] for e in dataset.get_labeled_entries()]
self.assertEqual(last_labeled_entry[0], np.array([9, 8, 7]))
self.assertEqual(last_labeled_entry[1], 2)
# unlabeled
idx = dataset.append(np.array([8, 7, 6]))
last_unlabeled_entry = dataset.get_unlabeled_entries()[-1]
last_unlabeled_entry = [e[-1] for e in dataset.get_unlabeled_entries()]
self.assertEqual(last_unlabeled_entry[0], idx)
self.assertEqual(last_unlabeled_entry[1], np.array([8, 7, 6]))

Expand All @@ -53,7 +53,7 @@ def test_update(self):
dataset.update(idx, 2)
self.assertEqual(self.cb_index, idx)
self.assertEqual(self.cb_label, 2)
last_labeled_entry = dataset.get_labeled_entries()[-1]
last_labeled_entry = [e[-1] for e in dataset.get_labeled_entries()]
self.assertEqual(last_labeled_entry[0], np.array([8, 7, 6]))
self.assertEqual(last_labeled_entry[1], 2)

Expand All @@ -65,38 +65,38 @@ def test_format_sklearn(self):

def test_get_labeled_entries(self):
dataset = self.setup_dataset()
entries = dataset.get_labeled_entries()
self.assertEqual(entries[0][0], np.array([0, 1, 2]))
self.assertEqual(entries[1][0], np.array([3, 4, 5]))
self.assertEqual(entries[2][0], np.array([9, 10, 11]))
self.assertEqual(entries[0][1], 1)
self.assertEqual(entries[1][1], 2)
self.assertEqual(entries[2][1], 1)
X, y = dataset.get_labeled_entries()
self.assertEqual(X[0], np.array([0, 1, 2]))
self.assertEqual(X[1], np.array([3, 4, 5]))
self.assertEqual(X[2], np.array([9, 10, 11]))
self.assertEqual(y[0], 1)
self.assertEqual(y[1], 2)
self.assertEqual(y[2], 1)

def test_get_unlabeled_entries(self):
dataset = self.setup_dataset()
entries = dataset.get_unlabeled_entries()
self.assertTrue(np.array_equal(entries[0][1], np.array([6, 7, 8])))
self.assertTrue(np.array_equal(entries[1][1], np.array([12, 13, 14])))
idx, X = dataset.get_unlabeled_entries()
self.assertTrue(np.array_equal(X[0], np.array([6, 7, 8])))
self.assertTrue(np.array_equal(X[1], np.array([12, 13, 14])))

def test_labeled_uniform_sample(self):
dataset = self.setup_dataset()
pool = dataset.get_labeled_entries()
pool_X, pool_y = dataset.get_labeled_entries()
# with replacement
dataset_s = dataset.labeled_uniform_sample(10)
for entry_s in dataset_s.get_labeled_entries():
for entry in pool:
if entry_s[0] is entry[0] and entry_s[1] == entry[1]:
for entry_s in zip(*dataset_s.get_labeled_entries()):
for entry in zip(pool_X, pool_y):
if np.array_equal(entry_s[0], entry[0]) and entry_s[1] == entry[1]:
break
else:
self.fail()
# without replacement
dataset_s = dataset.labeled_uniform_sample(3, replace=False)
used_indexes = set()
for entry_s in dataset_s.get_labeled_entries():
for idx, entry in enumerate(pool):
for entry_s in zip(*dataset_s.get_labeled_entries()):
for idx, entry in enumerate(zip(pool_X, pool_y)):
if (
entry_s[0] is entry[0] and entry_s[1] == entry[1]
np.array_equal(entry_s[0], entry[0]) and entry_s[1] == entry[1]
and idx not in used_indexes
):
used_indexes.add(idx)
Expand Down
2 changes: 1 addition & 1 deletion libact/labelers/ideal_labeler.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class IdealLabeler(Labeler):
"""

def __init__(self, dataset, **kwargs):
X, y = zip(*dataset.get_entries())
X, y = dataset.get_entries()
# make sure the input dataset is fully labeled
assert (np.array(y) != np.array(None)).all()
self.X = X
Expand Down
2 changes: 1 addition & 1 deletion libact/models/multilabel/dummy_clf.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def fit(self, X, y):
self.cls = int(y[0]) # 1 or 0

def train(self, dataset):
_, y = zip(*dataset.get_labeled_entries())
_, y = dataset.get_labeled_entries()
self.cls = int(y[0])

def predict(self, X):
Expand Down
9 changes: 4 additions & 5 deletions libact/query_strategies/active_learning_by_learning.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,11 +134,10 @@ def __init__(self, *args, **kwargs):
"__init__() missing required keyword-only argument: 'T'"
)

self.unlabeled_entry_ids, _ = \
zip(*self.dataset.get_unlabeled_entries())
self.unlabeled_entry_ids, _ = self.dataset.get_unlabeled_entries()
self.unlabeled_invert_id_idx = {}
for i, entry in enumerate(self.dataset.get_unlabeled_entries()):
self.unlabeled_invert_id_idx[entry[0]] = i
for i, idx in enumerate(self.dataset.get_unlabeled_entries()[0]):
self.unlabeled_invert_id_idx[idx] = i

self.uniform_sampler = kwargs.pop('uniform_sampler', True)
if not isinstance(self.uniform_sampler, bool):
Expand Down Expand Up @@ -219,7 +218,7 @@ def update(self, entry_id, label):
def make_query(self):
dataset = self.dataset
try:
unlabeled_entry_ids, _ = zip(*dataset.get_unlabeled_entries())
unlabeled_entry_ids, _ = dataset.get_unlabeled_entries()
except ValueError:
# might be no more unlabeled data left
return
Expand Down
2 changes: 1 addition & 1 deletion libact/query_strategies/density_weighted_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def _get_scores(self):
dataset = self.dataset
X, _ = zip(*dataset.data)
scores = self.base_query_strategy._get_scores()
_, X_pool = zip(*dataset.get_unlabeled_entries())
_, X_pool = dataset.get_unlabeled_entries()
unlabeled_entry_ids, base_scores = zip(*scores)

self.clustering_method.fit(X)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def __init__(self, *args, **kwargs):

@inherit_docstring_from(QueryStrategy)
def make_query(self):
unlabeled_entry_ids, _ = zip(*self.dataset.get_unlabeled_entries())
unlabeled_entry_ids, _ = self.dataset.get_unlabeled_entries()
labeled_entry_ids = np.array([eid
for eid, x in enumerate(self.dataset.data)
if x[1] != None])
Expand Down
5 changes: 2 additions & 3 deletions libact/query_strategies/hintsvm.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,9 +132,8 @@ def __init__(self, *args, **kwargs):
@inherit_docstring_from(QueryStrategy)
def make_query(self):
dataset = self.dataset
unlabeled_entry_ids, unlabeled_pool = zip(
*dataset.get_unlabeled_entries())
labeled_pool, y = zip(*dataset.get_labeled_entries())
unlabeled_entry_ids, unlabeled_pool = dataset.get_unlabeled_entries()
labeled_pool, y = dataset.get_labeled_entries()
if len(np.unique(y)) > 2:
raise ValueError("HintSVM query strategy support binary class "
"active learning only. Found %s classes" % len(np.unique(y)))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,9 +119,9 @@ def __init__(self,
@inherit_docstring_from(QueryStrategy)
def make_query(self):
dataset = self.dataset
unlabeled_entry_ids, pool_X = zip(*dataset.get_unlabeled_entries())
unlabeled_entry_ids, pool_X = dataset.get_unlabeled_entries()
# The input class should be 0-n_classes
X, y = zip(*dataset.get_labeled_entries())
X, y = dataset.get_labeled_entries()

pred_embed = np.zeros((len(pool_X), self.embed_dim))
for i in range(self.embed_dim):
Expand Down
Loading