From f497ac84d909cffd80d0d6b9c9166b65633c3711 Mon Sep 17 00:00:00 2001 From: Eugene Yang Date: Sun, 30 Jun 2019 14:32:49 -0500 Subject: [PATCH 1/7] change dataset interface to support numpy arrays and scipy csr sparse matrix --- libact/base/dataset.py | 91 ++++++++++++------- libact/base/tests/test_dataset.py | 40 ++++---- libact/labelers/ideal_labeler.py | 2 +- libact/models/multilabel/dummy_clf.py | 2 +- .../active_learning_by_learning.py | 9 +- .../query_strategies/density_weighted_meta.py | 2 +- .../density_weighted_uncertainty_sampling.py | 2 +- libact/query_strategies/hintsvm.py | 5 +- .../active_learning_with_cost_embedding.py | 4 +- .../multiclass/expected_error_reduction.py | 6 +- .../multiclass/hierarchical_sampling.py | 2 +- .../multilabel/adaptive_active_learning.py | 4 +- .../multilabel/binary_minimization.py | 4 +- .../cost_sensitive_reference_pair_encoding.py | 2 +- .../multilabel/maximum_margin_reduction.py | 7 +- .../multilabel_with_auxiliary_learner.py | 4 +- libact/query_strategies/query_by_committee.py | 12 +-- libact/query_strategies/quire.py | 16 ++-- libact/query_strategies/random_sampling.py | 2 +- .../query_strategies/uncertainty_sampling.py | 4 +- libact/query_strategies/variance_reduction.py | 6 +- 21 files changed, 124 insertions(+), 102 deletions(-) diff --git a/libact/base/dataset.py b/libact/base/dataset.py index e984c80..a88538d 100644 --- a/libact/base/dataset.py +++ b/libact/base/dataset.py @@ -8,6 +8,7 @@ import random import numpy as np +import scipy.sparse as sp from libact.utils import zip @@ -33,9 +34,16 @@ class Dataset(object): """ def __init__(self, X=None, y=None): - if X is None: X = [] + if X is None: X = np.array([]) + elif not isinstance(X, sp.csr_matrix): + X = np.array( X ) + if y is None: y = [] - self.data = list(zip(X, y)) + y = np.array( y ) + + # self.data = list(zip(X, y)) + self._X = X + self._y = y self.modified = True self._update_callback = set() @@ -47,9 +55,26 @@ def __len__(self): ------- n_samples : int """ - return len(self.data) + return self._X.shape[0] + + def __getitem__(self, idx): + # still provide the interface to direct access the data by index + return self._X[idx], self._y[idx] + + @property + def data(self): return self + + def get_labeled_mask(self): + """ + Get the mask of labeled entries. + + Returns + ------- + mask: numpy array of bool, shape = (n_sample, ) + """ + return ~np.fromiter( ( e is None for e in self._y), dtype=bool ) - def len_labeled(self): + def len_labeled(self): """ Number of labeled data entries in this object. @@ -57,7 +82,7 @@ def len_labeled(self): ------- n_samples : int """ - return len(self.get_labeled_entries()) + return self.get_labeled_mask().sum() def len_unlabeled(self): """ @@ -67,7 +92,7 @@ def len_unlabeled(self): ------- n_samples : int """ - return len(list(filter(lambda entry: entry[1] is None, self.data))) + return (~self.get_labeled_mask()).sum() def get_num_of_labels(self): """ @@ -77,7 +102,7 @@ def get_num_of_labels(self): ------- n_labels : int """ - return len({entry[1] for entry in self.data if entry[1] is not None}) + return np.unique( self._y[ self.get_labeled_mask() ] ).size def append(self, feature, label=None): """ @@ -97,9 +122,14 @@ def append(self, feature, label=None): entry_id : {int} entry_id for the appened sample. """ - self.data.append((feature, label)) + if isinstance( self._X, np.ndarray ): + self._X = np.vstack([ self._X, feature ]) + else: # sp.csr_matrix + self._X = sp.vstack([ self._X, feature ]) + self._y = np.append( self._y, label ) + self.modified = True - return len(self.data) - 1 + return len(self) - 1 def update(self, entry_id, new_label): """ @@ -113,7 +143,7 @@ def update(self, entry_id, new_label): label : {int, None} Label of the sample to be update. """ - self.data[entry_id] = (self.data[entry_id][0], new_label) + self._y[ entry_id ] = new_label self.modified = True for callback in self._update_callback: callback(entry_id, new_label) @@ -142,8 +172,9 @@ def format_sklearn(self): y : numpy array, shape = (n_samples) Sample labels. """ - X, y = zip(*self.get_labeled_entries()) - return np.array(X), np.array(y) + # becomes the same as get_labled_entries + X, y = self.get_labeled_entries() + return X, np.array(y) def get_entries(self): """ @@ -151,10 +182,10 @@ def get_entries(self): Returns ------- - data : list, shape = (n_samples) - List of all sample feature and label tuple. + X: numpy array or scipy matrix, shape = ( n_sample, n_features ) + y: numpy array, shape = (n_samples) """ - return self.data + return self._X, self._y def get_labeled_entries(self): """ @@ -162,24 +193,21 @@ def get_labeled_entries(self): Returns ------- - labeled_entries : list of (feature, label) tuple - Labeled entries + X: numpy array or scipy matrix, shape = ( n_sample labeled, n_features ) + y: list, shape = (n_samples lebaled) """ - return list(filter(lambda entry: entry[1] is not None, self.data)) + return self._X[ self.get_labeled_mask() ], self._y[ self.get_labeled_mask() ].tolist() - def get_unlabeled_entries(self): + def get_unlabeled_entries(self): # TODO: change interface """ Returns list of unlabeled features, along with their entry_ids Returns ------- - unlabeled_entries : list of (entry_id, feature) tuple - Labeled entries + idx: numpy array, shape = (n_samples unlebaled) + X: numpy array or scipy matrix, shape = ( n_sample unlabeled, n_features ) """ - return [ - (idx, entry[0]) for idx, entry in enumerate(self.data) - if entry[1] is None - ] + return np.where( ~self.get_labeled_mask() )[0], self._X[ ~self.get_labeled_mask() ] def labeled_uniform_sample(self, sample_size, replace=True): """Returns a Dataset object with labeled data only, which is @@ -190,21 +218,16 @@ def labeled_uniform_sample(self, sample_size, replace=True): ---------- sample_size """ - if replace: - samples = [ - random.choice(self.get_labeled_entries()) - for _ in range(sample_size) - ] - else: - samples = random.sample(self.get_labeled_entries(), sample_size) - return Dataset(*zip(*samples)) + idx = np.random.choice( np.where( self.get_labeled_mask() )[0], + size=sample_size, replace=replace ) + return Dataset( self._X[idx], self._y[idx] ) def import_libsvm_sparse(filename): """Imports dataset file in libsvm sparse format""" from sklearn.datasets import load_svmlight_file X, y = load_svmlight_file(filename) - return Dataset(X.toarray().tolist(), y.tolist()) + return Dataset(X.toarray(), y) def import_scipy_mat(filename): diff --git a/libact/base/tests/test_dataset.py b/libact/base/tests/test_dataset.py index f137738..5e6c717 100644 --- a/libact/base/tests/test_dataset.py +++ b/libact/base/tests/test_dataset.py @@ -37,12 +37,12 @@ def test_append(self): dataset = self.setup_dataset() # labeled dataset.append(np.array([9, 8, 7]), 2) - last_labeled_entry = dataset.get_labeled_entries()[-1] + last_labeled_entry = [ e[-1] for e in dataset.get_labeled_entries() ] self.assertEqual(last_labeled_entry[0], np.array([9, 8, 7])) self.assertEqual(last_labeled_entry[1], 2) # unlabeled idx = dataset.append(np.array([8, 7, 6])) - last_unlabeled_entry = dataset.get_unlabeled_entries()[-1] + last_unlabeled_entry = [ e[-1] for e in dataset.get_unlabeled_entries() ] self.assertEqual(last_unlabeled_entry[0], idx) self.assertEqual(last_unlabeled_entry[1], np.array([8, 7, 6])) @@ -53,7 +53,7 @@ def test_update(self): dataset.update(idx, 2) self.assertEqual(self.cb_index, idx) self.assertEqual(self.cb_label, 2) - last_labeled_entry = dataset.get_labeled_entries()[-1] + last_labeled_entry = [ e[-1] for e in dataset.get_labeled_entries() ] self.assertEqual(last_labeled_entry[0], np.array([8, 7, 6])) self.assertEqual(last_labeled_entry[1], 2) @@ -65,38 +65,38 @@ def test_format_sklearn(self): def test_get_labeled_entries(self): dataset = self.setup_dataset() - entries = dataset.get_labeled_entries() - self.assertEqual(entries[0][0], np.array([0, 1, 2])) - self.assertEqual(entries[1][0], np.array([3, 4, 5])) - self.assertEqual(entries[2][0], np.array([9, 10, 11])) - self.assertEqual(entries[0][1], 1) - self.assertEqual(entries[1][1], 2) - self.assertEqual(entries[2][1], 1) + X, y = dataset.get_labeled_entries() + self.assertEqual(X[0], np.array([0, 1, 2])) + self.assertEqual(X[1], np.array([3, 4, 5])) + self.assertEqual(X[2], np.array([9, 10, 11])) + self.assertEqual(y[0], 1) + self.assertEqual(y[1], 2) + self.assertEqual(y[2], 1) def test_get_unlabeled_entries(self): dataset = self.setup_dataset() - entries = dataset.get_unlabeled_entries() - self.assertTrue(np.array_equal(entries[0][1], np.array([6, 7, 8]))) - self.assertTrue(np.array_equal(entries[1][1], np.array([12, 13, 14]))) + idx, X = dataset.get_unlabeled_entries() + self.assertTrue(np.array_equal(X[0], np.array([6, 7, 8]))) + self.assertTrue(np.array_equal(X[1], np.array([12, 13, 14]))) def test_labeled_uniform_sample(self): dataset = self.setup_dataset() - pool = dataset.get_labeled_entries() + pool_X, pool_y = dataset.get_labeled_entries() # with replacement dataset_s = dataset.labeled_uniform_sample(10) - for entry_s in dataset_s.get_labeled_entries(): - for entry in pool: - if entry_s[0] is entry[0] and entry_s[1] == entry[1]: + for entry_s in zip(*dataset_s.get_labeled_entries()): + for entry in zip( pool_X, pool_y ): + if np.array_equal( entry_s[0], entry[0]) and entry_s[1] == entry[1]: break else: self.fail() # without replacement dataset_s = dataset.labeled_uniform_sample(3, replace=False) used_indexes = set() - for entry_s in dataset_s.get_labeled_entries(): - for idx, entry in enumerate(pool): + for entry_s in zip(*dataset_s.get_labeled_entries()): + for idx, entry in enumerate( zip( pool_X, pool_y ) ): if ( - entry_s[0] is entry[0] and entry_s[1] == entry[1] + np.array_equal( entry_s[0], entry[0]) and entry_s[1] == entry[1] and idx not in used_indexes ): used_indexes.add(idx) diff --git a/libact/labelers/ideal_labeler.py b/libact/labelers/ideal_labeler.py index 574f4e6..9dbe391 100644 --- a/libact/labelers/ideal_labeler.py +++ b/libact/labelers/ideal_labeler.py @@ -21,7 +21,7 @@ class IdealLabeler(Labeler): """ def __init__(self, dataset, **kwargs): - X, y = zip(*dataset.get_entries()) + X, y = dataset.get_entries() # make sure the input dataset is fully labeled assert (np.array(y) != np.array(None)).all() self.X = X diff --git a/libact/models/multilabel/dummy_clf.py b/libact/models/multilabel/dummy_clf.py index 6ed5250..ea17658 100644 --- a/libact/models/multilabel/dummy_clf.py +++ b/libact/models/multilabel/dummy_clf.py @@ -17,7 +17,7 @@ def fit(self, X, y): self.cls = int(y[0]) # 1 or 0 def train(self, dataset): - _, y = zip(*dataset.get_labeled_entries()) + _, y = dataset.get_labeled_entries() self.cls = int(y[0]) def predict(self, X): diff --git a/libact/query_strategies/active_learning_by_learning.py b/libact/query_strategies/active_learning_by_learning.py index 380adf2..22b7bfa 100644 --- a/libact/query_strategies/active_learning_by_learning.py +++ b/libact/query_strategies/active_learning_by_learning.py @@ -134,11 +134,10 @@ def __init__(self, *args, **kwargs): "__init__() missing required keyword-only argument: 'T'" ) - self.unlabeled_entry_ids, _ = \ - zip(*self.dataset.get_unlabeled_entries()) + self.unlabeled_entry_ids, _ = self.dataset.get_unlabeled_entries() self.unlabeled_invert_id_idx = {} - for i, entry in enumerate(self.dataset.get_unlabeled_entries()): - self.unlabeled_invert_id_idx[entry[0]] = i + for i, idx in enumerate(self.dataset.get_unlabeled_entries()[0]): + self.unlabeled_invert_id_idx[idx] = i self.uniform_sampler = kwargs.pop('uniform_sampler', True) if not isinstance(self.uniform_sampler, bool): @@ -219,7 +218,7 @@ def update(self, entry_id, label): def make_query(self): dataset = self.dataset try: - unlabeled_entry_ids, _ = zip(*dataset.get_unlabeled_entries()) + unlabeled_entry_ids, _ = dataset.get_unlabeled_entries() except ValueError: # might be no more unlabeled data left return diff --git a/libact/query_strategies/density_weighted_meta.py b/libact/query_strategies/density_weighted_meta.py index e4770b8..b89f517 100644 --- a/libact/query_strategies/density_weighted_meta.py +++ b/libact/query_strategies/density_weighted_meta.py @@ -102,7 +102,7 @@ def _get_scores(self): dataset = self.dataset X, _ = zip(*dataset.data) scores = self.base_query_strategy._get_scores() - _, X_pool = zip(*dataset.get_unlabeled_entries()) + _, X_pool = dataset.get_unlabeled_entries() unlabeled_entry_ids, base_scores = zip(*scores) self.clustering_method.fit(X) diff --git a/libact/query_strategies/density_weighted_uncertainty_sampling.py b/libact/query_strategies/density_weighted_uncertainty_sampling.py index 8cce0af..f2986c5 100644 --- a/libact/query_strategies/density_weighted_uncertainty_sampling.py +++ b/libact/query_strategies/density_weighted_uncertainty_sampling.py @@ -124,7 +124,7 @@ def __init__(self, *args, **kwargs): @inherit_docstring_from(QueryStrategy) def make_query(self): - unlabeled_entry_ids, _ = zip(*self.dataset.get_unlabeled_entries()) + unlabeled_entry_ids, _ = self.dataset.get_unlabeled_entries() labeled_entry_ids = np.array([eid for eid, x in enumerate(self.dataset.data) if x[1] != None]) diff --git a/libact/query_strategies/hintsvm.py b/libact/query_strategies/hintsvm.py index 115cedd..f6252cc 100644 --- a/libact/query_strategies/hintsvm.py +++ b/libact/query_strategies/hintsvm.py @@ -132,9 +132,8 @@ def __init__(self, *args, **kwargs): @inherit_docstring_from(QueryStrategy) def make_query(self): dataset = self.dataset - unlabeled_entry_ids, unlabeled_pool = zip( - *dataset.get_unlabeled_entries()) - labeled_pool, y = zip(*dataset.get_labeled_entries()) + unlabeled_entry_ids, unlabeled_pool = dataset.get_unlabeled_entries() + labeled_pool, y = dataset.get_labeled_entries() if len(np.unique(y)) > 2: raise ValueError("HintSVM query strategy support binary class " "active learning only. Found %s classes" % len(np.unique(y))) diff --git a/libact/query_strategies/multiclass/active_learning_with_cost_embedding.py b/libact/query_strategies/multiclass/active_learning_with_cost_embedding.py index 0073b9f..4653552 100644 --- a/libact/query_strategies/multiclass/active_learning_with_cost_embedding.py +++ b/libact/query_strategies/multiclass/active_learning_with_cost_embedding.py @@ -119,9 +119,9 @@ def __init__(self, @inherit_docstring_from(QueryStrategy) def make_query(self): dataset = self.dataset - unlabeled_entry_ids, pool_X = zip(*dataset.get_unlabeled_entries()) + unlabeled_entry_ids, pool_X = dataset.get_unlabeled_entries() # The input class should be 0-n_classes - X, y = zip(*dataset.get_labeled_entries()) + X, y = dataset.get_labeled_entries() pred_embed = np.zeros((len(pool_X), self.embed_dim)) for i in range(self.embed_dim): diff --git a/libact/query_strategies/multiclass/expected_error_reduction.py b/libact/query_strategies/multiclass/expected_error_reduction.py index 365d7bf..8c12bdf 100644 --- a/libact/query_strategies/multiclass/expected_error_reduction.py +++ b/libact/query_strategies/multiclass/expected_error_reduction.py @@ -77,8 +77,8 @@ def __init__(self, dataset, model=None, loss='log', random_state=None): @inherit_docstring_from(QueryStrategy) def make_query(self): dataset = self.dataset - X, y = zip(*dataset.get_labeled_entries()) - unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries()) + X, y = dataset.get_labeled_entries() + unlabeled_entry_ids, X_pool = dataset.get_unlabeled_entries() classes = np.unique(y) n_classes = len(classes) @@ -91,7 +91,7 @@ def make_query(self): score = [] for yi in range(n_classes): m = copy.deepcopy(self.model) - m.train(Dataset(np.vstack((X, [x])), y + (yi, ))) + m.train(Dataset(np.vstack((X, [x])), y + [yi ] )) p = m.predict_proba(X_pool) if self.loss == '01': # 0/1 loss diff --git a/libact/query_strategies/multiclass/hierarchical_sampling.py b/libact/query_strategies/multiclass/hierarchical_sampling.py index f7823b4..1c88853 100644 --- a/libact/query_strategies/multiclass/hierarchical_sampling.py +++ b/libact/query_strategies/multiclass/hierarchical_sampling.py @@ -126,7 +126,7 @@ class HierarchicalSampling(QueryStrategy): def __init__(self, dataset, classes, active_selecting=True, subsample_qs=None, random_state=None): super(HierarchicalSampling, self).__init__(dataset) - X = np.array(next(zip(*self.dataset.get_entries()))) + X, _ = self.dataset.get_entries() cluster = AgglomerativeClustering() cluster.fit(X) childrens = cluster.children_ diff --git a/libact/query_strategies/multilabel/adaptive_active_learning.py b/libact/query_strategies/multilabel/adaptive_active_learning.py index 44aeb4f..8436eac 100644 --- a/libact/query_strategies/multilabel/adaptive_active_learning.py +++ b/libact/query_strategies/multilabel/adaptive_active_learning.py @@ -94,8 +94,8 @@ def __init__(self, dataset, base_clf, betas=None, n_jobs=1, @inherit_docstring_from(QueryStrategy) def make_query(self): dataset = self.dataset - X, Y = zip(*dataset.get_labeled_entries()) - unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries()) + X, Y = dataset.get_labeled_entries() + unlabeled_entry_ids, X_pool = dataset.get_unlabeled_entries() Y = np.array(Y) X, X_pool = np.array(X), np.array(X_pool) diff --git a/libact/query_strategies/multilabel/binary_minimization.py b/libact/query_strategies/multilabel/binary_minimization.py index 5eccd3c..463df5d 100644 --- a/libact/query_strategies/multilabel/binary_minimization.py +++ b/libact/query_strategies/multilabel/binary_minimization.py @@ -60,9 +60,9 @@ def __init__(self, dataset, base_clf, random_state=None): @inherit_docstring_from(QueryStrategy) def make_query(self): dataset = self.dataset - X, Y = zip(*dataset.get_labeled_entries()) + X, Y = dataset.get_labeled_entries() Y = np.array(Y) - unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries()) + unlabeled_entry_ids, X_pool = dataset.get_unlabeled_entries() X_pool = np.array(X_pool) clfs = [] diff --git a/libact/query_strategies/multilabel/cost_sensitive_reference_pair_encoding.py b/libact/query_strategies/multilabel/cost_sensitive_reference_pair_encoding.py index 6a7e710..142078b 100644 --- a/libact/query_strategies/multilabel/cost_sensitive_reference_pair_encoding.py +++ b/libact/query_strategies/multilabel/cost_sensitive_reference_pair_encoding.py @@ -84,7 +84,7 @@ def __init__(self, dataset, scoring_fn, model, base_model, n_models=100, def make_query(self): dataset = self.dataset - unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries()) + unlabeled_entry_ids, X_pool = dataset.get_unlabeled_entries() X_pool = np.asarray(X_pool) self.csrpe_.train(dataset) diff --git a/libact/query_strategies/multilabel/maximum_margin_reduction.py b/libact/query_strategies/multilabel/maximum_margin_reduction.py index 91213bd..c658d75 100644 --- a/libact/query_strategies/multilabel/maximum_margin_reduction.py +++ b/libact/query_strategies/multilabel/maximum_margin_reduction.py @@ -67,7 +67,8 @@ class MaximumLossReductionMaximalConfidence(QueryStrategy): def __init__(self, *args, **kwargs): super(MaximumLossReductionMaximalConfidence, self).__init__(*args, **kwargs) - self.n_labels = len(self.dataset.get_labeled_entries()[0][1]) + # self.n_labels = len(self.dataset.get_labeled_entries()[0][1]) + self.n_labels = len(self.dataset.get_labeled_entries()[1][0]) random_state = kwargs.pop('random_state', None) self.random_state_ = seed_random_state(random_state) @@ -87,8 +88,8 @@ def __init__(self, *args, **kwargs): @inherit_docstring_from(QueryStrategy) def make_query(self): dataset = self.dataset - labeled_pool, Y = zip(*dataset.get_labeled_entries()) - unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries()) + labeled_pool, Y = dataset.get_labeled_entries() + unlabeled_entry_ids, X_pool = dataset.get_unlabeled_entries() labeled_pool = np.array(labeled_pool) Y = np.array(Y) X_pool = np.array(X_pool) diff --git a/libact/query_strategies/multilabel/multilabel_with_auxiliary_learner.py b/libact/query_strategies/multilabel/multilabel_with_auxiliary_learner.py index 1cdaa2e..1d9ed4b 100644 --- a/libact/query_strategies/multilabel/multilabel_with_auxiliary_learner.py +++ b/libact/query_strategies/multilabel/multilabel_with_auxiliary_learner.py @@ -91,8 +91,8 @@ def __init__(self, dataset, major_learner, auxiliary_learner, @inherit_docstring_from(QueryStrategy) def make_query(self): dataset = self.dataset - labeled_pool, Y = zip(*dataset.get_labeled_entries()) - unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries()) + labeled_pool, Y = dataset.get_labeled_entries() + unlabeled_entry_ids, X_pool = dataset.get_unlabeled_entries() major_clf = copy.deepcopy(self.major_learner) major_clf.train(dataset) diff --git a/libact/query_strategies/query_by_committee.py b/libact/query_strategies/query_by_committee.py index 0335035..527c38b 100644 --- a/libact/query_strategies/query_by_committee.py +++ b/libact/query_strategies/query_by_committee.py @@ -158,11 +158,11 @@ def _kl_divergence_disagreement(self, proba): def _labeled_uniform_sample(self, sample_size): """sample labeled entries uniformly""" - labeled_entries = self.dataset.get_labeled_entries() - samples = [labeled_entries[ - self.random_state_.randint(0, len(labeled_entries)) - ]for _ in range(sample_size)] - return Dataset(*zip(*samples)) + X, y = self.dataset.get_labeled_entries() + samples_idx = [ + self.random_state_.randint(0, X.shape[0]) for _ in range(sample_size)] + return Dataset( X[samples_idx], np.array(y)[samples_idx] ) + # return self.dataset.labeled_uniform_sample(sample_size, replace=True) def teach_students(self): """ @@ -186,7 +186,7 @@ def update(self, entry_id, label): @inherit_docstring_from(QueryStrategy) def make_query(self): dataset = self.dataset - unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries()) + unlabeled_entry_ids, X_pool = dataset.get_unlabeled_entries() if self.disagreement == 'vote': # Let the trained students vote for unlabeled data diff --git a/libact/query_strategies/quire.py b/libact/query_strategies/quire.py index f92ed17..7002a26 100644 --- a/libact/query_strategies/quire.py +++ b/libact/query_strategies/quire.py @@ -69,14 +69,16 @@ class QUIRE(QueryStrategy): def __init__(self, *args, **kwargs): super(QUIRE, self).__init__(*args, **kwargs) - self.Uindex = [ - idx for idx, _ in self.dataset.get_unlabeled_entries() - ] - self.Lindex = [ - idx for idx in range(len(self.dataset)) if idx not in self.Uindex - ] + self.Uindex = self.dataset.get_unlabeled_entries()[0].tolist() + self.Lindex = np.where( self.dataset.get_labeled_mask() )[0].tolist() + # self.Uindex = [ + # idx for idx, _ in self.dataset.get_unlabeled_entries() + # ] + # self.Lindex = [ + # idx for idx in range(len(self.dataset)) if idx not in self.Uindex + # ] self.lmbda = kwargs.pop('lambda', 1.) - X, self.y = zip(*self.dataset.get_entries()) + X, self.y = self.dataset.get_entries() self.y = list(self.y) self.kernel = kwargs.pop('kernel', 'rbf') if self.kernel == 'rbf': diff --git a/libact/query_strategies/random_sampling.py b/libact/query_strategies/random_sampling.py index 6e33f1e..8feab8e 100644 --- a/libact/query_strategies/random_sampling.py +++ b/libact/query_strategies/random_sampling.py @@ -45,7 +45,7 @@ def __init__(self, dataset, **kwargs): @inherit_docstring_from(QueryStrategy) def make_query(self): dataset = self.dataset - unlabeled_entry_ids, _ = zip(*dataset.get_unlabeled_entries()) + unlabeled_entry_ids, _ = dataset.get_unlabeled_entries() entry_id = unlabeled_entry_ids[ self.random_state_.randint(0, len(unlabeled_entry_ids))] return entry_id diff --git a/libact/query_strategies/uncertainty_sampling.py b/libact/query_strategies/uncertainty_sampling.py index 9543455..e04b393 100644 --- a/libact/query_strategies/uncertainty_sampling.py +++ b/libact/query_strategies/uncertainty_sampling.py @@ -98,7 +98,7 @@ def __init__(self, *args, **kwargs): def _get_scores(self): dataset = self.dataset self.model.train(dataset) - unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries()) + unlabeled_entry_ids, X_pool = dataset.get_unlabeled_entries() if isinstance(self.model, ProbabilisticModel): dvalue = self.model.predict_proba(X_pool) @@ -135,7 +135,7 @@ def make_query(self, return_score=False): """ dataset = self.dataset - unlabeled_entry_ids, _ = zip(*dataset.get_unlabeled_entries()) + # unlabeled_entry_ids, _ = dataset.get_unlabeled_entries() unlabeled_entry_ids, scores = zip(*self._get_scores()) ask_id = np.argmax(scores) diff --git a/libact/query_strategies/variance_reduction.py b/libact/query_strategies/variance_reduction.py index 3789e9e..2e83dbf 100644 --- a/libact/query_strategies/variance_reduction.py +++ b/libact/query_strategies/variance_reduction.py @@ -62,13 +62,11 @@ def __init__(self, *args, **kwargs): @inherit_docstring_from(QueryStrategy) def make_query(self): - labeled_entries = self.dataset.get_labeled_entries() - Xlabeled, y = zip(*labeled_entries) + Xlabeled, y = self.dataset.get_labeled_entries() Xlabeled = np.array(Xlabeled) y = list(y) - unlabeled_entries = self.dataset.get_unlabeled_entries() - unlabeled_entry_ids, X_pool = zip(*unlabeled_entries) + unlabeled_entry_ids, X_pool = self.dataset.get_unlabeled_entries() label_count = self.dataset.get_num_of_labels() From 12eb065a55b5cd9b9da75f2fce91e1335a721852 Mon Sep 17 00:00:00 2001 From: Eugene Yang Date: Sun, 30 Jun 2019 17:22:13 -0500 Subject: [PATCH 2/7] update examples --- examples/albl_plot.py | 3 +-- examples/alce_plot.py | 3 +-- examples/plot.py | 5 ++--- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/examples/albl_plot.py b/examples/albl_plot.py index 5ebc09b..7fdbdaf 100755 --- a/examples/albl_plot.py +++ b/examples/albl_plot.py @@ -27,8 +27,7 @@ def run(trn_ds, tst_ds, lbr, model, qs, quota): for _ in range(quota): ask_id = qs.make_query() - X, _ = zip(*trn_ds.data) - lb = lbr.label(X[ask_id]) + lb = lbr.label( trn_ds.data[ask_id][0] ) trn_ds.update(ask_id, lb) model.train(trn_ds) diff --git a/examples/alce_plot.py b/examples/alce_plot.py index bc06a25..b6e5b63 100644 --- a/examples/alce_plot.py +++ b/examples/alce_plot.py @@ -34,8 +34,7 @@ def run(trn_ds, tst_ds, lbr, model, qs, quota, cost_matrix): # Standard usage of libact objects if i > 0: ask_id = qs.make_query() - X, _ = zip(*trn_ds.data) - lb = lbr.label(X[ask_id]) + lb = lbr.label(trn_ds.data[ask_id][0]) trn_ds.update(ask_id, lb) model.train(trn_ds) diff --git a/examples/plot.py b/examples/plot.py index e21b40c..d072d0e 100755 --- a/examples/plot.py +++ b/examples/plot.py @@ -27,9 +27,8 @@ def run(trn_ds, tst_ds, lbr, model, qs, quota): for _ in range(quota): # Standard usage of libact objects - ask_id = qs.make_query() - X, _ = zip(*trn_ds.data) - lb = lbr.label(X[ask_id]) + ask_id = qs.make_query() + lb = lbr.label( trn_ds.data[ask_id][0] ) trn_ds.update(ask_id, lb) model.train(trn_ds) From 37da9659c194d07273a2158db9105d19b77aaec7 Mon Sep 17 00:00:00 2001 From: Eugene Yang Date: Mon, 1 Jul 2019 09:51:53 -0500 Subject: [PATCH 3/7] remove redundant whitespace --- libact/base/dataset.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/libact/base/dataset.py b/libact/base/dataset.py index a88538d..5aab7d0 100644 --- a/libact/base/dataset.py +++ b/libact/base/dataset.py @@ -40,7 +40,7 @@ def __init__(self, X=None, y=None): if y is None: y = [] y = np.array( y ) - + # self.data = list(zip(X, y)) self._X = X self._y = y @@ -56,11 +56,11 @@ def __len__(self): n_samples : int """ return self._X.shape[0] - + def __getitem__(self, idx): # still provide the interface to direct access the data by index return self._X[idx], self._y[idx] - + @property def data(self): return self @@ -74,7 +74,7 @@ def get_labeled_mask(self): """ return ~np.fromiter( ( e is None for e in self._y), dtype=bool ) - def len_labeled(self): + def len_labeled(self): """ Number of labeled data entries in this object. @@ -127,7 +127,7 @@ def append(self, feature, label=None): else: # sp.csr_matrix self._X = sp.vstack([ self._X, feature ]) self._y = np.append( self._y, label ) - + self.modified = True return len(self) - 1 @@ -218,7 +218,7 @@ def labeled_uniform_sample(self, sample_size, replace=True): ---------- sample_size """ - idx = np.random.choice( np.where( self.get_labeled_mask() )[0], + idx = np.random.choice( np.where( self.get_labeled_mask() )[0], size=sample_size, replace=replace ) return Dataset( self._X[idx], self._y[idx] ) From acf7136e9e7de6ef5d49b555933ff8284a51c0dc Mon Sep 17 00:00:00 2001 From: Eugene Yang Date: Tue, 2 Jul 2019 10:37:11 -0500 Subject: [PATCH 4/7] update coding style --- libact/base/dataset.py | 25 +++++++++---------- libact/base/tests/test_dataset.py | 14 +++++------ .../multiclass/expected_error_reduction.py | 2 +- libact/query_strategies/query_by_committee.py | 6 ++--- 4 files changed, 22 insertions(+), 25 deletions(-) diff --git a/libact/base/dataset.py b/libact/base/dataset.py index 5aab7d0..a284886 100644 --- a/libact/base/dataset.py +++ b/libact/base/dataset.py @@ -36,12 +36,11 @@ class Dataset(object): def __init__(self, X=None, y=None): if X is None: X = np.array([]) elif not isinstance(X, sp.csr_matrix): - X = np.array( X ) + X = np.array(X) if y is None: y = [] - y = np.array( y ) + y = np.array(y) - # self.data = list(zip(X, y)) self._X = X self._y = y self.modified = True @@ -102,7 +101,7 @@ def get_num_of_labels(self): ------- n_labels : int """ - return np.unique( self._y[ self.get_labeled_mask() ] ).size + return np.unique( self._y[self.get_labeled_mask()] ).size def append(self, feature, label=None): """ @@ -123,10 +122,10 @@ def append(self, feature, label=None): entry_id for the appened sample. """ if isinstance( self._X, np.ndarray ): - self._X = np.vstack([ self._X, feature ]) + self._X = np.vstack([self._X, feature]) else: # sp.csr_matrix - self._X = sp.vstack([ self._X, feature ]) - self._y = np.append( self._y, label ) + self._X = sp.vstack([self._X, feature]) + self._y = np.append(self._y, label) self.modified = True return len(self) - 1 @@ -196,9 +195,9 @@ def get_labeled_entries(self): X: numpy array or scipy matrix, shape = ( n_sample labeled, n_features ) y: list, shape = (n_samples lebaled) """ - return self._X[ self.get_labeled_mask() ], self._y[ self.get_labeled_mask() ].tolist() + return self._X[self.get_labeled_mask()], self._y[self.get_labeled_mask()].tolist() - def get_unlabeled_entries(self): # TODO: change interface + def get_unlabeled_entries(self): """ Returns list of unlabeled features, along with their entry_ids @@ -207,7 +206,7 @@ def get_unlabeled_entries(self): # TODO: change interface idx: numpy array, shape = (n_samples unlebaled) X: numpy array or scipy matrix, shape = ( n_sample unlabeled, n_features ) """ - return np.where( ~self.get_labeled_mask() )[0], self._X[ ~self.get_labeled_mask() ] + return np.where(~self.get_labeled_mask())[0], self._X[~self.get_labeled_mask()] def labeled_uniform_sample(self, sample_size, replace=True): """Returns a Dataset object with labeled data only, which is @@ -218,9 +217,9 @@ def labeled_uniform_sample(self, sample_size, replace=True): ---------- sample_size """ - idx = np.random.choice( np.where( self.get_labeled_mask() )[0], - size=sample_size, replace=replace ) - return Dataset( self._X[idx], self._y[idx] ) + idx = np.random.choice(np.where(self.get_labeled_mask())[0], + size=sample_size, replace=replace ) + return Dataset(self._X[idx], self._y[idx]) def import_libsvm_sparse(filename): diff --git a/libact/base/tests/test_dataset.py b/libact/base/tests/test_dataset.py index 5e6c717..e538696 100644 --- a/libact/base/tests/test_dataset.py +++ b/libact/base/tests/test_dataset.py @@ -37,12 +37,12 @@ def test_append(self): dataset = self.setup_dataset() # labeled dataset.append(np.array([9, 8, 7]), 2) - last_labeled_entry = [ e[-1] for e in dataset.get_labeled_entries() ] + last_labeled_entry = [e[-1] for e in dataset.get_labeled_entries()] self.assertEqual(last_labeled_entry[0], np.array([9, 8, 7])) self.assertEqual(last_labeled_entry[1], 2) # unlabeled idx = dataset.append(np.array([8, 7, 6])) - last_unlabeled_entry = [ e[-1] for e in dataset.get_unlabeled_entries() ] + last_unlabeled_entry = [e[-1] for e in dataset.get_unlabeled_entries()] self.assertEqual(last_unlabeled_entry[0], idx) self.assertEqual(last_unlabeled_entry[1], np.array([8, 7, 6])) @@ -53,7 +53,7 @@ def test_update(self): dataset.update(idx, 2) self.assertEqual(self.cb_index, idx) self.assertEqual(self.cb_label, 2) - last_labeled_entry = [ e[-1] for e in dataset.get_labeled_entries() ] + last_labeled_entry = [e[-1] for e in dataset.get_labeled_entries()] self.assertEqual(last_labeled_entry[0], np.array([8, 7, 6])) self.assertEqual(last_labeled_entry[1], 2) @@ -85,8 +85,8 @@ def test_labeled_uniform_sample(self): # with replacement dataset_s = dataset.labeled_uniform_sample(10) for entry_s in zip(*dataset_s.get_labeled_entries()): - for entry in zip( pool_X, pool_y ): - if np.array_equal( entry_s[0], entry[0]) and entry_s[1] == entry[1]: + for entry in zip(pool_X, pool_y): + if np.array_equal(entry_s[0], entry[0]) and entry_s[1] == entry[1]: break else: self.fail() @@ -94,9 +94,9 @@ def test_labeled_uniform_sample(self): dataset_s = dataset.labeled_uniform_sample(3, replace=False) used_indexes = set() for entry_s in zip(*dataset_s.get_labeled_entries()): - for idx, entry in enumerate( zip( pool_X, pool_y ) ): + for idx, entry in enumerate(zip(pool_X, pool_y)): if ( - np.array_equal( entry_s[0], entry[0]) and entry_s[1] == entry[1] + np.array_equal(entry_s[0], entry[0]) and entry_s[1] == entry[1] and idx not in used_indexes ): used_indexes.add(idx) diff --git a/libact/query_strategies/multiclass/expected_error_reduction.py b/libact/query_strategies/multiclass/expected_error_reduction.py index 8c12bdf..7ddc238 100644 --- a/libact/query_strategies/multiclass/expected_error_reduction.py +++ b/libact/query_strategies/multiclass/expected_error_reduction.py @@ -91,7 +91,7 @@ def make_query(self): score = [] for yi in range(n_classes): m = copy.deepcopy(self.model) - m.train(Dataset(np.vstack((X, [x])), y + [yi ] )) + m.train(Dataset(np.vstack((X, [x])), y + [yi])) p = m.predict_proba(X_pool) if self.loss == '01': # 0/1 loss diff --git a/libact/query_strategies/query_by_committee.py b/libact/query_strategies/query_by_committee.py index 527c38b..73cc6cd 100644 --- a/libact/query_strategies/query_by_committee.py +++ b/libact/query_strategies/query_by_committee.py @@ -159,10 +159,8 @@ def _kl_divergence_disagreement(self, proba): def _labeled_uniform_sample(self, sample_size): """sample labeled entries uniformly""" X, y = self.dataset.get_labeled_entries() - samples_idx = [ - self.random_state_.randint(0, X.shape[0]) for _ in range(sample_size)] - return Dataset( X[samples_idx], np.array(y)[samples_idx] ) - # return self.dataset.labeled_uniform_sample(sample_size, replace=True) + samples_idx = [self.random_state_.randint(0, X.shape[0]) for _ in range(sample_size)] + return Dataset(X[samples_idx], np.array(y)[samples_idx]) def teach_students(self): """ From 3e326565a2ac18ffb49be8afefc36a93c21d393a Mon Sep 17 00:00:00 2001 From: Eugene Yang Date: Tue, 2 Jul 2019 10:52:50 -0500 Subject: [PATCH 5/7] oops, miss 2 places --- examples/albl_plot.py | 2 +- examples/plot.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/albl_plot.py b/examples/albl_plot.py index 7fdbdaf..35fbddd 100755 --- a/examples/albl_plot.py +++ b/examples/albl_plot.py @@ -27,7 +27,7 @@ def run(trn_ds, tst_ds, lbr, model, qs, quota): for _ in range(quota): ask_id = qs.make_query() - lb = lbr.label( trn_ds.data[ask_id][0] ) + lb = lbr.label(trn_ds.data[ask_id][0]) trn_ds.update(ask_id, lb) model.train(trn_ds) diff --git a/examples/plot.py b/examples/plot.py index d072d0e..ba77c91 100755 --- a/examples/plot.py +++ b/examples/plot.py @@ -28,7 +28,7 @@ def run(trn_ds, tst_ds, lbr, model, qs, quota): for _ in range(quota): # Standard usage of libact objects ask_id = qs.make_query() - lb = lbr.label( trn_ds.data[ask_id][0] ) + lb = lbr.label(trn_ds.data[ask_id][0]) trn_ds.update(ask_id, lb) model.train(trn_ds) From f629d3a22d5708263d02cd5be47f84981d3d559c Mon Sep 17 00:00:00 2001 From: Eugene Yang Date: Tue, 2 Jul 2019 11:01:48 -0500 Subject: [PATCH 6/7] should be all... --- libact/base/dataset.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libact/base/dataset.py b/libact/base/dataset.py index a284886..8684116 100644 --- a/libact/base/dataset.py +++ b/libact/base/dataset.py @@ -71,7 +71,7 @@ def get_labeled_mask(self): ------- mask: numpy array of bool, shape = (n_sample, ) """ - return ~np.fromiter( ( e is None for e in self._y), dtype=bool ) + return ~np.fromiter((e is None for e in self._y), dtype=bool) def len_labeled(self): """ @@ -101,7 +101,7 @@ def get_num_of_labels(self): ------- n_labels : int """ - return np.unique( self._y[self.get_labeled_mask()] ).size + return np.unique(self._y[self.get_labeled_mask()]).size def append(self, feature, label=None): """ @@ -121,7 +121,7 @@ def append(self, feature, label=None): entry_id : {int} entry_id for the appened sample. """ - if isinstance( self._X, np.ndarray ): + if isinstance(self._X, np.ndarray): self._X = np.vstack([self._X, feature]) else: # sp.csr_matrix self._X = sp.vstack([self._X, feature]) @@ -142,7 +142,7 @@ def update(self, entry_id, new_label): label : {int, None} Label of the sample to be update. """ - self._y[ entry_id ] = new_label + self._y[entry_id] = new_label self.modified = True for callback in self._update_callback: callback(entry_id, new_label) From 776ee7ba14322bc226fe3719ce7075fa28db67e2 Mon Sep 17 00:00:00 2001 From: Eugene Yang Date: Wed, 3 Jul 2019 16:49:32 -0500 Subject: [PATCH 7/7] whitespace --- libact/query_strategies/quire.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libact/query_strategies/quire.py b/libact/query_strategies/quire.py index 7002a26..544baf1 100644 --- a/libact/query_strategies/quire.py +++ b/libact/query_strategies/quire.py @@ -70,7 +70,7 @@ class QUIRE(QueryStrategy): def __init__(self, *args, **kwargs): super(QUIRE, self).__init__(*args, **kwargs) self.Uindex = self.dataset.get_unlabeled_entries()[0].tolist() - self.Lindex = np.where( self.dataset.get_labeled_mask() )[0].tolist() + self.Lindex = np.where(self.dataset.get_labeled_mask())[0].tolist() # self.Uindex = [ # idx for idx, _ in self.dataset.get_unlabeled_entries() # ]