Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

add data partition for libsvm iter #7027

Merged
merged 1 commit into from
Jul 21, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions src/io/iter_libsvm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ struct LibSVMIterParam : public dmlc::Parameter<LibSVMIterParam> {
std::string label_libsvm;
/*! \brief label shape */
TShape label_shape;
/*! \brief partition the data into multiple parts */
int num_parts;
/*! \brief the index of the part will read*/
int part_index;
// declare parameters
DMLC_DECLARE_PARAMETER(LibSVMIterParam) {
DMLC_DECLARE_FIELD(data_libsvm)
Expand All @@ -35,6 +39,10 @@ struct LibSVMIterParam : public dmlc::Parameter<LibSVMIterParam> {
index_t shape1[] = {1};
DMLC_DECLARE_FIELD(label_shape).set_default(TShape(shape1, shape1 + 1))
.describe("The shape of one label.");
DMLC_DECLARE_FIELD(num_parts).set_default(1)
.describe("partition the data into multiple parts");
DMLC_DECLARE_FIELD(part_index).set_default(0)
.describe("the index of the part will read");
}
};

Expand All @@ -47,11 +55,15 @@ class LibSVMIter: public SparseIIterator<DataInst> {
virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
param_.InitAllowUnknown(kwargs);
CHECK_EQ(param_.data_shape.ndim(), 1) << "dimension of data_shape is expected to be 1";
CHECK_GT(param_.num_parts, 0) << "number of parts should be positive";
CHECK_GE(param_.part_index, 0) << "part index should be non-negative";
data_parser_.reset(dmlc::Parser<uint64_t>::Create(param_.data_libsvm.c_str(),
0, 1, "libsvm"));
param_.part_index,
param_.num_parts, "libsvm"));
if (param_.label_libsvm != "NULL") {
label_parser_.reset(dmlc::Parser<uint64_t>::Create(param_.label_libsvm.c_str(),
0, 1, "libsvm"));
param_.part_index,
param_.num_parts, "libsvm"));
CHECK_GT(param_.label_shape.Size(), 1)
<< "label_shape is not expected to be (1,) when param_.label_libsvm is set.";
} else {
Expand Down
20 changes: 14 additions & 6 deletions tests/python/unittest/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,29 +161,37 @@ def check_libSVMIter_synthetic():
assert_almost_equal(data_train.getdata().asnumpy(), expected)
i += 1

def check_libSVMIter_news_metadata():
def check_libSVMIter_news_data():
news_metadata = {
'name': 'news20.t',
'origin_name': 'news20.t.bz2',
'url': "http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/news20.t.bz2",
'shape': 62060,
'feature_dim': 62060,
'num_classes': 20,
'num_examples': 3993,
}
num_parts = 3
batch_size = 128
num_examples = news_metadata['num_examples']
data_dir = os.path.join(os.getcwd(), 'data')
get_data(data_dir, news_metadata['name'], news_metadata['url'],
news_metadata['origin_name'])
path = os.path.join(data_dir, news_metadata['name'])
data_train = mx.io.LibSVMIter(data_libsvm=path,
data_shape=(news_metadata['shape'], ),
batch_size=512)
data_train = mx.io.LibSVMIter(data_libsvm=path, data_shape=(news_metadata['feature_dim'],),
batch_size=batch_size, num_parts=num_parts, part_index=0)
num_batches = 0
iterator = iter(data_train)
for batch in iterator:
# check the range of labels
assert(np.sum(batch.label[0].asnumpy() > 20) == 0)
assert(np.sum(batch.label[0].asnumpy() <= 0) == 0)
num_batches += 1
import math
expected_num_batches = math.ceil(num_examples * 1.0 / batch_size / num_parts)
assert(num_batches == int(expected_num_batches)), (num_batches, expected_num_batches)

check_libSVMIter_synthetic()
check_libSVMIter_news_metadata()
check_libSVMIter_news_data()

if __name__ == "__main__":
test_NDArrayIter()
Expand Down