Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SvmTrain #10

Merged
merged 6 commits into from
Sep 6, 2017
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions datasets/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ covtype*
news20*
rcv1*
mushrooms
splice*
33 changes: 31 additions & 2 deletions datasets/Makefile
Original file line number Diff line number Diff line change
@@ -1,13 +1,35 @@
all: covtype.libsvm.binary news20.binary rcv1_train.binary rcv1_test.binary mushrooms
all: covtype news20 rcv1 mushrooms splice

.PHONY: help

help:
@IFS=$$'\n' ; \
help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//'`); \
for help_line in $${help_lines[@]}; do \
IFS=$$'#' ; \
help_split=($$help_line) ; \
help_command=`echo $${help_split[0]} | sed -e 's/^ *//' -e 's/ *$$//'` ; \
help_info=`echo $${help_split[2]} | sed -e 's/^ *//' -e 's/ *$$//'` ; \
printf "%-30s %s\n" $$help_command $$help_info ; \
done

covtype: ## covtype
covtype: covtype.libsvm.binary

covtype.libsvm.binary:
wget https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/covtype.libsvm.binary.bz2
bzip2 -df covtype.libsvm.binary.bz2

news20: ## news20
news20: news20.binary

news20.binary:
wget https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/news20.binary.bz2
bzip2 -df news20.binary.bz2

rcv1: ## rcv1
rcv1: rcv1_train.binary rcv1_test.binary

rcv1_train.binary:
wget https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/rcv1_train.binary.bz2
bzip2 -df rcv1_train.binary.bz2
Expand All @@ -16,5 +38,12 @@ rcv1_test.binary:
wget https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/rcv1_test.binary.bz2
bzip2 -df rcv1_test.binary.bz2

mushrooms:
mushrooms: ## mushrooms
wget https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/mushrooms

splice: ## splice
splice: splice.t
wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/splice

splice.t:
wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/splice.t
8 changes: 7 additions & 1 deletion datasets/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,13 @@ datasets by executing `make` in this directory. You can also download
specific datasets by name, e.g.

```bash
make covtype.libsvm.binary
make covtype
```

To see the available datasets, use:

```bash
make help
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I love this Makefile!

```


Expand Down
5 changes: 5 additions & 0 deletions src/psvm/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
SvmPredict
SvmTrain
PredictResult
psvm

14 changes: 0 additions & 14 deletions src/psvm/Document.chpl

This file was deleted.

168 changes: 168 additions & 0 deletions src/psvm/Documents.chpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
/* Stores the properties of a document sample, including its document id,
class label, the square of its two norm and all its features. */
record Sample {
var id, classLabel: int;
var twoNormSq: real;
var featureDom: domain(1);
var features: [featureDom] Feature;
};

/* Stores the properties of a feature, including id and weight. */
record Feature {
var id: int;
var weight: real;
};

/*
Reads samples according to processor id and provides methods for accessing
them. Suppose there are N processors, the first processor will read the 0th,
Nth, 2Nth, ... samples, the second processor will read the first, (N+1)th,
(2N+1)th, ... samples, and so forth. Sample usage:
Document document();
document.Read("sample.dat");
const Sample* sample = document.GetLocalSample(0);
const Feature& feature = sample.features[0];

*/
class Document {

// Sample domain
var samplesDom: domain(1);
// Stores the samples assigned to this processor.
var samples: [samplesDom] Sample;

// keeps track of the total number of samples.
// keeps track of the total number of positive samples.
// keeps track of the total number of negative samples.
var numTotal, numPos, numNeg: int;

// TODO
/*
Reads samples from the file specified by filename. If the file does not
exist or the file format is illegal, false is returned. Otherwise true
is returned. The file format whould strickly be:
label word-id:word-weight word-id:word-weight ...
label word-id:word-weight word-id:word-weight ...
...
Each line in the file corresponds to one sample. The samples will be
evenly distributed across all the processors. Suppose there are N
processors, with processor ids 0, 1, ..., (N-1). Then processor 0 will
read the 0th, Nth, 2Nth, ... samples from the file, processor 1 will read
the first, (N+1)th, (2N+1)th, ... samples form the file, and so forth.
*/
proc read(filename: string) {
this.numTotal = 0;
this.numPos = 0;
this.numNeg = 0;

if filename.length == 0 then
halt('Name required');

var f = open(filename, iomode.r);

// TODO: Parallelize IO
for line in f.lines() {
const fields = line.split();
var classLabel = fields[1]: int;

// Increment positive/negative samples
if classLabel == 1 then
this.numPos += 1;
else if classLabel == -1 then
this.numNeg += 1;
else
halt('Unknown classLabel in this line: ', numTotal + 1, ' label: ', classLabel);

var sample = new Sample(id=this.numTotal, classLabel=classLabel);

// Extract sample's features
const kvPairs = fields[2..];
for kvPair in kvPairs {
const kv = kvPair.split(':');
var feature = new Feature();
feature.id = kv[1]: int;
feature.weight = kv[2]: real;
sample.features.push_back(feature);
sample.twoNormSq += feature.weight * feature.weight;
}
samples.push_back(sample);
this.numTotal += 1;
}
f.close();
}

// TODO (maybe not)
/*
Returns a const pointer to the local_row_index'th sample. But if
local_row_index is less then 0 or points to a non-existent position, NULL
will be returned.
*/
proc getLocalSample(localRowIndex: int) {

}

// TODO (maybe not)
proc getLocalSample(localRowIndex: int) {

}

// TODO:
/*
Returns a const pointer to the global_row_index'th sample. But when any of
the following conditions is satisfied, NULL will be returned:
1. global_row_index is less then 0 or points to a non-existent
position.
2. The global_row_index'th sample is not assigned to this processor.
(See comment of method 'Read')
*/
proc getGlobalSample(globalRowIndex: int) { }

// Frees the memory occupied by the samples assigned to this processor.
proc destroy() {
this.samples.clear();
}

// Returns the number of the samples assigned to this processor.
proc getLocalNumberRows {
return this.samples.size();
}

// TODO
// Copies the labels of the samples assigned to this processor to the array
// specified by the output parameter 'labels'. The class labels will be
// stored in the same order as the samples. It is the caller's responsibility
// to allocate enough memory for the labels.
proc GetLocalLabels(labels: [] int) {

}

// TODO
// The following methods are used to encode Sample to or decode Sample from
// a memory block, which is used to transfer Sample in the network.

// Computes the size of the memory block needed to encode sample to.
proc getPackSize(sample: Sample) {

}

// Packs a Sample into 'buffer'. If buffer != NULL, it should be a
// pre-allocated memory block, with proper block size. Otherwise,
// this method will use GetPackSize to determine how much memory is
// needed and then allocate enough memory to hold it. It is the caller's
// responsibility to free the memory. The return value is the number
// of bytes used in buffer.
proc packSample(buffer: string, sample: Sample) {

}

// Decodes sample from the memory block pointed to by 'buffer'. If 'sample' is
// NULL, the method will allocate a new Sample. On return of the method,
// the decoded Sample is put in the output parameter 'sample'. It's the
// caller's responsility to free the memory. The method returns how many
// bytes is decoded from 'buffer'
proc unpackSample(sample: Sample, buffer: string) {

}


}
8 changes: 8 additions & 0 deletions src/psvm/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
all: svmTrain svmPredict

svmTrain:
chpl svmTrain.chpl

svmPredict:
chpl svmPredict.chpl

13 changes: 5 additions & 8 deletions src/psvm/Model.chpl
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
use Document;
use Documents;
use Kernel;

class Model {
/* kernel stores kernel type, kernel parameter information,
and calculates kernel function accordingly.*/
/*
kernel stores kernel type, kernel parameter information,
and calculates kernel function accordingly.
*/
var kernel: Kernel; // TODO
/* The number of support vectors in all. */
var numTotalSV: int,
Expand All @@ -19,11 +21,6 @@ class Model {
their information. */
proc checkSupportVector(alpha, doc, ipmParameter) { }

proc supportVector() const ref { return this.supportVector; };

/* Setter/getter to kernel */
proc kernel() ref { return this.kernel; }

// TODO
/* Saves the model to the directory specified by str_directory. */
proc save(strDirectory, modelName) { }
Expand Down
Loading