pgrew · ben-albrecht · Sep 6, 2017 · Aug 19, 2017 · Aug 19, 2017 · Aug 21, 2017
diff --git a/datasets/.gitignore b/datasets/.gitignore
@@ -2,3 +2,4 @@ covtype*
 news20*
 rcv1*
 mushrooms
+splice*
diff --git a/datasets/Makefile b/datasets/Makefile
@@ -1,13 +1,35 @@
-all: covtype.libsvm.binary news20.binary rcv1_train.binary rcv1_test.binary mushrooms
+all: covtype news20 rcv1 mushrooms splice
+
+.PHONY: help
+
+help:
+	@IFS=$$'\n' ; \
+	help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//'`); \
+	for help_line in $${help_lines[@]}; do \
+	    IFS=$$'#' ; \
+	    help_split=($$help_line) ; \
+	    help_command=`echo $${help_split[0]} | sed -e 's/^ *//' -e 's/ *$$//'` ; \
+	    help_info=`echo $${help_split[2]} | sed -e 's/^ *//' -e 's/ *$$//'` ; \
+	    printf "%-30s %s\n" $$help_command $$help_info ; \
+	done
+
+covtype: ## covtype
+covtype: covtype.libsvm.binary
 
 covtype.libsvm.binary:
 	wget https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/covtype.libsvm.binary.bz2
 	bzip2 -df covtype.libsvm.binary.bz2
 
+news20: ## news20
+news20: news20.binary
+
 news20.binary:
 	wget https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/news20.binary.bz2
 	bzip2 -df news20.binary.bz2
 
+rcv1: ## rcv1
+rcv1: rcv1_train.binary rcv1_test.binary
+
 rcv1_train.binary:
 	wget https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/rcv1_train.binary.bz2
 	bzip2 -df rcv1_train.binary.bz2
@@ -16,5 +38,12 @@ rcv1_test.binary:
 	wget https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/rcv1_test.binary.bz2
 	bzip2 -df rcv1_test.binary.bz2
 
-mushrooms:
+mushrooms: ## mushrooms
 	wget https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/mushrooms
+
+splice: ## splice
+splice: splice.t
+	wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/splice
+
+splice.t:
+	wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/splice.t
diff --git a/datasets/README.md b/datasets/README.md
@@ -5,7 +5,13 @@ datasets by executing `make` in this directory. You can also download
 specific datasets by name, e.g.
 
 ```bash
-make covtype.libsvm.binary
+make covtype
+```
+
+To see the available datasets, use:
+
+```bash
+make help
 ```
 
 

diff --git a/src/psvm/.gitignore b/src/psvm/.gitignore
@@ -0,0 +1,5 @@
+SvmPredict
+SvmTrain
+PredictResult
+psvm
+
diff --git a/src/psvm/Document.chpl b/src/psvm/Document.chpl
diff --git a/src/psvm/Documents.chpl b/src/psvm/Documents.chpl
@@ -0,0 +1,168 @@
+/* Stores the properties of a document sample, including its document id,
+   class label, the square of its two norm and all its features. */
+record Sample {
+  var id, classLabel: int;
+  var twoNormSq: real;
+  var featureDom: domain(1);
+  var features: [featureDom] Feature;
+};
+
+/* Stores the properties of a feature, including id and weight. */
+record Feature {
+  var id: int;
+  var weight: real;
+};
+
+/*
+  Reads samples according to processor id and provides methods for accessing
+  them. Suppose there are N processors, the first processor will read the 0th,
+  Nth, 2Nth, ... samples, the second processor will read the first, (N+1)th,
+  (2N+1)th, ... samples, and so forth. Sample usage:
+     Document document();
+     document.Read("sample.dat");
+     const Sample* sample = document.GetLocalSample(0);
+     const Feature& feature = sample.features[0];
+
+ */
+class Document {
+
+  // Sample domain
+  var samplesDom: domain(1);
+  // Stores the samples assigned to this processor.
+  var samples: [samplesDom] Sample;
+
+  // keeps track of the total number of samples.
+  // keeps track of the total number of positive samples.
+  // keeps track of the total number of negative samples.
+  var numTotal, numPos, numNeg: int;
+
+  // TODO
+  /*
+     Reads samples from the file specified by filename. If the file does not
+     exist or the file format is illegal, false is returned. Otherwise true
+     is returned. The file format whould strickly be:
+        label word-id:word-weight word-id:word-weight ...
+        label word-id:word-weight word-id:word-weight ...
+        ...
+     Each line in the file corresponds to one sample. The samples will be
+     evenly distributed across all the processors. Suppose there are N
+     processors, with processor ids 0, 1, ..., (N-1). Then processor 0 will
+     read the 0th, Nth, 2Nth, ... samples from the file, processor 1 will read
+     the first, (N+1)th, (2N+1)th, ... samples form the file, and so forth.
+   */
+  proc read(filename: string) {
+    this.numTotal = 0;
+    this.numPos = 0;
+    this.numNeg = 0;
+
+    if filename.length == 0 then
+      halt('Name required');
+
+    var f = open(filename, iomode.r);
+
+    // TODO: Parallelize IO
+    for line in f.lines() {
+      const fields = line.split();
+      var classLabel = fields[1]: int;
+
+      // Increment positive/negative samples
+      if classLabel == 1 then
+        this.numPos += 1;
+      else if classLabel == -1 then
+        this.numNeg += 1;
+      else
+        halt('Unknown classLabel in this line: ', numTotal + 1, ' label: ', classLabel);
+
+      var sample = new Sample(id=this.numTotal, classLabel=classLabel);
+
+      // Extract sample's features
+      const kvPairs = fields[2..];
+      for kvPair in kvPairs {
+        const kv = kvPair.split(':');
+        var feature = new Feature();
+        feature.id = kv[1]: int;
+        feature.weight = kv[2]: real;
+        sample.features.push_back(feature);
+        sample.twoNormSq += feature.weight * feature.weight;
+      }
+      samples.push_back(sample);
+      this.numTotal += 1;
+    }
+    f.close();
+  }
+
+  // TODO (maybe not)
+  /*
+     Returns a const pointer to the local_row_index'th sample. But if
+     local_row_index is less then 0 or points to a non-existent position, NULL
+     will be returned.
+   */
+  proc getLocalSample(localRowIndex: int) {
+
+  }
+
+  // TODO (maybe not)
+  proc getLocalSample(localRowIndex: int) {
+
+  }
+
+  // TODO:
+  /*
+     Returns a const pointer to the global_row_index'th sample. But when any of
+     the following conditions is satisfied, NULL will be returned:
+        1. global_row_index is less then 0 or points to a non-existent
+           position.
+        2. The global_row_index'th sample is not assigned to this processor.
+           (See comment of method 'Read')
+   */
+  proc getGlobalSample(globalRowIndex: int) { }
+
+  // Frees the memory occupied by the samples assigned to this processor.
+  proc destroy() {
+    this.samples.clear();
+  }
+
+  // Returns the number of the samples assigned to this processor.
+  proc getLocalNumberRows {
+    return this.samples.size();
+  }
+
+  // TODO
+  // Copies the labels of the samples assigned to this processor to the array
+  // specified by the output parameter 'labels'. The class labels will be
+  // stored in the same order as the samples. It is the caller's responsibility
+  // to allocate enough memory for the labels.
+  proc GetLocalLabels(labels: [] int) {
+
+  }
+
+  // TODO
+  // The following methods are used to encode Sample to or decode Sample from
+  // a memory block, which is used to transfer Sample in the network.
+
+  // Computes the size of the memory block needed to encode sample to.
+  proc getPackSize(sample: Sample) {
+
+  }
+
+  // Packs a Sample into 'buffer'. If buffer != NULL, it should be a
+  // pre-allocated memory block, with proper block size. Otherwise,
+  // this method will use GetPackSize to determine how much memory is
+  // needed and then allocate enough memory to hold it. It is the caller's
+  // responsibility to free the memory. The return value is the number
+  // of bytes used in buffer.
+  proc packSample(buffer: string, sample: Sample) {
+
+  }
+
+  // Decodes sample from the memory block pointed to by 'buffer'. If 'sample' is
+  // NULL, the method will allocate a new Sample. On return of the method,
+  // the decoded Sample is put in the output parameter 'sample'. It's the
+  // caller's responsility to free the memory. The method returns how many
+  // bytes is decoded from 'buffer'
+  proc unpackSample(sample: Sample, buffer: string) {
+
+  }
+
+
+}
diff --git a/src/psvm/Makefile b/src/psvm/Makefile
@@ -0,0 +1,8 @@
+all: svmTrain svmPredict
+
+svmTrain:
+	chpl svmTrain.chpl
+
+svmPredict:
+	chpl svmPredict.chpl
+
diff --git a/src/psvm/Model.chpl b/src/psvm/Model.chpl
@@ -1,9 +1,11 @@
-use Document;
+use Documents;
 use Kernel;
 
 class Model {
-  /* kernel stores kernel type, kernel parameter information,
-     and calculates kernel function accordingly.*/
+  /*
+     kernel stores kernel type, kernel parameter information,
+     and calculates kernel function accordingly.
+   */
   var kernel: Kernel; // TODO
   /* The number of support vectors in all. */
   var numTotalSV: int,
@@ -19,11 +21,6 @@ class Model {
      their information. */
   proc checkSupportVector(alpha, doc, ipmParameter) { }
 
-  proc supportVector() const ref { return this.supportVector; };
-
-  /* Setter/getter to kernel */
-  proc kernel() ref { return this.kernel; }
-
   // TODO
   /* Saves the model to the directory specified by str_directory. */
   proc save(strDirectory, modelName) { }
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,3 +2,4 @@ covtype* @@
     news20*
     rcv1*
     mushrooms
+    splice*