Skip to content

Commit

Permalink
corrected Imputer
Browse files Browse the repository at this point in the history
  • Loading branch information
pascal committed May 31, 2018
1 parent a86798b commit c4fed38
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 20 deletions.
3 changes: 3 additions & 0 deletions preprocessing/data.go
Original file line number Diff line number Diff line change
Expand Up @@ -771,6 +771,9 @@ func (m *Shuffler) InverseTransform(X, Y *mat.Dense) (Xout, Yout *mat.Dense) {
// Binarizer Binarize data (set feature values to 0 or 1) according to a threshold
type Binarizer struct{ Threshold float64 }

// NewBinarizer ...
func NewBinarizer() *Binarizer { return &Binarizer{} }

// Fit for binarizer does nothing
func (m *Binarizer) Fit(X, Y *mat.Dense) Transformer {
return m
Expand Down
22 changes: 22 additions & 0 deletions preprocessing/data_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -252,3 +252,25 @@ func ExampleMaxAbsScaler() {
// ⎣ 3 -4 0⎦

}

func ExampleBinarizer() {
// adapted from http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Binarizer.html#sklearn.preprocessing.Binarizer
X := mat.NewDense(3, 3, []float64{1, -1, 2, 2, 0, 0, 0, 1, -1})
binarizer := NewBinarizer()
binarizer.Fit(X, nil) // fit does nothing
X1, _ := binarizer.Transform(X, nil)
fmt.Println(mat.Formatted(X1))

// It is possible to adjust the threshold of the binarizer:
binarizer.Threshold = 1.1
X1, _ = binarizer.Transform(X, nil)
fmt.Println(mat.Formatted(X1))
// Output:
// ⎡1 0 1⎤
// ⎢1 0 0⎥
// ⎣0 1 0⎦
// ⎡0 0 1⎤
// ⎢1 0 0⎥
// ⎣0 0 0⎦

}
46 changes: 31 additions & 15 deletions preprocessing/imputation.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,20 @@ import (
"gonum.org/v1/gonum/stat"
)

// Inputer ...
// Imputer ...
// Stragegy is mean|median|most_frequent. default to mean
type Inputer struct{ Strategy string }

// NewInputer ...
func NewInputer() *Inputer { return &Inputer{} }
type Imputer struct {
Strategy string
MissingValues []float64
}

// Fit for Inputer ...
func (m *Inputer) Fit(X, Y *mat.Dense) base.Transformer { return m }
// NewImputer ...
func NewImputer() *Imputer { return &Imputer{} }

// Transform for Inputer ...
func (m *Inputer) Transform(X, Y *mat.Dense) (Xout, Yout *mat.Dense) {
// Fit for Imputer ...
func (m *Imputer) Fit(X, Y *mat.Dense) base.Transformer {
Xmat := X.RawMatrix()
Xout, Yout = mat.NewDense(Xmat.Rows, Xmat.Cols, nil), Y
Xmat, Xoutmat := X.RawMatrix(), Xout.RawMatrix()
m.MissingValues = make([]float64, Xmat.Cols, Xmat.Cols)
base.Parallelize(-1, Xmat.Cols, func(th, start, end int) {
tmp := make([]float64, Xmat.Rows, Xmat.Rows)
var def, v float64
Expand All @@ -45,6 +44,23 @@ func (m *Inputer) Transform(X, Y *mat.Dense) (Xout, Yout *mat.Dense) {
default:
def = stat.Mean(tmp, nil)
}
m.MissingValues[i] = def
}

})
return m
}

// Transform for Imputer ...
func (m *Imputer) Transform(X, Y *mat.Dense) (Xout, Yout *mat.Dense) {
Xmat := X.RawMatrix()
Xout, Yout = mat.NewDense(Xmat.Rows, Xmat.Cols, nil), Y
Xmat, Xoutmat := X.RawMatrix(), Xout.RawMatrix()
base.Parallelize(-1, Xmat.Cols, func(th, start, end int) {
var v, def float64

for i := start; i < end; i++ {
def = m.MissingValues[i]
for jX, jXout := 0, 0; jX < Xmat.Rows*Xmat.Stride; jX, jXout = jX+Xmat.Stride, jXout+Xoutmat.Stride {
v = Xmat.Data[jX+i]
if math.IsNaN(v) {
Expand All @@ -59,14 +75,14 @@ func (m *Inputer) Transform(X, Y *mat.Dense) (Xout, Yout *mat.Dense) {
return
}

// FitTransform for Inputer ...
func (m *Inputer) FitTransform(X, Y *mat.Dense) (Xout, Yout *mat.Dense) {
// FitTransform for Imputer ...
func (m *Imputer) FitTransform(X, Y *mat.Dense) (Xout, Yout *mat.Dense) {
Xout, Yout = m.Fit(X, Y).Transform(X, Y)
return
}

// InverseTransform for Inputer ...
func (m *Inputer) InverseTransform(X, Y *mat.Dense) (Xout, Yout *mat.Dense) {
// InverseTransform for Imputer ...
func (m *Imputer) InverseTransform(X, Y *mat.Dense) (Xout, Yout *mat.Dense) {
Xout, Yout = X, Y
return
}
21 changes: 16 additions & 5 deletions preprocessing/imputation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,24 @@ import (
"gonum.org/v1/gonum/mat"
)

func ExampleInputer() {
X := mat.NewDense(5, 2, []float64{1, 2, 3, 4, math.NaN(), 6, 7, 8, 7, 10})
func ExampleImputer() {
var nan = math.NaN()
X := mat.NewDense(5, 2, []float64{1, 2, 3, 4, nan, 6, 7, 8, 7, 10})
fmt.Println("replacing X.At(2,0) with...")
for _, s := range []string{"mean", "median", "most_frequent"} {
X1, _ := (&Inputer{Strategy: s}).Transform(X, nil)

X1, _ := (&Imputer{Strategy: s}).FitTransform(X, nil)
fmt.Printf("%s\n%g\n", s, mat.Formatted(X1))

}
// additional example adapted from http://scikit-learn.org/stable/modules/preprocessing.html#imputation-of-missing-values
imp := NewImputer()
imp.Fit(mat.NewDense(3, 2, []float64{1, 2, nan, 3, 7, 6}), nil)
X = mat.NewDense(3, 2, []float64{nan, 2, 6, nan, 7, 6})
X1, _ := imp.Transform(X, nil)
fmt.Printf("imputation-of-missing-values:\n%g\n", mat.Formatted(X1))
// Output:
// replacing X.At(2,0) with...
// replacing X.At(2,0) with...
// mean
// ⎡ 1 2⎤
// ⎢ 3 4⎥
Expand All @@ -35,5 +43,8 @@ func ExampleInputer() {
// ⎢ 7 6⎥
// ⎢ 7 8⎥
// ⎣ 7 10⎦

// imputation-of-missing-values:
// ⎡ 4 2⎤
// ⎢ 6 3.6666666666666665⎥
// ⎣ 7 6⎦
}

0 comments on commit c4fed38

Please sign in to comment.