scripts/builtin/decisionTree.dml

#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------

# This script implements decision trees for recoded and binned categorical and
# numerical input features. We train a single CART (classification and
# regression tree) decision trees depending on the provided labels y, either
# classification (majority vote per leaf) or regression (average per leaf).
#
# .. code-block::
#
#   For example, give a feature matrix with features [a,b,c,d]
#   and the following trees, M would look as follows:
#
#   (L1)               |d<5|
#                     /     \
#   (L2)           P1:2    |a<7|
#                          /   \
#   (L3)                 P2:2 P3:1
#
#   --> M :=
#   [[4, 5, 0, 2, 1, 7, 0, 0, 0, 0, 0, 2, 0, 1]]
#    |(L1)| |  (L2)   | |        (L3)         |
#
#
#
# INPUT:
# ------------------------------------------------------------------------------
# X               Feature matrix in recoded/binned representation
# y               Label matrix in recoded/binned representation
# ctypes          Row-Vector of column types [1 scale/ordinal, 2 categorical]
#                 of shape 1-by-(ncol(X)+1), where the last entry is the y type
# max_depth       Maximum depth of the learned tree (stopping criterion)
# min_leaf        Minimum number of samples in leaf nodes (stopping criterion),
#                 odd number recommended to avoid 50/50 leaf label decisions
# min_split       Minimum number of samples in leaf for attempting a split
# max_features    Parameter controlling the number of features used as split
#                 candidates at tree nodes: m = ceil(num_features^max_features)
# max_values      Parameter controlling the number of values per feature used
#                 as split candidates: nb = ceil(num_values^max_values)
# max_dataratio   Parameter in [0,1] controlling when to materialize data
#                 subsets of X and y on node splits. When set to 0, we always
#                 scan the original X and y, which has the benefit of avoiding
#                 the allocation and maintenance of data for all active nodes.
#                 When set to 0.01 we rematerialize whenever the sub-tree data
#                 would be less than 1% of last the parent materialize data size.
# impurity        Impurity measure: entropy, gini (default), rss (regression)
# seed            Fixed seed for randomization of samples and split candidates
# verbose         Flag indicating verbose debug output
# ------------------------------------------------------------------------------
#
# OUTPUT:
# ------------------------------------------------------------------------------
# M              Matrix M containing the learned trees, in linearized form
# ------------------------------------------------------------------------------

m_decisionTree = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] ctypes,
    Int max_depth = 10, Int min_leaf = 20, Int min_split = 50,
    Double max_features = 0.5, Double max_values = 1.0, Double max_dataratio = 0.25,
    String impurity = "gini", Int seed = -1, Boolean verbose = FALSE)
  return(Matrix[Double] M)
{
  t1 = time();

  # validation checks
  if( max_depth > 32 )
    stop("decisionTree: invalid max_depth > 32: "+max_depth);
  if( sum(X<=0) != 0 )
    stop("decisionTree: feature matrix X is not properly recoded/binned (values <= 0): "+sum(X<=0));
  if( sum(abs(X-round(X))>1e-14) != 0 )
    stop("decisionTree: feature matrix X is not properly recoded/binned (non-integer): "+sum(abs(X-round(X))>1e-14));
  if( sum(y<=0) != 0 )
    stop("decisionTree: label vector y is not properly recoded/binned: "+sum(y<=0));

  # initialize input data and basic statistics
  # (we keep y2 and the indicates I in transposed form for sparsity exploitation)
  m = nrow(X); n = ncol(X);
  classify = (as.scalar(ctypes[1,n+1]) == 2);

  fdom = max(colMaxs(X),2);           # num distinct per feature
  foffb = t(cumsum(t(fdom))) - fdom;  # feature begin
  foffe = t(cumsum(t(fdom)))          # feature end
  rix = matrix(seq(1,m)%*%matrix(1,1,n), m*n, 1)
  cix = matrix(X + foffb, m*n, 1);
  X2 = table(rix, cix, 1, m, as.scalar(foffe[,n]), FALSE); #one-hot encoded
  y2 = table(seq(1,m), y);
  cnt = colSums(X2);
  I = matrix(1, rows=1, cols=nrow(X));

  if( verbose ) {
    print("decisionTree: initialize with max_depth=" + max_depth + ", max_features="
      + max_features +", max_dataratio=" + max_dataratio + ", impurity="
      + impurity + ", seed=" + seed + ".");
    print("decisionTree: basic statistics:");
    print("-- impurity: " + as.scalar(computeImpurity(y2, I, impurity)));
    print("-- minFeatureCount: " + min(cnt));
    print("-- maxFeatureCount: " + max(cnt));
  }

  # queue-based node splitting
  M = matrix(0, rows=1, cols=2*(2^max_depth-1))
  queue = list(list(1,I,X2,y2)); # node IDs / data indicators
  maxPath = 1;
  while( length(queue) > 0 ) {
    # pop next node from queue for splitting
    [queue, node0] = remove(queue, 1);
    node = as.list(node0);
    nID = as.scalar(node[1]);
    nI = as.matrix(node[2]);
    X2 = as.matrix(node[3]);
    y2 = as.matrix(node[4]);
    if(verbose)
      print("decisionTree: attempting split of node "+nID+" ("+sum(nI)+" rows)");

    # optional rematerialization of data per node
    if( sum(nI) < max_dataratio*ncol(nI) ) {
      if(verbose)
        print("-- compacting data: "+ncol(nI)+" --> "+sum(nI));
      X2 = removeEmpty(target=X2, margin="rows", select=t(nI));
      y2 = removeEmpty(target=y2, margin="rows", select=t(nI));
      nI = matrix(1, rows=1, cols=nrow(X2));
    }

    # find best split attribute
    nSeed = ifelse(seed==-1, seed, seed*nID);
    [f, v, IDleft, Ileft, IDright, Iright] = findBestSplit(
      X2, y2, foffb, foffe, nID, nI, min_leaf, max_features, max_values, impurity, nSeed);
    validSplit = sum(Ileft) >= min_leaf & sum(Iright) >= min_leaf;
    if(verbose)
      print("-- best split: f"+f+" <= "+v+" --> valid="+validSplit);
    if( validSplit )
      M[, 2*nID-1:2*nID] = t(as.matrix(list(f,v)));
    else
      M[, 2*nID] = computeLeafLabel(y2, nI, classify, verbose);
    maxPath = max(maxPath, floor(log(nID,2)+1));

    # split data, finalize or recurse
    if( validSplit ) {
      if( sum(Ileft) >= min_split & floor(log(IDleft,2))+2 < max_depth )
        queue = append(queue, list(IDleft,Ileft,X2,y2));
      else
        M[,2*IDleft] = computeLeafLabel(y2, Ileft, classify, verbose)
      if( sum(Iright) >= min_split & floor(log(IDright,2))+2 < max_depth )
        queue = append(queue, list(IDright,Iright,X2,y2));
      else
        M[,2*IDright] = computeLeafLabel(y2, Iright, classify, verbose)
      maxPath = max(maxPath, floor(log(IDleft,2)+1));
    }
  }

  # summary and encoding
  M = M[1, 1:2*(2^maxPath-1)];

  if(verbose) {
    print("decisionTree: final constructed tree (linearized):");
    print("--" + toString(M));
  }
}

findBestSplit = function(Matrix[Double] X2, Matrix[Double] y2, Matrix[Double] foffb, Matrix[Double] foffe,
    Int ID, Matrix[Double] I, Int min_leaf, Double max_features, Double max_values, String impurity, Int seed)
  return(Int f, Int v, Int IDleft, Matrix[Double] Ileft, Int IDright, Matrix[Double] Iright)
{
  # sample features iff max_features < 1
  n = ncol(foffb);
  numI = sum(I);
  feat = seq(1,n);
  if( max_features < 1.0 ) {
    rI = rand(rows=n, cols=1, seed=seed) <= (n^max_features/n);
    feat = removeEmpty(target=feat, margin="rows", select=rI);
    if( sum(feat) == 0 ) #sample at least one
      feat[1,1] = round(rand(rows=1, cols=1, min=1, max=n));
  }

  # evaluate features and feature splits
  # (both categorical and numerical are treated similarly by
  # finding a cutoff point in the recoded/binned representation)
  R = matrix(0, rows=3, cols=nrow(feat));
  parfor( i in 1:nrow(feat) ) {
    f = as.scalar(feat[i]);        # feature
    beg = as.scalar(foffb[1,f])+1; # feature start in X2
    end = as.scalar(foffe[1,f]);   # feature end in X2
    belen = end-beg; #numFeat - 1
    while(FALSE){} # make beg/end known

    # construct 0/1 predicate vectors with <= semantics
    # find rows that match at least one value and appear in I
    # (vectorized evaluation, each column in P is a split candidate)
    fP = upper.tri(target=matrix(1,belen,belen), diag=TRUE);
    vI = seq(1,belen);
    if( max_values < 1.0 & ncol(fP)>10 ) {
      rI2 = rand(rows=ncol(fP),cols=1,seed=seed) <= (ncol(fP)^max_values/ncol(fP));
      fP = removeEmpty(target=fP, margin="cols", select=t(rI2));
      vI = removeEmpty(target=vI, margin="rows", select=rI2);
    }
    P = matrix(0, ncol(X2), ncol(fP));
    P[beg:end-1,1:ncol(fP)] = fP;
    Ileft = (t(X2 %*% P) * I) != 0;
    Iright = (Ileft==0) * I;

    # compute information gain for all split candidates
    ig = as.scalar(computeImpurity(y2, I, impurity))
         - rowSums(Ileft)/numI * computeImpurity(y2, Ileft, impurity)
         - rowSums(Iright)/numI * computeImpurity(y2, Iright, impurity);
    ig = replace(target=ig, pattern=NaN, replacement=0);

    # track best split value and index, incl validity
    valid = (rowSums(Ileft)>=min_leaf) & (rowSums(Iright)>=min_leaf);
    bestig = max(valid*ig);
    bestv = ifelse(bestig>0, nrow(valid)-as.scalar(rowIndexMax(t(rev(valid*ig))))+beg, -1);
    if( bestv >= 0 )
      bestv = as.scalar(vI[bestv-beg+1,1])+beg-1;
    R[,i] = as.matrix(list(f, bestig, bestv));
  }
  ix = as.scalar(rowIndexMax(R[2,]));

  # extract indicators and IDs
  IDleft = 2 * ID;
  IDright= 2 * ID + 1;
  f = as.integer(as.scalar(feat[ix,1]));
  beg = as.scalar(foffb[1,f]);
  v = as.integer(as.scalar(R[3,ix])-beg);
  if( max(R[2,]) > 0 ) {
    p = table(seq(beg+1, beg+v), 1, ncol(X2), 1);
    Ileft = (t(X2 %*% p) * I) != 0;
    Iright = I * (Ileft==0);
  }
  else { # no information gain
    Ileft = as.matrix(0);
    Iright = as.matrix(0);
  }
}

computeImpurity = function(Matrix[Double] y2, Matrix[Double] I, String impurity)
  return(Matrix[Double] score)
{
  f = (I %*% y2) / rowSums(I); # rel. freq. per category/bin
  score = matrix(0, nrow(I), 1);
  if( impurity == "gini" )
    score = 1 - rowSums(f^2); # sum(f*(1-f));
  else if( impurity == "entropy" )
    score = rowSums(-f * log(f));
  else if( impurity == "rss" ) { # residual sum of squares
    yhat = f %*% seq(1,ncol(f));                # yhat
    res = outer(yhat, t(rowIndexMax(y2)), "-"); # yhat-y
    score = rowSums((I * res)^2);               # sum((yhat-y)^2)
  }
  else
    stop("decisionTree: unsupported impurity measure: "+impurity);
}

computeLeafLabel = function(Matrix[Double] y2, Matrix[Double] I, Boolean classify, Boolean verbose)
  return(Double label)
{
  f = (I %*% y2) / sum(I);
  label = as.scalar(ifelse(classify,
    rowIndexMax(f), f %*% seq(1,ncol(f))));
  if(verbose)
    print("-- leaf node label: " + label +" ("+sum(I)*max(f)+"/"+sum(I)+")");
}