diff --git a/.travis.yml b/.travis.yml index f252f85f..a52606e5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -26,6 +26,7 @@ install: # grab latest embedpy - if [[ "x$QLIC_KC" != "x" ]]; then echo -n $QLIC_KC |base64 --decode > q/kc.lic; + pip install --upgrade pip; pip -q install -r requirements.txt; fi beforescript: @@ -40,7 +41,7 @@ script: - echo "Packaged as ml_$TRAVIS_OS_NAME-$TRAVIS_BRANCH.zip" - if [[ "x$QLIC_KC" != "x" ]]; then curl -fsSL -o test.q https://github.com/KxSystems/embedpy/raw/master/test.q; - q test.q fresh/tests/ util/tests/ xval/tests clust/tests/ graph/tests/ timeseries/tests/ optimize/tests/ -q; + bash tests/testFiles.bat; else echo No kdb+, no tests; diff --git a/build/test.bat b/build/test.bat index 661d593d..8a9d1615 100644 --- a/build/test.bat +++ b/build/test.bat @@ -2,5 +2,5 @@ if defined QLIC_KC ( pip -q install -r requirements.txt echo getting test.q from embedpy curl -fsSL -o test.q https://github.com/KxSystems/embedpy/raw/master/test.q - q test.q fresh/tests/ util/tests/ xval/tests/ clust/tests/ graph/tests/ timeseries/tests/ optimize/tests/ -q + call "tests\testFiles.bat" ) diff --git a/clust/README.md b/clust/README.md index db4e1a39..b1c2fd9e 100644 --- a/clust/README.md +++ b/clust/README.md @@ -43,6 +43,6 @@ Documentation is available on the [clustering](https://code.kx.com/v2/ml/toolkit ## Status -The clustering library is still in development and is available here as a beta release. Further functionality and improvements will be made to the library in the coming months. +The clustering library is still in development. Further functionality and improvements will be made to the library on an ongoing basis. If you have any issues, questions or suggestions, please write to ai@kx.com. diff --git a/clust/aprop.q b/clust/aprop.q index 1fc46dce..4b4c1f78 100644 --- a/clust/aprop.q +++ b/clust/aprop.q @@ -1,197 +1,61 @@ -\d .ml +// clust/init.q - Affinity propagation +// Copyright (c) 2021 Kx Systems Inc +// +// Clustering using affinity propagation. +// Affinity Propagation groups data based on the similarity +// between points and subsequently finds exemplars, which best +// represent the points in each cluster. The algorithm does +// not require the number of clusters be provided at run time, +// but determines the optimum solution by exchanging real-valued +// messages between points until a high-valued set of exemplars +// is produced. -// Affinity Propagation +\d .ml // @kind function // @category clust -// @fileoverview Fit affinity propagation algorithm -// @param data {float[][]} Data in matrix format, each column is an individual datapoint -// @param df {symbol} Distance function name within '.ml.clust.df' -// @param dmp {float} Damping coefficient -// @param diag {func} Function applied to the similarity matrix diagonal -// @param iter {dict} Max number of overall iterations and iterations -// without a change in clusters. (::) can be passed in which case the defaults -// of (`total`nochange!200 15) will be used -// @return {dict} Data, input variables, clusters and exemplars -// (`data`inputs`clt`exemplars) required for the predict method -clust.ap.fit:{[data;df;dmp;diag;iter] +// @desc Fit affinity propagation algorithm +// @param data {float[][]} Each column of the data is an individual datapoint +// @param df {symbol} Distance function name within '.ml.clust.df' +// @param damp {float} Damping coefficient +// @param diag {fn} Function applied to the similarity matrix diagonal +// @param iter {dictionary} Max number of overall iterations and iterations +// without a change in clusters. (::) can be passed in which case the +// defaults of (`total`noChange!200 15) will be used +// @return {dictionary} Data, input variables, clusters and exemplars +// (`data`inputs`clust`exemplars) required, along with a projection of the +// predict function +clust.ap.fit:{[data;df;damp;diag;iter] data:clust.i.floatConversion[data]; - defaultDict:`run`total`nochange!0 200 15; + defaultDict:`run`total`noChange!0 200 15; if[iter~(::);iter:()!()]; if[99h<>type iter;'"iter must be (::) or a dictionary"]; - // update iteration dictionary with user changes + // Update iteration dictionary with user changes updDict:defaultDict,iter; - // cluster data using AP algo - clust.i.runap[data;df;dmp;diag;til count data 0;updDict] + // Cluster data using AP algo + modelInfo:clust.i.runAp[data;df;damp;diag;til count data 0;updDict]; + returnInfo:enlist[`modelInfo]!enlist modelInfo; + predictFunc:clust.ap.predict returnInfo; + returnInfo,enlist[`predict]!enlist predictFunc } // @kind function // @category clust -// @fileoverview Predict clusters using AP config -// @param data {float[][]} Data in matrix format, each column is an individual datapoint -// @param cfg {dict} `data`inputs`clt`exemplars returned by clust.ap.fit -// @return {long[]} List of predicted clusters -clust.ap.predict:{[data;cfg] +// @desc Predict clusters using AP config +// @param config {dictionary} `data`inputs`clust`exemplars returned by the +// modelInfo key from the return of clust.ap.fit +// @param data {float[][]} Each column of the data is an individual datapoint +// @return {long[]} Predicted clusters +clust.ap.predict:{[config;data] + config:config`modelInfo; data:clust.i.floatConversion[data]; - if[-1~first cfg`clt; - '"'.ml.clust.ap.fit' did not converge, all clusters returned -1. Cannot predict new data."]; - // retrieve cluster centres from training data - ex:cfg[`data][;distinct cfg`exemplars]; - // predict testing data clusters - clust.i.appreddist[ex;cfg[`inputs]`df]each$[0h=type data;flip;enlist]data - } - - -// Utilities - -// @kind function -// @category private -// @fileoverview Run affinity propagation algorithm -// @param data {float[][]} Data in matrix format, each column is an individual datapoint -// @param df {symbol} Distance function name within '.ml.clust.df' -// @param dmp {float} Damping coefficient -// @param diag {func} Function applied to the similarity matrix diagonal -// @param idxs {long[]} List of indicies to find distances for -// @param iter {dict} Max number of overall iterations and iterations -// without a change in clusters. (::) can be passed in where the defaults -// of (`total`nochange!200 15) will be used -// @return {long[]} List of clusters -clust.i.runap:{[data;df;dmp;diag;idxs;iter] - // check negative euclidean distance has been given - if[df<>`nege2dist;clust.i.err.ap[]]; - // calculate distances, availability and responsibility - info0:clust.i.apinit[data;df;diag;idxs]; - // initialize exemplar matrix and convergence boolean - info0,:`emat`conv`iter!((count data 0;iter`nochange)#0b;0b;iter); - // run ap algo until maximum number of iterations completed or convergence - info1:clust.i.apstop clust.i.apalgo[dmp]/info0; - // return data, inputs, clusters and exemplars - inputs:`df`dmp`diag`iter!(df;dmp;diag;iter); - exemplars:info1`exemplars; - clt:$[info1`conv;clust.i.reindex exemplars;count[data 0]#-1]; - `data`inputs`clt`exemplars!(data;inputs;clt;exemplars) - } - -// @kind function -// @category private -// @fileoverview Initialize matrices -// @param data {float[][]} Data in matrix format, each column is an individual datapoint -// @param df {symbol} Distance function name within '.ml.clust.df' -// @param diag {func} Function applied to the similarity matrix diagonal -// @param idxs {long[]} List of point indices -// @return {dict} Similarity, availability and responsibility matrices -// and keys for matches and exemplars to be filled during further iterations -clust.i.apinit:{[data;df;diag;idxs] - // calculate similarity matrix values - s:clust.i.dists[data;df;data]each idxs; - // update diagonal - s:@[;;:;diag raze s]'[s;k:til n:count data 0]; - // create lists/matrices of zeros for other variables - `matches`exemplars`s`a`r!(0;0#0;s),(2;n;n)#0f - } - -// @kind function -// @category private -// @fileoverview Run affinity propagation algorithm -// @param dmp {float} Damping coefficient -// @param info {dict} Similarity, availability, responsibility, exemplars, -// matches, iter dictionary, no_conv boolean and iter dict -// @return {dict} Updated inputs -clust.i.apalgo:{[dmp;info] - // update responsibility matrix - info[`r]:clust.i.updr[dmp;info]; - // update availability matrix - info[`a]:clust.i.upda[dmp;info]; - // find new exemplars - ex:imax each sum info`a`r; - // update `info` with new exemplars/matches - info:update exemplars:ex,matches:?[exemplars~ex;matches+1;0]from info; - // update iter dictionary - .[clust.i.apconv info;(`iter;`run);+[1]] - } - -// @kind function -// @category private -// @fileoverview Check affinity propagation algorithm for convergence -// @param info {dict} Similarity, availability, responsibility, exemplars, -// matches, iter dictionary, no_conv boolean and iter dict -// @return {dict} Updated info dictionary -clust.i.apconv:{[info] - // iteration dictionary - iter:info`iter; - // exemplar matrix - emat:info`emat; - // existing exemplars - ediag:0sum(se=iter`nochange)+0=se:sum each emat; - conv:$[(iter[`total]=iter`run)|not[unconv]&sum[ediag]>0;1b;0b]]; - // return updated info - info,`emat`conv!(emat;conv) - } - -// @kind function -// @category private -// @fileoverview Retrieve diagonal from a square matrix -// @param m {any[][]} Square matrix -// @return {any[]} Matrix diagonal -clust.i.diag:{[m] - {x y}'[m;til count m] - } - -// @kind function -// @category private -// @fileoverview Update responsibility matrix -// @param dmp {float} Damping coefficient -// @param info {dict} Similarity, availability, responsibility, exemplars, -// matches, iter dictionary, no_conv boolean and iter dict -// @return {float[][]} Updated responsibility matrix -clust.i.updr:{[dmp;info] - // create matrix with every points max responsibility - // diagonal becomes -inf, current max is becomes second max - mxresp:{[x;i]@[count[x]#mx;j;:;]max@[x;i,j:x?mx:max x;:;-0w]}; - mx:mxresp'[sum info`s`a;til count info`r]; - // calculate new responsibility - (dmp*info`r)+(1-dmp)*info[`s]-mx - } - -// @kind function -// @category private -// @fileoverview Update availability matrix -// @param dmp {float} Damping coefficient -// @param info {dict} Similarity, availability, responsibility, exemplars, -// matches, iter dictionary, no_conv boolean and iter dict -// @return {float[][]} Returns updated availability matrix -clust.i.upda:{[dmp;info] - // sum values in positive availability matrix - s:sum@[;;:;0f]'[pv:0|info`r;k:til n:count info`a]; - // create a matrix using the negative values produced by the availability sum - // + responsibility diagonal - positive availability values - a:@[;;:;]'[0&(s+info[`r]@'k)-/:pv;k;s]; - // calculate new availability - (dmp*info`a)+a*1-dmp - } - -// @kind function -// @category private -// @fileoverview Stopping condition for affinity propagation algorithm -// @param info {dict} Similarity, availability, responsibility, exemplars, -// matches, iter dictionary, no_conv boolean and iter dict -// @return {bool} Indicates whether to continue or stop running AP (1/0b) -clust.i.apstop:{[info] - (info[`iter;`total]>info[`iter]`run)¬ 1b~info`conv - } - -// @kind function -// @category private -// @fileoverview Predict clusters using AP training exemplars -// @param ex {float[][]} Training cluster centres in matrix format, -// each column is an individual datapoint -// @param df {symbol} Distance function name within '.ml.clust.df' -// @param pt {float[]} Current data point -// @return {long[]} Predicted clusters -clust.i.appreddist:{[ex;df;pt] - d?max d:clust.i.dists[ex;df;pt]each til count ex 0 + if[-1~first config`clust; + '"'.ml.clust.ap.fit' did not converge, all clusters returned -1.", + " Cannot predict new data." + ]; + // Retrieve cluster centres from training data + exemp:config[`data][;distinct config`exemplars]; + // Predict testing data clusters + data:$[0h=type data;flip;enlist]data; + clust.i.apPredDist[exemp;config[`inputs]`df]each data } diff --git a/clust/dbscan.q b/clust/dbscan.q index 83fce63c..8a401e00 100644 --- a/clust/dbscan.q +++ b/clust/dbscan.q @@ -1,138 +1,107 @@ +// clust/dbscan.q - DBSCAN clustering +// Copyright (c) 2021 Kx Systems Inc +// +// DBSCAN clustering. +// The Density-Based Spatial Clustering of Applications with Noise +// (DBSCAN) algorithm groups points that are closely packed in areas +// of high density. Any points in low-density regions are seen as outliers + \d .ml // Density-Based Spatial Clustering of Applications with Noise (DBSCAN) // @kind function // @category clust -// @fileoverview Fit DBSCAN algorithm to data -// @param data {float[][]} Data in matrix format, each column is an individual datapoint -// @param df {symbol} Distance function name within '.ml.clust.df' -// @param minpts {long} Minimum number of points with the epsilon radius -// @param eps {float} Epsilon radius to search -// @return {dict} Data, inputs, clusters and cluster table -// (`data`inputs`clt`t) required for predict and update methods -clust.dbscan.fit:{[data;df;minpts;eps] +// @desc Fit DBSCAN algorithm to data +// @param data {float[][]} Each column of the data is an individual datapoint +// @param df {symbol} Distance function name within '.ml.clust.df' +// @param minPts {long} Minimum number of points with the epsilon radius +// @param eps {float} Epsilon radius to search +// @return {dictionary} A dictionary containing: +// modelInfo - Encapsulates all relevant infromation needed to fit +// the model `data`inputs`clust`tab, where data is the original data, +// inputs are the user defined minPts and eps, clust are the cluster +// assignments and tab is the neighbourhood table defining items in the +// clusters. +// predict - A projection allowing for prediction on new input data +// update - A projection allowing new data to be used to update +// cluster centers such that the model can react to new data +clust.dbscan.fit:{[data;df;minPts;eps] data:clust.i.floatConversion[data]; - // check distance function + // Check distance function if[not df in key clust.i.df;clust.i.err.df[]]; - // create neighbourhood table - t:clust.i.nbhoodtab[data;df;minpts;eps;til count data 0]; - // apply the density based clustering algorithm over the neighbourhood table - t:{[t]any t`corepoint}clust.i.dbalgo/t; - // find cluster for remaining points and return list of clusters - clt:-1^exec cluster from t; - // return config dict - `data`inputs`clt`t!(data;`df`minpts`eps!(df;minpts;eps);clt;t) + // Create neighbourhood table + tab:clust.i.nbhoodTab[data;df;minPts;eps;til count data 0]; + // Apply the density based clustering algorithm over the neighbourhood table + tab:{[t]any t`corePoint}clust.i.dbAlgo/tab; + // Find cluster for remaining points and return list of clusters + clust:-1^exec cluster from tab; + // Return config dict + inputDict:`df`minPts`eps!(df;minPts;eps); + modelInfo:`data`inputs`clust`tab!(data;inputDict;clust;tab); + returnInfo:enlist[`modelInfo]!enlist modelInfo; + predictFunc:clust.dbscan.predict returnInfo; + updFunc:clust.dbscan.update returnInfo; + returnInfo,`predict`update!(predictFunc;updFunc) } // @kind function // @category clust -// @fileoverview Predict clusters using DBSCAN config -// @param data {float[][]} Data in matrix format, each column is an individual datapoint -// @param cfg {dict} `data`df`minpts`eps`clt returned from DBSCAN -// clustered training data -// @return {long[]} List of predicted clusters -clust.dbscan.predict:{[data;cfg] +// @desc Predict clusters using DBSCAN config +// @param config {dictionary} A dictionary returned from '.ml.clust.dbscan.fit' +// containing: +// modelInfo - Encapsulates all relevant infromation needed to fit +// the model `data`inputs`clust`tab, where data is the original data, +// inputs are the user defined minPts and eps, clust are the cluster +// assignments and tab is the neighbourhood table defining items in the +// clusters. +// predict - A projection allowing for prediction on new input data +// update - A projection allowing new data to be used to update +// cluster centers such that the model can react to new data +// @param data {float[][]} Each column of the data is an individual datapoint +// @return {long[]} Predicted clusters +clust.dbscan.predict:{[config;data] + config:config[`modelInfo]; data:clust.i.floatConversion[data]; - // predict new clusters - -1^exec cluster from clust.i.dbscanpredict[data;cfg] + // Predict new clusters + -1^exec cluster from clust.i.dbscanPredict[data;config] } // @kind function // @category clust -// @fileoverview Update DBSCAN config including new data points -// @param data {float[][]} Data in matrix format, each column is an individual datapoint -// @param cfg {dict} `data`inputs`clt`nbh returned from DBSCAN clustered training data -// @return {dict} Updated model config -clust.dbscan.update:{[data;cfg] +// @desc Update DBSCAN config including new data points +// @param config {dictionary} A dictionary returned from '.ml.clust.dbscan.fit' +// containing: +// modelInfo - Encapsulates all relevant infromation needed to fit +// the model `data`inputs`clust`tab, where data is the original data, +// inputs are the user defined minPts and eps, clust are the cluster +// assignments and tab is the neighbourhood table defining items in the +// clusters. +// predict - A projection allowing for prediction on new input data +// update - A projection allowing new data to be used to update +// cluster centers such that the model can react to new data +// @param data {float[][]} Each column of the data is an individual datapoint +// and update functions +// @return {dictionary} Updated model configuration (config), including predict +clust.dbscan.update:{[config;data] + modelConfig:config[`modelInfo]; data:clust.i.floatConversion[data]; - // original data prior to addition of new points, with core points set - orig:update corepoint:1b from cfg[`t]where cluster<>0N; - // predict new clusters - new:clust.i.dbscanpredict[data;cfg]; - // include new data points in training neighbourhood - orig:clust.i.updnbhood/[orig;new;count[orig]+til count new]; - // fit model with new data included to update model - t:{[t]any t`corepoint}.ml.clust.i.dbalgo/orig,new; - // reindex the clusters - t:update{(d!til count d:distinct x)x}cluster from t where cluster<>0N; + // Original data prior to addition of new points, with core points set + orig:update corePoint:1b from modelConfig[`tab]where cluster<>0N; + // Predict new clusters + new:clust.i.dbscanPredict[data;modelConfig]; + // Include new data points in training neighbourhood + orig:clust.i.updNbhood/[orig;new;count[orig]+til count new]; + // Fit model with new data included to update model + tab:{[t]any t`corePoint}.ml.clust.i.dbAlgo/orig,new; + // Reindex the clusters + tab:update{(d!til count d:distinct x)x}cluster from tab where cluster<>0N; // return updated config - cfg,`data`t`clt!(cfg[`data],'data;t;-1^exec cluster from t) - } - - -// Utilities - -// @kind function -// @category private -// @fileoverview Update the neighbourhood of a previously fit original dbscan model based on new data -// @param orig {tab} Original table of data with all points set as core points -// @param new {tab} Table generated from new data with the previously generated model -// @param idx {long[]} Indices used to update the neighbourhood of the original table -// @return {tab} Table with neighbourhood updated appropriately for the newly introduced data -clust.i.updnbhood:{[orig;new;idx] - update nbhood:{x,'y}[nbhood;idx]from orig where i in new`nbhood - } - -// @kind function -// @category private -// @fileoverview Predict clusters using DBSCAN config -// @param data {float[][]} Data in matrix format, each column is an individual datapoint -// @param cfg {dict} `data`inputs`clt returned from DBSCAN clustered training data -// @return {tab} Cluster table -clust.i.dbscanpredict:{[data;cfg] - idx:count[cfg[`data]0]+til count data 0; - // create neighbourhood table - t:clust.i.nbhoodtab[cfg[`data],'data;;;;idx]. cfg[`inputs;`df`minpts`eps]; - // find which existing clusters new data belongs to - update cluster:{x[`clt]first y}[cfg]each nbhood from t where corepoint - } - -// @kind function -// @category private -// @fileoverview Create neighbourhood table for points at indices provided -// @param data {float[][]} Data in matrix format, each column is an individual datapoint -// @param df {symbol} Distance function name within '.ml.clust.df' -// @param minpts {long} Minimum number of points with the epsilon radius -// @param eps {float} Epsilon radius to search -// @param idx {long[]} Data indices to find neighbourhood for -// @return {table} Neighbourhood table with columns `nbhood`cluster`corepoint -clust.i.nbhoodtab:{[data;df;minpts;eps;idx] - // calculate distances and find all points which are not outliers - nbhood:clust.i.nbhood[data;df;eps]each idx; - // update outlier cluster to null - update cluster:0N,corepoint:minpts<=1+count each nbhood from([]nbhood) - } - -// @kind function -// @category private -// @fileoverview Find all points which are not outliers -// @param data {float[][]} Data in matrix format, each column is an individual datapoint -// @param df {symbol} Distance function name within '.ml.clust.df' -// @param eps {float} Epsilon radius to search -// @param idx {long} Index of current point -// @return {long[]} Indices of points within the epsilon radius -clust.i.nbhood:{[data;df;eps;idx] - where eps>@[;idx;:;0w]clust.i.df[df]data-data[;idx] - } - -// @kind function -// @category private -// @fileoverview Run DBSCAN algorithm and update cluster of each point -// @param t {table} Cluster info table -// @return {table} Updated cluster table with old clusters merged -clust.i.dbalgo:{[t] - nbh:.ml.clust.i.nbhoodidxs[t]/[first where t`corepoint]; - update cluster:0|1+max t`cluster,corepoint:0b from t where i in nbh - } - -// @kind function -// @category private -// @fileoverview Find indices in each points neighborhood -// @param t {table} Cluster info table -// @param idxs {long[]} Indices to search the neighborhood of -// @return {long[]} Indices in neighborhood -clust.i.nbhoodidxs:{[t;idxs] - nbh:exec nbhood from t[distinct idxs,raze t[idxs]`nbhood]where corepoint; - asc distinct idxs,raze nbh + clusts:-1^exec cluster from tab; + modelConfig,:`data`tab`clust!(modelConfig[`data],'data;tab;clusts); + returnInfo:enlist[`modelInfo]!enlist modelConfig; + returnKeys:`predict`update; + returnVals:(clust.dbscan.predict returnInfo; + clust.dbscan.update returnInfo); + returnInfo,returnKeys!returnVals } diff --git a/clust/hierarchical.q b/clust/hierarchical.q index 1df3d122..303613f9 100644 --- a/clust/hierarchical.q +++ b/clust/hierarchical.q @@ -1,545 +1,210 @@ +// clust/hierarchical.q - Hierarchical and CURE clustering +// Copyright (c) 2021 Kx Systems Inc +// +// Hierarchical clustering. +// Agglomerative hierarchical clustering iteratively groups data, +// using a bottom-up approach that initially treats all data +// points as individual clusters. +// +// CURE clustering. +// Clustering Using REpresentatives (CURE) is a technique used to deal +// with datasets containing outliers and clusters of varying sizes and +// shapes. Each cluster is represented by a specified number of +// representative points. These points are chosen by taking the most +// scattered points in each cluster and shrinking them towards the +// cluster center using a compression ratio. + \d .ml // Clustering Using REpresentatives (CURE) and Hierarchical Clustering // @kind function // @category clust -// @fileoverview Fit CURE algorithm to data -// @param data {float[][]} Data in matrix format, each column is an individual datapoint -// @param df {symbol} Distance function name within '.ml.clust.df' -// @param n {long} Number of representative points per cluster -// @param c {float} Compression factor for representative points -// @return {dict} Data, input variables and dendrogram -// (`data`inputs`dgram) required for predict method +// @desc Fit CURE algorithm to data +// @param data {float[][]} Each column of the data is an individual datapoint +// @param df {symbol} Distance function name within '.ml.clust.i.df' +// @param n {long} Number of representative points per cluster +// @param c {float} Compression factor for representative points +// @return {dictionary} A dictionary containing: +// modelInfo - Encapsulates all relevant information needed to fit +// the model `data`inputs`dgram, where data is the original data, inputs +// are the user defined linkage and distance functions while dgram +// is the generated dendrogram +// predict - A projection allowing for prediction on new input data clust.cure.fit:{[data;df;n;c] data:clust.i.floatConversion[data]; if[not df in key clust.i.df;clust.i.err.df[]]; - dgram:clust.i.hcscc[data;df;`cure;1;n;c;1b]; - `data`inputs`dgram!(data;`df`n`c!(df;n;c);dgram) + dgram:clust.i.hcSCC[data;df;`cure;1;n;c;1b]; + modelInfo:`data`inputs`dgram!(data;`df`n`c!(df;n;c);dgram); + returnInfo:enlist[`modelInfo]!enlist modelInfo; + predictFunc:clust.cure.predict returnInfo; + returnInfo,enlist[`predict]!enlist predictFunc } // @kind function // @category clust -// @fileoverview Fit Hierarchical algorithm to data -// @param data {float[][]} Data in matrix format, each column is an individual datapoint -// @param df {symbol} Distance function name within '.ml.clust.df' -// @param lf {symbol} Linkage function name within '.ml.clust.lf' -// @return {dict} Data, input variables and dendrogram -// (`data`inputs`dgram) required for predict method +// @desc Fit Hierarchical algorithm to data +// @param data {float[][]} Each column of the data is an individual datapoint +// @param df {symbol} Distance function name within '.ml.clust.i.df' +// @param lf {symbol} Linkage function name within '.ml.clust.i.lf' +// @return {dictionary} A dictionary containing: +// modelInfo - Encapsulates all relevant information needed to fit +// the model `data`inputs`dgram, where data is the original data, inputs +// are the user defined linkage and distance functions while dgram +// is the generated dendrogram +// predict - A projection allowing for prediction on new input data clust.hc.fit:{[data;df;lf] - // check distance and linkage functions + // Check distance and linkage functions data:clust.i.floatConversion[data]; if[not df in key clust.i.df;clust.i.err.df[]]; dgram:$[lf in`complete`average`ward; - clust.i.hccaw[data;df;lf;2;1b]; + clust.i.hcCAW[data;df;lf;2;1b]; lf in`single`centroid; - clust.i.hcscc[data;df;lf;1;::;::;1b]; + clust.i.hcSCC[data;df;lf;1;::;::;1b]; clust.i.err.lf[] ]; - `data`inputs`dgram!(data;`df`lf!(df;lf);dgram) + modelInfo:`data`inputs`dgram!(data;`df`lf!(df;lf);dgram); + returnInfo:enlist[`modelInfo]!enlist modelInfo; + predictFunc:clust.hc.predict returnInfo; + returnInfo,enlist[`predict]!enlist predictFunc } // @kind function // @category clust -// @fileoverview Convert CURE cfg to k clusters -// @param cfg {dict} Output of .ml.clust.cure.fit -// @param k {long} Number of clusters -// @return {dict} Updated config with clusters labels added -clust.cure.cutk:{[cfg;k] - cfg,enlist[`clt]!enlist clust.i.cutdgram[cfg`dgram;k-1] +// @desc Convert CURE config to k clusters +// @param config {dictionary} A dictionary returned from '.ml.clust.cure.fit' +// containing: +// modelInfo - Encapsulates all relevant information needed to fit +// the model `data`inputs`dgram, where data is the original data, inputs +// are the user defined linkage and distance functions while dgram +// is the generated dendrogram +// predict - A projection allowing for prediction on new input data +// @param k {long} Number of clusters +// @return {dictionary} Updated config with clusters labels added +clust.cure.cutK:{[config;k] + clust.i.checkK[k]; + clustVal:clust.i.cutDgram[config[`modelInfo;`dgram];k-1]; + clusts:enlist[`clust]!enlist clustVal; + config,clusts } // @kind function // @category clust -// @fileoverview Convert hierarchical cfg to k clusters -// @param cfg {dict} Output of .ml.clust.hc.fit -// @param k {long} Number of clusters -// @return {dict} Updated config with clusters added -clust.hc.cutk:clust.cure.cutk +// @desc Convert hierarchical config to k clusters +// @param config {dictionary} A dictionary returned from '.ml.clust.hc.fit' +// containing: +// modelInfo - Encapsulates all relevant information needed to fit +// the model `data`inputs`dgram, where data is the original data, inputs +// are the user defined linkage and distance functions while dgram +// is the generated dendrogram +// predict - A projection allowing for prediction on new input data +// @param k {long} Number of clusters +// @return {dictionary} Updated config with clusters added +clust.hc.cutK:clust.cure.cutK // @kind function // @category clust -// @fileoverview Convert CURE dendrogram to clusters based on distance +// @desc Convert CURE dendrogram to clusters based on distance // threshold -// @param cfg {dict} Output of .ml.clust.cure.fit -// @param dthresh {float} Cutting distance threshold -// @return {dict} Updated config with clusters added -clust.cure.cutdist:{[cfg;dthresh] - dgram:cfg`dgram; - k:0|count[dgram]-exec first i from dgram where dist>dthresh; - cfg,enlist[`clt]!enlist clust.i.cutdgram[dgram;k] +// @param config {dictionary} A dictionary returned from '.ml.clust.cure.fit' +// containing: +// modelInfo - Encapsulates all relevant information needed to fit +// the model `data`inputs`dgram, where data is the original data, inputs +// are the user defined linkage and distance functions while dgram +// is the generated dendrogram +// predict - A projection allowing for prediction on new input data +// @param distThresh {float} Cutting distance threshold +// @return {dictionary} Updated config with clusters added +clust.cure.cutDist:{[config;distThresh] + clust.i.checkDist[distThresh]; + dgram:config[`modelInfo;`dgram]; + k:0|count[dgram]-exec first i from dgram where dist>distThresh; + config,enlist[`clust]!enlist clust.i.cutDgram[dgram;k] } // @kind function // @category clust -// @fileoverview Convert hierarchical dendrogram to clusters based on distance +// @desc Convert hierarchical dendrogram to clusters based on distance // threshold -// @param cfg {dict} Output of .ml.clust.hc.fit -// @param dthresh {float} Cutting distance threshold -// @return {dict} Updated config with clusters added -clust.hc.cutdist:clust.cure.cutdist +// @param config {dictionary} A dictionary returned from '.ml.clust.cure.fit' +// containing: +// modelInfo - Encapsulates all relevant information needed to fit +// the model `data`inputs`dgram, where data is the original data, inputs +// are the user defined linkage and distance functions while dgram +// is the generated dendrogram +// predict - A projection allowing for prediction on new input data +// @param distThresh {float} Cutting distance threshold +// @return {dictionary} Updated config with clusters added +clust.hc.cutDist:clust.cure.cutDist // @kind function // @category clust -// @fileoverview Predict clusters using CURE config -// @param data {float[][]} Data in matrix format, each column is an individual datapoint -// @param cfg {dict} `data`df`n`c`clt returned from .ml.clust.(cutk/cutdist) -// @return {long[]} List of predicted clusters -clust.cure.predict:{[data;cfg] - clust.i.hccpred[`cure;data;cfg] +// @desc Predict clusters using CURE config +// @param config {dictionary} A dictionary returned from '.ml.clust.cure.fit' +// containing: +// modelInfo - Encapsulates all relevant information needed to fit +// the model `data`inputs`dgram, where data is the original data, inputs +// are the user defined linkage and distance functions while dgram +// is the generated dendrogram +// predict - A projection allowing for prediction on new input data +// @param data {float[][]} Each column of the data is an individual datapoint +// @param cutDict {dictionary} The key defines what cutting algo to use when +// splitting the data into clusters (`k/`dist) and the value defines the +// cutting threshold +// @return {long[]} Predicted clusters +clust.cure.predict:{[config;data;cutDict] + updConfig:clust.i.prepPred[config;cutDict]; + clust.i.hCCpred[`cure;data;updConfig] } // @kind function // @category clust -// @fileoverview Predict clusters using hierarchical config -// @param data {float[][]} Data in matrix format, each column is an individual datapoint -// @param cfg {dict} `data`df`lf`clt returned from .ml.clust.(cutk/cutdist) -// @return {long[]} List of predicted clusters -clust.hc.predict:{[data;cfg] - clust.i.hccpred[`hc;data;cfg] - } - - -// Utilities - -// @kind function -// @category private -// @fileoverview Complete, Average, Ward (CAW) Linkage -// @param data {float[][]} Data in matrix format, each column is an individual datapoint -// @param df {symbol} Distance function name within '.ml.clust.df' -// @param lf {symbol} Linkage function name within '.ml.clust.lf' -// @param k {long} Number of clusters -// @param dgram {bool} Generate dendrogram or not (1b/0b) -// @return {table/long[]} Dendrogram or list of clusters -clust.i.hccaw:{[data;df;lf;k;dgram] - // check distance function for ward - if[(not df~`e2dist)&lf=`ward;clust.i.err.ward[]]; - // create initial cluster table - t0:clust.i.initcaw[data;df]; - // create linkage matrix - m:([]i1:`int$();i2:`int$();dist:`float$();n:`int$()); - // merge clusters based on chosen algorithm - r:{[k;r]ki+1;cl[where[cl=cl i]except i]:1+max cl;i+:1]; - // update dendrogram with new indices - ![dgram;();0b;`i1`i2!n cut cl] - } - -// @kind function -// @category private -// @fileoverview Convert dendrogram table to clusters -// @param t {table} Dendrogram table -// @param k {long} Define splitting value in dendrogram table -// @return {long[]} List of clusters -clust.i.cutdgram:{[t;k] - // get index of cluster made at cutting point k - idx:(2*cntt:count t)-k-1; - // exclude any clusters made after point k - exclt:i where idx>i:raze neg[k]#'allclt:t`i1`i2; - // extract indices within clusters made until k, excluding any outliers - nout:exclt except outliers:exclt where exclt<=cntt; - clt:{last{count x 0}clust.i.extractclt[x;y]/(z;())}[allclt;cntt+1]each nout; - // update points to the cluster they belong to - @[;;:;]/[(1+cntt)#0N;clt,enlist each outliers;til k+1] - } - -// @kind function -// @category private -// @fileoverview Extract points within merged cluster -// @param clts {long[]} List of cluster indices -// @param cntt {long} Count of dend table -// @param inds {long[]} Index in list to search and indices points found within -// that cluster -// @return {long[]} Next index to search, and additional points found -// within cluster -clust.i.extractclt:{[clts;cntt;inds] - // extract the points that were merged at this point - mrgclt:raze clts[;inds[0]-cntt]; - // Store any single clts, break down clts more than single point - (mrgclt where inext;inds[1],mrgclt where not inext:mrgclt>=cntt) +// @category clust +// @desc Fit CURE algorithm to data and convert dendrogram to clusters +// @param data {float[][]} Each column of the data is an individual datapoint +// @param df {symbol} Distance function name within '.ml.clust.i.df' +// @param n {long} Number of representative points per cluster +// @param c {float} Compression factor for representative points +// @param cutDict {dictionary} The key defines what cutting algo to use when +// splitting the data into clusters (`k/`dist) and the value defines the +// cutting threshold +// @return {dictionary} Updated config with clusters added +clust.cure.fitPredict:{[data;df;n;c;cutDict] + fitModel:clust.cure.fit[data;df;n;c]; + clust.i.prepPred[fitModel;cutDict] } // @kind function -// @category private -// @fileoverview SCC algo -// @param data {float[][]} Data in matrix format, each column is -// an individual datapoint -// @param df {symbol} Distance function name within '.ml.clust.df' -// @param lf {symbol} Linkage function name within '.ml.clust.lf' -// @param params {dict} Parameters - k (no. clusts), n (no. reppts per clust), reppts, kdtree -// @param clusts {table} Cluster table -// @param reppts {float[][]} Representative points and associated info -// @param kdtree {table} k-dimensional tree storing points and distances -// @return {(dict;long[];float[][];table)} Parameters dict, clusters, -// representative points and kdtree tables -clust.i.algoscc:{[data;df;lf;params;clusts;reppts;kdtree;lnkmat] - // merge closest clusters - clust0:exec clust{x?min x}closestDist from clusts where valid; - newmrg:clusts clust0,clust1:clusts[clust0]`closestClust; - newmrg:update valid:10b,reppts:(raze reppts;0#0),points:(raze points;0#0)from newmrg; - // make dendrogram if required - if[lnkmat 1; - m:lnkmat 0; - m,:newmrg[`clusti],fnew[`closestDist],count(fnew:first newmrg)`points; - lnkmat[0]:m - ]; - // keep track of old reppts - oldrep:reppts newmrg[0]`reppts; - // find reps in new cluster - $[sgl:lf~`single; - // for single new reps=old reps -> no new points calculated - newrep:select reppt,clust:clust0 from oldrep; - [ - // generate new representative points table (centroid -> reps=avg; cure -> calc reps) - newrepfunc:$[lf~`centroid;clust.i.centrep;clust.i.curerep[df;params`n;params`c]]; - newrepkeys:params[`rpcols]; - newrepvals:flip newrepfunc[data[;newmrg[0]`points]]; - newrep:flip newrepkeys!newrepvals; - newrep:update clust:clust0,reppt:count[i]#newmrg[0]`reppts from newrep; - // new rep leaves - newrep[`leaf]:(clust.kd.findleaf[kdtree;;kdtree 0]each flip newrep params`rpcols)`self; - newmrg[0;`reppts]:newrep`reppt; - // delete old points from leaf and update new point to new rep leaf - kdtree:.[kdtree;(oldrep`leaf;`idxs);except;oldrep`reppt]; - kdtree:.[kdtree;(newrep`leaf;`idxs);union ;newrep`reppt] - ] - ]; - // update clusters and reppts - clusts:@[clusts;newmrg`clust;,;delete clust from newmrg]; - reppts:@[reppts;newrep`reppt;,;delete reppt from newrep]; - updrep:reppts newrep`reppt; - // nneighbour to clust - if[sgl;updrep:select from updrep where closestClust in newmrg`clust]; - // calculate and append to representative point table the nearest neighbours - // of columns containing representative points - updrepdata:flip updrep params`rpcols; - updrepdatann:clust.kd.nn[kdtree;reppts params`rpcols;df;newmrg[0]`points] each updrepdata; - updrep:updrep,'updrepdatann; - updrep:update closestClust:reppts[closestPoint;`clust]from updrep; - if[sgl; - reppts:@[reppts;updrep`reppt;,;select closestDist,closestClust from updrep]; - updrep:reppts newrep`reppt]; - // update nneighbour of new clust - updrep@:raze imin updrep`closestDist; - clusts:@[clusts;updrep`clust;,;`closestDist`closestClust#updrep]; - $[sgl; - // single - nneighbour=new clust - [clusts:update closestClust:clust0 from clusts where valid,closestClust=clust1; - reppts:update closestClust:clust0 from reppts where closestClust=clust1]; - // else do nneighbour search - if[count updcls:select from clusts where valid,closestClust in(clust0;clust1); - updcls:updcls,'{x imin x`closestDist}each clust.kd.nn[kdtree;reppts params`rpcols;df]/:' - [updcls`reppts;flip each reppts[updcls`reppts]@\:params`rpcols]; - updcls[`closestClust]:reppts[updcls`closestPoint]`clust; - clusts:@[clusts;updcls`clust;,;select closestDist,closestClust from updcls] - ] - ]; - (params;clusts;reppts;kdtree;lnkmat) +// @category clust +// @desc Fit hierarchial algorithm to data and convert dendrogram +// to clusters +// @param data {float[][]} Each column of the data is an individual datapoint +// @param df {symbol} Distance function name within '.ml.clust.i.df' +// @param lf {symbol} Linkage function name within '.ml.clust.i.lf' +// @param cutDict {dictionary} The key defines what cutting algo to use when +// splitting the data into clusters (`k/`dist) and the value defines the +// cutting threshold +// @return {dictionary} Updated config with clusters added +clust.hc.fitPredict:{[data;df;lf;cutDict] + fitModel:clust.hc.fit[data;df;lf]; + clust.i.prepPred[fitModel;cutDict] } diff --git a/clust/init.q b/clust/init.q index e8e69e36..15804bfa 100644 --- a/clust/init.q +++ b/clust/init.q @@ -1,13 +1,21 @@ +// clust/init.q - Load clustering library +// Copyright (c) 2021 Kx Systems Inc +// +// Clustering algorithms including affinity propagation, +// cure, dbscan, hierarchical, and k-means clustering + \d .ml // required for use of .ml.confmat in score.q loadfile`:util/init.q // load clustering files -loadfile`:clust/util.q +loadfile`:clust/utils.q loadfile`:clust/kdtree.q loadfile`:clust/kmeans.q loadfile`:clust/aprop.q loadfile`:clust/dbscan.q loadfile`:clust/hierarchical.q loadfile`:clust/score.q + +.ml.i.deprecWarning`clust diff --git a/clust/kdtree.q b/clust/kdtree.q index 97a50918..bead5ac7 100644 --- a/clust/kdtree.q +++ b/clust/kdtree.q @@ -1,130 +1,104 @@ +// clust/kdtree.q - K dimensional tree +// Copyright (c) 2021 Kx Systems Inc +// +// A k-dimensional tree (k-d tree) is a special case of the +// binary search tree data structure, commonly used in computer +// science to organize data points in k-dimensional space. +// Each leaf node in the tree contains a set of k-dimensional points, +// while each non-leaf node generates a splitting hyperplane +// which divides the surrounding space. + \d .ml // K-Dimensional (k-d) Tree // @kind function // @category clust -// @fileoverview Create new k-d tree -// @param data {float[][]} Data in matrix format, each column is an individual datapoint -// @param leafsz {long} Number of points per leaf (<2*number of reppts) -// @return {table} k-d tree -clust.kd.newtree:{[data;leafsz] +// @desc Create new k-d tree +// @param data {float[][]} Each column of the data is an individual datapoint +// @param leafSize {long} Number of points per leaf (<2*number of reppts) +// @return {table} k-d tree +clust.kd.newTree:{[data;leafSize] args:`leaf`left`parent`self`idxs!(0b;0b;0N;0;til count data 0); - clust.kd.i.tree[data;leafsz]args + clust.kd.i.tree[data;leafSize]args } // @kind function // @category clust -// @fileoverview Find nearest neighhbors in k-d tree -// @param tree {table} k-d tree -// @param data {float[][]} Data in matrix format, each column is an individual datapoint -// @param df {symbol} Distance function name within '.ml.clust.df' -// @param xidxs {long[][]} Points to exclude in search -// @param pt {long[]} Point to find nearest neighbor for -// @return {dict} Nearest neighbor dictionary with closest point, +// @desc Find nearest neighhbors in k-d tree +// @param tree {table} k-d tree +// @param data {float[][]} Each column of the data is an individual datapoint +// @param df {symbol} Distance function name within '.ml.clust.df' +// @param xIdxs {long[][]} Points to exclude in search +// @param pt {long[]} Point to find nearest neighbor for +// @return {dictionary} Nearest neighbor dictionary with closest point, // distance, points searched and points to search -clust.kd.q.nn:clust.kd.nn:{[tree;data;df;xidxs;pt] - nninit:(0N;0w;0#0;clust.kd.findleaf[tree;pt;tree 0]); - start:`closestPoint`closestDist`xnodes`node!nninit; - stop:{[nninfo]not null nninfo[`node;`self]}; - 2#stop clust.kd.i.nncheck[tree;data;df;xidxs;pt]/start +clust.kd.q.nn:clust.kd.nn:{[tree;data;df;xIdxs;pt] + nnInit:(0N;0w;0#0;clust.kd.findLeaf[tree;pt;tree 0]); + start:`closestPoint`closestDist`xNodes`node!nnInit; + stop:{[nnInfo]not null nnInfo[`node;`self]}; + 2#stop clust.kd.i.nnCheck[tree;data;df;xIdxs;pt]/start } // @kind function -// @category private -// @fileoverview Create tree table where each row represents a node -// @param data {float[][]} Points in `value flip` format -// @param leafsz {long} Points per leaf (<2*number of representatives) -// @param node {dict} Info for a given node in the tree -// @return {table} k-d tree table -clust.kd.i.tree:{[data;leafsz;node] - if[leafsz<=.5*count node`idxs; - chk:xdatatype cfg;'"cfg must be (::) or a dictionary"]; - // update iteration dictionary with user changes - updDict:defaultDict,cfg; - // fit algo to data - r:clust.i.kmeans[data;df;k;updDict]; - // return config with new clusters - r,`data`inputs!(data;`df`k`iter`kpp!(df;k;updDict`iter;updDict`init)) + if[config~(::);config:()!()]; + if[99h<>type config;'"config must be (::) or a dictionary"]; + // Update iteration dictionary with user changes + updDict:defaultDict,config; + // Fit algo to data + r:clust.i.kMeans[data;df;k;updDict]; + // Return config with new clusters + inputDict:`df`k`iter`kpp!(df;k;updDict`iter;updDict`init); + modelInfo:r,`data`inputs!(data;inputDict); + returnInfo:enlist[`modelInfo]!enlist modelInfo; + predictFunc:clust.kmeans.predict returnInfo; + updFunc:clust.kmeans.update returnInfo; + returnInfo,`predict`update!(predictFunc;updFunc) } // @kind function // @category clust -// @fileoverview Predict clusters using k-means config -// @param data {float[][]} Data in matrix format, each column is an individual datapoint -// @param df {symbol} Distance function name within '.ml.clust.df' -// @param cfg {dict} `data`df`reppts`clt returned from kmeans clustered training data -// @return {long[]} List of predicted clusters -clust.kmeans.predict:{[data;cfg] +// @desc Predict clusters using k-means config +// @param config {dictionary} A dictionary returned from '.ml.clust.kmeans.fit' +// containing: +// modelInfo - Encapsulates all relevant information needed to fit +// the model `data`df`repPts`clust, where data and df are the inputs, +// repPts are the calculated k centers and clust are clusters associated +// with each of the datapoints +// predict - A projection allowing for prediction on new input data +// update - A projection allowing new data to be used to update +// cluster centers such that the model can react to new data +// @param data {float[][]} Each column of the data is an individual datapoint +// @return {long[]} Predicted clusters +clust.kmeans.predict:{[config;data] + config:config[`modelInfo]; data:clust.i.floatConversion[data]; - // get new clusters based on latest config - clust.i.getclust[data;cfg[`inputs]`df;cfg`reppts] + // Get new clusters based on latest config + clust.i.getClust[data;config[`inputs]`df;config`repPts] } // @kind function // @category clust -// @fileoverview Update kmeans config including new data points -// @param data {float[][]} Data in matrix format, each column is an individual datapoint -// @param cfg {dict} `data`df`reppts`clt returned from kmeans clustered on training data -// @return {dict} Updated model config -clust.kmeans.update:{[data;cfg] +// @desc Update kmeans config including new data points +// @param config {dictionary} A dictionary returned from '.ml.clust.kmeans.fit' +// containing: +// modelInfo - Encapsulates all relevant information needed to fit +// the model `data`df`repPts`clust, where data and df are the inputs, +// repPts are the calculated k centers and clust are clusters associated +// with each of the datapoints +// predict - A projection allowing for prediction on new input data +// update - A projection allowing new data to be used to update +// cluster centers such that the model can react to new data +// @param data {float[][]} Each column of the data is an individual datapoint +// @return {dictionary} Updated model configuration (config), including predict +// and update functions +clust.kmeans.update:{[config;data] + modelConfig:config[`modelInfo]; data:clust.i.floatConversion[data]; - // update data to include new points - cfg[`data]:cfg[`data],'data; - // update k means - cfg[`reppts]:clust.i.updcenters[cfg`data;cfg[`inputs]`df;()!();cfg`reppts]; - // get updated clusters based on new means - cfg[`clt]:clust.i.getclust[cfg`data;cfg[`inputs]`df;cfg`reppts]; - // return updated config - cfg - } - - -// Utilities - -// @kind function -// @category private -// @fileoverview K-Means algorithm -// @param data {float[][]} Data in matrix format, each column is an individual datapoint -// @param df {symbol} Distance function name within '.ml.clust.df' -// @param k {long} Number of clusters -// @param cfg {dict} Configuration information containing the maximum iterations `iter, -// initialisation type `init and threshold for smallest distance -// to move between the previous and new run `thresh -// @return {dict} Clusters or reppts depending on rep -clust.i.kmeans:{[data;df;k;cfg] - // check distance function - if[not df in`e2dist`edist;clust.i.err.kmeans[]]; - // initialize representative points - initreppts:$[cfg`init;clust.i.initkpp df;clust.i.initrdm][data;k]; - // run algo until maximum number of iterations reached or convergence - reppts0:`idx`reppts`notconv!(0;initreppts;1b); - reppts1:clust.i.kmeansConverge[cfg] clust.i.updcenters[data;df;cfg]/reppts0; - // return representative points and clusters - `reppts`clt!(reppts1`reppts;clust.i.getclust[data;df;reppts1`reppts]) - } - -// @kind function -// @category private -// @fileoverview Check to see if cluster centers are stable or -// if the maximum number of iterations allowable have been reached -// @param cfg {dict} Configuration information containing the maximum iterations `iter, -// initialisation type `init and threshold for smallest distance -// to move between the previous and new run `thresh -// @param algorun {dict} Information about the current run of the algorithm which can have an -// impact on early or on time stopping i.e. have the maximum number of iterations been exceeded -// or have the cluster centers not moved more than the threshold i.e. 'stationary' -// @return {bool} 0b indicates number of iterations has exceeded maximum and -clust.i.kmeansConverge:{[cfg;algorun] - check1:cfg[`iter]>algorun`idx; - check2:algorun`notconv; - check1 & check2 - } - -// @kind function -// @category private -// @fileoverview Update cluster centers -// @param data {float[][]} Data in matrix format, each column is an individual datapoint -// @param df {symbol} Distance function name within '.ml.clust.df' -// @param cfg {dict} Configuration information containing the maximum iterations `iter, -// initialisation type `init and threshold for smallest distance -// to move between the previous and new run `thresh -// @param reppts {float[][]/dict} Information relating to the representative points, in the case of -// fitting the model this is a dictionary containing the current iteration index and if the data -// has converged in addition to the representative points. In an individual update this is just -// the representative points for the k means centers. -// @return {float[][]} Updated representative points -clust.i.updcenters:{[data;df;cfg;reppts] - // projection used for calculation of representative points - repptFunc:clust.i.newreppts[data;df;]; - if[99h=type reppts; - reppts[`idx]+:1; - prevpoint:reppts`reppts; - reppts[`reppts]:repptFunc reppts`reppts; - reppts[`notconv]:cfg[`thresh]n:count true; - '`$"pred and true must have equal lengths"]; - if[not e:clust.i.entropy true;:1.]; - cm:value confmat[pred;true]; - nm:(*\:/:).((count each group@)each(pred;true))@\:til count cm; - mi:(sum/)0^cm*.[-;log(n*cm;nm)]%n; - mi%e + '"pred and true must have equal lengths" + ]; + if[not ent:clust.i.entropy true;:1.]; + confMat:value confMatrix[pred;true]; + nm:(*\:/:).((count each group@)each(pred;true))@\:til count confMat; + mi:(sum/)0^confMat*.[-;log(n*confMat;nm)]%n; + mi%ent } // Optimum number of clusters // @kind function // @category clust -// @fileoverview Elbow method -// @param data {float[][]} Data in matrix format, each column is an individual datapoint -// @param df {symbol} Distance function name within '.ml.clust.df' -// @param k {long} Max number of clusters -// @return {float[]} Score for each k value - plot to find elbow +// @desc Elbow method +// @param data {float[][]} Each column of the data is an individual datapoint +// @param df {symbol} Distance function name within '.ml.clust.i.df' +// @param k {long} Max number of clusters +// @return {float[]} Score for each k value - plot to find elbow clust.elbow:{[data;df;k] - {[data;df;k] - clt:clust.kmeans.fit[data;df;k;::]`clt; - sum raze clust.i.dists[;df;;::]'[p;a:avg@''p:{x[;y]}[data]each group clt] - }[data;df]each 2+til k-1 - } - -// Utilities - -// @kind function -// @category private -// @fileoverview Entropy -// @param d {long[]} distribution -// @return {float} Entropy for d -clust.i.entropy:{[d] - neg sum(p%n)*(-). log(p;n:sum p:count each group d) - } - -// @kind function -// @category private -// @fileoverview Maximum intra-cluster distance -// @param df {symbol} Distance function name within '.ml.clust.df' -// @param data {float[][]} Data in matrix format, each column is an individual datapoint -// @return {float} Max intra-cluster distance -clust.i.maxintra:{[df;data] - max raze{[df;data;x;y] - clust.i.dists[data;df;data[;y];x except til 1+y] - }[df;data;n]each n:til count first data - } - -// @kind function -// @category private -// @fileoverview Minimum inter-cluster distance -// @param df {symbol} Distance function name within '.ml.clust.df' -// @param data {float[][]} Data in matrix format, each column is an individual datapoint -// @param idxs {long[]} Cluster indices -// @return {float} Min inter-cluster distance -clust.i.mininter:{[df;data;idxs] - {[df;data;i;j] - (min/)clust.i.dists[data[i];df;data[j]]each til count data[i]0 - }[df;data;first idxs]each 1_idxs - } - -// @kind function -// @category private -// @fileoverview Silhouette coefficient -// @param data {float[][]} Data in matrix format, each column is an individual datapoint -// @param df {symbol} Distance function name within '.ml.clust.df' -// @param idxs {dict} Point indices grouped by cluster -// @param k {float} Coefficient to multiply by -// @param clt {long} Cluster of current point -// @param pt {float} Current point -// @return {float} Silhouette coefficent for pt -clust.i.sil:{[data;df;idxs;k;clt;pt] - d:clust.i.dists[data;df;pt]each idxs; - (%).((-).;max)@\:(min avg each;k[clt]*sum@)@'d@/:(key[idxs]except clt;clt) + clust.i.elbow[data;df]each 2+til k-1 } diff --git a/clust/tests/clt.t b/clust/tests/clt.t index 37e9fdff..3741d14a 100644 --- a/clust/tests/clt.t +++ b/clust/tests/clt.t @@ -13,15 +13,18 @@ fclust:.p.import[`scipy.cluster.hierarchy]`:fcluster // q Utilities mat :{"f"$flip value flip x} -clusterIdxs:{value group(x . y)`clt} -clusterKeys:{key group(x . y)`clt} -clusterAdd1:{1+(x . y)`clt} -qDendrogram:{asc each x(y . z)`dgram} +clusterIdxs:{value group(x . y)[`modelInfo;`clust]} +clusterKeys:{key group(x . y)[`modelInfo;`clust]} +clusterIdxsDendro:{value group(x . y)`clust} +clusterIdxsUpd:{value group(x . y)[`modelInfo;`clust]} +clusterAdd1:{1+(x . y)`clust} +qDendrogram:{asc each x(y . z)[`modelInfo;`dgram]} algoOutputs:{asc key x . y} -countOutput:{count x . y} -pythonRes :{[fclust;mat;t;clt;param]value group fclust[mat t`dgram;clt;param]`}[fclust;mat] +algoOutputsFit:{asc key first x . y} +countOutput:{count x y} +pythonRes :{[fclust;mat;t;clust;param]value group fclust[mat t[`modelInfo;`dgram];clust;param]`}[fclust;mat] pythonDgram:{[lnk;d;lf;df]asc each lnk[flip d;lf;df]`}[lnk] -qDgramDists:{(x . y)[`dgram]`dist} +qDgramDists:{(x . y)[`modelInfo;`dgram]`dist} // Datasets d1:flip(60#"F";",")0:`:clust/tests/data/ss5.csv @@ -44,17 +47,19 @@ passingTest[clusterIdxs[.ml.clust.ap.fit];(d2;`nege2dist;0.01;{[x] -10.};(::));1 passingTest[clusterIdxs[.ml.clust.ap.fit];(d1tts 0;`nege2dist;0.3;min;`maxrun`maxmatch!100 10);1b;enlist til 45] passingTest[clusterKeys[.ml.clust.ap.fit];(d2;`nege2dist;0.95;{[x] -20000.};enlist[`maxsame]!enlist 150);1b;til 5] passingTest[clusterKeys[.ml.clust.ap.fit];(d2;`nege2dist;0.5;min;(::));1b;til 5] -passingTest[algoOutputs[.ml.clust.ap.fit];(d2;`nege2dist;0.5;min;(::));1b;`clt`data`exemplars`inputs] +passingTest[algoOutputsFit[.ml.clust.ap.fit];(d2;`nege2dist;0.5;min;(::));1b;`clust`data`exemplars`inputs] failingTest[.ml.clust.ap.fit;(d1;`e2dist;0.7;min;(::));0b;"AP must be used with nege2dist"] failingTest[.ml.clust.ap.fit;(d1;`nege2dist;0.7;min;100);0b;"iter must be (::) or a dictionary"] failingTest[.ml.clust.ap.fit;(d1;`nege2dist;0.7;min;([]total:10,();nochange:5,()));0b;"iter must be (::) or a dictionary"] failingTest[.ml.clust.ap.fit;(100?`8;`nege2dist;0.7;min;(::));0b;"Dataset not suitable for clustering. Must be convertible to floats."] + // Predict -passingTest[.ml.clust.ap.predict;(d1tts 1;.ml.clust.ap.fit[d1tts 0;`nege2dist;0.7;min;(::)]);0b;APclt] -passingTest[.ml.clust.ap.predict;(d1tts 1;.ml.clust.ap.fit[d1tts 0;`nege2dist;0.7;med;`maxrun`maxmatch!100 10]);0b;APclt] -failingTest[.ml.clust.ap.predict;(100?`7;enlist[`clt]!enlist -1);0b;"Dataset not suitable for clustering. Must be convertible to floats."] -failingTest[.ml.clust.ap.predict;(d1tts 1;enlist[`clt]!enlist -1);0b;"'.ml.clust.ap.fit' did not converge, all clusters returned -1. Cannot predict new data."] +passingTest[.ml.clust.ap.fit[d1tts 0;`nege2dist;0.7;min;(::)]`predict;d1tts 1;1b;APclt] +passingTest[.ml.clust.ap.fit[d1tts 0;`nege2dist;0.7;med;`maxrun`maxmatch!100 10]`predict;d1tts 1;1b;APclt] +failingTest[.ml.clust.ap.fit[d1tts 0;`nege2dist;0.7;min;(::)]`predict;100?`7;1b;"Dataset not suitable for clustering. Must be convertible to floats."] +failingTest[.ml.clust.ap.predict;(enlist[`modelInfo]!enlist enlist[`clust]!enlist -1;d1tts 1); + 0b;"'.ml.clust.ap.fit' did not converge, all clusters returned -1. Cannot predict new data."] // K-Means @@ -66,21 +71,22 @@ passingTest[clusterIdxs[.ml.clust.kmeans.fit];(d1;`e2dist;4;kMeansCfg,enlist[`th passingTest[clusterIdxs[.ml.clust.kmeans.fit];(d1;`edist;4;kMeansCfg);1b;d1clt] passingTest[clusterKeys[.ml.clust.kmeans.fit];(d1;`edist;4;kMeansCfg);1b;til 4] passingTest[clusterKeys[.ml.clust.kmeans.fit];(d1;`e2dist;7;kMeansCfg);1b;til 7] -passingTest[algoOutputs[.ml.clust.kmeans.fit];(d2;`edist;4;kMeansCfg);1b;`clt`data`inputs`reppts] +passingTest[algoOutputsFit[.ml.clust.kmeans.fit];(d2;`edist;4;kMeansCfg);1b;`clust`data`inputs`repPts] failingTest[.ml.clust.kmeans.fit;(d1;`mdist;4;kMeansCfg);0b;"kmeans must be used with edist/e2dist"] -failingTest[.ml.clust.kmeans.fit;(d1;`nege2dist;4;74);0b;"cfg must be (::) or a dictionary"] -failingTest[.ml.clust.kmeans.fit;(d1;`nege2dist;4;([]total:28,();nochange:100,()));0b;"cfg must be (::) or a dictionary"] +failingTest[.ml.clust.kmeans.fit;(d1;`nege2dist;4;74);0b;"config must be (::) or a dictionary"] +failingTest[.ml.clust.kmeans.fit;(d1;`nege2dist;4;([]total:28,();nochange:100,()));0b;"config must be (::) or a dictionary"] failingTest[.ml.clust.kmeans.fit;(1000?`a`b`c;`edist;4;kMeansCfg);0b;"Dataset not suitable for clustering. Must be convertible to floats."] // Predict -passingTest[countOutput[.ml.clust.kmeans.predict];(d1tts 1;.ml.clust.kmeans.fit[d1tts 0;`e2dist;4;kMeansCfg]);1b;15] -passingTest[countOutput[.ml.clust.kmeans.predict];(d1tts 1;.ml.clust.kmeans.fit[d1tts 0;`edist;4;kMeansCfg]);1b;15] -failingTest[.ml.clust.kmeans.predict;(100?`4;()!());0b;"Dataset not suitable for clustering. Must be convertible to floats."] +passingTest[countOutput[.ml.clust.kmeans.fit[d1tts 0;`e2dist;4;kMeansCfg]`predict];d1tts 1;1b;15] +passingTest[countOutput[.ml.clust.kmeans.fit[d1tts 0;`edist;4;kMeansCfg]`predict];d1tts 1;1b;15] +failingTest[.ml.clust.kmeans.fit[d1tts 0;`e2dist;4;kMeansCfg]`predict;100?`4;1b;"Dataset not suitable for clustering. Must be convertible to floats."] // Update -passingTest[clusterIdxs[.ml.clust.kmeans.update];(d1tts 1;.ml.clust.kmeans.fit[d1tts 0;`e2dist;4;kMeansCfg]);1b;d1clt] -passingTest[algoOutputs[.ml.clust.kmeans.update];(d1tts 1;.ml.clust.kmeans.fit[d1tts 0;`edist;4;kMeansCfg]);1b;`clt`data`inputs`reppts] -failingTest[.ml.clust.kmeans.update;(1000?`2;()!());0b;"Dataset not suitable for clustering. Must be convertible to floats."] +passingTest[algoOutputs[.ml.clust.kmeans.fit[d1tts 0;`edist;4;kMeansCfg]`update];enlist d1tts 1;1b;`modelInfo`predict`update] +passingTest[clusterIdxsUpd[.ml.clust.kmeans.fit[d1tts 0;`e2dist;4;kMeansCfg]`update];enlist d1tts 1;1b;d1clt] +failingTest[.ml.clust.kmeans.update;(()!();1000?`2);0b;"Dataset not suitable for clustering. Must be convertible to floats."] + // DBSCAN @@ -95,17 +101,17 @@ failingTest[.ml.clust.dbscan.fit;(50?`x`y;`edist;4;300);0b;"Dataset not suitable failingTest[.ml.clust.dbscan.fit;(d1;`euclidean;5;5);0b;"invalid distance metric"] // Predict -passingTest[.ml.clust.dbscan.predict;(d1tts 1;.ml.clust.dbscan.fit[d1tts 0;`e2dist;5;5]);0b;15#-1] -passingTest[.ml.clust.dbscan.predict;(d1tts 1;.ml.clust.dbscan.fit[d1tts 0;`edist;5;5]);0b;15#-1] -passingTest[.ml.clust.dbscan.predict;(d1tts 1;.ml.clust.dbscan.fit[d1tts 0;`mdist;5;5]);0b;15#-1] -failingTest[.ml.clust.dbscan.predict;(50?`x`y;());0b;"Dataset not suitable for clustering. Must be convertible to floats."] +passingTest[.ml.clust.dbscan.fit[d1tts 0;`e2dist;5;5]`predict;d1tts 1;1b;15#-1] +passingTest[.ml.clust.dbscan.fit[d1tts 0;`edist;5;5]`predict;d1tts 1;1b;15#-1] +passingTest[.ml.clust.dbscan.fit[d1tts 0;`mdist;5;5]`predict;d1tts 1;1b;15#-1] +failingTest[.ml.clust.dbscan.fit[d1tts 0;`e2dist;5;5]`predict;(50?`x`y);1b;"Dataset not suitable for clustering. Must be convertible to floats."] // Update -passingTest[clusterIdxs[.ml.clust.dbscan.update];(d1tts 1;.ml.clust.dbscan.fit[d1tts 0;`e2dist;5;5]);1b;d1clt] -passingTest[clusterIdxs[.ml.clust.dbscan.update];(d1tts 1;.ml.clust.dbscan.fit[d1tts 0;`edist;5;5]);1b;d1clt] -passingTest[clusterIdxs[.ml.clust.dbscan.update];(d1tts 1;.ml.clust.dbscan.fit[d1tts 0;`mdist;5;5]);1b;d1clt] -passingTest[algoOutputs[.ml.clust.dbscan.update];(d1tts 1;.ml.clust.dbscan.fit[d1tts 0;`mdist;5;5]);1b;`clt`data`inputs`t] -failingTest[.ml.clust.dbscan.update;(50?`x`y;());0b;"Dataset not suitable for clustering. Must be convertible to floats."] +passingTest[clusterIdxsUpd[.ml.clust.dbscan.fit[d1tts 0;`e2dist;5;5]`update];enlist d1tts 1;1b;d1clt] +passingTest[clusterIdxsUpd[.ml.clust.dbscan.fit[d1tts 0;`edist;5;5]`update];enlist d1tts 1;1b;d1clt] +passingTest[clusterIdxsUpd[.ml.clust.dbscan.fit[d1tts 0;`mdist;5;5]`update];enlist d1tts 1;1b;d1clt] +passingTest[algoOutputs[.ml.clust.dbscan.fit[d1tts 0;`mdist;5;5]`update];enlist d1tts 1;1b;`modelInfo`predict`update] +failingTest[.ml.clust.dbscan.update;(()!();50?`x`y);0b;"Dataset not suitable for clustering. Must be convertible to floats."] // CURE @@ -118,25 +124,29 @@ cured1pred2:0 3 0 0 3 3 0 0 0 0 0 3 0 3 3 cured1pred3:1 3 1 3 3 3 1 1 1 1 1 3 1 3 3 // Fit -passingTest[clusterIdxs[.ml.clust.cure.cutk];(.ml.clust.cure.fit[d1;`e2dist;5;0];4);1b;d1clt] -passingTest[clusterIdxs[.ml.clust.cure.cutk];(.ml.clust.cure.fit[d1;`edist;10;0.2];4);1b;d1clt] -passingTest[clusterIdxs[.ml.clust.cure.cutk];(.ml.clust.cure.fit[d1;`mdist;3;0.15];4);1b;d1clt] -passingTest[clusterIdxs[.ml.clust.cure.cutk];(.ml.clust.cure.fit[d2;`e2dist;20;0];4);1b;cured2clt1] -passingTest[clusterIdxs[.ml.clust.cure.cutk];(.ml.clust.cure.fit[d2;`edist;20;0.2];4);1b;cured2clt2] -passingTest[clusterIdxs[.ml.clust.cure.cutk];(.ml.clust.cure.fit[d2;`mdist;10;0.1];4);1b;cured2clt3] -passingTest[clusterIdxs[.ml.clust.cure.cutdist];(.ml.clust.cure.fit[d1;`e2dist;5;0];2.);1b;d1clt] -passingTest[clusterIdxs[.ml.clust.cure.cutdist];(.ml.clust.cure.fit[d1;`edist;10;0.2];2.);1b;d1clt] -passingTest[clusterIdxs[.ml.clust.cure.cutdist];(.ml.clust.cure.fit[d1;`mdist;3;0.15];2.);1b;d1clt] -passingTest[algoOutputs[.ml.clust.cure.fit];(d1;`e2dist;5;0);1b;`data`dgram`inputs] +passingTest[clusterIdxsDendro[.ml.clust.cure.cutK];(.ml.clust.cure.fit[d1;`e2dist;5;0];4);1b;d1clt] +passingTest[clusterIdxsDendro[.ml.clust.cure.cutK];(.ml.clust.cure.fit[d1;`edist;10;0.2];4);1b;d1clt] +passingTest[clusterIdxsDendro[.ml.clust.cure.cutK];(.ml.clust.cure.fit[d1;`mdist;3;0.15];4);1b;d1clt] +passingTest[clusterIdxsDendro[.ml.clust.cure.cutK];(.ml.clust.cure.fit[d2;`e2dist;20;0];4);1b;cured2clt1] +passingTest[clusterIdxsDendro[.ml.clust.cure.cutK];(.ml.clust.cure.fit[d2;`edist;20;0.2];4);1b;cured2clt2] +passingTest[clusterIdxsDendro[.ml.clust.cure.cutK];(.ml.clust.cure.fit[d2;`mdist;10;0.1];4);1b;cured2clt3] +passingTest[clusterIdxsDendro[.ml.clust.cure.cutDist];(.ml.clust.cure.fit[d1;`e2dist;5;0];2.);1b;d1clt] +passingTest[clusterIdxsDendro[.ml.clust.cure.cutDist];(.ml.clust.cure.fit[d1;`edist;10;0.2];2.);1b;d1clt] +passingTest[clusterIdxsDendro[.ml.clust.cure.cutDist];(.ml.clust.cure.fit[d1;`mdist;3;0.15];2.);1b;d1clt] +passingTest[algoOutputsFit[.ml.clust.cure.fit];(d1;`e2dist;5;0);1b;`data`dgram`inputs] failingTest[.ml.clust.cure.fit;(821?`2;`e2dist;5;0);0b;"Dataset not suitable for clustering. Must be convertible to floats."] failingTest[.ml.clust.cure.fit;(d1;`newmetric;5;0);0b;"invalid distance metric"] +// FitPredict +passingTest[clusterIdxsDendro[.ml.clust.cure.fitPredict];(d1;`e2dist;5;0;enlist[`k]!enlist 4);1b;d1clt] +passingTest[clusterIdxsDendro[.ml.clust.cure.fitPredict];(d1;`edist;10;0.2;enlist[`k]!enlist 4);1b;d1clt] +passingTest[clusterIdxsDendro[.ml.clust.cure.fitPredict];(d1;`mdist;3;0.15;enlist[`k]!enlist 4);1b;d1clt] + // Predict -passingTest[.ml.clust.cure.predict;(d1tts 1;.ml.clust.cure.cutk[.ml.clust.cure.fit[d1tts 0;`e2dist;5;0];4]);0b;cured1pred1] -passingTest[.ml.clust.cure.predict;(d1tts 1;.ml.clust.cure.cutk[.ml.clust.cure.fit[d1tts 0;`edist;10;0.2];4]);0b;cured1pred2] -passingTest[.ml.clust.cure.predict;(d1tts 1;.ml.clust.cure.cutk[.ml.clust.cure.fit[d1tts 0;`mdist;3;0.15];4]);0b;cured1pred3] -failingTest[.ml.clust.cure.predict;(182?`5;());0b;"Dataset not suitable for clustering. Must be convertible to floats."] -failingTest[.ml.clust.cure.predict;(2 10#20?5.;()!());0b;"Clusters must be contained within cfg - please run .ml.clust.cure.(cutk/cutdist)"] +passingTest[.ml.clust.cure.fit[d1tts 0;`e2dist;5;0]`predict;(d1tts 1;enlist[`k]!enlist 4);0b;cured1pred1] +passingTest[.ml.clust.cure.fit[d1tts 0;`edist;10;0.2]`predict;(d1tts 1;enlist[`k]!enlist 4);0b;cured1pred2] +passingTest[.ml.clust.cure.fit[d1tts 0;`mdist;3;0.15]`predict;(d1tts 1;enlist[`k]!enlist 4);0b;cured1pred3] +failingTest[.ml.clust.cure.fit[d1tts 0;`e2dist;5;0]`predict;(182?`5;enlist[`k]!enlist 3);0b;"Dataset not suitable for clustering. Must be convertible to floats."] // Hierarchical @@ -150,37 +160,42 @@ tab1:.ml.clust.hc.fit[d1;`mdist ;`single] tab2:.ml.clust.hc.fit[d1;`e2dist;`average] tab3:.ml.clust.hc.fit[d2;`e2dist;`centroid] tab4:.ml.clust.hc.fit[d2;`edist ;`complete] -hct1fit:"j"$fclust[mat tab1`dgram;4;`maxclust]` +hct1fit:"j"$fclust[mat tab1[`modelInfo;`dgram];4;`maxclust]` hcd1pred1:1 2 1 1 2 2 1 1 1 1 1 2 1 2 2 hcd1pred2:1 3 1 1 3 3 1 1 1 1 1 3 1 3 3 hcd1pred3:1 3 1 1 3 3 1 1 1 1 1 3 1 3 3 pyDgramDists:(lnk[flip d2;`single;`sqeuclidean]`)[;2] // Fit -passingTest[clusterAdd1[.ml.clust.hc.cutk ];(tab1;4);1b;hct1fit] -passingTest[clusterIdxs[.ml.clust.hc.cutk ];(.ml.clust.hc.fit[d2;`e2dist;`single];4);1b;hcResSingle] -passingTest[clusterIdxs[.ml.clust.hc.cutk ];(.ml.clust.hc.fit[d2;`e2dist;`ward];4);1b;hcResWard] -passingTest[clusterIdxs[.ml.clust.hc.cutk ];(.ml.clust.hc.fit[d2;`edist;`centroid];4);1b;hcResCentroid] -passingTest[clusterIdxs[.ml.clust.hc.cutk ];(.ml.clust.hc.fit[d2;`edist;`complete];4);1b;hcResComplete] -passingTest[clusterIdxs[.ml.clust.hc.cutk ];(.ml.clust.hc.fit[d2;`mdist;`average];4);1b;hcResAverage] -passingTest[clusterIdxs[.ml.clust.hc.cutk ];(tab2;4);1b;pythonRes[tab2;4;`maxclust]] -passingTest[clusterIdxs[.ml.clust.hc.cutk ];(tab3;4);1b;pythonRes[tab3;4;`maxclust]] -passingTest[clusterIdxs[.ml.clust.hc.cutk ];(tab4;4);1b;pythonRes[tab4;4;`maxclust]] -passingTest[clusterIdxs[.ml.clust.hc.cutdist];(tab1;.45);1b;pythonRes[tab1;.45;`distance]] -passingTest[clusterIdxs[.ml.clust.hc.cutdist];(tab2;4);1b;pythonRes[tab2;34;`distance]] -passingTest[clusterIdxs[.ml.clust.hc.cutdist];(tab3;500);1b;pythonRes[tab3;500;`distance]] -passingTest[clusterIdxs[.ml.clust.hc.cutdist];(tab4;30);1b;pythonRes[tab4;30;`distance]] +passingTest[clusterAdd1[.ml.clust.hc.cutK ];(tab1;4);1b;hct1fit] +passingTest[clusterIdxsDendro[.ml.clust.hc.cutK];(.ml.clust.hc.fit[d2;`e2dist;`single];4);1b;hcResSingle] +passingTest[clusterIdxsDendro[.ml.clust.hc.cutK];(.ml.clust.hc.fit[d2;`e2dist;`ward];4);1b;hcResWard] +passingTest[clusterIdxsDendro[.ml.clust.hc.cutK];(.ml.clust.hc.fit[d2;`edist;`centroid];4);1b;hcResCentroid] +passingTest[clusterIdxsDendro[.ml.clust.hc.cutK];(.ml.clust.hc.fit[d2;`edist;`complete];4);1b;hcResComplete] +passingTest[clusterIdxsDendro[.ml.clust.hc.cutK];(.ml.clust.hc.fit[d2;`mdist;`average];4);1b;hcResAverage] +passingTest[clusterIdxsDendro[.ml.clust.hc.cutK];(tab2;4);1b;pythonRes[tab2;4;`maxclust]] +passingTest[clusterIdxsDendro[.ml.clust.hc.cutK];(tab3;4);1b;pythonRes[tab3;4;`maxclust]] +passingTest[clusterIdxsDendro[.ml.clust.hc.cutK];(tab4;4);1b;pythonRes[tab4;4;`maxclust]] +passingTest[clusterIdxsDendro[.ml.clust.hc.cutDist];(tab1;.45);1b;pythonRes[tab1;.45;`distance]] +passingTest[clusterIdxsDendro[.ml.clust.hc.cutDist];(tab2;4);1b;pythonRes[tab2;34;`distance]] +passingTest[clusterIdxsDendro[.ml.clust.hc.cutDist];(tab3;500);1b;pythonRes[tab3;500;`distance]] +passingTest[clusterIdxsDendro[.ml.clust.hc.cutDist];(tab4;30);1b;pythonRes[tab4;30;`distance]] passingTest[qDendrogram[mat;.ml.clust.hc.fit];(d1;`e2dist;`single);1b;pythonDgram[d1;`single;`sqeuclidean]] passingTest[qDendrogram[mat;.ml.clust.hc.fit];(d1;`mdist;`complete);1b;pythonDgram[d1;`complete;`cityblock]] passingTest[qDendrogram[mat;.ml.clust.hc.fit];(d1;`edist;`centroid);1b;pythonDgram[d1;`centroid;`euclidean]] passingTest[qDendrogram[mat;.ml.clust.hc.fit];(d1;`mdist;`average);1b;pythonDgram[d1;`average;`cityblock]] -passingTest[qDgramDists[.ml.clust.hc.fit ];(d2;`e2dist;`single);1b;pyDgramDists] +passingTest[qDgramDists[.ml.clust.hc.fit];(d2;`e2dist;`single);1b;pyDgramDists] failingTest[.ml.clust.hc.fit;(821?`2;`e2dist;`ward);0b;"Dataset not suitable for clustering. Must be convertible to floats."] failingTest[.ml.clust.hc.fit;(d1;`mdist;`ward);0b;"ward must be used with e2dist"] failingTest[.ml.clust.hc.fit;(d1;`mdist;`linkage);0b;"invalid linkage"] +// FitPredict +passingTest[clusterIdxsDendro[.ml.clust.hc.fitPredict];(d2;`e2dist;`single;enlist[`k]!enlist 4);1b;hcResSingle] +passingTest[clusterIdxsDendro[.ml.clust.hc.fitPredict];(d2;`e2dist;`ward;enlist[`k]!enlist 4);1b;hcResWard] +passingTest[clusterIdxsDendro[.ml.clust.hc.fitPredict];(d2;`edist;`centroid;enlist[`k]!enlist 4);1b;hcResCentroid] + // Predict -passingTest[.ml.clust.hc.predict;(d1tts 1;.ml.clust.hc.cutk[.ml.clust.hc.fit[d1tts 0;`e2dist;`single];4]);0b;hcd1pred1] -passingTest[.ml.clust.hc.predict;(d1tts 1;.ml.clust.hc.cutk[.ml.clust.hc.fit[d1tts 0;`e2dist;`ward];4]);0b;hcd1pred2] -passingTest[.ml.clust.hc.predict;(d1tts 1;.ml.clust.hc.cutk[.ml.clust.hc.fit[d1tts 0;`edist;`centroid];4]);0b;hcd1pred3] -failingTest[.ml.clust.hc.predict;(2 10#20?5.;()!());0b;"Clusters must be contained within cfg - please run .ml.clust.hc.(cutk/cutdist)"] +passingTest[.ml.clust.hc.fit[d1tts 0;`e2dist;`single]`predict;(d1tts 1;enlist[`k]!enlist 4);0b;hcd1pred1] +passingTest[.ml.clust.hc.fit[d1tts 0;`e2dist;`ward]`predict;(d1tts 1;enlist[`k]!enlist 4);0b;hcd1pred2] +passingTest[.ml.clust.hc.fit[d1tts 0;`edist;`centroid]`predict;(d1tts 1;enlist[`k]!enlist 4);0b;hcd1pred3] + diff --git a/clust/tests/score.t b/clust/tests/score.t index 500d5616..7de0bf10 100644 --- a/clust/tests/score.t +++ b/clust/tests/score.t @@ -20,26 +20,26 @@ d1:flip(60#"F";",")0:`:clust/tests/data/ss5.csv d2:@[;`AnnualIncome`SpendingScore]("SSIII";(),",")0:`:clust/tests/data/Mall_Customers.csv // Expected Results -clt1:.ml.clust.hc.cutk[.ml.clust.hc.fit[d1;`edist;`single];4] -clt2:.ml.clust.hc.cutk[.ml.clust.hc.fit[d2;`e2dist;`ward];4] -clt3:.ml.clust.hc.cutk[.ml.clust.cure.fit[d2;`edist;20;0.2];4] +clt1:.ml.clust.hc.cutK[.ml.clust.hc.fit[d1;`edist;`single];4] +clt2:.ml.clust.hc.cutK[.ml.clust.hc.fit[d2;`e2dist;`ward];4] +clt3:.ml.clust.hc.cutK[.ml.clust.cure.fit[d2;`edist;20;0.2];4] rnd1:count[flip d1]?4 rnd2:count[flip d2]?4 // Dave Bouldin Score -passingTest[.ml.clust.daviesbouldin;(d1;clt1`clt);0b;pydb[flip d1;clt1`clt]`] -passingTest[.ml.clust.daviesbouldin;(d2;clt2`clt);0b;pydb[flip d2;clt2`clt]`] -passingTest[.ml.clust.daviesbouldin;(d2;clt3`clt);0b;pydb[flip d2;clt3`clt]`] +passingTest[.ml.clust.daviesBouldin;(d1;clt1`clust);0b;pydb[flip d1;clt1`clust]`] +passingTest[.ml.clust.daviesBouldin;(d2;clt2`clust);0b;pydb[flip d2;clt2`clust]`] +passingTest[.ml.clust.daviesBouldin;(d2;clt3`clust);0b;pydb[flip d2;clt3`clust]`] // Silhouette Score -passingTest[.ml.clust.silhouette;(d1;`edist;clt1`clt;1b);0b;pysil[flip d1;clt1`clt]`] -passingTest[.ml.clust.silhouette;(d2;`edist;clt2`clt;1b);0b;pysil[flip d2;clt2`clt]`] -passingTest[.ml.clust.silhouette;(d2;`edist;clt3`clt;1b);0b;pysil[flip d2;clt3`clt]`] +passingTest[.ml.clust.silhouette;(d1;`edist;clt1`clust;1b);0b;pysil[flip d1;clt1`clust]`] +passingTest[.ml.clust.silhouette;(d2;`edist;clt2`clust;1b);0b;pysil[flip d2;clt2`clust]`] +passingTest[.ml.clust.silhouette;(d2;`edist;clt3`clust;1b);0b;pysil[flip d2;clt3`clust]`] // Dunn Score -passingTest[applyScoring[.ml.clust.dunn;1 ];(d1;`e2dist;clt1`clt);1b;20] -passingTest[applyScoring[.ml.clust.dunn;100];(d2;`edist;clt2`clt);1b;13] -passingTest[applyScoring[.ml.clust.dunn;100];(d2;`mdist;clt3`clt);1b;10] +passingTest[applyScoring[.ml.clust.dunn;1 ];(d1;`e2dist;clt1`clust);1b;20] +passingTest[applyScoring[.ml.clust.dunn;100];(d2;`edist;clt2`clust);1b;13] +passingTest[applyScoring[.ml.clust.dunn;100];(d2;`mdist;clt3`clust);1b;10] // Elbow Scoring passingTest[applyScoring[.ml.clust.elbow;1];(d1;`e2dist;2);1b;enlist 548] @@ -48,7 +48,7 @@ passingTest[applyScoring[.ml.clust.elbow;1];(d2;`e2dist;2);1b;enlist 186363] failingTest[.ml.clust.elbow;(d2;`mdist;3);0b;"kmeans must be used with edist/e2dist"] // Homogeneity Score -passingTest[.ml.clust.homogeneity;(clt1`clt;rnd1);0b;hscore[rnd1;clt1`clt]`] -passingTest[.ml.clust.homogeneity;(clt2`clt;rnd2);0b;hscore[rnd2;clt2`clt]`] -passingTest[.ml.clust.homogeneity;(clt3`clt;rnd2);0b;hscore[rnd2;clt3`clt]`] +passingTest[.ml.clust.homogeneity;(clt1`clust;rnd1);0b;hscore[rnd1;clt1`clust]`] +passingTest[.ml.clust.homogeneity;(clt2`clust;rnd2);0b;hscore[rnd2;clt2`clust]`] +passingTest[.ml.clust.homogeneity;(clt3`clust;rnd2);0b;hscore[rnd2;clt3`clust]`] failingTest[.ml.clust.homogeneity;(100?0b;10?0b);0b;"pred and true must have equal lengths"] diff --git a/clust/tests/util.t b/clust/tests/util.t index 0b078fe7..96515d02 100644 --- a/clust/tests/util.t +++ b/clust/tests/util.t @@ -16,24 +16,24 @@ idxs1:til count d1 0 idxs2:til count d2 0 // K-D trees -tree:.ml.clust.kd.newtree[d1;1] -tree2:.ml.clust.kd.newtree[d2;2] +tree:.ml.clust.kd.newTree[d1;1] +tree2:.ml.clust.kd.newTree[d2;2] // Configurations -iter:`run`total`nochange!0 200 15 -info:.ml.clust.i.apinit[d1;`e2dist;max;idxs1] -info,:`emat`conv`iter!((count d1 0;iter`nochange)#0b;0b;iter) +iter:`run`total`noChange!0 200 15 +info:.ml.clust.i.apInit[d1;`e2dist;max;idxs1] +info,:`exemMat`conv`iter!((count d1 0;iter`noChange)#0b;0b;iter) // q Utilities specificRes :{(x . z)y} closestPoint:specificRes[.ml.clust.i.closest;`point] -newTreeRes :specificRes[.ml.clust.kd.newtree] +newTreeRes :specificRes[.ml.clust.kd.newTree] nnRes :specificRes[.ml.clust.kd.nn] // K-D Tree using C // Expected Results -kdKey:`leaf`left`self`parent`children`axis`midval`idxs +kdKey:`leaf`left`self`parent`children`axis`midVal`idxs kdRes1:kdKey!(1b;0b;3;1;0#0;0N;0n;enlist 1) kdRes2:kdKey!(1b;1b;2;1;0#0;0N;0n;enlist 0) kdRes3:kdKey!(1b;0b;3;1;0#0;0N;0n;1 3 4) @@ -45,7 +45,7 @@ passingTest[.ml.clust.i.closest;(d1;`e2dist;1 2;til 5);0b;`point`distance!(1;0)] passingTest[closestPoint ;(d2;`e2dist;3 6;reverse til 5);1b;2] passingTest[newTreeRes`left ;(d1;2);1b;010b] passingTest[newTreeRes`leaf ;(d1;2);1b;011b] -passingTest[newTreeRes`midval;(d1;2);1b;2 0n 0n] +passingTest[newTreeRes`midVal;(d1;2);1b;2 0n 0n] passingTest[newTreeRes`parent;(d1;2);1b;0N 0 0] passingTest[newTreeRes`idxs ;(d1;2);1b;(0#0;0 1;2 3 4)] passingTest[newTreeRes`axis ;(d1;2);1b;0 0N 0N] @@ -59,10 +59,10 @@ passingTest[nnRes`closestPoint;(tree2;d2;`edist;1 2 3;d1[;1]);1b;0] passingTest[nnRes`closestPoint;(tree2;d2;`edist;1 5 2;d1[;3]);1b;3] passingTest[nnRes`closestPoint`closestDist;(tree;d1;`mdist;1;7 9f);1b;(4;8f)] passingTest[nnRes`closestPoint`closestDist;(tree2;d2;`edist;0;d2[;2]);1b;(2;0f)] -passingTest[.ml.clust.kd.findleaf;(tree;d1[;1];tree 0);0b;kdRes1] -passingTest[.ml.clust.kd.findleaf;(tree;d2[;4];tree 2);0b;kdRes2] -passingTest[.ml.clust.kd.findleaf;(tree2;d2[;1];tree2 1);0b;kdRes3] -passingTest[.ml.clust.kd.findleaf;(tree2;d1[;0];tree2 2);0b;kdRes4] +passingTest[.ml.clust.kd.findLeaf;(tree;d1[;1];tree 0);0b;kdRes1] +passingTest[.ml.clust.kd.findLeaf;(tree;d2[;4];tree 2);0b;kdRes2] +passingTest[.ml.clust.kd.findLeaf;(tree2;d2[;1];tree2 1);0b;kdRes3] +passingTest[.ml.clust.kd.findLeaf;(tree2;d1[;0];tree2 2);0b;kdRes4] // K-D Tree using q @@ -80,17 +80,17 @@ passingTest[nnRes`closestPoint;(tree2;d2;`edist;1 2 3;d1[;1]);1b;0] passingTest[nnRes`closestPoint;(tree2;d2;`edist;1 5 2;d1[;3]);1b;3] passingTest[nnRes`closestPoint`closestDist;(tree;d1;`mdist;1;7 9f);1b;(4;8f)] passingTest[nnRes`closestPoint`closestDist;(tree2;d2;`edist;0;d2[;2]);1b;(2;0f)] -passingTest[.ml.clust.kd.findleaf;(tree;d1[;1];tree 0);0b;kdRes5] -passingTest[.ml.clust.kd.findleaf;(tree;d2[;4];tree 2);0b;kdRes6] -passingTest[.ml.clust.kd.findleaf;(tree2;d2[;1];tree2 1);0b;kdRes7] -passingTest[.ml.clust.kd.findleaf;(tree2;d1[;0];tree2 2);0b;kdRes8] +passingTest[.ml.clust.kd.findLeaf;(tree;d1[;1];tree 0);0b;kdRes5] +passingTest[.ml.clust.kd.findLeaf;(tree;d2[;4];tree 2);0b;kdRes6] +passingTest[.ml.clust.kd.findLeaf;(tree2;d2[;1];tree2 1);0b;kdRes7] +passingTest[.ml.clust.kd.findLeaf;(tree2;d1[;0];tree2 2);0b;kdRes8] // K-Means -passingTest[.ml.clust.i.getclust;(d2;`e2dist;flip d2[;1 2]);0b;1 0 1 0 0 0 0 0 0 0] -passingTest[.ml.clust.i.getclust;(d2;`e2dist;flip d2[;1 2 3]);0b;1 0 1 2 2 2 2 2 2 2] -passingTest[.ml.clust.i.getclust;(d1;`e2dist;flip d1[;2 3]);0b;0 1 0 1 0] -passingTest[.ml.clust.i.getclust;(d1;`edist;flip d1[;3 4]);0b;0 0 1 0 1] +passingTest[.ml.clust.i.getClust;(d2;`e2dist;flip d2[;1 2]);0b;1 0 1 0 0 0 0 0 0 0] +passingTest[.ml.clust.i.getClust;(d2;`e2dist;flip d2[;1 2 3]);0b;1 0 1 2 2 2 2 2 2 2] +passingTest[.ml.clust.i.getClust;(d1;`e2dist;flip d1[;2 3]);0b;0 1 0 1 0] +passingTest[.ml.clust.i.getClust;(d1;`edist;flip d1[;3 4]);0b;0 0 1 0 1] // DBSCAN @@ -107,9 +107,9 @@ a01:"f"$(3.24 0 0 0 0;0 0 0 0 0;0 0 3.24 0 0;0 0 0 0 0;0 0 0 0 0) AP1:(0 -12 -7.2 -3.2 0;-12 3.2 -5.6 -9.6 -3.2;-7.2 -5.6 0 0 -9.6;-3.2 -9.6 3.2 0 -5.6;3.2 -3.2 -9.6 -5.6 0) AP2:(0 -13.5 -8.1 -3.6 0;-13.5 3.6 -6.3 -10.8 -3.6;-8.1 -6.3 0 0 -10.8;-3.6 -10.8 3.6 0 -6.3;3.6 -3.6 -10.8 -6.3 0) -passingTest[specificRes[.ml.clust.i.apinit;`s`a`r`matches];(d1;`e2dist;min;idxs1);1b;(d1S;5 5#0f;5 5#0f;0)] -passingTest[specificRes[.ml.clust.i.apalgo;`exemplars`s`a];(.1;info);1b;(0 1 2 2 0;s01;a01)] -passingTest[.ml.clust.i.updr;(.2;info);0b;AP1] -passingTest[.ml.clust.i.updr;(.1;info);0b;AP2] -passingTest[.ml.clust.i.upda;(.5;info);0b;5 5#0f] -passingTest[.ml.clust.i.upda;(.9;info);0b;5 5#0f] +passingTest[specificRes[.ml.clust.i.apInit;`similar`avail`r`matches];(d1;`e2dist;min;idxs1);1b;(d1S;5 5#0f;5 5#0f;0)] +passingTest[specificRes[.ml.clust.i.apAlgo;`exemplars`similar`avail];(.1;info);1b;(0 1 2 2 0;s01;a01)] +passingTest[.ml.clust.i.updR;(.2;info);0b;AP1] +passingTest[.ml.clust.i.updR;(.1;info);0b;AP2] +passingTest[.ml.clust.i.updAvail;(.5;info);0b;5 5#0f] +passingTest[.ml.clust.i.updAvail;(.9;info);0b;5 5#0f] diff --git a/clust/util.q b/clust/util.q deleted file mode 100644 index cc2cc552..00000000 --- a/clust/util.q +++ /dev/null @@ -1,106 +0,0 @@ -\d .ml - -// Clustering Utilities - -// Distance metric dictionary - -// @kind function -// @category private -// @fileoverview Euclidean distance calculation -// @param data {float[][]} Points -// @return {float[]} Euclidean distances for data -clust.i.df.edist:{[data] - sqrt data wsum data - } - -// @kind function -// @category private -// @fileoverview distance calculation -// @param data {float[][]} Points -// @return {float[]} Euclidean squared distances for data -clust.i.df.e2dist:{[data] - data wsum data - } - -// @kind function -// @category private -// @fileoverview Manhattan distance calculation -// @param data {float[][]} Points -// @return {float[]} Manhattan distances for data -clust.i.df.mdist:{[data] - sum abs data - } - -// @kind function -// @category private -// @fileoverview Chebyshev distance calculation -// @param data {float[][]} Points -// @return {float[]} Chebyshev distances for data -clust.i.df.cshev:{[data] - min abs data - } - -// @kind function -// @category private -// @fileoverview Negative euclidean squared distance calculation -// @param data {float[][]} Points -// @return {float[]} Negative euclidean squared distances for data -clust.i.df.nege2dist:{[data] - neg data wsum data - } - -// @kind dictionary -// @category private -// @fileoverview Linkage dictionary -clust.i.lf.single:min -clust.i.lf.complete:max -clust.i.lf.average:avg -clust.i.lf.centroid:raze -clust.i.lf.ward:{z*x*y%x+y} - -// Distance calculations - -// @kind function -// @category private -// @param data {float[][]} Points in `value flip` format -// @param df {fn} Distance function -// @param pt {float[]} Current point -// @param idxs {long[]} Indices from data -// @return {float[]} Distances for data and pt -clust.i.dists:{[data;df;pt;idxs] - clust.i.df[df]pt-data[;idxs] - } - -// @kind function -// @category private -// @param data {float[][]} Points in `value flip` format -// @param df {fn} Distance function -// @param pt {float[]} Current point -// @param idxs {long[]} Indices from data -// @return {float[]} Distances for data and pt -clust.i.closest:{[data;df;pt;idxs] - `point`distance!(idxs dists?md;md:min dists:clust.i.dists[data;df;pt;idxs]) - } - -// @kind function -// @category private -// @fileoverview Reindex exemplars -// @param data {#any[]} Data points -// @return {long[]} List of indices -clust.i.reindex:{[data] - distinct[data]?data - } - -clust.i.floatConversion:{[data] - @[{"f"$x};data;{'"Dataset not suitable for clustering. Must be convertible to floats."}] - } - -// @kind dictionary -// @category private -// @fileoverview Error dictionary -clust.i.err.df:{'`$"invalid distance metric"} -clust.i.err.lf:{'`$"invalid linkage"} -clust.i.err.ward:{'`$"ward must be used with e2dist"} -clust.i.err.centroid:{'`$"centroid must be used with edist/e2dist"} -clust.i.err.kmeans:{'`$"kmeans must be used with edist/e2dist"} -clust.i.err.ap:{'`$"AP must be used with nege2dist"} diff --git a/clust/utils.q b/clust/utils.q new file mode 100644 index 00000000..aebc6e20 --- /dev/null +++ b/clust/utils.q @@ -0,0 +1,1327 @@ +// clust/utils.q - Clustering Utilities +// Copyright (c) 2021 Kx Systems Inc +// +// Collection of utility functions for +// implementation of clustering algos + +\d .ml + + +// Distance metric dictionary + +// @private +// @kind function +// @category clustUtility +// @desc Euclidean distance calculation +// @param data {float[][]} Points +// @return {float[]} Euclidean distances for data +clust.i.df.edist:{[data] + sqrt data wsum data + } + +// @private +// @kind function +// @category clustUtility +// @desc Distance calculation +// @param data {float[][]} Points +// @return {float[]} Euclidean squared distances for data +clust.i.df.e2dist:{[data] + data wsum data + } + +// @private +// @kind function +// @category clustUtility +// @desc Manhattan distance calculation +// @param data {float[][]} Points +// @return {float[]} Manhattan distances for data +clust.i.df.mdist:{[data] + sum abs data + } + +// @private +// @kind function +// @category clustUtility +// @desc Chebyshev distance calculation +// @param data {float[][]} Points +// @return {float[]} Chebyshev distances for data +clust.i.df.cshev:{[data] + min abs data + } + +// @private +// @kind function +// @category clustUtility +// @desc Negative euclidean squared distance calculation +// @param data {float[][]} Points +// @return {float[]} Negative euclidean squared distances for data +clust.i.df.nege2dist:{[data] + neg data wsum data + } + +// @private +// @kind dictionary +// @category clustUtility +// @desc Linkage dictionary +// @type dictionary +clust.i.lf.single:min +clust.i.lf.complete:max +clust.i.lf.average:avg +clust.i.lf.centroid:raze +clust.i.lf.ward:{z*x*y%x+y} + +// Distance calculations + +// @private +// @kind function +// @category clustUtility +// @param data {float[][]} Points in `value flip` format +// @param df {fn} Distance function +// @param pt {float[]} Current point +// @param idxs {long[]} Indices from data +// @return {float[]} Distances for data and pt +clust.i.dists:{[data;df;pt;idxs] + clust.i.df[df]pt-data[;idxs] + } + +// @private +// @kind function +// @category clustUtility +// @desc Get the closest point and distance from the current point +// @param data {float[][]} Points in `value flip` format +// @param df {fn} Distance function +// @param pt {float[]} Current point +// @param idxs {long[]} Indices from data +// @return {dictionary} Index of the closest point and the distance between +// that point and current point +clust.i.closest:{[data;df;pt;idxs] + dists:clust.i.dists[data;df;pt;idxs]; + minIdx:idxs dists?minDist:min dists; + `point`distance!(minIdx;minDist) + } + +// @private +// @kind function +// @desc Reindex the data +// @category clustUtility +// @desc Reindex exemplars +// @param data {any[]} Data points +// @return {long[]} List of indices +clust.i.reIndex:{[data] + distinct[data]?data + } + +// @private +// @kind function +// @category clustUtility +// @desc Convert data to floating value +// @param data {any[]} Data points +// @return {err|float[]} Data converted to floating point values or +// error if not possible +clust.i.floatConversion:{[data] + @[{"f"$x};data;{'"Dataset not suitable for clustering. ", + "Must be convertible to floats."}] + } + +// @private +// @kind dictionary +// @category clustUtility +// @desc Error dictionary +// @type dictionary +clust.i.err.df:{'`$"invalid distance metric"} +clust.i.err.lf:{'`$"invalid linkage"} +clust.i.err.ward:{'`$"ward must be used with e2dist"} +clust.i.err.centroid:{'`$"centroid must be used with edist/e2dist"} +clust.i.err.kMeans:{'`$"kmeans must be used with edist/e2dist"} +clust.i.err.ap:{'`$"AP must be used with nege2dist"} + +// Hierarchial Utilities + +// @private +// @kind function +// @category clustUtility +// @desc Check validity of inputs for cutting dendrograms +// at position K when using .ml.clust.cutK >1 +// @param cutK {int} The user provided number of clusters to be +// retrieved when cutting the dendrogram +// @return {::|err} Returns nothing on successful invocation, will error +// if a user provides an unsupported value +clust.i.checkK:{[cutK] + if[cutK<=1;'"Number of requested clusters must be > 1."]; + } + +// @private +// @kind function +// @category clustUtility +// @desc Check validity of inputs for cutting dendrograms +// at a distance. In order to be valid this must be > 0 +// @param cutDist {float} The user provided cutting distance for +// the dendrogram +// @return {::|err} Returns nothing on successful invocation, will error +// if a user provides an unsupported value +clust.i.checkDist:{[cutDist] + if[cutDist<=0;'"Cutting distance must be > 0."]; + } + +// @private +// @kind function +// @category clustUtility +// @desc Prepare the config for prediction functionality +// @param config {dictionary} Clustering information returned from `fit` +// @param cutDist {dictionary} The key defines what cutting algo to use when +// splitting the data into clusters (`k/`dist) and the value defines the +// cutting threshold +// @return {dictionary} `data`df`n`c`clust returned from +// .ml.clust.(cutK/cutDist) +clust.i.prepPred:{[config;cutDict] + cutType:first key cutDict; + if[not cutType in`k`dist;'"Cutting distance has to be 'k' or 'dist'"]; + $[cutType=`k; + clust.cure.cutK; + clust.cure.cutDist + ][config;first value cutDict] + } + + +// @private +// @kind function +// @category clustUtility +// @desc Complete, Average, Ward (CAW) Linkage +// @param data {float[][]} Each column of the data is an individual datapoint +// @param df {symbol} Distance function name within '.ml.clust.i.df' +// @param lf {symbol} Linkage function name within '.ml.clust.i.lf' +// @param k {long} Number of clusters +// @param dgram {boolean} Generate dendrogram or not (1b/0b) +// @return {table|long[]} Dendrogram or list of clusters +clust.i.hcCAW:{[data;df;lf;k;dgram] + // Check distance function for ward + if[(not df~`e2dist)&lf=`ward;clust.i.err.ward[]]; + // Create initial cluster table + t0:clust.i.initCAW[data;df]; + // Create linkage matrix + m:([]idx1:`int$();idx2:`int$();dist:`float$();n:`int$()); + // Merge clusters based on chosen algorithm + r:{[k;r]ki+1; + clustIdx:where[clusts=clusts i]except i; + clusts[clustIdx]:1+max clusts;i+:1 + ]; + // Update dendrogram with new indices + ![dgram;();0b;`idx1`idx2!n cut clusts] + } + +// @private +// @kind function +// @category clustUtility +// @desc Convert dendrogram table to clusters +// @param tab {table} Dendrogram table +// @param k {long} Define splitting value in dendrogram table +// @return {long[]} List of clusters +clust.i.cutDgram:{[tab;k] + if[k=0; + '"User provided input encapsultes all datapoints, please ", + "increase `k or reduce `cut to an appropriate value." + ]; + // Get index of cluster made at cutting point k + idx:(2*cntTab:count tab)-k-1; + // Exclude any clusters made after point k + i:raze neg[k]#'allClusts:tab`idx1`idx2; + exClust:i where idx>i; + // Extract indices within clusters made until k, excluding any outliers + outliers:exClust where exClust<=cntTab; + cutOff:exClust except outliers; + clust:{last{count x 0}clust.i.extractClust[x;y]/(z;())} + [allClusts;cntTab+1]each cutOff; + // Update points to the cluster they belong to + @[;;:;]/[(1+cntTab)#0N;clust,enlist each outliers;til k+1] + } + +// @private +// @kind function +// @category clustUtility +// @desc Extract points within merged cluster +// @param clusts {long[]} Cluster indices +// @param cntTab {long} Count of dendrogram table +// @param idxs {long[]} Index in list to search and indices points found within +// that cluster +// @return {long[]} Next index to search, and additional points found +// within cluster +clust.i.extractClust:{[clusts;cntTab;idxs] + // Extract the points that were merged at this point + mrgClust:raze clusts[;idxs[0]-cntTab]; + // Store any single clusts, break down clusts more than single point + nextIdx:mrgClust>=cntTab; + otherIdxs:idxs[1],mrgClust where not nextIdx; + (mrgClust where nextIdx;otherIdxs) + } + +// @private +// @kind function +// @category clustUtility +// @desc SCC algo +// @param data {float[][]} Each column of the data is an individual datapoint +// @param df {symbol} Distance function name within '.ml.clust.i.df' +// @param lf {symbol} Linkage function name within '.ml.clust.i.lf' +// @param params {dictionary} Parameters - k (no. clusts), +// n (no. repPts per clust), repPts, kdTree +// @param clustTab {table} Cluster table +// @param repPts {float[][]} Representative points and associated info +// @param kdTree {table} k-dimensional tree storing points and distances +// @param linkMatrix {float[][]} Linkage matrix +// @return {(dictionary|long[]|float[][]|table)} Parameters dict, clusters, +// representative points and kdTree tables +clust.i.algoSCC:{[data;df;lf;params;clustTab;repPts;kdTree;linkMatrix] + // Merge closest clusters + clust0:exec clust{x?min x}closestDist from clustTab where valid; + newMerge:clustTab clust0,clust1:clustTab[clust0]`closestClust; + newMerge:update valid:10b,repPts:(raze repPts;0#0),points:(raze points;0#0) + from newMerge; + // Make dendrogram if required + if[linkMatrix 1; + matrix:linkMatrix 0; + merge0:first newMerge; + matrix,:newMerge[`clustIdx],merge0[`closestDist],count merge0`points; + linkMatrix[0]:matrix + ]; + // Keep track of old repPts + oldRep:repPts newMerge[0]`repPts; + // Find reps in new cluster + $[single:lf~`single; + // For single new reps=old reps -> no new points calculated + newRep:select repPt,clust:clust0 from oldRep; + // Generate new representative points table + // (centroid -> reps=avg; cure -> calc reps) + [newRepFunc:$[lf~`centroid; + clust.i.centRep; + clust.i.cureRep[df;params`n;params`c] + ]; + newRepKeys:params`repCols; + newRepVals:flip newRepFunc data[;newMerge[0]`points]; + newRep:flip newRepKeys!newRepVals; + newRep:update clust:clust0,repPt:count[i]#newMerge[0]`repPts from newRep; + // New rep leaves + updLeaf:clust.kd.findleaf[kdTree;;kdTree 0]each flip newRep params`repCols; + newRep[`leaf]:updLeaf`self; + newMerge[0;`repPts]:newRep`repPt; + // Delete old points from leaf and update new point to new rep leaf + kdTree:.[kdTree;(oldRep`leaf;`idxs);except;oldRep`repPt]; + kdTree:.[kdTree;(newRep`leaf;`idxs);union ;newRep`repPt] + ] + ]; + // Update clusters and repPts + clustTab:@[clustTab;newMerge`clust;,;delete clust from newMerge]; + repPts:@[repPts;newRep`repPt;,;delete repPt from newRep]; + updRep:repPts newRep`repPt; + // Nearest neighbour to clust + if[single;updRep:select from updRep where closestClust in newMerge`clust]; + // Calculate and append to representative point table the nearest neighbours + // of columns containing representative points + updRepData:flip updRep params`repCols; + updRepDataNN:clust.kd.nn + [kdTree;repPts params`repCols;df;newMerge[0]`points] each updRepData; + updRep:updRep,'updRepDataNN; + updRep:update closestClust:repPts[closestPoint;`clust]from updRep; + if[single; + repPt:@[repPts;updRep`repPt;,;select closestDist,closestClust from updRep]; + updRep:repPt newRep`repPt + ]; + // Update nearest neighbour of new clust + updRep@:raze iMin updRep`closestDist; + clustTab:@[clustTab;updRep`clust;,;`closestDist`closestClust#updRep]; + $[single; + // Single - nearest neighbour=new clust + [clustTab:update closestClust:clust0 from clustTab where valid, + closestClust=clust1; + repPts:update closestClust:clust0 from repPts where closestClust=clust1 + ]; + // Else do nearest neighbour search + if[count updClusts:select from clustTab where valid,closestClust in + (clust0;clust1); + nnClust:clust.kd.nn[kdTree;repPts params`repCols;df]/:' + [updClusts`repPts;flip each repPts[updClusts`repPts]@\:params`repCols]; + updClusts:updClusts,'{x iMin x`closestDist}each nnClust; + updClusts[`closestClust]:repPts[updClusts`closestPoint]`clust; + clustTab:@[clustTab;updClusts`clust;,;select closestDist,closestClust + from updClusts] + ] + ]; + (params;clustTab;repPts;kdTree;linkMatrix) + } + + +// Kmeans utilities + +// @private +// @kind function +// @category clustUtility +// @desc K-Means algorithm +// @param data {float[][]} Each column of the data is an individual datapoint +// @param df {symbol} Distance function name within '.ml.clust.i.df' +// @param k {long} Number of clusters +// @param config {dictionary} Configuration information containing the maximum +// iterations `iter, initialisation type `init and threshold for smallest +// distance to move between the previous and new run `thresh +// @return {dictionary} Clusters or repPts depending on rep +clust.i.kMeans:{[data;df;k;config] + // Check distance function + if[not df in`e2dist`edist;clust.i.err.kMeans[]]; + // Initialize representative points + initRepPts:$[config`init; + clust.i.initKpp df; + clust.i.initRandom + ][data;k]; + // Run algo until maximum number of iterations reached or convergence + repPts0:`idx`repPts`notConv!(0;initRepPts;1b); + repPts1:clust.i.kMeansConverge[config] + clust.i.updCenters[data;df;config]/repPts0; + // Return representative points and clusters + clust:clust.i.getClust[data;df;repPts1`repPts]; + `repPts`clust!(repPts1`repPts;clust) + } + +// @private +// @kind function +// @category clustUtility +// @desc Check to see if cluster centers are stable or +// if the maximum number of iterations allowable have been reached +// @param config {dictionary} Configuration information containing the maximum +// iterations `iter, initialisation type `init and threshold for smallest +// distance to move between the previous and new run `thresh +// @param algoRun {dictionary} Information about the current run of the +// algorithm which can have an impact on early or on time stopping i.e. have +// the maximum number of iterations been exceeded or have the cluster centers +// not moved more than the threshold i.e. 'stationary' +// @return {boolean} 0b indicates number of iterations has exceeded maximum and +clust.i.kMeansConverge:{[config;algoRun] + check1:config[`iter]>algoRun`idx; + check2:algoRun`notConv; + check1&check2 + } + +// @private +// @kind function +// @category clustUtility +// @desc Update cluster centers +// @param data {float[][]} Each column of the data is an individual datapoint +// @param df {symbol} Distance function name within '.ml.clust.i.df' +// @param config {dictionary} Configuration information containing the maximum +// iterations `iter, initialisation type `init and threshold for smallest +// distance to move between the previous and new run `thresh +// @param repPts {float[][]|dictionary} Information relating to the +// representative points, in the case of fitting the model this is a +// dictionary containing the current iteration index and if the data has +// converged in addition to the representative points. In an individual +// update this is just the representative points for the k means centers. +// @return {float[][]} Updated representative points +clust.i.updCenters:{[data;df;config;repPts] + // Projection used for calculation of representative points + repPtFunc:clust.i.newRepPts[data;df;]; + if[99h=type repPts; + repPts[`idx]+:1; + prevPoint:repPts`repPts; + repPts[`repPts]:repPtFunc repPts`repPts; + repPts[`notConv]:config[`thresh]@[;idx;:;0w]clust.i.df[df]data-data[;idx] + } + +// @private +// @kind function +// @category clustUtility +// @desc Run DBSCAN algorithm and update cluster of each point +// @param tab {table} Cluster info table +// @return {table} Updated cluster table with old clusters merged +clust.i.dbAlgo:{[tab] + nbIdxs:.ml.clust.i.nbhoodIdxs[tab]/[first where tab`corePoint]; + update cluster:0|1+max tab`cluster,corePoint:0b from tab where i in nbIdxs + } + +// @private +// @kind function +// @category clustUtility +// @desc Find indices in each points neighborhood +// @param tab {table} Cluster info table +// @param idxs {long[]} Indices to search the neighborhood of +// @return {long[]} Indices in neighborhood +clust.i.nbhoodIdxs:{[tab;idxs] + nbh:exec nbhood from tab[distinct idxs,raze tab[idxs]`nbhood]where corePoint; + asc distinct idxs,raze nbh + } + +// Aprop utilities + +// @private +// @kind function +// @category clustUtility +// @desc Run affinity propagation algorithm +// @param data {float[][]} Each column of the data is an individual datapoint +// @param df {symbol} Distance function name within '.ml.clust.i.df' +// @param damp {float} Damping coefficient +// @param diag {fn} Function applied to the similarity matrix diagonal +// @param idxs {long[]} Indicies to find distances for +// @param iter {dictionary} Max number of overall iterations and iterations +// without a change in clusters. (::) can be passed in where the defaults +// of (`total`noChange!200 15) will be used +// @return {dictionary} Data, input variables, clusters and exemplars +clust.i.runAp:{[data;df;damp;diag;idxs;iter] + // Check negative euclidean distance has been given + if[df<>`nege2dist;clust.i.err.ap[]]; + // Calculate distances, availability and responsibility + info0:clust.i.apInit[data;df;diag;idxs]; + // Initialize exemplar matrix and convergence boolean + info0,:`exemMat`conv`iter!((count data 0;iter`noChange)#0b;0b;iter); + // Run ap algo until maximum number of iterations completed or convergence + info1:clust.i.apStop clust.i.apAlgo[damp]/info0; + // Return data, inputs, clusters and exemplars + inputs:`df`damp`diag`iter!(df;damp;diag;iter); + exemplars:info1`exemplars; + clust:$[info1`conv;clust.i.reIndex exemplars;count[data 0]#-1]; + `data`inputs`clust`exemplars!(data;inputs;clust;exemplars) + } + +// @private +// @kind function +// @category clustUtility +// @desc Initialize matrices +// @param data {float[][]} Each column of the data is an individual datapoint +// @param df {symbol} Distance function name within '.ml.clust.i.df' +// @param diag {fn} Function applied to the similarity matrix diagonal +// @param idxs {long[]} Point indices +// @return {dictionary} Similarity, availability and responsibility matrices +// and keys for matches and exemplars to be filled during further iterations +clust.i.apInit:{[data;df;diag;idxs] + // Calculate similarity matrix values + dists:clust.i.dists[data;df;data]each idxs; + // Update diagonal + dists:@[;;:;diag raze dists]'[dists;k:til n:count data 0]; + // Create lists/matrices of zeros for other variables + `matches`exemplars`similar`avail`r!(0;0#0;dists),(2;n;n)#0f + } + +// @private +// @kind function +// @category clustUtility +// @desc Run affinity propagation algorithm +// @param damp {float} Damping coefficient +// @param info {dictionary} Similarity, availability, responsibility, +// exemplars, matches, iter dictionary, no_conv boolean and iter dict +// @return {dictionary} Updated inputs +clust.i.apAlgo:{[damp;info] + // Update responsibility matrix + info[`r]:clust.i.updR[damp;info]; + // Update availability matrix + info[`avail]:clust.i.updAvail[damp;info]; + // Find new exemplars + ex:iMax each sum info`avail`r; + // Update `info` with new exemplars/matches + info:update exemplars:ex,matches:?[exemplars~ex;matches+1;0]from info; + // Update iter dictionary + .[clust.i.apConv info;(`iter;`run);+[1]] + } + +// @private +// @kind function +// @category clustUtility +// @desc Check affinity propagation algorithm for convergence +// @param info {dictionary} Similarity, availability, responsibility, +// exemplars, matches, iter dictionary, no_conv boolean and iter dict +// @return {dictionary} Updated info dictionary +clust.i.apConv:{[info] + // Iteration dictionary + iter:info`iter; + // Exemplar matrix + exemMat:info`exemMat; + // Existing exemplars + exemDiag:0sum(se=iter`noChange)+0=se:sum each exemMat; + conv:$[(iter[`total]=iter`run)|not[unConv]&sum[exemDiag]>0;1b;0b]]; + // Return updated info + info,`exemMat`conv!(exemMat;conv) + } + +// @private +// @kind function +// @category clustUtility +// @desc Retrieve diagonal from a square matrix +// @param matrix {any[][]} Square matrix +// @return {any[]} Matrix diagonal +clust.i.diag:{[matrix] + {x y}'[matrix;til count matrix] + } + +// @private +// @kind function +// @category clustUtility +// @desc Update responsibility matrix +// @param damp {float} Damping coefficient +// @param info {dictionary} Similarity, availability, responsibility, +// exemplars, matches, iter dictionary, no_conv boolean and iter dict +// @return {float[][]} Updated responsibility matrix +clust.i.updR:{[damp;info] + mx:clust.i.maxResp'[sum info`similar`avail;til count info`r]; + // Calculate new responsibility + (damp*info`r)+(1-damp)*info[`similar]-mx + } + +// @private +// @kind function +// @category clustUtility +// @desc Create matrix with every points max responsibility +// diagonal becomes -inf, current max becomes second max +// @param data {float[]} Sum of similarity and availability matrices +// @param i {long} Index of responsibility matrix +// @return {float[][]} Responsibility matrix +clust.i.maxResp:{[data;i] + maxData:max data; + maxI:data?maxData; + @[count[data]#maxData;maxI;:;]max@[data;i,maxI;:;-0w] + } + +// @private +// @kind function +// @category clustUtility +// @desc Update availability matrix +// @param damp {float} Damping coefficient +// @param info {dictionary} Similarity, availability, responsibility, +// exemplars, matches, iter dictionary, no_conv boolean and iter dict +// @return {float[][]} Returns updated availability matrix +clust.i.updAvail:{[damp;info] + // Sum values in positive availability matrix + resp:0|info`r; + k:til count info`avail; + sumR:sum@[;;:;0f]'[resp;k]; + // Create a matrix using the negative values produced by the availability sum + // + responsibility diagonal - positive availability values + avail:@[;;:;]'[0&(sumR+info[`r]@'k)-/:resp;k;sumR]; + // Calculate new availability + (damp*info`avail)+avail*1-damp + } + +// @private +// @kind function +// @category clustUtility +// @desc Stopping condition for affinity propagation algorithm +// @param info {dictionary} Similarity, availability, responsibility, exemplars, +// matches, iter dictionary, no_conv boolean and iter dict +// @return {boolean} Indicates whether to continue or stop running AP (1/0b) +clust.i.apStop:{[info] + (info[`iter;`total]>info[`iter]`run)¬ 1b~info`conv + } + +// @private +// @kind function +// @category clustUtility +// @desc Predict clusters using AP training exemplars +// @param centre {float[][]} Training cluster centres in matrix format, +// each column is an individual datapoint +// @param df {symbol} Distance function name within '.ml.clust.i.df' +// @param pt {float[]} Current data point +// @return {long[]} Predicted clusters +clust.i.apPredDist:{[centre;df;pt] + dists:clust.i.dists[centre;df;pt]each til count centre 0; + iMax dists + } + +// KD Tree utilities + +// @private +// @kind function +// @category kdtree +// @desc Create tree table where each row represents a node +// @param data {float[][]} Each column of the data is an individual datapoint +// @param leafSize {long} Points per leaf (<2*number of representatives) +// @param node {dictionary} Info for a given node in the tree +// @return {table} k-d tree table +clust.kd.i.tree:{[data;leafSize;node] + if[leafSize<=.5*count node`idxs; + xData:data[;node`idxs]; + varData:var each xData; + split:xData1250]`]; - `mean`variance`median`dev!(avg;var;med;dev)@\:a} -fresh.feat.agglintrend:{ - t:fresh.feat.lintrend each(max;min;var;avg)@/:\:y cut x; - (`$"_"sv'string cols[t]cross`max`min`var`avg)!raze value flip t} -fresh.feat.augfuller:{`teststat`pvalue`usedlag!3#"f"$@[{fresh.i.adfuller[x]`};x;0n]} -fresh.feat.autocorr:{$[y=0;1f;(avg(x-m)*xprev[y;x]-m:avg x)%var x]} -fresh.feat.binnedentropy:{neg sum p*log p:(count each group(y-1)&floor y*x%max x-:min x)%count x} -/ t-series non-linearity - Schreiber, T. and Schmitz, A. (1997). PHYSICAL REVIEW E, VOLUME 55, NUMBER 5 -fresh.feat.c3:{avg x*/xprev\:[-1 -2*y]x} -fresh.feat.changequant:{[x;ql;qh;isabs] - k:($[isabs;abs;]1_deltas x)where 1_&':[x within fresh.feat.quantile[x]ql,qh]; - `max`min`mean`variance`median`stdev!(max;min;avg;var;med;dev)@\:k} -/ time series complexity - http://www.cs.ucr.edu/~eamonn/Complexity-Invariant%20Distance%20Measure.pdf -fresh.feat.cidce:{sqrt k$k:"f"$1_deltas$[not y;x;0=s:dev x;:0.;(x-avg x)%s]} -fresh.feat.count:{count x} -fresh.feat.countabovemean:{sum x>avg x} -fresh.feat.countbelowmean:{sum xcount distinct x} -fresh.feat.hasdupmax:{1y*max[x]-min x} -fresh.feat.lastmax:{(last where x=max x)%count x} -fresh.feat.lastmin:{(last where x=min x)%count x} -fresh.feat.lintrend:{`rval`intercept`slope!0^(xk%sqrt vk*var x;avg[x]-b*avg k;b:(xk:x cov k)%vk:var k:til count x)} -fresh.feat.longstrikegtmean:{max 0,fresh.i.getlenseqwhere x>avg x} -fresh.feat.longstrikeltmean:{max 0,fresh.i.getlenseqwhere xy} -fresh.feat.numcwtpeaks:{count fresh.i.findpeak[x;1+til y]`} -fresh.feat.numpeaks:{sum all fresh.i.peakfind[x;y;]each 1+til y} -fresh.feat.partautocorrelation:{ - (`$"lag_",/:string 1+til y)!y#$[1>mx:y&count[x]-1;();1_fresh.i.pacf[x;`nlags pykw mx;`method pykw`ld]`],y#0n} -fresh.feat.perrecurtoalldata:{sum[1=y)&xy*dev x} -fresh.feat.ratiovalnumtserieslength:{count[distinct x]%count x} -fresh.feat.skewness:{n*sum[m*m*m:x-avg x]%(s*s*s:sdev x)*(n-1)*-2+n:count x} -fresh.feat.spktwelch:{fresh.i.welch[x;`nperseg pykw 256&count x][@;1][`]y} -fresh.feat.stddev:{dev x} -fresh.feat.sumrecurringdatapoint:{sum k*g k:where 1/:xprev\:[-1 1*z]x} +// @kind function +// @category fresh +// @desc Add hyperparameter values to .ml.fresh.params +fresh.loadparams"/fresh/hyperparameters.json"; -/ params -fresh.params:update pnum:{count 1_get[fresh.feat x]1}each f,pnames:count[i]#(),pvals:count[i]#()from([]f:1_key fresh.feat) -fresh.params:1!`pnum xasc update valid:pnum=count each pnames from fresh.params -fresh.loadparams:{ - pp:{(raze value@)each(!).("S=;")0:x}each(!).("S*";"|")0:x; - fresh.params[([]f:key pp);`pvals]:value each value pp:inter[key pp;exec f from fresh.params]#pp; - fresh.params[([]f:key pp);`pnames]:key each value pp; - fresh.params:update valid:pnum=count each pnames from fresh.params where f in key pp;} -fresh.loadparams hsym`$path,"/fresh/hyperparam.txt"; / default params +// @kind function +// @category fresh +// @desc Extract features using FRESH +// @param data {table} Input data +// @param idCol {symbol[]} ID column(s) name +// @param cols2Extract {symbol[]} Columns on which extracted features will +// be calculated (these columns must be numerical) +// @param params {table} Functions/parameters to be applied to cols2Extract. +// This should be a modified version of .ml.fresh.params +// @return {table} Table keyed by ID column and containing the features +// extracted from the subset of the data identified by the ID column. +fresh.createFeatures:{[data;idCol;cols2Extract;params] + param0:exec f from params where valid,pnum=0; + param1:exec f,pnames,pvals from params where valid,pnum>0; + allParams:(cross/)each param1`pvals; + calcs:param0,raze param1[`f]cross'param1[`pnames],'/:'allParams; + cols2Extract:$[n:"j"$abs system"s"; + $[n0; - calcs:p0,raze p1[`f]cross'p1[`pnames],'/:'(cross/)each p1`pvals; - calcs:(cnames:$[n:"j"$abs system"s";$[nsystem"s";mproc.init[abs system"s"]enlist".ml.loadfile`:fresh/init.q"]; +if[0>system"s";multiProc.init[abs system"s"]enlist".ml.loadfile`:fresh/init.q"]; diff --git a/fresh/feat.q b/fresh/feat.q new file mode 100644 index 00000000..367e31f6 --- /dev/null +++ b/fresh/feat.q @@ -0,0 +1,643 @@ +// fresh/feat.q - Features +// Copyright (c) 2021 Kx Systems Inc +// +// Features to be used in FRESH + +\d .ml + +// @kind function +// @category freshFeat +// @desc Calculate the absolute energy of data (sum of squares) +// @param data {number[]} Numerical data points +// @return {float} Sum of squares +fresh.feat.absEnergy:{[data] + data wsum data + } + +// @kind function +// @category freshFeat +// @desc Calculate the absolute sum of the differences between +// successive data points +// @param data {number[]} Numerical data points +// @return {float} Absolute sum of differences +fresh.feat.absSumChange:{[data] + sum abs 1_deltas data + } + +// @kind function +// @category freshFeat +// @desc Calculate the aggregation of an auto-correlation over all +// possible lags (1 - count[x]) +// @param data {number[]} Numerical data points +// @return {dictionary} Aggregation (mean, median, variance +// and standard deviation) of an auto-correlation +fresh.feat.aggAutoCorr:{[data] + n:count data; + autoCorrFunc:$[(abs[var data]<1e-10)|1=n; + 0; + 1_fresh.i.acf[data;`unbiased pykw 1b;`fft pykw n>1250]` + ]; + `mean`variance`median`dev!(avg;var;med;dev)@\:autoCorrFunc + } + +// @kind function +// @category freshFeat +// @desc Calculate a linear least-squares regression for aggregated +// values +// @param data {number[]} Numerical data points +// @param chunkLen {long} Size of chunk to apply +// @return {dictionary} Slope, intercept and rvalue for the series +// over aggregated max, min, variance or average for chunks of size chunklen +fresh.feat.aggLinTrend:{[data;chunkLen] + chunkData:chunkLen cut data; + stats:(max;min;var;avg)@/:\:chunkData; + trend:fresh.feat.linTrend each stats; + statCols:`$"_"sv'string cols[trend]cross`max`min`var`avg; + statCols!raze value flip trend + } + +// @kind function +// @category freshFeat +// @desc Hypothesis test to check for a unit root in series +// (Augmented Dickey Fuller tests) +// @param data {number[]} Numerical data points +// @return {dictionary} Test statistic, p-value and used lag +fresh.feat.augFuller:{[data] + `teststat`pvalue`usedlag!3#"f"$@[{fresh.i.adFuller[x]`};data;0n] + } + +// @kind function +// @category freshFeat +// @desc Apply auto-correlation over a user-specified lag +// @param data {number[]} Numerical data points +// @param lag {long} Lag to apply to data +// @return {float} Auto-correlation over specified lag +fresh.feat.autoCorr:{[data;lag] + mean:avg data; + $[lag=0;1f;(avg(data-mean)*xprev[lag;data]-mean)%var data] + } + +// @kind function +// @category freshFeat +// @desc Calculate entropy for data binned into n equi-distant bins +// @param data {number[]} Numerical data points +// @params numBins {long} Number of bins to apply to data +// @return {float} Entropy of the series binned into numBins equidistant bins +fresh.feat.binnedEntropy:{[data;numBins] + n:count data; + data-:min data; + p:(count each group(numBins-1)&floor numBins*data%max data)%n; + neg sum p*log p + } + +// @kind function +// @category freshFeat +// @desc Calculate non-linearity of a time series with lag applied +// @param data {number[]} Numerical data points +// @param lag {long} Lag to apply to data +// @return {float} Measure of the non-linearity of the series lagged by lag +// Time series non-linearity: Schreiber, T. and Schmitz, A. (1997). PHYSICAL +// REVIEW E, VOLUME 55, NUMBER 5 +fresh.feat.c3:{[data;lag] + avg data*/xprev\:[-1 -2*lag]data + } + +// @kind function +// @category freshFeat +// @desc Calculate aggregate value of successive changes within +// corridor +// @param data {number[]} Numerical data points +// @param lowerQuant {float} Lower quartile +// @param upperQuant {float} Upper quartile +// @param isAbs {boolean} Whether absolute values should be considered +// @return {dictionary} Aggregated value of successive changes within corridor +// specified by lower/upperQuant +fresh.feat.changeQuant:{[data;lowerQuant;upperQuant;isAbs] + quants:fresh.feat.quantile[data]lowerQuant,upperQuant; + k:($[isAbs;abs;]1_deltas data)where 1_&':[data within quants]; + statCols:`max`min`mean`variance`median`stdev; + statCols!(max;min;avg;var;med;dev)@\:k + } + +// @kind function +// @category freshFeat +// @desc Calculated complexity of time series based on peaks and +// troughs in the dataset +// @param data {number[]} Numerical data points +// @param isAbs {boolean} Whether absolute values should be considered +// @return {float} Measure of series complexity +// Time series complexity: +// http://www.cs.ucr.edu/~eamonn/Complexity-Invariant%20Distance%20Measure.pdf +fresh.feat.cidCe:{[data;isAbs] + comp:$[not isAbs; + data; + 0=s:dev data; + :0.; + (data-avg data)%s + ]; + sqrt k$k:"f"$1_deltas comp + } + +// @kind function +// @category freshFeat +// @desc Count of values in data +// @param data {number[]} Numerical data points +// @return {long} Number of values within the series +fresh.feat.count:{[data] + count data + } + +// @kind function +// @category freshFeat +// @desc Values greater than the average value +// @param data {number[]} Numerical data points +// @return {int} Number of values in series with a value greater than the mean +fresh.feat.countAboveMean:{[data] + sum data>avg data + } + +// @kind function +// @category freshFeat +// @desc Values less than the average value +// @param data {number[]} Numerical data points +// @return {int} Number of values in series with a value less than the mean +fresh.feat.countBelowMean:{[data] + sum datacount distinct data + } + +// @kind function +// @category freshFeat +// @desc Check for duplicate of maximum value within a series +// @param data {number[]} Numerical data points +// @return {boolean} Does data contain a duplicate of the maximum value +fresh.feat.hasDupMax:{[data] + 1ratio*max[data]-min data + } + +// @kind function +// @category freshFeat +// @desc Find the position of the last occurrence of the maximum value +// in the series relative to the series length +// @param data {number[]} Numerical data points +// @return {float} Last max relative to number of data points +fresh.feat.lastMax:{[data] + (last where data=max data)%count data + } + +// @kind function +// @category freshFeat +// @desc Find the position of the last occurrence of the minimum value +// in the series relative to the series length +// @param data {number[]} Numerical data points +// @return {float} Last min relative to number of data points +fresh.feat.lastMin:{[data] + (last where data=min data)%count data + } + +// @kind function +// @category freshFeat +// @desc Calculate the slope/intercept/r-value associated of a series +// @param data {number[]} Numerical data points +// @return {dictionary} Slope, intercept and r-value +fresh.feat.linTrend:{[data] + k:til count data; + slope:(xk:data cov k)%vk:var k; + intercept:avg[data]-slope*avg k; + rval:xk%sqrt vk*var data; + `rval`intercept`slope!0^(rval;intercept;slope) + } + +// @kind function +// @category freshFeat +// @desc Longest sequence of consecutive data points within the series with +// a value greater than the mean +// @param data {number[]} Numerical data points +// @return {boolean} Is longest subsequence greater than the mean +fresh.feat.longStrikeAboveMean:{[data] + max 0,fresh.i.getLenSeqWhere data>avg data + } + +// @kind function +// @category freshFeat +// @desc Longest sequence of consecutive data points within the series with +// a value lower than the mean +// @param data {number[]} Numerical data points +// @return {boolean} Is longest subsequence less than the mean +fresh.feat.longStrikeBelowMean:{[data] + max 0,fresh.i.getLenSeqWhere datacrossVal + } + +// @kind function +// @category freshFeat +// @desc Number of peaks in a series following data smoothing via +// application of a Ricker wavelet of defined width +// @param data {number[]} Numerical data points +// @param width {long} Width of wavelet +// @return {long} Number of peaks +fresh.feat.numCwtPeaks:{[data;width] + count fresh.i.findPeak[data;1+til width]` + } + +// @kind function +// @category freshFeat +// @desc Number of peaks in the series with a specified support +// @param data {number[]} Numerical data points +// @param support {long} Support of the peak +// @return {int} Number of peaks +fresh.feat.numPeaks:{[data;support] + sum all fresh.i.peakFind[data;support]each 1+til support + } + +// @kind function +// @category freshFeat +// @desc Partial auto-correlation of a series with a specified lag +// @param data {number[]} Numerical data points +// @param lag {long} Lag to apply to data +// @return {dictionary} Partial auto-correlation +fresh.feat.partAutoCorrelation:{[data;lag] + corrKeys:`$"lag_",/:string 1+til lag; + corrVals:lag#$[1>mx:lag&count[data]-1; + (); + 1_fresh.i.pacf[data;`nlags pykw mx;`method pykw`ld]` + ],lag#0n; + corrKeys!corrVals + } + +// @kind function +// @category freshFeat +// @desc Ratio of the number of non-distinct values to the number of +// possible values +// @param data {number[]} Numerical data points +// @return {float} Calculated ratio +fresh.feat.perRecurToAllData:{[data] + g:count each group data; + sum[1=minVal)&datar*dev data + } + +// @kind function +// @category freshFeat +// @desc Ratio of the number of unique values to total number of values +// in a series +// @param data {number[]} Numerical data points +// @return {float} Calculated ratio +fresh.feat.ratioValNumToSeriesLength:{[data] + count[distinct data]%count data + } + +// @kind function +// @category freshFeat +// @desc Skew of a time series indicating asymmetry within the series +// @param data {number[]} Numerical data points +// @return {float} Skew of data +fresh.feat.skewness:{[data] + n:count data; + s:sdev data; + m:data-avg data; + n*sum[m*m*m]%(s*s*s)*(n-1)*-2+n + } + +// @kind function +// @category freshFeat +// @desc Calculate the cross power spectral density of a time series +// @param data {number[]} Numerical data points +// @param coeff {int} Frequency at which calculation is performed +// @return {float} Cross power spectral density of data at given coeff +fresh.feat.spktWelch:{[data;coeff] + fresh.i.welch[data;`nperseg pykw 256&count data][@;1][`]coeff + } + +// @kind function +// @category freshFeat +// @desc Standard deviation +// @param data {number[]} Numerical data points +// @return {float} Standard deviation of series +fresh.feat.stdDev:{[data] + dev data + } + +// @kind function +// @category freshFeat +// @desc Sum points that appear more than once in a series +// @param data {number[]} Numerical data points +// @return {number} Sum of all points present more than once +fresh.feat.sumRecurringDataPoint:{[data] + g:count each group data; + k:where 11 +// @return {boolean} Measure of symmetry +fresh.feat.symmetricLooking:{[data;ratio] + abs[avg[data]-med data]= min) & (x < max)) p)def< variance_larger_than_standard_deviation(x):return np.var(x) > np.std(x) p)def< number_cwt_peaks(x,n):return len(find_peaks_cwt(vector=x, widths=np.array(list(range(1, n + 1))), wavelet=ricker)) p)def< quantile_py(x, q):x = pd.Series(x);return pd.Series.quantile(x, q) -p)def< quantile_py(x, q):x = pd.Series(x);return pd.Series.quantile(x, q) p)def< value_count(x, value): if np.isnan(value): return np.isnan(x) diff --git a/fresh/utils.q b/fresh/utils.q new file mode 100644 index 00000000..85a1f87d --- /dev/null +++ b/fresh/utils.q @@ -0,0 +1,202 @@ +// fresh/utils.q - Utility functions +// Copyright (c) 2021 Kx Systems Inc +// +// Unitily functions used in the implimentation of FRESH + +\d .ml + +// Python imports +sci_ver :1.5<="F"$3#.p.import[`scipy][`:__version__]` +numpy :.p.import`numpy +pyStats :.p.import`scipy.stats +signal :.p.import`scipy.signal +stattools:.p.import`statsmodels.tsa.stattools + +// @private +// @kind function +// @category freshPythonUtility +// @desc Compute the one-dimensional +// discrete Fourier Transform for real input +fresh.i.rfft:numpy`:fft.rfft + +// @private +// @kind function +// @category freshPythonUtility +// @desc Return the real part of the complex argument +fresh.i.real:numpy`:real + +// @private +// @kind function +// @category freshPythonUtility +// @desc Return the angle of the complex argument +fresh.i.angle:numpy`:angle + +// @private +// @kind function +// @category freshPythonUtility +// @desc Return the imaginary part of the complex argument +fresh.i.imag:numpy`:imag + +// @private +// @kind function +// @category freshPythonUtility +// @desc Calculate the absolute value element-wise +fresh.i.abso:numpy`:abs + +// @private +// @kind function +// @category freshPythonUtility +// @desc Kolmogorov-Smirnov two-sided test statistic distribution +fresh.i.ksDistrib:pyStats[$[sci_ver;`:kstwo.sf;`:kstwobign.sf];<] + +// @private +// @kind function +// @category freshPythonUtility +// @desc Calculate Kendall’s tau, a correlation measure for +// ordinal data +fresh.i.kendallTau:pyStats`:kendalltau + +// @private +// @kind function +// @category freshPythonUtility +// @desc Perform a Fisher exact test on a 2x2 contingency table +fresh.i.fisherExact:pyStats`:fisher_exact + +// @private +// @kind function +// @category freshPythonUtility +// @desc Estimate power spectral density using Welch’s method +fresh.i.welch:signal`:welch + +// @private +// @kind function +// @category freshPythonUtility +// @desc Find peaks in a 1-D array with wavelet transformation +fresh.i.findPeak:signal`:find_peaks_cwt + +// @private +// @kind function +// @category freshPythonUtility +// @desc Calculate the autocorrelation function +fresh.i.acf:stattools`:acf + +// @private +// @kind function +// @category freshPythonUtility +// @desc Partial autocorrelation estimate +fresh.i.pacf:stattools`:pacf + +// @private +// @kind function +// @category freshPythonUtility +// @desc Augmented Dickey-Fuller unit root test +fresh.i.adFuller:stattools`:adfuller + +// Python features +fresh.i.pyFeat:`aggAutoCorr`augFuller`fftAggReg`fftCoeff`numCwtPeaks, + `partAutoCorrelation`spktWelch + +// Extract utilities + +// @private +// @kind function +// @category freshUtility +// @desc Create a mapping between the functions and columns on which +// they are to be applied +// @param map {symbol[][]} Two element list where first element is the +// columns to which functions are to be applied and the second element is +// the name of the function in the .ml.fresh.feat namespace to be applied +// @return {symbol[]} A mapping of the functions to be applied to each column +fresh.i.colMap:{[map] + updFunc:flip (` sv'`.ml.fresh.feat,'map[;1];map[;0]); + updFunc,'last@''2_'map + } + +// @private +// @kind function +// @category freshUtility +// @desc Returns the length of each sequence +// @param condition {boolean} Executed condition, e.g. data>avg data +// @return {long[]} Sequence length based on condition +fresh.i.getLenSeqWhere:{[condition] + idx:where differ condition; + (1_deltas idx,count condition)where condition idx + } + +// @private +// @kind function +// @category freshUtility +// @desc Find peaks within the data +// @param data {number[]} Numerical data points +// @param support {long} Support of the peak +// @param idx {long} Current index +// @return {boolean[]} 1 where peak exists +fresh.i.peakFind:{[data;support;idx] + neg[support]_support _min data>/:xprev\:[-1 1*idx]data + } + +// @private +// @kind function +// @category freshUtility +// @desc Expand results produced by FRESH +// @param results {table} Table of resulting features +// @param column {symbol} Column of interest +// @return {table} Expanded results table +fresh.i.expandResults:{[results;column] + t:(`$"_"sv'string column,'cols t)xcol t:results column; + ![results;();0b;enlist column],'t + } + +// Select utilities + +// @private +// @kind function +// @category freshUtility +// @desc Apply python function for Kendall’s tau +// @param target {number[]} Target vector +// @param feature {number[]} Feature table column +// @return {float} Kendall’s tau - Close to 1 shows strong agreement, close to +// -1 shows strong disagreement +fresh.i.kTau:{[target;feature] + fresh.i.kendallTau[<;target;feature]1 + } + +// @private +// @kind function +// @category freshUtility +// @desc Perform a Fisher exact test +// @param target {number[]} Target vector +// @param feature {number[]} Feature table column +// @return {float} Results of Fisher exact test +fresh.i.fisher:{[target;feature] + g:group@'target value group feature; + fresh.i.fisherExact[<;count@''@\:[g]distinct target]1 + } + +// @private +// @kind function +// @category freshUtility +// @desc Calculate the Kolmogorov-Smirnov two-sided test statistic +// distribution +// @param feature {number[]} Feature table column +// @param target {number[]} Target vector +// @return {float} Kolmogorov-Smirnov two-sided test statistic distribution +fresh.i.ks:{[feature;target] + d:asc each target group feature; + n:count each d; + k:max abs(-). value(1+d bin\:raze d)%n; + en:prd[n]%sum n; + fresh.i.ksDistrib .$[sci_ver;(k;ceiling en);enlist k*sqrt en] + } + +// @private +// @kind function +// @category freshUtility +// @desc Pass data correctly to .ml.fresh.i.ks allowing for projection +// in main function +// @param target {number[]} Target vector +// @param feature {number[]} Feature table column +// @return {float} Kolmogorov-Smirnov two-sided test statistic distribution +fresh.i.ksYX:{[target;feature] + fresh.i.ks[feature;target] + } diff --git a/graph/README.md b/graph/README.md index 16478d12..1a1362e6 100644 --- a/graph/README.md +++ b/graph/README.md @@ -31,6 +31,6 @@ Documentation is available on the [Graph](https://code.kx.com/q/ml/toolkit/graph ## Status -The graph-pipeline library is still in development and is available here as a beta release. Further functionality and improvements will be made to the library in the coming months. +The graph-pipeline library is still in development. Further functionality and improvements will be made to the library on an ongoing basis. If you have any issues, questions or suggestions, please write to ai@kx.com. diff --git a/graph/graph.q b/graph/graph.q index c212fb64..426df44b 100644 --- a/graph/graph.q +++ b/graph/graph.q @@ -1,10 +1,32 @@ +// graph/graph.q - Graph tools +// Copyright (c) 2021 Kx Systems Inc +// +// Create, update, and delete functionality for a graph. + \d .ml +// @kind function +// @category graph +// @desc Generate an empty graph +// @return {dictionary} Structure required for the generation of a connected +// graph. This includes a key for information on the nodes present within the +// graph and edges outlining how the nodes within the graph are connected. createGraph:{[] - nodes:1!enlist`nodeId``function`inputs`outputs!(`;::;::;::;::); - edges:2!enlist`dstNode`dstName`srcNode`srcName`valid!(`;`;`;`;0b); - `nodes`edges!(nodes;edges)} + nodeKeys:`nodeId``function`inputs`outputs; + nodes:1!enlist nodeKeys!(`;::;::;::;::); + edgeKeys:`destNode`destName`sourceNode`sourceName`valid; + edges:2!enlist edgeKeys!(`;`;`;`;0b); + `nodes`edges!(nodes;edges) + } +// @kind function +// @category graph +// @desc Add a functional node to a graph +// @param graph {dictionary} Graph originally generated using .ml.createGraph +// @param nodeId {symbol} Denotes the name associated with the functional node +// @param node {fn} A functional node +// @return {dictionary} The graph with the the new node added to the graph +// structure addNode:{[graph;nodeId;node] node,:(1#`)!1#(::); if[nodeId in exec nodeId from graph`nodes;'"invalid nodeId"]; @@ -15,72 +37,155 @@ addNode:{[graph;nodeId;node] if[-10h=type node`outputs; node[`outputs]:(1#`output)!enlist node`outputs; node[`function]:((1#`output)!enlist@)node[`function]::; - ]; + ]; if[99h<>type node`outputs;'"invalid outputs"]; graph:@[graph;`nodes;,;update nodeId from node]; - edges:flip`dstNode`dstName`srcNode`srcName`valid!(nodeId;key node`inputs;`;`;0b); + edgeKeys:`destNode`destName`sourceNode`sourceName`valid; + edges:flip edgeKeys!(nodeId;key node`inputs;`;`;0b); graph:@[graph;`edges;,;edges]; - graph} + graph + } +// @kind function +// @category graph +// @desc Update the contents of a functional node +// @param graph {dictionary} Graph originally generated using .ml.createGraph +// @param nodeId {symbol} Denotes the name of a functional node to be updated +// @param node {fn} A functional node +// @return {dictionary} The graph with the named functional node contents +// overwritten updNode:{[graph;nodeId;node] node,:(1#`)!1#(::); if[not nodeId in 1_exec nodeId from graph`nodes;'"invalid nodeId"]; if[count key[node]except``function`inputs`outputs;'"invalid node"]; - oldnode:graph[`nodes]nodeId; + oldNode:graph[`nodes]nodeId; if[`inputs in key node; if[(::)~node`inputs;node[`inputs]:(0#`)!""]; if[-10h=type node`inputs;node[`inputs]:(1#`input)!enlist node`inputs]; if[99h<>type node`inputs;'"invalid inputs"]; - inputEdges:select from graph[`edges]where dstNode=nodeId,dstName in key oldnode`inputs; + inputEdges:select from graph[`edges]where destNode=nodeId, + destName in key oldNode`inputs; graph:@[graph;`edges;key[inputEdges]_]; - inputEdges:flip[`dstNode`dstName!(nodeId;key node`inputs)]#inputEdges; + inputEdges:flip[`destNode`destName!(nodeId;key node`inputs)]#inputEdges; graph:@[graph;`edges;,;inputEdges]; - inputEdges:select from inputEdges where not null srcNode; - graph:{[graph;edge]connectEdge[graph]. edge`srcNode`srcName`dstNode`dstName}/[graph;0!inputEdges]; - ]; + inputEdges:select from inputEdges where not null sourceNode; + graph:i.connectGraph/[graph;0!inputEdges]; + ]; if[`outputs in key node; if[-10h=type node`outputs; - node[`outputs]:(1#`output)!enlist node`outputs; - ]; + node[`outputs]:(1#`output)!enlist node`outputs]; if[99h<>type node`outputs;'"invalid outputs"]; - outputEdges:select from graph[`edges]where srcNode=nodeId,srcName in key oldnode`outputs; + outputEdges:select from graph[`edges]where sourceNode=nodeId, + sourceName in key oldNode`outputs; graph:@[graph;`edges;key[outputEdges]_]; - outputEdges:select from outputEdges where srcName in key node`outputs; + outputEdges:select from outputEdges where sourceName in key node`outputs; graph:@[graph;`edges;,;outputEdges]; - outputEdges:select srcNode,srcName,dstNode,dstName from outputEdges; - graph:{[graph;edge]connectEdge[graph]. edge`srcNode`srcName`dstNode`dstName}/[graph;0!outputEdges]; - ]; + outputEdge:select sourceNode,sourceName,destName,destName from outputEdges; + graph:i.connectGraph/[graph;0!outputEdge]; + ]; if[`function in key node; - if[(1#`output)~key graph[`nodes;nodeId]`outputs;node[`function]:((1#`output)!enlist@)node[`function]::]; - ]; + if[(1#`output)~key graph[`nodes;nodeId]`outputs; + node[`function]:((1#`output)!enlist@)node[`function]::]; + ]; graph:@[graph;`nodes;,;update nodeId from node]; - graph} + graph + } +// @kind function +// @category graph +// @desc Delete a named function node +// @param graph {dictionary} Graph originally generated using .ml.createGraph +// @param nodeId {symbol} Denotes the name of a functional node to be deleted +// @return {dictionary} The graph with the named fucntional node removed delNode:{[graph;nodeId] if[not nodeId in 1_exec nodeId from graph`nodes;'"invalid nodeId"]; graph:@[graph;`nodes;_;nodeId]; - inputEdges:select from graph[`edges]where dstNode=nodeId; + inputEdges:select from graph[`edges]where destNode=nodeId; graph:@[graph;`edges;key[inputEdges]_]; - outputEdges:select from graph[`edges]where srcNode=nodeId; - graph:@[graph;`edges;,;update srcNode:`,srcName:`,valid:0b from outputEdges]; - graph} + outputEdges:select from graph[`edges]where sourceNode=nodeId; + graph:@[graph;`edges;,;update sourceNode:`,sourceName:`, + valid:0b from outputEdges]; + graph + } + +// @kind function +// @category graph +// @desc Add a configuration node to a graph +// @param graph {dictionary} Graph originally generated using .ml.createGraph +// @param nodeId {symbol} Denotes the name associated with the configuration +// node +// @param config {fn} Any configuration information to be supplied to other +// nodes in the graph +// @return {dictionary} A graph with the the new configuration added to the +// graph structure +addCfg:{[graph;nodeId;config] + nodeKeys:``function`inputs`outputs; + addNode[graph;nodeId]nodeKeys!(::;@[;config];::;"!") + } -addCfg:{[graph;nodeId;cfg]addNode[graph;nodeId]``function`inputs`outputs!(::;@[;cfg];::;"!")} -updCfg:{[graph;nodeId;cfg]updNode[graph;nodeId](1#`function)!enlist cfg} +// @kind function +// @category graph +// @desc Update the contents of a configuration node +// @param graph {dictionary} Graph originally generated using .ml.createGraph +// @param nodeId {symbol} Denotes the name of a configuration node to be +// updated +// @param config {fn} Any configuration information to be supplied to other +// nodes in the graph +// @return {dictionary} The graph with the named configuration node contents +// overwritten +updCfg:{[graph;nodeId;config] + updNode[graph;nodeId](1#`function)!enlist config + } + +// @kind function +// @category graph +// @desc Delete a named configuration node +// @param graph {dictionary} Graph originally generated using .ml.createGraph +// @param nodeId {symbol} Denotes the name of a configuration node to be +// deleted +// @return {dictionary} The graph with the named fucntional node removed delCfg:delNode -connectEdge:{[graph;srcNode;srcName;dstNode;dstName] - if[99h<>type srcOutputs:graph[`nodes;srcNode;`outputs];'"invalid srcNode"]; - if[99h<>type dstInputs:graph[`nodes;dstNode;`inputs];'"invalid dstNode"]; - if[not srcName in key srcOutputs;'"invalid srcName"]; - if[not dstName in key dstInputs;'"invalid dstName"]; - edge:(1#`valid)!1#srcOutputs[srcName]~dstInputs[dstName]; - graph:@[graph;`edges;,;update dstNode,dstName,srcNode,srcName from edge]; - graph} +// @kind function +// @category graph +// @desc Connect the output of one node to the input to another +// @param graph {dictionary} Graph originally generated using .ml.createGraph +// @param sourceNode {symbol} Denotes the name of a node in the graph which +// contains the relevant output +// @param sourceName {symbol} Denotes the name of the output to be connected to +// an associated input node +// @param destNode {symbol} Name of a node in the graph which contains the +// relevant input to be connected to +// @param destName {symbol} Name of the input which is connected to the output +// defined by sourceNode and sourceName +// @return {dictionary} The graph with the relevant connection made between the +// inputs and outputs of two nodes +connectEdge:{[graph;sourceNode;sourceName;destNode;destName] + srcOutputs:graph[`nodes;sourceNode;`outputs]; + dstInputs:graph[`nodes;destNode;`inputs]; + if[99h<>type srcOutputs;'"invalid sourceNode"]; + if[99h<>type dstInputs;'"invalid destNode"]; + if[not sourceName in key srcOutputs;'"invalid sourceName"]; + if[not destName in key dstInputs;'"invalid destName"]; + edge:(1#`valid)!1#srcOutputs[sourceName]~dstInputs[destName]; + graph:@[graph;`edges;,;update destNode,destName,sourceNode, + sourceName from edge]; + graph + } -disconnectEdge:{[graph;dstNode;dstName] - if[not(dstNode;dstName)in key graph`edges;'"invalid edge"]; +// @kind function +// @category graph +// @desc Disconnect an edge from the input of a node +// @param graph {dictionary} Graph originally generated using .ml.createGraph +// @param destNode {symbol} Name of the node containing the edge to be deleted +// @param destName {symbol} Name of the edge associated with a specific input +// to be disconnected +// @return {dictionary} The graph with the edge connected to the destination +// input removed from the graph. +disconnectEdge:{[graph;destNode;destName] + if[not(destNode;destName)in key graph`edges;'"invalid edge"]; edge:(1#`valid)!1#0b; - graph:@[graph;`edges;,;update dstNode,dstName,srcName:`,srcNode:` from edge]; - graph} - + graph:@[graph;`edges;,;update destNode,destName,sourceName:`, + sourceNode:` from edge]; + graph + } diff --git a/graph/init.q b/graph/init.q index 3f7568d9..e401db10 100644 --- a/graph/init.q +++ b/graph/init.q @@ -1,4 +1,14 @@ +// graph/init.q - Load graph library +// Copyright (c) 2021 Kx Systems Inc +// +// Graph and Pipeline is a structural framework for developing +// q/kdb+ solutions, based on a directed acyclic graph. + +.ml.loadfile`:graph/utils.q .ml.loadfile`:graph/graph.q .ml.loadfile`:graph/pipeline.q .ml.loadfile`:graph/modules/saving.q .ml.loadfile`:graph/modules/loading.q + +.ml.loadfile`:util/utils.q +.ml.i.deprecWarning`graph diff --git a/graph/modules/loading.q b/graph/modules/loading.q index b1528ef9..750adaea 100644 --- a/graph/modules/loading.q +++ b/graph/modules/loading.q @@ -1,27 +1,113 @@ -\d .ml - -i.loadfname:{[cfg] - file:hsym`$$[(not ""~cfg`directory)&`directory in key cfg;cfg`directory;"."],"/",cfg`fileName; +\d .ml + +// Utility Functions for loading data + +// @private +// @kind function +// @category loadingUtility +// @fileoverview Construct path to a data file +// @param config {dict} Any configuration information about the dataset being +// loaded in +// @return {str} Path to the data file +i.loadFileName:{[config] + file:hsym`$$[(not ""~config`directory)&`directory in key config; + config`directory; + "."],"/",config`fileName; if[()~key file;'"file does not exist"]; - file} + file + } + +// @private +// @kind function +// @category loadingUtility +// @fileoverview Load splayed table or binary file +// @param config {dict} Any configuration information about the dataset being +// loaded in +// @return {tab} Date obtained from splayed table or binary file +i.loadFunc.splay:i.loadFunc.binary:{[config] + get i.loadFileName config + } + +// @private +// @kind function +// @category loadingUtility +// @fileoverview Load data from csv file +// @param config {dict} Any configuration information about the dataset being +// loaded in +// @return {tab} Data obtained from csv +i.loadFunc.csv:{[config] + (config`schema;config`separator)0: i.loadFileName config + } -i.loadfunc.splay:i.loadfunc.binary:{[cfg]get i.loadfname cfg} -i.loadfunc.csv:{[cfg](cfg`schema;cfg`separator)0: i.loadfname cfg} -i.loadfunc.json:{[cfg].j.k first read0 i.loadfname cfg} -i.loadfunc.hdf5:{[cfg] +// @private +// @kind function +// @category loadingUtility +// @fileoverview Load data from json file +// @param config {dict} Any configuration information about the dataset being +// loaded in +// @return {tab} Data obtained from json file +i.loadFunc.json:{[config] + .j.k first read0 i.loadFileName config + } + +// @private +// @kind function +// @category loadingUtility +// @fileoverview Load data from HDF5 file +// @param config {dict} Any configuration information about the dataset being +// loaded in +// @return {tab} Data obtained from HDF5 file +i.loadFunc.hdf5:{[config] if[not`hdf5 in key`;@[system;"l hdf5.q";{'"unable to load hdf5 lib"}]]; - if[not .hdf5.ishdf5 fname:i.loadfname cfg;'"file is not an hdf5 file"]; - if[not .hdf5.isObject[fpath;cfg`dname];'"hdf5 dataset does not exist"]; - .hdf5.readData[fpath;cfg`dname]} -i.loadfunc.ipc:{[cfg] - h:@[hopen;cfg`port;{'"error opening connection"}]; - ret:@[h;cfg`select;{'"error executing query"}]; + if[not .hdf5.ishdf5 filePath:i.loadFileName config; + '"file is not an hdf5 file" + ]; + if[not .hdf5.isObject[filePath;config`dname];'"hdf5 dataset does not exist"]; + .hdf5.readData[fpath;config`dname] + } + +// @private +// @kind function +// @category loadingUtility +// @fileoverview Load data from ipc +// @param config {dict} Any configuration information about the dataset being +// loaded in +// @return {tab} Data obtained via IPC +i.loadFunc.ipc:{[config] + h:@[hopen;config`port;{'"error opening connection"}]; + ret:@[h;config`select;{'"error executing query"}]; @[hclose;h;{}]; - ret} -i.loadfunc.process:{[cfg]if[not `data in key cfg;'"Data to be used must be defined"];cfg[`data]} + ret + } + +// @private +// @kind function +// @category loadingUtility +// @fileoverview Load data from config dictionary +// @param config {dict} Any configuration information about the dataset being +// loaded in +// @return {dict} Data obtained from config dictionary +i.loadFunc.process:{[config] + if[not `data in key config;'"Data to be used must be defined"]; + config`data + } + +// @private +// @kind function +// @category loadingUtility +// @fileoverview Load data from a defined source +// @param config {dict} Any configuration information about the dataset being +// loaded in +// @return {dict} Data obtained from a defined source +i.loadDataset:{[config] + if[null func:i.loadFunc config`typ;'"dataset type not supported"]; + func config + } -i.loaddset:{[cfg] - if[null func:i.loadfunc cfg`typ;'"dataset type not supported"]; - func cfg} +// Loading functionality -loaddset:`function`inputs`outputs!(i.loaddset;"!";"+") +// @kind function +// @category loading +// @fileoverview Node to load data from a defined source +// @return {dict} Node in graph to be used for loading data +loadDataSet:`function`inputs`outputs!(i.loadDataset;"!";"+") diff --git a/graph/modules/saving.q b/graph/modules/saving.q index 56159792..2f7605bc 100644 --- a/graph/modules/saving.q +++ b/graph/modules/saving.q @@ -1,29 +1,112 @@ \d .ml -i.savefname:{[cfg] +// Utility Functions for loading data + +// @private +// @kind function +// @category savingUtility +// @fileoverview Construct path to location where data is to be saved +// @param config {dict} Any configuration information about the dataset being +// saved +// @return {str} Path to a file location +i.saveFileName:{[cfg] file:hsym`$$[`dir in key cfg;cfg`key;"."],"/",cfg fname; if[not ()~key file;'"file exists"]; file} -i.savedset.txt:{[cfg;dset]i.savefname[cfg]0:.h.tx[cfg`typ;dset];} -i.savedset[`csv`xml`xls]:i.savedset.txt -i.savedset.binary:{[cfg;dset]i.savefname[cfg]set dset;} -i.savedset.json:{[cfg;dset] - h:hopen i.savefname cfg; - h @[.j.j;dset;{'"error converting to json"}]; - hclose h;} -i.savedset.hdf5:{[cfg;dset] +// @private +// @kind function +// @category savingUtility +// @fileoverview Save data as a text file +// @param config {dict} Any configuration information about the dataset being +// saved +// @param data {tab} Data which is to be saved +// @return {null} Data is saved as a text file +i.saveFunc.txt:{[config;data] + i.saveFileName[config]0:.h.tx[config`typ;data]; + } + +// @private +// @kind function +// @category savingUtility +// @fileoverview Save data as a text file +// @param config {dict} Any configuration information about the dataset being +// saved +// @param data {tab} Data which is to be saved +// @return {null} Data is saved as a text file +i.saveFunc[`csv`xml`xls]:i.saveFunc.txt + +// @private +// @kind function +// @category savingUtility +// @fileoverview Save data as a binary file +// @param config {dict} Any configuration information about the dataset being +// saved +// @param data {tab} Data which is to be saved +// @return {null} Data is saved as a binary file +i.saveFunc.binary:{[config;data] + i.saveFileName[config]set data; + } + +// @private +// @kind function +// @category savingUtility +// @fileoverview Save data as a json file +// @param config {dict} Any configuration information about the dataset being +// saved +// @param data {tab} Data which is to be saved +// @return {null} Data is saved as a json file +i.saveFunc.json:{[config;data] + h:hopen i.saveFileName config; + h @[.j.j;data;{'"error converting to json"}]; + hclose h; + } + +// @private +// @kind function +// @category savingUtility +// @fileoverview Save data as a HDF5 file +// @param config {dict} Any configuration information about the dataset being +// saved +// @param data {tab} Data which is to be saved +// @return {null} Data is saved as a HDF5 file +i.saveFunc.hdf5:{[config;data] if[not`hdf5 in key`;@[system;"l hdf5.q";{'"unable to load hdf5 lib"}]]; - .hdf5.createFile fname:i.savefname cfg; - .hdf5.writeData[fname;cfg`dname;dset]; + .hdf5.createFile filePath:i.saveFilename config; + .hdf5.writeData[filePath;config`dname;data]; + } + +// @private +// @kind function +// @category savingUtility +// @fileoverview Save data as a splayed table +// @param config {dict} Any configuration information about the dataset being +// saved +// @param data {tab} Data which is to be saved +// @return {null} Data is saved as a splayed table +i.saveFunc.splay:{[config;data] + dataName:first` vs filePath:i.saveFileName config; + filePath:` sv filePath,`; + filePath set .Q.en[dataName]data; + } + +// @private +// @kind function +// @category savingUtility +// @fileoverview Save data in a defined format +// @param config {dict} Any configuration information about the dataset being +// saved +// @param data {tab} Data which is to be saved +// @return {null} Data is saved in the defined format +i.saveDataset:{[config;data] + if[null func:i.saveFunc cfg`typ;'"dataset type not supported"]; + func data } -i.savedset.splay:{[cfg;dset] - dname:first` vs fname:i.savefname cfg; - fname:` sv fname,`; - fname set .Q.en[dname]dset;} -i.savefunc:{[cfg;dset] - if[null func:i.savedset cfg`typ;'"dataset type not supported"]; - func dset} +// Saving functionality -savedset:`function`inputs`outputs!(i.savefunc;`cfg`dset!"!+";" ") +// @kind function +// @category saving +// @fileoverview Node to save data from a defined source +// @return {dict} Node in graph to be used for saving data +saveDataset:`function`inputs`outputs!(i.saveDataset;`cfg`dset!"!+";" ") diff --git a/graph/pipeline.q b/graph/pipeline.q index 4cea2893..70ecd458 100644 --- a/graph/pipeline.q +++ b/graph/pipeline.q @@ -1,51 +1,54 @@ +// graph/pipeline.q - Build and execute a pipeline +// Copyright (c) 2021 Kx Systems Inc +// +// Contains createPipeline and execPipeline for +// the creation and execution of pipelines. + \d .ml -// Execution of a pipeline will not default to enter q debug mode but should be possible to overwrite +// Execution of a pipeline will not default to enter q debug mode but should +// be possible to overwrite graphDebug:0b -updDebug:{[x]graphDebug::not graphDebug} +// @kind function +// @category pipeline +// @desc Update debugging mode +// @return {::} Debugging is updated +updDebug:{[] + graphDebug::not graphDebug + } + +// @kind function +// @category pipeline +// @desc Generate a execution pipeline based on a valid graph +// @param graph {dictionary} Graph originally generated by .ml.createGraph, +// which has all relevant input edges connected validly +// @return {dictionary} An optimal execution pipeline populated with all +// information required to allow its successful execution createPipeline:{[graph] if[not all exec 1_valid from graph`edges;'"disconnected edges"]; - outputs:ungroup select srcNode:nodeId,srcName:key each outputs from 1_graph`nodes; - endpoints:exec distinct srcNode from outputs except select srcNode,srcName from graph`edges; - optimalpath:distinct raze paths idesc count each paths:i.getOptimalPath[graph]each endpoints; - pipeline:([]nodeId:optimalpath)#graph`nodes; - nodeinputs:key each exec inputs from pipeline; - pipeline:update inputs:count[i]#enlist(1#`)!1#(::),outputtypes:outputs,inputorder:nodeinputs from pipeline; - pipeline:select nodeId,complete:0b,error:`,function,inputs,outputs:inputs,outputtypes,inputorder from pipeline; - pipeline:pipeline lj select outputmap:([]srcName;dstNode;dstName)by nodeId:srcNode from graph`edges; + outputs:ungroup select sourceNode:nodeId,sourceName:key each outputs + from 1_graph`nodes; + srcInfo:select sourceNode,sourceName from graph`edges; + endPoints:exec distinct sourceNode from outputs except srcInfo; + paths:i.getOptimalPath[graph]each endPoints; + optimalPath:distinct raze paths idesc count each paths; + pipeline:([]nodeId:optimalPath)#graph`nodes; + nodeInputs:key each exec inputs from pipeline; + pipeline:update inputs:count[i]#enlist(1#`)!1#(::),outputTypes:outputs, + inputOrder:nodeInputs from pipeline; + pipeline:select nodeId,complete:0b,error:`,function,inputs,outputs:inputs, + outputTypes,inputOrder from pipeline; + pipeline:pipeline lj select outputMap:([]sourceName;destNode;destName)by + nodeId:sourceNode from graph`edges; 1!pipeline} -execPipeline:{[pipeline]i.execCheck i.execNext/pipeline} - - -// Pipeline creation utilities -i.getDeps:{[graph;node]exec distinct srcNode from graph[`edges]where dstNode=node} -i.getAllDeps:{[graph;node]$[count depNodes:i.getDeps[graph]node;distinct node,raze .z.s[graph]each depNodes;node]} -i.getAllPaths:{[graph;node]$[count depNodes:i.getDeps[graph]node;node,/:raze .z.s[graph]each depNodes;raze node]} -i.getLongestPath:{[graph;node]paths first idesc count each paths:reverse each i.getAllPaths[graph;node]} -i.getOptimalPath:{[graph;node]distinct raze reverse each i.getAllDeps[graph]each i.getLongestPath[graph;node]} - -i.execNext:{[pipeline] - node:first 0!select from pipeline where not complete; - -1"Executing node: ",string node`nodeId; - if[not count inputs:node[`inputs]node[`inputorder];inputs:1#(::)]; - res:`complete`error`outputs!$[graphDebug; - .[(1b;`;)node[`function]::;inputs]; - .[(1b;`;)node[`function]::;inputs;{[err](0b;`$err;::)}] - ]; - / compare outputs to outputtypes ? - if[not null res`error;-2"Error: ",string res`error]; - if[res`complete; - res[`inputs]:(1#`)!1#(::); - outputmap:update data:res[`outputs]srcName from node`outputmap; - res[`outputs]:((1#`)!1#(::)),(exec distinct srcName from outputmap)_ res`outputs; - pipeline:{[pipeline;map]pipeline[map`dstNode;`inputs;map`dstName]:map`data;pipeline}/[pipeline;outputmap]; - ]; - pipeline,:update nodeId:node`nodeId from res; - pipeline} - -i.execCheck:{[pipeline] - if[any not null exec error from pipeline;:0b]; - if[all exec complete from pipeline;:0b]; - 1b} +// @kind function +// @category pipeline +// @desc Execute a generated pipeline +// @param pipeline {dictionary} Pipeline created by .ml.createPipeline +// @return {dictionary} The pipeline with each node executed and appropriate +// outputs populated. +execPipeline:{[pipeline] + i.execCheck i.execNext/pipeline + } diff --git a/graph/tests/graph.t b/graph/tests/graph.t index 51a03ba6..852f4398 100644 --- a/graph/tests/graph.t +++ b/graph/tests/graph.t @@ -4,6 +4,7 @@ \l p.q \l ml.q +\l graph/utils.q \l graph/graph.q \l graph/pipeline.q @@ -79,7 +80,7 @@ failingTest[.ml.updNode;(g;`node1;outputType);0b;"invalid outputs"] // Connect an invalid edge between 2 nodes and check that this is not valid g:.ml.connectEdge[g;`cfg1;`output;`node1;`input] -0b~first exec valid from g[`edges] where dstNode=`node1,dstName=`input +0b~first exec valid from g[`edges] where destNode=`node1,destName=`input g:.ml.disconnectEdge[g;`node1;`input] // Attempt to disconnect a node that doesn't exist @@ -89,16 +90,16 @@ failingTest[.ml.disconnectEdge;(g;`node;`input);0b;"invalid edge"] failingTest[.ml.disconnectEdge;(g;`node1;`test);0b;"invalid edge"] // Attempt to connect an edge with a non existent source node -failingTest[.ml.connectEdge;(g;`nocfg;`output;`node1;`input);0b;"invalid srcNode"] +failingTest[.ml.connectEdge;(g;`nocfg;`output;`node1;`input);0b;"invalid sourceNode"] // Attempt to connect an edge from an existent source node but non existent source name -failingTest[.ml.connectEdge;(g;`cfg1;`nosrcName;`node1;`input);0b;"invalid srcName"] +failingTest[.ml.connectEdge;(g;`cfg1;`nosourceName;`node1;`input);0b;"invalid sourceName"] // Attempt to connect an edge from an non existent destination node -failingTest[.ml.connectEdge;(g;`cfg1;`output;`nosrcnode;`input);0b;"invalid dstNode"] +failingTest[.ml.connectEdge;(g;`cfg1;`output;`nosrcnode;`input);0b;"invalid destNode"] // Attempt to connect an edge from an existent destination node but non existent destination name -failingTest[.ml.connectEdge;(g;`cfg1;`output;`node1;`noinput);0b;"invalid dstName"] +failingTest[.ml.connectEdge;(g;`cfg1;`output;`node1;`noinput);0b;"invalid destName"] -1"\nTesting delNode"; @@ -116,7 +117,7 @@ not `tempNode in exec nodeId from g[`nodes] // but function errors on execution (for pipeline testing) g:.ml.updNode[g;`node1]`function`inputs`outputs!({`e+1};"!";"!") g:.ml.connectEdge[g;`cfg1;`output;`node1;`input] -1b~first exec valid from g[`edges] where dstNode=`node1,dstName=`input +1b~first exec valid from g[`edges] where destNode=`node1,destName=`input -1"\nTesting failing pipeline execution without debug mode active"; diff --git a/graph/utils.q b/graph/utils.q new file mode 100644 index 00000000..ee3fc613 --- /dev/null +++ b/graph/utils.q @@ -0,0 +1,153 @@ +// graph/utils.q - Utility functions for graphs +// Copyright (c) 2021 Kx Systems Inc +// +// Utility functions for implementation of graph library + +\d .ml + +// Graphing creation utilities + +// @private +// @kind function +// @category pipelineUtility +// @desc Connect the output of one node to the input to another +// @param graph {dictionary} Graph originally generated by .ml.createGraph, +// which has all relevant input edges connected validly +// @param edge {dictionary} Contains information about the edge node +// @return {dictionary} The graph with the relevant connection made between the +// inputs and outputs of two nodes. +i.connectGraph:{[graph;edge] + edgeKeys:`sourceNode`sourceName`destNode`destName; + connectEdge[graph]. edge edgeKeys + } + +// Pipeline creation utilities + +// @private +// @kind function +// @category pipelineUtility +// @desc Extract the source of a specific node +// @param graph {dictionary} Graph originally generated by .ml.createGraph, +// which has all relevant input edges connected validly +// @param node {symbol} Name associated with the functional node +// @return {symbol} Source of the given node +i.getDeps:{[graph;node] + exec distinct sourceNode from graph[`edges]where destNode=node + } + +// @private +// @kind function +// @category pipelineUtility +// @desc Extract all dependent source nodes needed to run the node +// @param graph {dictionary} Graph originally generated by .ml.createGraph, +// which has all relevant input edges connected validly +// @param node {symbol} Denoting the name to be associated with the functional +// node +// @return {symbol[]} All sources required for the given node +i.getAllDeps:{[graph;node] + depNodes:i.getDeps[graph]node; + $[count depNodes; + distinct node,raze .z.s[graph]each depNodes; + node + ] + } + +// @private +// @kind function +// @category pipelineUtility +// @desc Extract all the paths needed to run the node +// @param graph {dictionary} Graph originally generated by .ml.createGraph, +// which has all relevant input edges connected validly +// @param node {symbol} Denoting the name to be associated with the functional +// node +// @return {symbol} All paths required for the given node +i.getAllPaths:{[graph;node] + depNodes:i.getDeps[graph]node; + $[count depNodes; + node,/:raze .z.s[graph]each depNodes; + raze node + ] + } + +// @private +// @kind function +// @category pipelineUtility +// @desc Get the longest path +// @param graph {dictionary} Graph originally generated by .ml.createGraph, +// which has all relevant input edges connected validly +// @param node {symbol} Denoting the name to be associated with the functional +// node +// @return {symbol} The longest path available +i.getLongestPath:{[graph;node] + paths:reverse each i.getAllPaths[graph;node]; + paths first idesc count each paths + } + +// @private +// @kind function +// @category pipelineUtility +// @desc Extract the optimal path to run the node +// @param graph {dictionary} Graph originally generated by .ml.createGraph, +// which has all relevant input edges connected validly +// @param node {symbol} Denoting the name to be associated with the functional +// node +// @return {symbol} The optimal path to run the node +i.getOptimalPath:{[graph;node] + longestPath:i.getLongestPath[graph;node]; + distinct raze reverse each i.getAllDeps[graph]each longestPath + } + +// @private +// @kind function +// @category pipelineUtility +// @desc Update input data information within the pipeline +// @param pipeline {dictionary} Pipeline created by .ml.createPipeline +// @param map {dictionary} Contains information needed to run the node +// @return {dictionary} Pipeline updated with input information +i.updateInputData:{[pipeline;map] + pipeline[map`destNode;`inputs;map`destName]:map`data; + pipeline + } + +// @private +// @kind function +// @category pipelineUtility +// @desc Execute the first non completed node in the pipeline +// @param pipeline {dictionary} Pipeline created by .ml.createPipeline +// @return {dictionary} Pipeline with executed node marked as complete +i.execNext:{[pipeline] + node:first 0!select from pipeline where not complete; + -1"Executing node: ",string node`nodeId; + inputs:node[`inputs]node`inputOrder; + if[not count inputs;inputs:1#(::)]; + resKeys:`complete`error`outputs; + resVals:$[graphDebug; + .[(1b;`;)node[`function]::;inputs]; + .[(1b;`;)node[`function]::;inputs;{[err](0b;`$err;::)}] + ]; + res:resKeys!resVals; + if[not null res`error;-2"Error: ",string res`error]; + if[res`complete; + res[`inputs]:(1#`)!1#(::); + outputMap:update data:res[`outputs]sourceName from node`outputMap; + uniqueSource:(exec distinct sourceName from outputMap)_ res`outputs; + res[`outputs]:((1#`)!1#(::)),uniqueSource; + pipeline:i.updateInputData/[pipeline;outputMap]; + ]; + pipeline,:update nodeId:node`nodeId from res; + pipeline + } + +// @private +// @kind function +// @category pipelineUtility +// @desc Check if any nodes are left to be executed or if any +// errors have occured +// @param pipeline {dictionary} Pipeline created by .ml.createPipeline +// @return {dictionary} Return 0b if all nodes have been completed or if any +// errors have occured. Otherwise return 1b +i.execCheck:{[pipeline] + if[any not null exec error from pipeline;:0b]; + if[all exec complete from pipeline;:0b]; + 1b + } diff --git a/init.q b/init.q index 15341349..c17e0f22 100644 --- a/init.q +++ b/init.q @@ -1,7 +1,16 @@ -.ml.loadfile`:util/init.q -.ml.loadfile`:fresh/init.q -.ml.loadfile`:clust/init.q -.ml.loadfile`:xval/init.q -.ml.loadfile`:graph/init.q -.ml.loadfile`:optimize/init.q -.ml.loadfile`:timeseries/init.q +// init.q - Load ml libraries +// Copyright (c) 2021 Kx Systems Inc + +\d .ml + +path:{string`ml^`$@[{"/"sv -1_"/"vs ssr[;"\\";"/"](-3#get .z.s)0};`;""]}` +system"l ",path,"/","ml.q" + +loadfile`:util/init.q +loadfile`:stats/init.q +loadfile`:fresh/init.q +loadfile`:clust/init.q +loadfile`:xval/init.q +loadfile`:graph/init.q +loadfile`:optimize/init.q +loadfile`:timeseries/init.q diff --git a/ml.q b/ml.q index ac6e6816..a5272582 100644 --- a/ml.q +++ b/ml.q @@ -1,5 +1,25 @@ +// ml.q - Setup for ml namespace +// Copyright (c) 2021 Kx Systems Inc +// +// Define version, path, and loadfile + + \l p.q /embedPy \d .ml version:@[{TOOLKITVERSION};`;`development] path:{string`ml^`$@[{"/"sv -1_"/"vs ssr[;"\\";"/"](-3#get .z.s)0};`;""]}` loadfile:{$[.z.q;;-1]"Loading ",x:_[":"=x 0]x:$[10=type x;;string]x;system"l ",path,"/",x;} + +// The following functionality should be available for all initialized sections of the library + +// @private +// @kind function +// @category utility +// @fileoverview If set to `1b` deprecation warnings are ignored +i.ignoreWarning:0b + +// @private +// @kind function +// @category utilities +// @fileoverview Change ignoreWarnings +updateIgnoreWarning:{[]i.ignoreWarning::not i.ignoreWarning} diff --git a/optimize/README.md b/optimize/README.md new file mode 100644 index 00000000..1d2c79a5 --- /dev/null +++ b/optimize/README.md @@ -0,0 +1,35 @@ +# Numerical optimization + +The functionality contained within this folder provides a number of implementations of numerical optimization techniques. Such techniques are used to find the local or global minima of user-provided objective functions and are central to many statistical models. + +## Functionality + +At present, the optimization folder contains an implementation of the Broyden-Fletcher-Goldfarb-Shanno algorithm. + +The Broyden-Fletcher-Goldfarb-Shanno(BFGS) algorithm is a quasi-Newton iterative method for solving unconstrained non-linear optimization problems. This is a class of hill-climbing optimization that seeks a stationary, preferably twice-differentiable, solution to the objective function. + +## Requirements + +- kdb+ > 3.5 + +## Installation + +Place the `ml` library in `$QHOME` and load into a q instance using `ml/ml.q` + +### Load + +The following will load the optimization functionality into the `.ml` namespace +```q +q)\l ml/ml.q +q).ml.loadfile`:optimize/init.q +``` + +## Documentation + +Documentation is available on the [Optimization](https://code.kx.com/q/ml/toolkit/optimize/) homepage. + +## Status + +The optimization library is still in development. Further functionality and improvements will be made to the library on an ongoing basis. + +If you have any issues, questions or suggestions, please write to ai@kx.com. diff --git a/optimize/init.q b/optimize/init.q index bfca8622..8ca0a58c 100644 --- a/optimize/init.q +++ b/optimize/init.q @@ -1,3 +1,15 @@ +// optimize/init.q - Load optimize library +// Copyright (c) 2021 Kx Systems Inc +// +// The .ml.optimize namespace contains functions that relate to +// the application of numerical optimization techniques. Such +// techniques are used to find local or global minima of user-provided +// objective functions and are central to many statistical models. + \d .ml -loadfile`:util/util.q -loadfile`:optimize/optim.q +loadfile`:util/utils.q +loadfile`:util/utilities.q +loadfile`:optimize/utils.q +loadfile`:optimize/optimize.q + +.ml.i.deprecWarning`optimize diff --git a/optimize/optim.q b/optimize/optim.q deleted file mode 100644 index d832e790..00000000 --- a/optimize/optim.q +++ /dev/null @@ -1,659 +0,0 @@ -// Namespace appropriately -\d .ml - -// @kind function -// @category optimization -// @fileoverview Optimize a function using the -// Broyden-Fletcher-Goldfarb-Shanno (BFGS) algorithm. This implementation -// is based on https://github.com/scipy/scipy/blob/v1.5.0/scipy/optimize/optimize.py#L1058 -// and is a quasi-Newton hill-climbing optimization technique used to find -// a preferebly twice continuously differentiable stationary point of a function. -// An outline of the algorithm mathematically is provided here: -// https://en.wikipedia.org/wiki/Broyden-Fletcher-Goldfarb-Shanno_algorithm#Algorithm -// @param func {lambda} the function to be optimized. This function should take -// as its arguments a list/dictionary of parameters to be optimized and a list/dictionary -// of additional unchanging arguments -// @param x0 {num[]/dict} the first guess at the parameters to be optimized as -// a list or dictionary of numeric values -// @param args {list/dict/(::)} any unchanging parameters to required for evaluation -// of the function, these should be in the order that they are to be applied -// to the function -// @param params {dict} any modifications to be applied to the optimization procedure e.g. -// - display {bool} are the results at each optimization iteration to be printed -// - optimIter {integer} maximum number of iterations in optimization procedure -// - zoomIter {integer} maximum number of iterations when finding optimal zoom -// - wolfeIter {integer} maximum number of iterations in -// - norm {integer} order of norm (0W = max; -0W = min), otherwise calculated via -// sum[abs[vec]xexp norm]xexp 1%norm -// - gtol {float} gradient norm must be less than gtol before successful termination -// - geps {float} the absolute step size used for numerical approximation -// of the jacobian via forward differences. -// - stepSize {float} maximum allowable 'alpha' step size between calculations -// - c1 {float} armijo rule condition -// - c2 {integer} curvature conditions rule -// @returns {dict} a dictionary containing the estimated optimal parameters, number of iterations -// and the evaluated return of the function being optimized. -optimize.BFGS:{[func;x0;args;params] - // update the default behaviour of the parameters - params:i.updDefault[params]; - // format x0 based on input type - x0:i.dataFormat[x0]; - // Evaluate the function at the starting point - f0:i.funcEval[func;x0;args]; - // Calculate the starting gradient - gk:i.grad[func;x0;args;params`geps]; - // Initialize Hessian matrix as identity matrix - hess:.ml.eye count x0; - // set initial step guess i.e. the step before f0 - prev_fk:f0+sqrt[sum gk*gk]%2; - gradNorm:i.vecNorm[gk;params`norm]; - optimKeys:`xk`fk`prev_fk`gk`prev_xk`hess`gnorm`I`idx; - optimVals:(x0;f0;prev_fk;gk;0n;hess;gradNorm;hess;0); - optimDict:optimKeys!optimVals; - // Run optimization until one of the stopping conditions is met - optimDict:i.stopOptimize[;params]i.BFGSFunction[func;;args;params]/optimDict; - returnKeys:`xVals`funcRet`numIter; - // if function returned due to a null xVal or the new value being worse than the previous - // value then return the k-1 value - returnVals:$[(optimDict[`fk] x(k) - sk:alpha*pk; - // update values of x at the new position k - optimDict[`xk]:optimDict[`prev_xk]+sk; - // if null gnew, then get gradient of new x value - if[any null gnew;gnew:i.grad[func;optimDict`xk;args;params`geps]]; - // subtract new gradients - yk:gnew-optimDict`gk;; - optimDict[`gk]:gnew; - // get new norm of gradient - optimDict[`gnorm]:i.vecNorm[optimDict`gk;params`norm]; - // calculate new hessian matrix for next iteration - rhok:1%mmu[yk;sk]; - if[0w=rhok; - rhok:1000f; - -1"Division by zero in calculation of rhok, assuming rhok large";]; - A1:optimDict[`I] - sk*\:yk*rhok; - A2:optimDict[`I] - yk*\:sk*rhok; - optimDict[`hess]:mmu[A1;mmu[optimDict`hess;A2]]+rhok*(sk*/:sk); - // if x(k) returns infinite value update gnorm and fk - if[0w in abs optimDict`xk;optimDict[`gnorm`fk]:(0n;0w)]; - optimDict[`idx]+:1; - if[params`display;show optimDict;-1"";]; - optimDict - } - -// @private -// @kind function -// @category optimization -// @fileoverview complete a line search across an unconstrained minimization problem making -// use of wolfe conditions to constrain the search the naming convention for dictionary keys -// in this implementation is based on the python implementation of the same functionality here -// https://github.com/scipy/scipy/blob/v1.5.0/scipy/optimize/linesearch.py#L193 -// @param fk {float} function return evaluated at position k -// @param prev_fk {float} function return evaluated at position k-1 -// @param gk {float} gradient at position k -// @param pk {float} search direction -// @param func {lambda} function being optimized -// @param xk {num[]} parameter values at position k -// @param args {dict/num[]} function arguments that do not change per iteration -// @param params {dict} parameters controlling non default optimization behaviour -// @return {num[]} new alpha, fk and derivative values -i.wolfeSearch:{[fk;prev_fk;gk;pk;func;xk;args;params] - phiFunc :i.phi[func;pk;;xk;args]; - derphiFunc:i.derphi[func;params`geps;pk;;xk;args]; - // initial Wolfe conditions - wolfeDict:`idx`alpha0`phi0`phi_a0!(0;0;fk;fk); - // calculate the derivative at that phi0 - derphi0:gk mmu pk; - wolfeDict[`derphi_a0`derphi0]:2#derphi0; - // calculate step size this should be 0 < x < 1 - // with min(x;maxstepsize) or 1f otherwise - alpha:1.01*2*(fk - prev_fk)%derphi0; - alphaVal:$[alpha within 0 1f;min(alpha;params`stepSize);1f]; - wolfeDict[`alpha1]:alphaVal; - // function value at alpha1 - wolfeDict[`phi_a1]:phiFunc wolfeDict`alpha1; - // repeat until wolfe criteria is reached or max iterations have been done - // to get new alpha, phi and derphi values - wolfeDict:i.stopWolfe[;params]i.scalarWolfe[derphiFunc;phiFunc;pk;params]/wolfeDict; - // if the line search did not converge, use last alpha , phi and derphi - $[not any null raze wolfeDict`alpha_star`phi_star`derphi_star; - wolfeDict`alpha_star`phi_star`derphi_star; - wolfeDict`alpha1`phi_a1`derphi_a0_fin - ] - } - -// @private -// @kind function -// @category optimization -// @fileoverview apply a scalar search to find an alpha value that satisfies -// strong Wolfe conditions, a python implementation of this is outlined here -// https://github.com/scipy/scipy/blob/v1.5.0/scipy/optimize/linesearch.py#L338 -// This functions defines the bounds between which the step function can be found. -// When the optimal bound is found, the area is zoomed in on and optimal value find -// @param derphiFunc {proj} function to calculate the value of the objective function -// derivative at alpha -// @param phiFunc {proj} function to calculate the value of the objective function at alpha -// @param pk {float} search direction -// @param params {dict} parameters controlling non default optimization behaviour -// @param wolfeDict {dict} all data relevant to the calculation of the optimal -// alpha values -// @returns {dict} new alpha, fk and derivative values -i.scalarWolfe:{[derphiFunc;phiFunc;pk;params;wolfeDict] - // set up zoom function constant params - zoomSetup:i.zoomFunc[derphiFunc;phiFunc;;;params]. wolfeDict`phi0`derphi0; - // if criteria 1, zoom and break loop - if[i.wolfeCriteria1[wolfeDict;params]; - wolfeDict[`idx]:0w; - wolfeDict[i.zoomReturn]:zoomSetup wolfeDict`alpha0`alpha1`phi_a0`phi_a1`derphi_a0; - :wolfeDict - ]; - // calculate the derivative of the function at the new position - derphiCalc:derphiFunc wolfeDict`alpha1; - // update the new derivative fnc - wolfeDict[`derphi_a1]:derphiCalc`derval; - $[i.wolfeCriteria2[wolfeDict;params]; - [wolfeDict[`alpha_star] :wolfeDict`alpha1; - wolfeDict[`phi_star] :wolfeDict`phi_a1; - wolfeDict[`derphi_star]:derphiCalc`grad; - wolfeDict[`idx]:0w; - wolfeDict - ]; - 0<=wolfeDict`derphi_a1; - [wolfeDict[`idx]:0w; - wolfeDict[i.zoomReturn]:zoomSetup wolfeDict`alpha1`alpha0`phi_a1`phi_a0`derphi_a1 - ]; - // update dictionary and repeat process until criteria is met - [wolfeDict[`alpha0]:wolfeDict`alpha1; - wolfeDict[`alpha1]:2*wolfeDict`alpha1; - wolfeDict[`phi_a0]:wolfeDict`phi_a1; - wolfeDict[`phi_a1]:phiFunc wolfeDict`alpha1; - wolfeDict[`derphi_a0]:wolfeDict`derphi_a1; - wolfeDict[`derphi_a0_fin]:derphiCalc`grad; - wolfeDict[`idx]+:1 - ] - ]; - wolfeDict - } - -// @private -// @kind function -// @category optimize -// @fileoverview function to apply 'zoom' iteratively during linesearch to find optimal alpha -// value satisfying strong Wolfe conditions -// @param derphiFunc {proj} function to calculate the value of the objective function -// derivative at alpha -// @param phiFunc {proj} function to calculate the value of the objective function at alpha -// @param phi0 {float} value of function evaluation at x(k-1) -// @param derphi0 {float} value of objective function derivative at x(k-1) -// @param params {dict} parameters controlling non default optimization behaviour -// @param lst {num[]} bounding conditions for alpha, phi and derphi used in zoom algorithm -// @returns {num[]} new alpha, fk and derivative values -i.zoomFunc:{[derphiFunc;phiFunc;phi0;derphi0;params;lst] - zoomDict:i.zoomKeys!lst,phi0; - zoomDict[`idx`a_rec]:2#0f; - zoomDict:i.stopZoom[;params]i.zoom[derphiFunc;phiFunc;phi0;derphi0;params]/zoomDict; - // if zoom did not converge, set to null - $[count star:zoomDict[i.zoomReturn];star;3#0N] - } - -// @private -// @kind function -// @category optimize -// @fileoverview function to apply an individual step in 'zoom' during linesearch -// to find optimal alpha value satisfying strong Wolfe conditions. An outline of -// the python implementation of this section of the algorithm can be found here -// https://github.com/scipy/scipy/blob/v1.5.0/scipy/optimize/linesearch.py#L556 -// @param derphiFunc {proj} function to calculate the value of the objective function -// derivative at alpha -// @param phiFunc {proj} function to calculate the value of the objective function at alpha -// @param phi0 {float} value of function evaluation at x(k-1) -// @param derphi0 {float} value of objective function derivative at x(k-1) -// @param params {dict} parameters controlling non default optimization behaviour -// @param zoomDict {dict} parameters to be updated as 'zoom' procedure is applied to find -// the optimal value of alpha -// @returns {dict} parameters calculated for an individual step in line search procedure -// to find optimal alpha value satisfying strong Wolfe conditions -i.zoom:{[derphiFunc;phiFunc;phi0;derphi0;params;zoomDict] - // define high and low values - dalpha:zoomDict[`a_hi]-zoomDict`a_lo; - // These should probably be named a and b since mapping doesn't work properly? - highLow:`high`low!$[dalpha>0;zoomDict`a_hi`a_lo;zoomDict`a_lo`a_hi]; - if["i"$zoomDict`idx; - cubicCheck:dalpha*0.2; - findMin:i.cubicMin . zoomDict`a_lo`phi_lo`derphi_lo`a_hi`phi_hi`a_rec`phi_rec - ]; - if[i.quadCriteria[findMin;highLow;cubicCheck;zoomDict]; - quadCheck:0.1*dalpha; - findMin:i.quadMin . zoomDict`a_lo`phi_lo`derphi_lo`a_hi`phi_hi; - if[(findMin > highLow[`low]-quadCheck) | findMin < highLow[`high]+quadCheck; - findMin:zoomDict[`a_lo]+0.5*dalpha - ] - ]; - // update new values depending on fnd_min - phiMin:phiFunc[findMin]; - //first condition, update and continue loop - if[i.zoomCriteria1[phi0;derphi0;phiMin;findMin;zoomDict;params]; - zoomDict[`idx]+:1; - zoomDict[i.zoomKeys1]:zoomDict[`phi_hi`a_hi],findMin,phiMin; - :zoomDict - ]; - // calculate the derivative at the cubic minimum - derphiMin:derphiFunc findMin; - // second scenario, create new features and end the loop - $[i.zoomCriteria2[derphi0;derphiMin;params]; - [zoomDict[`idx]:0w; - zoomDict:zoomDict,i.zoomReturn!findMin,phiMin,enlist derphiMin`grad]; - i.zoomCriteria3[derphiMin;dalpha]; - [zoomDict[`idx]+:1; - zoomDict[i.zoomKeys1,i.zoomKeys2]:zoomDict[`phi_hi`a_hi`a_lo`phi_lo], - findMin,phiMin,derphiMin`derval]; - [zoomDict[`idx]+:1; - zoomDict[i.zoomKeys3,i.zoomKeys2]:zoomDict[`phi_lo`a_lo], - findMin,phiMin,derphiMin`derval] - ]; - zoomDict - } - - -// Vector norm calculation - -// @private -// @kind function -// @category optimization -// @fileoverview calculate the vector norm, used in calculation of the gradient norm at position k. -// Default behaviour is to use the maximum value of the gradient, this can be overwritten by -// a user, this is in line with the default python implementation. -// @param vec {num[]} calculated gradient values -// @param ord {long} order of norm (0W = max; -0W = min) -// @return the gradient norm based on the input gradient -i.vecNorm:{[vec;ord] - if[-7h<>type ord;'"ord must be +/- infinity or a long atom"]; - $[ 0W~ord;max abs vec; - -0W~ord;min abs vec; - sum[abs[vec]xexp ord]xexp 1%ord - ] - } - - -// Stopping conditions - -// @private -// @kind function -// @category optimization -// @fileoverview evaluate if the optimization function has reached a condition which is -// should result in the optimization algorithm being stopped. -// @param dict {dict} optimization function returns -// @param params {dict} parameters controlling non default optimization behaviour -// @return {bool} indication as to if the optimization has met one of it's stopping conditions -i.stopOptimize:{[dict;params] - // is the function evaluation at k an improvement on k-1? - check1:dict[`fk] < dict`prev_fk; - // has x[k] returned a non valid return? - check2:not any null dict`xk; - // have the maximum number of iterations been met? - check3:params[`optimIter] > dict`idx; - // is the gradient at position k below the accepted tolerance - check4:params[`gtol] < dict`gnorm; - check1 & check2 & check3 & check4 - } - -// @private -// @kind function -// @category optimization -// @fileoverview evaluate if the wolfe condition search has reached a condition which is -// should result in the optimization algorithm being stopped. -// @param dict {dict} optimization function returns -// @param params {dict} parameters controlling non default optimization behaviour -// @return {bool} indication as to if the optimization has met one of it's stopping conditions -i.stopWolfe:{[dict;params] - dict[`idx] < params`wolfeIter - } - -// @private -// @kind function -// @category optimization -// @fileoverview evaluate if the alpha condition 'zoom' has reached a condition which is -// should result in the optimization algorithm being stopped. -// @param dict {dict} optimization function returns -// @param params {dict} parameters controlling non default optimization behaviour -// @return {bool} indication as to if the optimization has met one of it's stopping conditions -i.stopZoom:{[dict;params] - dict[`idx] < params`zoomIter - } - - -// Function + derivative evaluation at x[k]+ p[k]*alpha[k] - -// @private -// @kind function -// @category optimization -// @fileoverview evaluate the objective function at the position x[k] + step size -// @param func {lambda} the objective function to be minimized -// @param pk {float} step direction -// @param alpha {float} size of the step to be applied -// @param xk {num[]} parameter values at position k -// @param args {dict/num[]} function arguments that do not change per iteration -// @param xk {num[]} -// @returns {float} function evaluated at at the position x[k] + step size -i.phi:{[func;pk;alpha;xk;args] - xk+:alpha*pk; - i.funcEval[func;xk;args] - } - -// @private -// @kind function -// @category optimization -// @fileoverview evaluate the derivative of the objective function at -// the position x[k] + step size -// @param func {lambda} the objective function to be minimized -// @param eps {float} the absolute step size used for numerical approximation -// of the jacobian via forward differences. -// @param pk {float} step direction -// @param alpha {float} size of the step to be applied -// @param xk {num[]} parameter values at position k -// @param args {dict/num[]} function arguments that do not change per iteration -// @returns {dict} gradient and value of scalar derivative -i.derphi:{[func;eps;pk;alpha;xk;args] - // increment xk by a small step size - xk+:alpha*pk; - // get gradient at the new position - gval:i.grad[func;xk;args;eps]; - derval:gval mmu pk; - `grad`derval!(gval;derval) - } - - -// Minimization functions - -// @private -// @kind function -// @category optimization -// @fileoverview find the minimizing solution for a cubic polynomial which -// passes through the points (a,fa), (b,fb) and (c,fc) with a derivative of the -// objective function calculated as fpa. This follows the python implementation -// outlined here https://github.com/scipy/scipy/blob/v1.5.0/scipy/optimize/linesearch.py#L482 -// @param a {float} position a -// @param b {float} position b -// @param c {float} position c -// @param fa {float} objective function evaluated at a -// @param fb {float} objective function evaluated at b -// @param fc {float} objective function evaluated at c -// @param fpa {float} derivative of the objective function evaluated at a -// @returns {num[]} minimized parameter set as a solution for the cubic polynomial -i.cubicMin:{[a;fa;fpa;b;fb;c;fc] - db:b-a; - dc:c-a; - denom:(db*dc)xexp 2*(db-dc); - d1:2 2#0f; - d1[0]:(1 -1)*xexp[;2]each(db;dc); - d1[1]:(-1 1)*xexp[;3]each(dc;db); - AB:d1 mmu(fb-fa-fpa*db;fc-fa-fpa*dc); - AB%:denom; - radical:AB[1]*AB[1]-3*AB[0]*fpa; - a+(neg[AB[1]]+sqrt(radical))%(3*AB[0]) - } - -// @private -// @kind function -// @category optimization -// @fileoverview find the minimizing solution for a quadratic polynomial which -// passes through the points (a,fa) and (b,fb) with a derivative of the objective function -// calculated as fpa. This follows the python implementation outlined here -// https://github.com/scipy/scipy/blob/v1.5.0/scipy/optimize/linesearch.py#L516 -// @param a {float} position a -// @param b {float} position b -// @param fa {float} objective function evaluated at a -// @param fb {float} objective function evaluated at b -// @param fpa {float} derivative of the objective function evaluated at a -// @returns {num[]} minimized parameter set as a solution for the quadratic polynomial -i.quadMin:{[a;fa;fpa;b;fb] - db:b-a; - B:(fb-fa-fpa*db)%(db*db); - a-fpa%(2*B) - } - - -// Gradient + function evaluation - -// @private -// @kind function -// @category optimization -// @fileoverview calculation of the gradient of the objective function for all parameters of x -// incremented individually by epsilon -// @param func {lambda} the objective function to be minimized -// @param xk {num[]} parameter values at position k -// @param args {dict/num[]} function arguments that do not change per iteration -// @param eps {float} the absolute step size used for numerical approximation -// of the jacobian via forward differences. -// @returns {dict} gradient of function at position k -i.grad:{[func;xk;args;eps] - fk:i.funcEval[func;xk;args]; - i.gradEval[fk;func;xk;args;eps]each til count xk - } - -// @private -// @kind function -// @category optimization -// @fileoverview calculation of the gradient of the objective function for a single -// parameter set x where one of the indices has been incremented by epsilon -// @param func {lambda} the objective function to be minimized -// @param xk {num[]} parameter values at position k -// @param args {dict/num[]} function arguments that do not change per iteration -// @param eps {float} the absolute step size used for numerical approximation -// of the jacobian via forward differences. -// @returns {dict} gradient of function at position k with an individual -// variable x incremented by epsilon -i.gradEval:{[fk;func;xk;args;eps;idx] - if[(::)~fk;fk:i.funcEval[func;xk;args]]; - // increment function optimisation values by epsilon - xk[idx]+:eps; - // Evaluate the gradient - (i.funcEval[func;xk;args]-fk)%eps - } - -// @private -// @kind function -// @category optimization -// @fileoverview evaluate the objective function at position x[k] with relevant -// additional arguments accounted for -// @param {lambda} the objective function to be minimized -// @param xk {num[]} parameter values at position k -// @param args {dict/num[]} function arguments that do not change per iteration -// @returns {float} the objective function evaluated at the appropriate location -i.funcEval:{[func;xk;args] - $[any args~/:((::);());func xk;func[xk;args]] - } - - -// Paramter dictionary - -// @private -// @kind function -// @category -// @fileoverview update the default behaviour of the model optimization procedure -// to account for increased sensitivity to tolerance, the number of iterations, -// how the gradient norm is calculated and various numerical updates including changes -// to the Armijo rule and curvature for calculation of the strong Wolfe conditions. -// @param dict {dict/(::)/()} if a dictionary update the default dictionary to include -// the user defined updates, otherwise use the default dictionary -// @returns {dict} updated or default parameter set depending on user input -i.updDefault:{[dict] - returnKeys:`norm`optimIter`gtol`geps`stepSize`c1`c2`wolfeIter`zoomIter`display; - returnVals:(0W;0W;1e-4;1.49e-8;0w;1e-4;0.9;10;10;0b); - returnDict:returnKeys!returnVals; - if[99h<>type dict;dict:()!()]; - i.wolfeParamCheck[returnDict,dict] - } - -// @private -// @kind function -// @category optimization -// @fileoverview Ensure that the armijo and curvature parameters are consistent -// with the expected values for calculation of the strong Wolfe conditions. -// Return an error on unsuitable conditions otherwise return the input dictionary -// @param dict {dict} updated parameter dictionary containing default information and -// any updated parameter information -// @returns {dict/err} the original input dictionary or an error suggesting that the -// Armijo and curvature parameters are unsuitable -i.wolfeParamCheck:{[dict] - check1:dict[`c1]>dict`c2; - check2:any not dict[`c1`c2]within 0 1; - $[check1 or check2; - '"When evaluating Wolfe conditions the following must hold 0 < c1 < c2 < 1"; - dict - ] - } - - -// Data Formatting - -// @private -// @kind function -// @category optimization -// @fileoverview Ensure that the input parameter x at position 0 which will -// be updated is in a format that is suitable for use with this optimization -// procedure i.e. the data is a list of values. -// @param x0 {dict/num/num[]} initial values of x to be optimized -// @returns {num[]} the initial values of x converted into a suitable numerical list format -i.dataFormat:{[x0] - "f"$$[99h=type x0;raze value x0;0h >type x0;enlist x0; x0] - } - - -// Conditional checks for Wolfe, zoom and quadratic condition evaluation - -// @private -// @kind function -// @category optimization -// @fileoverview ensure new values lead to improvements over the older values -// @param wolfeDict {dict} the current iterations values for the objective function and the -// derivative of the objective function evaluated -// @param params {dict} parameter dictionary containing the updated/default information -// used to modify the behaviour of the system as a whole -// @returns {bool} indication as to if a further zoom is required -i.wolfeCriteria1:{[wolfeDict;params] - check1:wolfeDict[`phi_a1]>wolfeDict[`phi0]+params[`c1]*prd wolfeDict`alpha1`derphi0; - check2:(wolfeDict[`phi_a1]>=wolfeDict`phi_a0) and (1=abs wolfeDict`derphi_a1 - } - -// @private -// @kind function -// @category optimization -// @fileoverview check if there is need to apply quadratic minimum calculation -// @param findMin {num[]} the currently calculated minimum values -// @param highLow {dict} upper and lower bounds of the search space -// @param cubicCheck {float} interpolation check parameter -// @param zoomDict {dict} parameters to be updated as 'zoom' procedure is applied to find -// the optimal value of alpha -// @returns {bool} indication as to if the value of findMin needs to be updated -i.quadCriteria:{[findMin;highLow;cubicCheck;zoomDict] - // On initial iteration the minimum has not been calculated - // as such criteria should exit early to complete the quadratic calculation - if[findMin~();:1b]; - check1:0=zoomDict`idx; - check2:findMin>highLow[`low] -cubicCheck; - check3:findMin phi0+findMin*derphi0*params`c1; - check2:phiMin>=zoomDict`phi_lo; - check1 or check2 - } - -// @private -// @kind function -// @category optimization -// @fileoverview check if the zoom conditions are sufficient -// @param derphi0 {float} derivative of the objective function evaluated at index 0 -// @param derphiMin {float} derivative of the objective function evaluated at the current minimum -// @param params {dict} parameter dictionary containing the updated/default information -// used to modify the behaviour of the system as a whole -// @returns indication as to if further zooming is required -i.zoomCriteria2:{[derphi0;derphiMin;params] - abs[derphiMin`derval]<=neg derphi0*params`c2 - } - -// @private -// @kind function -// @category optimization -// @fileoverview check if the zoom conditions are sufficient -// @param derphiMin {float} derivative of the objective function evaluated at the current minimum -// @param dalpha {float} difference between the upper and lower bound of the zoom bracket -// @returns indication as to if further zooming is required -i.zoomCriteria3:{[derphiMin;dalpha] - 0<=derphiMin[`derval]*dalpha - } - - -// Zoom dictionary - -//input keys of zoom dictionary -i.zoomKeys:`a_lo`a_hi`phi_lo`phi_hi`derphi_lo`phi_rec; -// keys to be updated in zoom each iteration -i.zoomKeys1:`phi_rec`a_rec`a_hi`phi_hi; -// extra keys that have to be updated in some scenarios -i.zoomKeys2:`a_lo`phi_lo`derphi_lo; -i.zoomKeys3:`phi_rec`a_rec -// final updated keys to be used -i.zoomReturn:`alpha_star`phi_star`derphi_star; diff --git a/optimize/optimize.q b/optimize/optimize.q new file mode 100644 index 00000000..26dad8f5 --- /dev/null +++ b/optimize/optimize.q @@ -0,0 +1,77 @@ +// optimize/optimize.q - Otimization algorithms +// Copyright (c) 2021 Kx Systems Inc +// +// Contains an implementation of the BFGS algorithm. + +// Broyden-Fletcher-Goldfarb-Shanno (BFGS) algorithm. This implementation +// is based on +// https://github.com/scipy/scipy/blob/v1.5.0/scipy/optimize/optimize.py#L1058 +// and is a quasi-Newton hill-climbing optimization technique used to find a +// preferably twice continuously differentiable stationary point of a +// function. + +// An outline of the algorithm mathematically is provided here: +// https://en.wikipedia.org/wiki/Broyden-Fletcher-Goldfarb-Shanno_algorithm + +\d .ml + +// @kind function +// @category optimization +// @desc Optimize a function using the Broyden-Fletcher-Goldfarb-Shanno +// (BFGS) algorithm +// @param func {fn} Function to be optimized. This function should take +// as its arguments a list/dictionary of parameters to be optimized and +// a list/dictionary of additional unchanging arguments +// @param x0 {number[]|dictionary} The first guess at the parameters to be +// optimized as a list or dictionary of numeric values +// @param args {list|dictionary|(::)} Any unchanging parameters to required for +// evaluation of the function, these should be in the order that they are to +// be applied to the function +// @param params {dictionary} Any modifications to be applied to the +// optimization procedure e.g. +// - display {boolean} Results at each optimization iteration to be printed +// - optimIter {int} Maximum number of iterations in optimization procedure +// - zoomIter {int} Maximum number of iterations when finding optimal zoom +// - wolfeIter {int} Maximum number of iterations +// - norm {int} Order of norm (0W = max; -0W = min) otherwise calculated via +// sum[abs[vec]xexp norm]xexp 1%norm +// - gtol {float} Gradient norm must be less than gtol before successful +// termination +// - geps {float} The absolute step size used for numerical approximation of +// the jacobian via forward differences. +// - stepSize {float} Maximum allowable 'alpha' step size between +// calculations +// - c1 {float} Armijo rule condition +// - c2 {int} Curvature conditions rule +// @returns {dictionary} Contains the estimated optimal parameters, number of +// iterations and the evaluated return of the function being optimized +optimize.BFGS:{[func;x0;args;params] + // Update the default behaviour of the parameters + params:i.updDefault[params]; + // Format x0 based on input type + x0:i.dataFormat[x0]; + // Evaluate the function at the starting point + f0:i.funcEval[func;x0;args]; + // Calculate the starting gradient + gk:i.grad[func;x0;args;params`geps]; + // Initialize Hessian matrix as identity matrix + hess:.ml.eye count x0; + // Set initial step guess i.e. the step before f0 + fkPrev:f0+sqrt[sum gk*gk]%2; + gradNorm:i.vecNorm[gk;params`norm]; + optimKeys:`xk`fk`fkPrev`gk`xkPrev`hess`gnorm`I`idx; + optimVals:(x0;f0;fkPrev;gk;0n;hess;gradNorm;hess;0); + optimDict:optimKeys!optimVals; + // Run optimization until one of the stopping conditions is met + optimDict:i.stopOptimize[;params]i.BFGSFunction[func;;args;params]/optimDict; + returnKeys:`xVals`funcRet`numIter; + // If function returned due to a null xVal or the new value being worse than + // the previous value then return the k-1 value + nullOptim:not any null optimDict`xk; + fkCompare:optimDict[`fk] x(k) + sk:alpha*pk; + // Update values of x at the new position k + optimDict[`xk]:optimDict[`xkPrev]+sk; + // If null gNew, then get gradient of new x value + if[any null gNew;gNew:i.grad[func;optimDict`xk;args;params`geps]]; + // Subtract new gradients + yk:gNew-optimDict`gk; + optimDict[`gk]:gNew; + // Get new norm of gradient + optimDict[`gnorm]:i.vecNorm[optimDict`gk;params`norm]; + // Calculate new hessian matrix for next iteration + rhok:1%mmu[yk;sk]; + if[0w=rhok; + rhok:1000f; + -1"Division by zero in calculation of rhok, assuming rhok large"; + ]; + A1:optimDict[`I]-sk*\:yk*rhok; + A2:optimDict[`I]-yk*\:sk*rhok; + hessMul:mmu[A1;mmu[optimDict`hess;A2]]; + optimDict[`hess]:hessMul+rhok*(sk*/:sk); + // if x(k) returns infinite value update gnorm and fk + if[0w in abs optimDict`xk;optimDict[`gnorm`fk]:(0n;0w)]; + optimDict[`idx]+:1; + if[params`display;show optimDict;-1"";]; + optimDict + } + +// @private +// @kind function +// @category optimizationUtility +// @desc Complete a line search across an unconstrained minimization +// problem making use of wolfe conditions to constrain the search. The naming +// convention for dictionary keys in this implementation is based on the +// python implementation of the same functionality here +// https://github.com/scipy/scipy/blob/v1.5.0/scipy/optimize/linesearch.py#L193 +// @param fk {float} Function return evaluated at position k +// @param fkPrev {float} Function return evaluated at position k-1 +// @param gk {float} Gradient at position k +// @param pk {float} Search direction +// @param func {fn} Function being optimized +// @param xk {number[]} Parameter values at position k +// @param args {dictionary|number[]} Function arguments that do not change per +// iteration +// @param params {dictionary} Parameters controlling non default optimization +// behaviour +// @return {number[]} New alpha, fk and derivative values +i.wolfeSearch:{[fk;fkPrev;gk;pk;func;xk;args;params] + phiFunc :i.phi[func;pk;;xk;args]; + derPhiFunc:i.derPhi[func;params`geps;pk;;xk;args]; + // Initial Wolfe conditions + wolfeKeys:`idx`alpha0`phi0`phia0; + wolfeVals:(0;0;fk;fk); + wolfeDict:wolfeKeys!wolfeVals; + // Calculate the derivative at that phi0 + derPhi0:gk mmu pk; + wolfeDict[`derPhia0`derPhi0]:2#derPhi0; + // Calculate step size this should be 0 < x < 1 + // with min(x;maxstepsize) or 1f otherwise + alpha:1.01*2*(fk-fkPrev)%derPhi0; + alphaVal:$[alpha within 0 1f;min(alpha;params`stepSize);1f]; + wolfeDict[`alpha1]:alphaVal; + // function value at alpha1 + wolfeDict[`phia1]:phiFunc wolfeDict`alpha1; + // Repeat until wolfe criteria is reached or max iterations have been done + // to get new alpha, phi and derPhi values + wolfeDict:i.stopWolfe[;params] + i.scalarWolfe[derPhiFunc;phiFunc;pk;params]/wolfeDict; + // if the line search did not converge, use last alpha , phi and derPhi + $[not any null raze wolfeDict`alphaStar`phiStar`derPhiStar; + wolfeDict`alphaStar`phiStar`derPhiStar; + wolfeDict`alpha1`phia1`derPhia0Fin + ] + } + +// @private +// @kind function +// @category optimizationUtility +// @desc Apply a scalar search to find an alpha value that satisfies +// strong Wolfe conditions, a python implementation of this is outlined here +// https://github.com/scipy/scipy/blob/v1.5.0/scipy/optimize/linesearch.py#L338 +// This functions defines the bounds between which the step function can +// be found. When the optimal bound is found, the area is zoomed recursively +// until the optimal value is found +// @param derPhiFunc {fn} Function to calculate the value of the objective +// function derivative at alpha +// @param phiFunc {fn} Function to calculate the value of the objective +// function at alpha +// @param pk {float} Search direction +// @param params {dictionary} Parameters controlling non default optimization +// behaviour +// @param wolfeDict {dictionary} All data relevant to the calculation of the +// optimal alpha values +// @returns {dictionary} New alpha, fk and derivative values +i.scalarWolfe:{[derPhiFunc;phiFunc;pk;params;wolfeDict] + // Set up zoom function constant params + zoomSetup:i.zoomFunc[derPhiFunc;phiFunc;;;params]. wolfeDict`phi0`derPhi0; + // If criteria 1 is met, zoom and break loop + if[i.wolfeCriteria1[wolfeDict;params]; + wolfeDict[`idx]:0w; + wolfeVals:wolfeDict`alpha0`alpha1`phia0`phia1`derPhia0; + updZoom:zoomSetup wolfeVals; + wolfeDict[i.zoomReturn]:updZoom; + :wolfeDict + ]; + // Calculate the derivative of the function at the new position + derPhiCalc:derPhiFunc wolfeDict`alpha1; + // Update the new derivative function + wolfeDict[`derPhia1]:derPhiCalc`derval; + $[i.wolfeCriteria2[wolfeDict;params]; + [wolfeDict[`alphaStar]:wolfeDict`alpha1; + wolfeDict[`phiStar]:wolfeDict`phia1; + wolfeDict[`derPhiStar]:derPhiCalc`grad; + wolfeDict[`idx]:0w; + wolfeDict + ]; + 0<=wolfeDict`derPhia1; + [wolfeDict[`idx]:0w; + updZoom:zoomSetup wolfeDict`alpha1`alpha0`phia1`phia0`derPhia1; + wolfeDict[i.zoomReturn]:updZoom + ]; + // Update dictionary and repeat process until criteria is met + [wolfeDict[`alpha0]:wolfeDict`alpha1; + wolfeDict[`alpha1]:2*wolfeDict`alpha1; + wolfeDict[`phia0]:wolfeDict`phia1; + wolfeDict[`phia1]:phiFunc wolfeDict`alpha1; + wolfeDict[`derPhia0]:wolfeDict`derPhia1; + wolfeDict[`derPhia0Fin]:derPhiCalc`grad; + wolfeDict[`idx]+:1 + ] + ]; + wolfeDict + } + +// @private +// @kind function +// @category optimizeUtility +// @desc Function to apply 'zoom' iteratively during linesearch to find +// optimal alpha value satisfying strong Wolfe conditions +// @param derPhiFunc {fn} Function to calculate the value of the objective +// function derivative at alpha +// @param phiFunc {fn} Function to calculate the value of the objective +// function at alpha +// @param phi0 {float} Value of function evaluation at x(k-1) +// @param derPhi0 {float} Value of objective function derivative at x(k-1) +// @param params {dictionary} Parameters controlling non default optimization +// behaviour +// @param cond {number[]} Bounding conditions for alpha, phi and derPhi used in +// zoom algorithm +// @returns {number[]} New alpha, fk and derivative values +i.zoomFunc:{[derPhiFunc;phiFunc;phi0;derPhi0;params;cond] + zoomDict:i.zoomKeys!cond,phi0; + zoomDict[`idx`aRec]:2#0f; + zoomDict:i.stopZoom[;params] + i.zoom[derPhiFunc;phiFunc;phi0;derPhi0;params]/zoomDict; + // If zoom did not converge, set to null + $[count star:zoomDict[i.zoomReturn];star;3#0N] + } + +// @private +// @kind function +// @category optimizeUtility +// @desc Function to apply an individual step in 'zoom' during +// linesearch to find optimal alpha value satisfying strong Wolfe conditions. +// An outline of the python implementation of this section of the algorithm +// can be found here +// https://github.com/scipy/scipy/blob/v1.5.0/scipy/optimize/linesearch.py#L556 +// @param derPhiFunc {fn} Function to calculate the value of the objective +// function derivative at alpha +// @param phiFunc {fn} Function to calculate the value of the objective +// function at alpha +// @param phi0 {float} Value of function evaluation at x(k-1) +// @param derPhi0 {float} Value of objective function derivative at x(k-1) +// @param params {dictionary} Parameters controlling non default optimization +// behaviour +// @param zoomDict {dictionary} Parameters to be updated as 'zoom' procedure is +// applied to find the optimal value of alpha +// @returns {dictionary} Parameters calculated for an individual step in line +// search procedure to find optimal alpha value satisfying strong Wolfe +// conditions +i.zoom:{[derPhiFunc;phiFunc;phi0;derPhi0;params;zoomDict] + alphaDiff:zoomDict[`aHi]-zoomDict`aLo; + // define high and low values + highLowVal:$[alphaDiff>0;zoomDict`aHi`aLo;zoomDict`aLo`aHi]; + highLow:`high`low!highLowVal; + if["i"$zoomDict`idx; + cubicCheck:alphaDiff*0.2; + findMin:i.cubicMin . zoomDict`aLo`phiLo`derPhiLo`aHi`phiHi`aRec`phiRec + ]; + if[i.quadCriteria[findMin;highLow;cubicCheck;zoomDict]; + quadCheck:0.1*alphaDiff; + findMin:i.quadMin . zoomDict`aLo`phiLo`derPhiLo`aHi`phiHi; + lowerCheck:findMinhighLow[`low]-quadCheck; + if[upperCheck|lowerCheck; + findMin:zoomDict[`aLo]+0.5*alphaDiff + ] + ]; + // Update new values depending on findMin + phiMin:phiFunc[findMin]; + // First condition, update and continue loop + if[i.zoomCriteria1[phi0;derPhi0;phiMin;findMin;zoomDict;params]; + zoomDict[`idx]+:1; + zoomDict[i.zoomKeys1]:zoomDict[`phiHi`aHi],findMin,phiMin; + :zoomDict + ]; + // Calculate the derivative at the cubic minimum + derPhiMin:derPhiFunc findMin; + // Second scenario, create new features and end the loop + $[i.zoomCriteria2[derPhi0;derPhiMin;params]; + [zoomDict[`idx]:0w; + zoomDict:zoomDict,i.zoomReturn!findMin,phiMin,enlist derPhiMin`grad + ]; + i.zoomCriteria3[derPhiMin;alphaDiff]; + [zoomDict[`idx]+:1; + zoomDict[i.zoomKeys1,i.zoomKeys2]:zoomDict[`phiHi`aHi`aLo`phiLo], + findMin,phiMin,derPhiMin`derval + ]; + [zoomDict[`idx]+:1; + zoomDict[i.zoomKeys3,i.zoomKeys2]:zoomDict[`phiLo`aLo], + findMin,phiMin,derPhiMin`derval + ] + ]; + zoomDict + } + +// Vector norm calculation + +// @private +// @kind function +// @category optimizationUtility +// @desc Calculate the vector norm, used in calculation of the gradient +// norm at position k. Default behaviour is to use the maximum value of the +// gradient, this can be overwritten by a user, this is in line with the +// default python implementation. +// @param gradVals {number[]} Vector of calculated gradient values +// @param ord {long} Order of norm (0W = max; -0W = min) +// @return {float} Gradient norm based on the input gradient +i.vecNorm:{[gradVals;ord] + if[-7h<>type ord;'"ord must be +/- infinity or a long atom"]; + $[0W~ord;max abs gradVals; + -0W~ord;min abs gradVals; + sum[abs[gradVals]xexp ord]xexp 1%ord + ] + } + +// Stopping conditions + +// @private +// @kind function +// @category optimizationUtility +// @desc Evaluate if the optimization function has reached a condition +// which should result in the optimization algorithm being stopped +// @param dict {dictionary} Optimization function returns +// @param params {dictionary} Parameters controlling non default optimization +// behaviour +// @return {boolean} Indication as to if the optimization has met one of it's +// stopping conditions +i.stopOptimize:{[dict;params] + // Is the function evaluation at k an improvement on k-1? + check1:dict[`fk]dict`idx; + // Is the gradient at position k below the accepted tolerance + check4:params[`gtol]type dict;dict:()!()]; + i.wolfeParamCheck[returnDict,dict] + } + +// @private +// @kind function +// @category optimizationUtility +// @desc Ensure that the Armijo and curvature parameters are consistent +// with the expected values for calculation of the strong Wolfe conditions +// @param dict {dictionary} Updated parameter dictionary containing default +// information and any updated parameter information +// @returns {dictionary|err} The original input dictionary or an error +// suggesting that the Armijo and curvature parameters are unsuitable +i.wolfeParamCheck:{[dict] + check1:dict[`c1]>dict`c2; + check2:any not dict[`c1`c2]within 0 1; + $[check1 or check2; + '"When evaluating Wolfe conditions the following must hold 0 < c1 < c2 < 1"; + dict + ] + } + +// Data Formatting + +// @private +// @kind function +// @category optimizationUtility +// @desc Ensure that the input parameter x at position 0 which +// will be updated is in a format that is suitable for use with this +// optimization procedure i.e. the data is a list of values. +// @param x0 {dictionary|number|number[]} Initial values of x to be optimized +// @returns {number[]} The initial values of x converted into a suitable +// numerical list format +i.dataFormat:{[x0] + "f"$$[99h=type x0;raze value x0;0h>type x0;enlist x0;x0] + } + +// Conditional checks for Wolfe, zoom and quadratic condition evaluation + +// @private +// @kind function +// @category optimizationUtility +// @desc Ensure new values lead to improvements over the older values +// @param wolfeDict {dictionary} The current iterations values for the +// objective function and the derivative of the objective function evaluated +// @param params {dictionary} Parameter dictionary containing the updated/ +// default information used to modify the behaviour of the system as a whole +// @returns {boolean} Indication as to if a further zoom is required +i.wolfeCriteria1:{[wolfeDict;params] + prdVal:prd wolfeDict`alpha1`derPhi0; + check1:wolfeDict[`phia1]>wolfeDict[`phi0]+params[`c1]*prdVal; + prevPhi:wolfeDict[`phia1]>=wolfeDict`phia0; + wolfeIdx:1=abs wolfeDict`derPhia1 + } + +// @private +// @kind function +// @category optimizationUtility +// @desc Check if there is need to apply quadratic minimum calculation +// @param findMin {number[]} The currently calculated minimum values +// @param highLow {dictionary} Upper and lower bounds of the search space +// @param cubicCheck {float} Interpolation check parameter +// @param zoomDict {dictionary} Parameters to be updated as 'zoom' procedure is +// applied to find the optimal value of alpha +// @returns {boolean} Indication as to if the value of findMin needs to be +// updated +i.quadCriteria:{[findMin;highLow;cubicCheck;zoomDict] + // On first iteration the initial minimum has not been calculated + // as such criteria should exit early to complete the quadratic calculation + if[findMin~();:1b]; + check1:0=zoomDict`idx; + check2:findMin>highLow[`low] -cubicCheck; + check3:findMincalc; + check2:phiMin>=zoomDict`phiLo; + check1 or check2 + } + +// @private +// @kind function +// @category optimizationUtility +// @desc Check if the zoom conditions are sufficient +// @param derPhi0 {float} Derivative of the objective function evaluated at +// index 0 +// @param derPhiMin {float} Derivative of the objective function evaluated at +// the current minimum +// @param params {dictionary} Parameter dictionary containing the +// updated/default information used to modify the behaviour of the system +// as a whole +// @returns {boolean} Indication as to if further zooming is required +i.zoomCriteria2:{[derPhi0;derPhiMin;params] + abs[derPhiMin`derval]<=neg derPhi0*params`c2 + } + +// @private +// @kind function +// @category optimizationUtility +// @desc Check if the zoom conditions are sufficient +// @param derPhiMin {float} Derivative of the objective function evaluated at +// the current minimum +// @param alphaDiff {float} Difference between the upper and lower bound of the +// zoom bracket +// @returns {boolean} Indication as to if further zooming is required +i.zoomCriteria3:{[derPhiMin;alphaDiff] + 0<=derPhiMin[`derval]*alphaDiff + } + +// Zoom dictionary + +// @private +// @kind symbol +// @category optimizationUtility +// @desc Input keys of zoom dictionary +// @type symbol[] +i.zoomKeys:`aLo`aHi`phiLo`phiHi`derPhiLo`phiRec; + +// @private +// @kind symbol +// @category optimizationUtility +// @desc Keys to be updated in zoom each iteration +// @type symbol[] +i.zoomKeys1:`phiRec`aRec`aHi`phiHi; + +// @private +// @kind symbol +// @category optimizationUtility +// @desc Extra keys that have to be updated in some scenarios +// @type symbol[] +i.zoomKeys2:`aLo`phiLo`derPhiLo; + +// @private +// @kind symbol +// @category optimizationUtility +// @desc Extra keys that have to be updated in some scenarios +// @type symbol[] +i.zoomKeys3:`phiRec`aRec + +// @private +// @kind symbol +// @category optimizationUtility +// @desc Final updated keys to be used +// @type symbol[] +i.zoomReturn:`alphaStar`phiStar`derPhiStar; diff --git a/stats/README.md b/stats/README.md new file mode 100644 index 00000000..046b4bc6 --- /dev/null +++ b/stats/README.md @@ -0,0 +1,34 @@ +# Statistical Analysis + +This folder contains implementations of statistical methods for data exploration and estimation of models parameters. + +## Functionality + +The functionality contained within this section range from descriptive statistical methods to gain more insight into data, to linear regression estimation methods to investigate unknown parameters in a model. The linear regression implementations include `Ordinary Least Squares` and `Weighted Least Squares` + + +## Requirements + +- kdb+ > 3.5 + +## Installation + +Place the `ml` library in `$QHOME` and load into a q instance using `ml/ml.q` + +### Load + +The following will load the optimization functionality into the `.ml` namespace +```q +q)\l ml/ml.q +q).ml.loadfile`:stats/init.q +``` + +## Documentation + +Documentation is available on the [Statistics](https://code.kx.com/q/ml/toolkit/statistics/) homepage. + +## Status + +The optimization library is still in development. Further functionality and improvements will be made to the library on an ongoing basis. + +If you have any issues, questions or suggestions, please write to ai@kx.com. diff --git a/stats/describe.json b/stats/describe.json new file mode 100644 index 00000000..805c8255 --- /dev/null +++ b/stats/describe.json @@ -0,0 +1,74 @@ +{ + "count":{ + "func":"count", + "type":["num","temporal","other"] + }, + "type":{ + "func":"{.ml.stats.i.metaTypes .Q.ty x}", + "type":["num","temporal","other"] + }, + "mean":{ + "func":"avg", + "type":["num"] + }, + "std":{ + "func":"sdev", + "type":["num"] + }, + "min":{ + "func":"min", + "type":["num","temporal"] + }, + "max":{ + "func":"max", + "type":["num","temporal"] + }, + "q1":{ + "func":"{.ml.stats.percentile[x;0.25]}", + "type":["num"] + }, + "q2":{ + "func":"{.ml.stats.percentile[x;0.5]}", + "type":["num"] + }, + "q3":{ + "func":"{.ml.stats.percentile[x;0.75]}", + "type":["num"] + }, + "nulls":{ + "func":"{sum null x}", + "type":["num","temporal","other"] + }, + "inf":{ + "func":"{sum x=.ml.stats.i.infinity .ml.stats.i.metaTypes[.Q.ty x]}", + "type":["num"] + }, + "range":{ + "func":".ml.range", + "type":["num","temporal"] + }, + "skew":{ + "func":".ml.fresh.feat.skewness", + "type":["num"] + }, + "countDistinct":{ + "func":"{count distinct x}", + "type":["num","temporal","other"] + }, + "mode":{ + "func":"{first key desc count each group x}", + "type":["num","temporal","other"] + }, + "freq":{ + "func":"{first value asc count each group x}", + "type":["num","temporal","other"] + }, + "sampleDev":{ + "func":"sdev", + "type":["num"] + }, + "standardError":{ + "func":"{dev[x]%sqrt count x}", + "type":["num"] + } +} diff --git a/stats/init.q b/stats/init.q new file mode 100644 index 00000000..63053519 --- /dev/null +++ b/stats/init.q @@ -0,0 +1,5 @@ +// stats/init.q - Load stats library +// Copyright (c) 2021 Kx Systems Inc + +.ml.loadfile`:stats/utils.q +.ml.loadfile`:stats/stats.q diff --git a/stats/stats.q b/stats/stats.q new file mode 100644 index 00000000..e5eab3d3 --- /dev/null +++ b/stats/stats.q @@ -0,0 +1,159 @@ +// stats/stats.q - Statistical tools +// Copyright (c) 2021 Kx Systems Inc +// +// This statistical library contains functionality ranging from +// descriptive statistical methods to gain more insight into a +// users data, to linear regression estimation methods to investigate +// unknown parameters in a model. Includes OLS, WLS, describe, +// and percentile + +\d .ml + +// @kind function +// @category stats +// @desc Train an ordinary least squares model on data +// @param endog {number[][]|number[]} The endogenous variable +// @param exog {number[][]|number[]} A variables that predict the +// endog variable +// @param trend {boolean} Whether a trend is added to the model +// @returns {dictionary} Contains the following information: +// modelInfo - Coefficients and statistical values calculated during the +// fitting process +// predict - A projection allowing for prediction on new input data +stats.OLS.fit:{[endog;exog;trend] + stats.i.checkLen[endog;exog;"exog"]; + endog:"f"$endog; + exog:"f"$$[trend;1f,'exog;exog]; + if[1=count exog[0];exog:flip enlist exog]; + coef:first enlist[endog]lsq flip exog; + modelInfo:stats.i.OLSstats[coef;endog;exog;trend]; + returnInfo:enlist[`modelInfo]!enlist modelInfo; + predict:stats.OLS.predict returnInfo; + returnInfo,enlist[`predict]!enlist predict + } + +// @desc Predict values using coefficients calculated via OLS +// @param config {dictionary} Information returned from `OLS.fit` +// including: +// modelInfo - Coefficients and statistical values calculated during the +// fitting process +// predict - A projection allowing for prediction on new input data +// @param exog {table|number[][]|number[]} The exogenous variables +// @returns {number[]} The predicted values +stats.OLS.predict:{[config;exog] + modelInfo:config`modelInfo; + trend:`yIntercept in key modelInfo`variables; + exog:"f"$$[trend;1f,'exog;exog]; + coef:modelInfo`coef; + if[1=count exog[0];exog:flip enlist exog]; + sum coef*flip exog + } + +// @kind function +// @category stats +// @desc Train a weighted least squares model on data +// @param endog {number[][]|number[]} The endogenous variable +// @param exog {number[][]|number[]} A variables that predict the +// endog variable +// @param weights {float[]} The weights to be applied to the endog variable +// @param trend {boolean} Whether a trend is added to the model +// @returns {dictionary} Contains the following information: +// modelInfo - Coefficients and statistical values calculated during the +// fitting process +// predict - A projection allowing for prediction on new input data +stats.WLS.fit:{[endog;exog;weights;trend] + stats.i.checkLen[endog;exog;"exog"]; + if[weights~(::);weights:()]; + if[count weights;stats.i.checkLen[endog;weights;"weights"]]; + endog:"f"$endog; + // Calculate the weights if not given + // Must be inversely proportional to the error variance + if[not count weights; + trained:stats.OLS.fit[endog;exog;0b]; + residuals:endog-trained[`predict]exog; + trained:stats.OLS.fit[abs residuals;exog;0b]; + weights:1%{x*x}trained[`predict]exog + ]; + exog:"f"$$[trend;1f,'exog;exog]; + if[1=count exog[0];exog:flip enlist exog]; + updDependent:flip[exog]mmu weights*'endog; + updPredictor:flip[exog]mmu weights*'exog; + coef:raze inv[updPredictor]mmu updDependent; + modelInfo:stats.i.OLSstats[coef;endog;exog;trend]; + modelInfo,:enlist[`weights]!enlist weights; + returnInfo:enlist[`modelInfo]!enlist modelInfo; + predict:stats.WLS.predict returnInfo; + returnInfo,enlist[`predict]!enlist predict + } + +// @desc Predict values using coefficients calculated via WLS +// @param config {dictionary} Information returned from `WLS.fit` +// including: +// modelInfo - Coefficients and statistical values calculated during the +// fitting process +// predict - A projection allowing for prediction on new input data +// @param exog {table|number[][]|number[]} The exogenous variables +// @returns {number[]} The predicted values +stats.WLS.predict:stats.OLS.predict + +// @kind data +// @category stats +// @desc Load in functions defined within `describe.json` +// @type dictionary +stats.describeFuncs:.j.k raze read0`$path,"/stats/describe.json" + +// @kind function +// @category stats +// @desc Generates descriptive statistics of a table +// @param tab {table} A simple table +// @returns {dictionary} A tabular description of aggregate information +// of each column +stats.describe:{[tab] + funcTab:stats.describeFuncs; + if[not all `func`type in cols value funcTab; + '"Keyed table must contain a func and type attribute"]; + typeKeys:`num`temporal`other; + typeFunc:distinct raze value[funcTab][`type]; + typCheck:raze not enlist[typeFunc] in string each typeKeys; + if[any typCheck; + '"Invalid type given:",raze typeFunc where typCheck + ]; + descKeys:key funcTab; + funcs:get each value[funcTab]`func; + // Get indices of where each type of function is in the function list + typeDict:typeKeys!where@'(string each typeKeys) in/:\:value[funcTab]`type; + numTypes:"hijef"; + temporalTypes:"pmdznuvt"; + numCols:exec c from meta[tab]where t in numTypes; + temporalCols:exec c from meta[tab]where t in temporalTypes; + otherCols:cols[tab]except numCols,temporalCols; + colDict:typeKeys!(numCols;temporalCols;otherCols); + applyInd:where 0 list of predicted values -ts.ARMA.predict:{[mdl;exog;len] - ts.i.dictCheck[mdl;ts.i.ARMA.keyList;"mdl"]; - exog:ts.i.predDataCheck[mdl;exog]; - ts.i.predictFunction[mdl;exog;len;ts.i.ARMA.singlePredict] +// @desc Predictions based on an AutoRegressive Moving Average model +// (ARMA) +// @params config {dictionary} Information returned from `ml.ts.ARMA.fit` +// including: +// modelInfo - Model coefficients and data needed for future predictions +// predict - A projection allowing for prediction of future values +// @param exog {table|float[]|(::)} Exogenous variables are additional +// variables which may be accounted for to improve the model +// @param len {long} Number of future values to be predicted +// @return {float[]} Predicted values +ts.ARMA.predict:{[config;exog;len] + model:config`modelInfo; + exog:ts.i.predDataCheck[model;exog]; + ts.i.predictFunction[model;exog;len;ts.i.ARMA.singlePredict] } // @kind function // @category modelPredict -// @fileoverview Predictions based on an AutoRegressive Integrated Moving Average -// model (ARIMA) -// @param mdl {dict} model parameters returned from fitting of an appropriate model -// @param exog {tab/num[][]/(::)} Exogenous variables, are additional variables which -// required for application of model prediction -// @param len {integer} number of values to be predicted -// @return {float[]} list of predicted values -ts.ARIMA.predict:{[mdl;exog;len] - ts.i.dictCheck[mdl;ts.i.ARIMA.keyList;"mdl"]; - exog:ts.i.predDataCheck[mdl;exog]; +// @desc Predictions based on an AutoRegressive Integrated Moving +// Average model (ARIMA) +// @params config {dictionary} Information returned from `ml.ts.ARIMA.fit` +// including: +// modelInfo - Model coefficients and data needed for future predictions +// predict - A projection allowing for prediction of future values +// @param exog {table|float[]|(::)} Exogenous variables are additional +// variables which may be accounted for to improve the model +// @param len {long} Number of future values to be predicted +// @return {float[]} Predicted values +ts.ARIMA.predict:{[config;exog;len] + model:config`modelInfo; + exog:ts.i.predDataCheck[model;exog]; // Calculate predictions not accounting for differencing - pred:ts.i.predictFunction[mdl;exog;len;ts.i.ARMA.singlePredict]; - dval:count mdl`origd; + preds:ts.i.predictFunction[model;exog;len;ts.i.ARMA.singlePredict]; + dVal:count model`originalData; // Revert data to correct scale (remove differencing if previously applied) - $[dval;dval _dval{sums x}/mdl[`origd],pred;pred] + $[dVal;dVal _dVal{sums x}/model[`originalData],preds;preds] } // @kind function // @category modelPredict -// @fileoverview Predictions based on a Seasonal AutoRegressive Integrated Moving -// Average model (SARIMA) -// @param mdl {dict} model parameters returned from fitting of an appropriate model -// @param exog {tab/num[][]/(::)} Exogenous variables, are additional variables which -// required for application of model prediction -// @param len {integer} number of values to be predicted -// @return {float[]} list of predicted values -ts.SARIMA.predict:{[mdl;exog;len] - ts.i.dictCheck[mdl;ts.i.SARIMA.keyList;"mdl"]; - exog:ts.i.predDataCheck[mdl;exog]; +// @desc Predictions based on a Seasonal AutoRegressive Integrated +// Moving Average model (SARIMA) +// @params config {dictionary} Information returned from `ml.ts.SARIMA.fit` +// including: +// modelInfo - Model coefficients and data needed for future predictions +// predict - A projection allowing for prediction of future values +// @param exog {table|float[]|(::)} Exogenous variables are additional +// variables which may be accounted for to improve the model +// @param len {long} Number of future values to be predicted +// @return {float[]} Predicted values +ts.SARIMA.predict:{[config;exog;len] + model:config`modelInfo; + exog:ts.i.predDataCheck[model;exog]; // Calculate predictions not accounting for differencing - preds:$[count raze mdl[`pred_dict]; - ts.i.predictFunction[mdl;exog;len;ts.i.SARMA.singlePredict]; - ts.i.AR.predict[mdl;exog;len] + preds:$[count raze model`paramDict; + ts.i.predictFunction[model;exog;len;ts.i.SARMA.singlePredict]; + ts.i.AR.predict[model;exog;len] ]; // Order of seasonal differencing originally applied - sval:count mdl`origs; - // if seasonal differenced, revert to original - if[sval;preds:ts.i.reverseSeasonDiff[mdl[`origs];preds]]; + dSeasVal:count model`seasonData; + // If seasonal differenced, revert to original + if[dSeasVal;preds:ts.i.reverseSeasonDiff[model`seasonData;preds]]; // Order of differencing originally applied - dval:count mdl`origd; + dVal:count model`originalData; // Revert data to correct scale (remove differencing if previously applied) - $[dval;dval _dval{sums x}/mdl[`origd],preds;preds] + $[dVal;dVal _dVal{sums x}/model[`originalData],preds;preds] } - // @kind function // @category modelPredict -// @fileoverview Predictions based on an AutoRegressive Conditional Heteroskedasticity -// model (ARCH) -// @param mdl {dict} model parameters returned from fitting of an appropriate model -// @param len {integer} number of values to be predicted -// @return {float[]} list of predicted values -// Predict future volatility using an ARCH model -/. r > list of predicted values -ts.ARCH.predict:{[mdl;len] - ts.i.dictCheck[mdl;ts.i.ARCH.keyList;"mdl"]; - // predict and return future values - last{x>count y 1}[len;]ts.i.ARCH.singlePredict[mdl`params]/(mdl`resid;()) +// @desc Predictions based on an AutoRegressive Conditional +// Heteroskedasticity model (ARCH) +// @params config {dictionary} Information returned from `ml.ts.ARCH.fit` +// including: +// modelInfo - Model coefficients and data needed for future predictions +// predict - A projection allowing for prediction of future values +// @param len {long} Number of future values to be predicted +// @return {float[]} Predicted values +ts.ARCH.predict:{[config;len] + model:config`modelInfo; + last{x>count y 1}[len;]ts.i.ARCH.singlePredict + [model`coefficients]/(model`residualVals;()) } diff --git a/timeseries/tests/data/linux/fit/AR1 b/timeseries/tests/data/linux/fit/AR1 index fb1f9935..79eba12d 100644 Binary files a/timeseries/tests/data/linux/fit/AR1 and b/timeseries/tests/data/linux/fit/AR1 differ diff --git a/timeseries/tests/data/linux/fit/AR2 b/timeseries/tests/data/linux/fit/AR2 index 81d32088..c8aa9d52 100644 Binary files a/timeseries/tests/data/linux/fit/AR2 and b/timeseries/tests/data/linux/fit/AR2 differ diff --git a/timeseries/tests/data/linux/fit/AR3 b/timeseries/tests/data/linux/fit/AR3 index 1642ecb5..ddc5e238 100644 Binary files a/timeseries/tests/data/linux/fit/AR3 and b/timeseries/tests/data/linux/fit/AR3 differ diff --git a/timeseries/tests/data/linux/fit/AR4 b/timeseries/tests/data/linux/fit/AR4 index 87bae7c7..85e62b7b 100644 Binary files a/timeseries/tests/data/linux/fit/AR4 and b/timeseries/tests/data/linux/fit/AR4 differ diff --git a/timeseries/tests/data/linux/fit/ARCH1 b/timeseries/tests/data/linux/fit/ARCH1 index 79bfe798..4f891318 100644 Binary files a/timeseries/tests/data/linux/fit/ARCH1 and b/timeseries/tests/data/linux/fit/ARCH1 differ diff --git a/timeseries/tests/data/linux/fit/ARCH2 b/timeseries/tests/data/linux/fit/ARCH2 index 7af3136f..15c59dc5 100644 Binary files a/timeseries/tests/data/linux/fit/ARCH2 and b/timeseries/tests/data/linux/fit/ARCH2 differ diff --git a/timeseries/tests/data/linux/fit/ARIMA1 b/timeseries/tests/data/linux/fit/ARIMA1 index cd34de86..694da18a 100644 Binary files a/timeseries/tests/data/linux/fit/ARIMA1 and b/timeseries/tests/data/linux/fit/ARIMA1 differ diff --git a/timeseries/tests/data/linux/fit/ARIMA2 b/timeseries/tests/data/linux/fit/ARIMA2 index 98e47b41..cce5792a 100644 Binary files a/timeseries/tests/data/linux/fit/ARIMA2 and b/timeseries/tests/data/linux/fit/ARIMA2 differ diff --git a/timeseries/tests/data/linux/fit/ARIMA3 b/timeseries/tests/data/linux/fit/ARIMA3 index 71b9e363..5cd55f86 100644 Binary files a/timeseries/tests/data/linux/fit/ARIMA3 and b/timeseries/tests/data/linux/fit/ARIMA3 differ diff --git a/timeseries/tests/data/linux/fit/ARIMA4 b/timeseries/tests/data/linux/fit/ARIMA4 index a5d9c743..5f07cc33 100644 Binary files a/timeseries/tests/data/linux/fit/ARIMA4 and b/timeseries/tests/data/linux/fit/ARIMA4 differ diff --git a/timeseries/tests/data/linux/fit/ARMA1 b/timeseries/tests/data/linux/fit/ARMA1 index 2a1590cc..d44763c4 100644 Binary files a/timeseries/tests/data/linux/fit/ARMA1 and b/timeseries/tests/data/linux/fit/ARMA1 differ diff --git a/timeseries/tests/data/linux/fit/ARMA2 b/timeseries/tests/data/linux/fit/ARMA2 index 53fdae4b..53233219 100644 Binary files a/timeseries/tests/data/linux/fit/ARMA2 and b/timeseries/tests/data/linux/fit/ARMA2 differ diff --git a/timeseries/tests/data/linux/fit/ARMA3 b/timeseries/tests/data/linux/fit/ARMA3 index 17160976..d8ce2295 100644 Binary files a/timeseries/tests/data/linux/fit/ARMA3 and b/timeseries/tests/data/linux/fit/ARMA3 differ diff --git a/timeseries/tests/data/linux/fit/ARMA4 b/timeseries/tests/data/linux/fit/ARMA4 index 2e5f7ceb..eaccc354 100644 Binary files a/timeseries/tests/data/linux/fit/ARMA4 and b/timeseries/tests/data/linux/fit/ARMA4 differ diff --git a/timeseries/tests/data/linux/fit/SARIMA1 b/timeseries/tests/data/linux/fit/SARIMA1 index 34f4cd02..566a8b31 100644 Binary files a/timeseries/tests/data/linux/fit/SARIMA1 and b/timeseries/tests/data/linux/fit/SARIMA1 differ diff --git a/timeseries/tests/data/linux/fit/SARIMA2 b/timeseries/tests/data/linux/fit/SARIMA2 index 68924cba..bca1afaa 100644 Binary files a/timeseries/tests/data/linux/fit/SARIMA2 and b/timeseries/tests/data/linux/fit/SARIMA2 differ diff --git a/timeseries/tests/data/linux/fit/SARIMA3 b/timeseries/tests/data/linux/fit/SARIMA3 index a4cbc49a..f1654f87 100644 Binary files a/timeseries/tests/data/linux/fit/SARIMA3 and b/timeseries/tests/data/linux/fit/SARIMA3 differ diff --git a/timeseries/tests/data/linux/fit/SARIMA4 b/timeseries/tests/data/linux/fit/SARIMA4 index 3400a955..7e0bd5a3 100644 Binary files a/timeseries/tests/data/linux/fit/SARIMA4 and b/timeseries/tests/data/linux/fit/SARIMA4 differ diff --git a/timeseries/tests/data/misc/aicScore1 b/timeseries/tests/data/misc/aicScore1 index b8d2a1ae..8d18cc8d 100644 Binary files a/timeseries/tests/data/misc/aicScore1 and b/timeseries/tests/data/misc/aicScore1 differ diff --git a/timeseries/tests/data/misc/aicScore2 b/timeseries/tests/data/misc/aicScore2 index 90981b9a..d8334523 100644 Binary files a/timeseries/tests/data/misc/aicScore2 and b/timeseries/tests/data/misc/aicScore2 differ diff --git a/timeseries/tests/data/misc/aicScore3 b/timeseries/tests/data/misc/aicScore3 index ceeeac11..e314b912 100644 Binary files a/timeseries/tests/data/misc/aicScore3 and b/timeseries/tests/data/misc/aicScore3 differ diff --git a/timeseries/tests/data/misc/aicScore4 b/timeseries/tests/data/misc/aicScore4 index d3339401..b8d8868f 100644 Binary files a/timeseries/tests/data/misc/aicScore4 and b/timeseries/tests/data/misc/aicScore4 differ diff --git a/timeseries/tests/data/windows/fit/AR1 b/timeseries/tests/data/windows/fit/AR1 index 47e50d0e..5c37aca7 100644 Binary files a/timeseries/tests/data/windows/fit/AR1 and b/timeseries/tests/data/windows/fit/AR1 differ diff --git a/timeseries/tests/data/windows/fit/AR2 b/timeseries/tests/data/windows/fit/AR2 index bdd39803..a222bcc5 100644 Binary files a/timeseries/tests/data/windows/fit/AR2 and b/timeseries/tests/data/windows/fit/AR2 differ diff --git a/timeseries/tests/data/windows/fit/AR3 b/timeseries/tests/data/windows/fit/AR3 index 1e9da8eb..5b2a3ae0 100644 Binary files a/timeseries/tests/data/windows/fit/AR3 and b/timeseries/tests/data/windows/fit/AR3 differ diff --git a/timeseries/tests/data/windows/fit/AR4 b/timeseries/tests/data/windows/fit/AR4 index 51d34ef1..8b29b6a6 100644 Binary files a/timeseries/tests/data/windows/fit/AR4 and b/timeseries/tests/data/windows/fit/AR4 differ diff --git a/timeseries/tests/data/windows/fit/ARCH1 b/timeseries/tests/data/windows/fit/ARCH1 index 79bfe798..4f891318 100644 Binary files a/timeseries/tests/data/windows/fit/ARCH1 and b/timeseries/tests/data/windows/fit/ARCH1 differ diff --git a/timeseries/tests/data/windows/fit/ARCH2 b/timeseries/tests/data/windows/fit/ARCH2 index 27d88e56..316f25ba 100644 Binary files a/timeseries/tests/data/windows/fit/ARCH2 and b/timeseries/tests/data/windows/fit/ARCH2 differ diff --git a/timeseries/tests/data/windows/fit/ARIMA1 b/timeseries/tests/data/windows/fit/ARIMA1 index 9e5e7986..1d6761d0 100644 Binary files a/timeseries/tests/data/windows/fit/ARIMA1 and b/timeseries/tests/data/windows/fit/ARIMA1 differ diff --git a/timeseries/tests/data/windows/fit/ARIMA2 b/timeseries/tests/data/windows/fit/ARIMA2 index 56af3814..a575e52e 100644 Binary files a/timeseries/tests/data/windows/fit/ARIMA2 and b/timeseries/tests/data/windows/fit/ARIMA2 differ diff --git a/timeseries/tests/data/windows/fit/ARIMA3 b/timeseries/tests/data/windows/fit/ARIMA3 index 7d5943fe..82830ebc 100644 Binary files a/timeseries/tests/data/windows/fit/ARIMA3 and b/timeseries/tests/data/windows/fit/ARIMA3 differ diff --git a/timeseries/tests/data/windows/fit/ARIMA4 b/timeseries/tests/data/windows/fit/ARIMA4 index e025ec69..8ed732a2 100644 Binary files a/timeseries/tests/data/windows/fit/ARIMA4 and b/timeseries/tests/data/windows/fit/ARIMA4 differ diff --git a/timeseries/tests/data/windows/fit/ARMA1 b/timeseries/tests/data/windows/fit/ARMA1 index 8e859e20..95b14c1f 100644 Binary files a/timeseries/tests/data/windows/fit/ARMA1 and b/timeseries/tests/data/windows/fit/ARMA1 differ diff --git a/timeseries/tests/data/windows/fit/ARMA2 b/timeseries/tests/data/windows/fit/ARMA2 index 17e52598..9e2511b3 100644 Binary files a/timeseries/tests/data/windows/fit/ARMA2 and b/timeseries/tests/data/windows/fit/ARMA2 differ diff --git a/timeseries/tests/data/windows/fit/ARMA3 b/timeseries/tests/data/windows/fit/ARMA3 index 63248b87..907229f0 100644 Binary files a/timeseries/tests/data/windows/fit/ARMA3 and b/timeseries/tests/data/windows/fit/ARMA3 differ diff --git a/timeseries/tests/data/windows/fit/ARMA4 b/timeseries/tests/data/windows/fit/ARMA4 index 99d9305c..60d682d9 100644 Binary files a/timeseries/tests/data/windows/fit/ARMA4 and b/timeseries/tests/data/windows/fit/ARMA4 differ diff --git a/timeseries/tests/data/windows/fit/SARIMA1 b/timeseries/tests/data/windows/fit/SARIMA1 index 2464585f..3234cc56 100644 Binary files a/timeseries/tests/data/windows/fit/SARIMA1 and b/timeseries/tests/data/windows/fit/SARIMA1 differ diff --git a/timeseries/tests/data/windows/fit/SARIMA2 b/timeseries/tests/data/windows/fit/SARIMA2 index 9551519a..670e426d 100644 Binary files a/timeseries/tests/data/windows/fit/SARIMA2 and b/timeseries/tests/data/windows/fit/SARIMA2 differ diff --git a/timeseries/tests/data/windows/fit/SARIMA3 b/timeseries/tests/data/windows/fit/SARIMA3 index 4640a6d7..0f4e26aa 100644 Binary files a/timeseries/tests/data/windows/fit/SARIMA3 and b/timeseries/tests/data/windows/fit/SARIMA3 differ diff --git a/timeseries/tests/data/windows/fit/SARIMA4 b/timeseries/tests/data/windows/fit/SARIMA4 index 7f596488..823c37f3 100644 Binary files a/timeseries/tests/data/windows/fit/SARIMA4 and b/timeseries/tests/data/windows/fit/SARIMA4 differ diff --git a/timeseries/tests/fit.t b/timeseries/tests/fit.t index 31eee812..8f0178bc 100644 --- a/timeseries/tests/fit.t +++ b/timeseries/tests/fit.t @@ -1,10 +1,13 @@ \l p.q \l ml.q -\l util/util.q -\l optimize/optim.q +\l util/utils.q +\l util/utilities.q +\l optimize/utils.q +\l optimize/optimize.q \l timeseries/utils.q \l timeseries/fit.q -\l fresh/extract.q +\l timeseries/predict.q +\l fresh/init.q \l timeseries/tests/failMessage.q -1"Warning: These tests may cause varying results for Linux vs Windows users"; @@ -26,35 +29,35 @@ fileList:`AR1`AR2`AR3`AR4`ARCH1`ARCH2`ARMA1`ARMA2`ARMA3`ARMA4`ARIMA1`ARIMA2, {load hsym`$":timeseries/tests/data/",y,"fit/",string x}[;os]each fileList; // AR tests -.ml.ts.AR.fit[endogInt ;() ;1;0b]~AR1 -.ml.ts.AR.fit[endogInt ;exogFloat;3;1b]~AR2 -.ml.ts.AR.fit[endogFloat;exogInt ;2;1b]~AR3 -.ml.ts.AR.fit[endogFloat;exogMixed;4;0b]~AR4 +.ml.ts.AR.fit[endogInt ;() ;1;0b][`modelInfo]~AR1`modelInfo +.ml.ts.AR.fit[endogInt ;exogFloat;3;1b][`modelInfo]~AR2`modelInfo +.ml.ts.AR.fit[endogFloat;exogInt ;2;1b][`modelInfo]~AR3`modelInfo +.ml.ts.AR.fit[endogFloat;exogMixed;4;0b][`modelInfo]~AR4`modelInfo failingTest[.ml.ts.AR.fit;(endogInt ;5000#exogInt ;1;1b);0b;"Endog length less than length"] failingTest[.ml.ts.AR.fit;(endogFloat;5000#exogFloat;1;1b);0b;"Endog length less than length"] // ARMA tests -.ml.ts.ARMA.fit[endogInt ;() ;1;2;1b]~ARMA1 -.ml.ts.ARMA.fit[endogInt ;exogFloat;2;1;0b]~ARMA2 -.ml.ts.ARMA.fit[endogFloat;exogInt ;1;1;0b]~ARMA3 -.ml.ts.ARMA.fit[endogFloat;exogMixed;3;2;1b]~ARMA4 +.ml.ts.ARMA.fit[endogInt ;() ;1;2;1b][`modelInfo]~ARMA1`modelInfo +.ml.ts.ARMA.fit[endogInt ;exogFloat;2;1;0b][`modelInfo]~ARMA2`modelInfo +.ml.ts.ARMA.fit[endogFloat;exogInt ;1;1;0b][`modelInfo]~ARMA3`modelInfo +.ml.ts.ARMA.fit[endogFloat;exogMixed;3;2;1b][`modelInfo]~ARMA4`modelInfo failingTest[.ml.ts.ARMA.fit;(endogInt ;5000#exogInt ;2;1;0b);0b;"Endog length less than length"] failingTest[.ml.ts.ARMA.fit;(endogFloat;5000#exogFloat;2;1;0b);0b;"Endog length less than length"] // ARCH tests -.ml.ts.ARCH.fit[residInt ;3]~ARCH1 -.ml.ts.ARCH.fit[residFloat;1]~ARCH2 +.ml.ts.ARCH.fit[residInt ;3][`modelInfo]~ARCH1`modelInfo +.ml.ts.ARCH.fit[residFloat;1][`modelInfo]~ARCH2`modelInfo // ARIMA tests -.ml.ts.ARIMA.fit[endogInt ;() ;2;1;2;0b]~ARIMA1 -.ml.ts.ARIMA.fit[endogInt ;exogFloat;1;1;1;1b]~ARIMA2 -.ml.ts.ARIMA.fit[endogFloat;exogInt ;3;0;1;1b]~ARIMA3 -.ml.ts.ARIMA.fit[endogFloat;exogMixed;1;2;2;0b]~ARIMA4 +.ml.ts.ARIMA.fit[endogInt ;() ;2;1;2;0b][`modelInfo]~ARIMA1`modelInfo +.ml.ts.ARIMA.fit[endogInt ;exogFloat;1;1;1;1b][`modelInfo]~ARIMA2`modelInfo +.ml.ts.ARIMA.fit[endogFloat;exogInt ;3;0;1;1b][`modelInfo]~ARIMA3`modelInfo +.ml.ts.ARIMA.fit[endogFloat;exogMixed;1;2;2;0b][`modelInfo]~ARIMA4`modelInfo failingTest[.ml.ts.ARIMA.fit;(endogInt ;5000#exogInt ;1;1;1;1b);0b;"Endog length less than length"] failingTest[.ml.ts.ARIMA.fit;(endogFloat;5000#exogFloat;1;1;1;1b);0b;"Endog length less than length"] @@ -66,10 +69,10 @@ s2:`P`D`Q`m!2 1 0 10 s3:`P`D`Q`m!2 1 1 30 s4:`P`D`Q`m!0 1 1 20 -.ml.ts.SARIMA.fit[endogInt ;() ;1;1;1;0b;s1]~SARIMA1 -.ml.ts.SARIMA.fit[endogInt ;exogFloat;1;0;1;1b;s2]~SARIMA2 -.ml.ts.SARIMA.fit[endogFloat;exogInt ;1;2;0;0b;s3]~SARIMA3 -.ml.ts.SARIMA.fit[endogFloat;exogMixed;2;1;1;0b;s4]~SARIMA4 +.ml.ts.SARIMA.fit[endogInt ;() ;1;1;1;0b;s1][`modelInfo]~SARIMA1`modelInfo +.ml.ts.SARIMA.fit[endogInt ;exogFloat;1;0;1;1b;s2][`modelInfo]~SARIMA2`modelInfo +.ml.ts.SARIMA.fit[endogFloat;exogInt ;1;2;0;0b;s3][`modelInfo]~SARIMA3`modelInfo +.ml.ts.SARIMA.fit[endogFloat;exogMixed;2;1;1;0b;s4][`modelInfo]~SARIMA4`modelInfo failingTest[.ml.ts.SARIMA.fit;(endogInt ;5000#exogInt ;2;0;1;1b;s1);0b;"Endog length less than length"] failingTest[.ml.ts.SARIMA.fit;(endogFloat;5000#exogFloat;2;0;1;1b;s1);0b;"Endog length less than length"] diff --git a/timeseries/tests/misc.t b/timeseries/tests/misc.t index 6629696b..a907000d 100644 --- a/timeseries/tests/misc.t +++ b/timeseries/tests/misc.t @@ -5,7 +5,7 @@ \l timeseries/fit.q \l timeseries/predict.q \l timeseries/tests/failMessage.q -\l fresh/extract.q +\l fresh/init.q \S 42 @@ -39,7 +39,7 @@ fileList:`stationarityTab1`stationarityTab2`aicScore1`aicScore2`aicScore3`aicSco // Set up parameters dictKeys :`endog`exog -paramKeys:`p`d`q`tr +paramKeys:`p`d`q`trend trainDict1:dictKeys!(endogInt ;() ) trainDict2:dictKeys!(endogInt ;exogFloat) diff --git a/timeseries/tests/pred.t b/timeseries/tests/pred.t index df2d2dad..303a9019 100644 --- a/timeseries/tests/pred.t +++ b/timeseries/tests/pred.t @@ -24,49 +24,48 @@ loadFunc[os;"pred/pred"]each fileList; // AR tests -.ml.ts.AR.predict[AR1;();1000]~predAR1 -.ml.ts.AR.predict[AR2;exogFloatFuture;1000]~predAR2 -.ml.ts.AR.predict[AR3;exogIntFuture;1000]~predAR3 -.ml.ts.AR.predict[AR4;exogMixedFuture;1000]~predAR4 +AR1.predict[() ;1000]~predAR1 +AR2.predict[exogFloatFuture;1000]~predAR2 +AR3.predict[exogIntFuture ;1000]~predAR3 +AR4.predict[exogMixedFuture;1000]~predAR4 -failingTest[.ml.ts.AR.predict;(AR2;-1_'exogFloatFuture;1000);0b;"Test exog length does not match train exog length"] -failingTest[.ml.ts.AR.predict;(AR3;-1_'exogIntFuture ;1000);0b;"Test exog length does not match train exog length"] +failingTest[AR2.predict;(-1_'exogFloatFuture;1000);0b;"Test exog length does not match train exog length"] +failingTest[AR3.predict;(-1_'exogIntFuture ;1000);0b;"Test exog length does not match train exog length"] // ARCH tests -.ml.ts.ARCH.predict[ARCH1;1000]~predARCH1 -.ml.ts.ARCH.predict[ARCH2;1000]~predARCH2 +ARCH1.predict[1000]~predARCH1 +ARCH2.predict[1000]~predARCH2 // ARMA tests -.ml.ts.ARMA.predict[ARMA1;();1000]~predARMA1 -.ml.ts.ARMA.predict[ARMA2;exogFloatFuture;1000]~predARMA2 -.ml.ts.ARMA.predict[ARMA3;exogIntFuture;1000]~predARMA3 -.ml.ts.ARMA.predict[ARMA4;exogMixedFuture;1000]~predARMA4 +ARMA1.predict[();1000]~predARMA1 +ARMA2.predict[exogFloatFuture;1000]~predARMA2 +ARMA3.predict[exogIntFuture;1000]~predARMA3 +ARMA4.predict[exogMixedFuture;1000]~predARMA4 + +failingTest[ARMA2.predict;(-1_'exogFloatFuture;1000);0b;"Test exog length does not match train exog length"] +failingTest[ARMA3.predict;(-1_'exogIntFuture ;1000);0b;"Test exog length does not match train exog length"] -failingTest[.ml.ts.ARMA.predict;(ARMA2;-1_'exogFloatFuture;1000);0b;"Test exog length does not match train exog length"] -failingTest[.ml.ts.ARMA.predict;(ARMA3;-1_'exogIntFuture ;1000);0b;"Test exog length does not match train exog length"] -failingTest[.ml.ts.ARMA.predict;(AR1 ;() ;1000);0b;"The following required dictionary keys for 'mdl' are not provided: q_param, resid, estresid, pred_dict"] // ARIMA tests -.ml.ts.ARIMA.predict[ARIMA1;();1000]~predARIMA1 -.ml.ts.ARIMA.predict[ARIMA2;exogFloatFuture;1000]~predARIMA2 -.ml.ts.ARIMA.predict[ARIMA3;exogIntFuture;1000]~predARIMA3 -.ml.ts.ARIMA.predict[ARIMA4;exogMixedFuture;1000]~predARIMA4 +ARIMA1.predict[() ;1000]~predARIMA1 +ARIMA2.predict[exogFloatFuture;1000]~predARIMA2 +ARIMA3.predict[exogIntFuture ;1000]~predARIMA3 +ARIMA4.predict[exogMixedFuture;1000]~predARIMA4 -failingTest[.ml.ts.ARIMA.predict;(ARIMA2;-1_'exogFloatFuture;1000);0b;"Test exog length does not match train exog length"] -failingTest[.ml.ts.ARIMA.predict;(ARIMA3;-1_'exogIntFuture ;1000);0b;"Test exog length does not match train exog length"] -failingTest[.ml.ts.ARIMA.predict;(ARMA4 ;exogMixedFuture ;1000);0b;"The following required dictionary keys for 'mdl' are not provided: origd"] +failingTest[ARIMA2.predict;(-1_'exogFloatFuture;1000);0b;"Test exog length does not match train exog length"] +failingTest[ARIMA3.predict;(-1_'exogIntFuture ;1000);0b;"Test exog length does not match train exog length"] // SARIMA tests -.ml.ts.SARIMA.predict[SARIMA1;();1000]~predSARIMA1 -.ml.ts.SARIMA.predict[SARIMA2;exogFloatFuture;1000]~predSARIMA2 -.ml.ts.SARIMA.predict[SARIMA3;exogIntFuture;1000]~predSARIMA3 -.ml.ts.SARIMA.predict[SARIMA4;exogMixedFuture;1000]~predSARIMA4 +SARIMA1.predict[() ;1000]~predSARIMA1 +SARIMA2.predict[exogFloatFuture;1000]~predSARIMA2 +SARIMA3.predict[exogIntFuture ;1000]~predSARIMA3 +SARIMA4.predict[exogMixedFuture;1000]~predSARIMA4 + +failingTest[SARIMA2.predict;(-1_'exogFloatFuture;1000);0b;"Test exog length does not match train exog length"] +failingTest[SARIMA3.predict;(-1_'exogIntFuture ;1000);0b;"Test exog length does not match train exog length"] -failingTest[.ml.ts.SARIMA.predict;(SARIMA2;-1_'exogFloatFuture;1000);0b;"Test exog length does not match train exog length"] -failingTest[.ml.ts.SARIMA.predict;(SARIMA3;-1_'exogIntFuture ;1000);0b;"Test exog length does not match train exog length"] -failingTest[.ml.ts.SARIMA.predict;(ARIMA2 ;exogFloatFuture ;1000);0b;"The following required dictionary keys for 'mdl' are not provided: origs, P_param, Q_param"] diff --git a/timeseries/utils.q b/timeseries/utils.q index b77152a9..1d3a881d 100644 --- a/timeseries/utils.q +++ b/timeseries/utils.q @@ -1,92 +1,109 @@ +// timeseries/utils.q - Timeseries Utilities +// Copyright (c) 2021 Kx Systems Inc +// +// AR/ARMA/SARMA model utilities + \d .ml -// AR/ARMA/SARMA model utilities // @private // @kind function // @category fitUtility -// @fileoverview ARMA model generation -// @param endog {num[]} Endogenous variable (time-series) from which to build a model -// this is the target variable from which a value is to be predicted -// @param exog {tab} Exogenous variables, are additional variables which -// may be accounted for to improve the model -// @param params {dict} parameter sets used to fit the ARMA model -// @return {dict} dictionary containing all information required to make predictions -// using an ARMA based model +// @desc ARMA model generation +// @param endog {number[]} Endogenous variable (time-series) from which to +// build a model. This is the target variable from which a value is to be +// predicted +// @param exog {float[]|(::)} Exogenous variables are additional variables +// which may be accounted for to improve the model, if (::)/() +// this will be ignored +// @param params {dictionary} Parameter sets used to fit the ARMA model +// @return {dictionary} Dictionary containing all information required to make +// predictions using an ARMA based model ts.i.ARMA.model:{[endog;exog;params] n:1+max params`p`q; errCoeff:ts.i.estimateErrorCoeffs[endog;exog;params;n]; - ARMAparams:ts.i.ARMA.parameters[endog;errCoeff`coeffs;params;errCoeff`errors;n]; - mdlKeys:`params`tr_param`exog_param`p_param`q_param`lags`resid`estresid`pred_dict; - mdlParams:(errCoeff[`coeffs](::;params[`tr]-1;params[`tr]+til count exog 0)),ARMAparams; - mdlKeys!mdlParams + ARMAvals:ts.i.ARMA.sortValues[endog;;params;;n] . errCoeff`coeffs`errors; + dictKeys:`coefficients`trendCoeff`exogCoeff`pCoeff`qCoeff`lagVals, + `residualVals`residualCoeffs`paramDict; + dictVals:(errCoeff[`coeffs](::;params[`trend]-1; + params[`trend]+til count exog 0)),ARMAvals; + dictKeys!dictVals } // @private // @kind function // @category fitUtility -// @fileoverview SARMA model generation -// @param endog {num[]} Endogenous variable (time-series) from which to build a model -// this is the target variable from which a value is to be predicted -// @param exog {tab} Exogenous variables, are additional variables which -// may be accounted for to improve the model -// @param params {dict} parameter sets used to fit the SARMA model -// @return {dict} dictionary containing all information required to make predictions -// using an SARMA based model +// @desc SARMA model generation +// @param endog {number[]} Endogenous variable (time-series) from which to +// build a model. This is the target variable from which a value is to be +// predicted +// @param exog {float[]|(::)} Exogenous variables are additional variables +// which may be accounted for to improve the model, if (::)/() +// this will be ignored +// @param params {dictionary} Parameter sets used to fit the SARMA model +// @return {dictionary} Dictionary containing all information required to make +// predictions using an SARMA based model ts.i.SARMA.model:{[endog;exog;params] n:1+max params`p`q; errCoeff:ts.i.estimateErrorCoeffs[endog;exog;params;n]; - coeffs:ts.i.SARMA.coefficients[endog;exog;errCoeff[`errors]`err;errCoeff`coeffs;params]; - SARMAparams:ts.i.SARMA.parameters[endog;coeffs;params;errCoeff`errors;n]; - modelKeys:`params`tr_param`exog_param`p_param`q_param, - `P_param`Q_param`lags`resid`estresid`pred_dict; - modelParams:(coeffs(::;params[`tr]-1;params[`tr]+til count exog 0)),SARMAparams; - modelKeys!modelParams + coeffs:ts.i.SARMA.coefficients[endog;exog;errCoeff[`errors]`errorVals; + errCoeff`coeffs;params]; + SARMAvals:ts.i.SARMA.sortValues[endog;coeffs;params;errCoeff`errors;n]; + dictKeys:`coefficients`trendCoeff`exogCoeff`pCoeff`qCoeff, + `PCoeff`QCoeff`lagVals`residualVals`residualCoeffs`paramDict; + dictVals:(coeffs(::;params[`trend]-1;params[`trend]+til count exog 0)), + SARMAvals; + dictKeys!dictVals } // @private // @kind function // @category fitUtility -// @fileoverview Estimate error coefficients -// @param endog {num[]} Endogenous variable (time-series) from which to build a model -// this is the target variable from which a value is to be predicted -// @param exog {tab} Exogenous variables, are additional variables which -// may be accounted for to improve the model -// @param params {dict} parameter sets used to estimate error coefficients -// @param n {integer} number of error coefficients to estimate -// @return {dict} dictionary returning coefficients and errors required for -// model generation +// @desc Estimate error coefficients +// @param endog {number[]} Endogenous variable (time-series) from which to +// build a model. This is the target variable from which a value is to be +// predicted +// @param exog {float[]|(::)} Exogenous variables are additional variables +// which may be accounted for to improve the model, if (::)/() +// this will be ignored +// @param params {dictionary} Parameter sets used to estimate coefficients +// @param n {int} Number of error coefficients to estimate +// @return {dictionary} Dictionary returning coefficients and errors required +// for model generation ts.i.estimateErrorCoeffs:{[endog;exog;params;n] - errs :ts.i.estimateErrors[endog;exog;n]; - coeff:ts.i.estimateParams[endog;exog;errs`err;params]; - `errors`coeffs!(errs;coeff) + errors:ts.i.estimateErrors[endog;exog;n]; + coeffs:ts.i.estimateCoefficients[endog;exog;errors`errorVals;params]; + `errors`coeffs!(errors;coeffs) } // @private // @kind function // @category fitUtility -// @fileoverview Estimate ARMA model parameters using ordinary least squares -// @param endog {num[]} Endogenous variable (time-series) from which to build a model -// this is the target variable from which a value is to be predicted -// @param exog {tab} Exogenous variables, are additional variables which -// may be accounted for to improve the model -// @param errors {dict} errors estimated using `i.estimateErrorCoeffs` -// @param params {dict} parameter sets used to estimate model parameters -// @return {float[]} estimated ARMA model parameters -ts.i.estimateParams:{[endog;exog;errors;params] +// @desc Estimate ARMA model parameters using ordinary least squares +// @param endog {number[]} Endogenous variable (time-series) from which to +// build a model. This is the target variable from which a value is to be +// predicted +// @param exog {float[]|(::)} Exogenous variables are additional variables +// which may be accounted for to improve the model, if (::)/() +// this will be ignored +// @param errors {dictionary} Errors estimated using `i.estimateErrorCoeffs` +// @param params {dictionary} Parameter sets used to estimate model +// coefficients +// @return {float[]} Estimated ARMA model coefficients +ts.i.estimateCoefficients:{[endog;exog;errors;params] // Create lagged matrices for the endogenous variable and residual errors - endogm:ts.i.lagMatrix[endog ;params`p]; - resid :ts.i.lagMatrix[errors;params`q]; + endogMatrix:ts.i.lagMatrix[endog ;params`p]; + residMatrix:ts.i.lagMatrix[errors;params`q]; // Collect the data needed for estimation - vals:(exog;endogm;resid); + values:(exog;endogMatrix;residMatrix); // How many data points are required m:neg min raze(count[endog]-params[`p`P]),count[errors]-params[`q`Q]; - x:(,'/)m#'vals; - // add seasonality components + x:(,'/)m#'values; + // Add seasonality components if[not 0N~params[`P];x:x,'(m #flip[params[`P]xprev\:endog])]; if[not 0N~params[`Q];x:x,'(m #flip[params[`Q]xprev\:errors])]; // If required add a trend line variable - if[params`tr;x:1f,'x]; + if[params`trend;x:1f,'x]; y:m#endog; first enlist[y]lsq flip x } @@ -94,373 +111,423 @@ ts.i.estimateParams:{[endog;exog;errors;params] // @private // @kind function // @category fitUtility -// @fileoverview Durbin Levinson function to calculate the coefficients +// @desc Durbin Levinson function to calculate the coefficients // in a pure AR model with no trend for a univariate dataset // Implementation can be found here // https://www.stat.purdue.edu/~zhanghao/STAT520/handout/DurbinLevHandout.pdf -// @param data {float[][]} dataset from which to estimate the coefficients -// @param lags {integer} order of the AR(p) model being fit +// @param data {float[]} Dataset from which to estimate the coefficients +// @param p {int} Order of the AR(p) model being fit // @return {float[]} AR(p) coefficients for specified lagged value -ts.i.durbinLevinson:{[data;lags] - // cast to float +ts.i.durbinLevinson:{[data;p] data:"f"$data; - mat:(1+lags;1+lags)#0f; - vec:(1+lags)#0f; - mat[1;1]:ts.i.autoCorrFunction[data;1]; - vec[1] :var[data]*(1-xexp[mat[1;1];2]); - reverse 1_last first(lags-1){[data;d] - mat:d[0];vec:d[1];n:d[2]; - k:n+1; - dval:sum mat[n;1+til n]mmu ts.i.lagCovariance[data]each k-1+til n; - mat[k;k]:(ts.i.lagCovariance[data;k]-dval)%vec[n]; - upd:{[data;n;mat;j]mat[n;j]-(mat[n+1;n+1]*mat[n;1+n-j])}[data;n;mat]each 1+til n; - mat[k;1+til n]:upd; - vec[k]:vec[n]*(1-xexp[mat[k;k];2]); - (mat;vec;n+1) - }[data]/(mat;vec;1) + matrix:(1+p;1+p)#0f; + vector:(1+p)#0f; + matrix[1;1]:ts.i.autoCorrFunction[data;1]; + vector[1] :var[data]*(1-xexp[matrix[1;1];2]); + estParams:first(p-1) ts.i.durbinLevinsonEstimate[data]/(matrix;vector;1); + reverse 1_last estParams } +// @private +// @kind function +// @category fitUtility +// @desc Recursive function to estimate the coefficients +// in a pure AR model with no trend for a univariate dataset +// @param data {float[]} Dataset from which to estimate the coefficients +// @param info {number[]} Matrix, vector and n information +// @return {float[]} New matrix,vector and n information +ts.i.durbinLevinsonEstimate:{[data;info] + matrix:info 0;vector:info 1;n:info 2; + k:n+1; + dVal:sum matrix[n;1+til n]mmu ts.i.lagCovariance[data]each k-1+til n; + matrix[k;k]:(ts.i.lagCovariance[data;k]-dVal)%vector n; + updateMatrix:ts.i.durbinUpdateMatrix[data;n;matrix]each 1+til n; + matrix[k;1+til n]:updateMatrix; + vector[k]:vector[n]*(1-xexp[matrix[k;k];2]); + (matrix;vector;n+1) + } // @private // @kind function // @category fitUtility -// @fileoverview Estimate residual errors for the Hannan Riessanan method -// @param endog {num[]} Endogenous variable (time-series) from which to build a model -// this is the target variable from which a value is to be predicted -// @param exog {tab/num[][]} Exogenous variables, are additional variables which -// may be accounted for to improve the model -// @param lags {integer} The number/order of time lags of the model -// @return {dict} Residual errors and parameters for calculation of these parameters -ts.i.estimateErrors:{[endog;exog;lags] - // Construct an AR model to estimate the residual error parameters - estresid:ts.AR.fit[endog;exog;lags;0b]`params; - // Convert the endogenous variable to lagged matrix - endogm:ts.i.lagMatrix[endog;lags]; - // Predict future values based on estimations from AR model and use to estimate error - err:(lags _endog)-((neg[count endogm]#exog),'endogm)mmu estresid; - `params`err!(estresid;err) +// @desc Update matrix values for calculating AR coefficients using +// Durbin Levinson method +// @param data {float[]} Dataset from which to estimate the coefficients +// @param n {int} Number of iterations +// @param matrix {float[]} Matrix used to caluclate coefficients +// @param j {int} Column in the matrix +// @return {float[]} AR(p) coefficients for specified lagged value +ts.i.durbinUpdateMatrix:{[data;n;matrix;j] + matrix[n;j]-(matrix[n+1;n+1]*matrix[n;1+n-j]) } +// @private +// @kind function +// @category fitUtility +// @desc Estimate residual errors using the Hannan Riessanan method +// @param endog {number[]} Endogenous variable (time-series) from which to +// build a model. This is the target variable from which a value is to be +// predicted +// @param exog {float[]|(::)} Exogenous variables are additional variables +// which may be accounted for to improve the model, if (::)/() +// @param p {int} The number/order of time lags of the model +// @return {dictionary} Residual errors and parameters for calculation of these +// parameters +ts.i.estimateErrors:{[endog;exog;p] + // Construct an AR model to estimate the residual error coeffs + estCoeffs:ts.AR.fit[endog;exog;p;0b][`modelInfo;`coefficients]; + // Convert the endogenous variable to lagged matrix + endogMatrix:ts.i.lagMatrix[endog;p]; + // Predict future values based on estimations from AR model and use to + // estimate the error + errors:(p _endog)-((neg[count endogMatrix]#exog),'endogMatrix)mmu estCoeffs; + `estCoeffs`errorVals!(estCoeffs;errors) + } // @private // @kind function // @category fitUtility -// @fileoverview Estimate coefficients as starting points to calculate the sarima coeffs -// @param endog {num[]} Endogenous variable (time-series) from which to build a model -// this is the target variable from which a value is to be predicted -// @param exog {tab} Exogenous variables, are additional variables which -// may be accounted for to improve the model -// @param resid {num[][]} residual errors estimated using i.estimateErrorCoeffs -// @param coeff {num[][]} Estimated coefficients for ARMA model using OLS -// @param params {dict} Information on seasonal and non seasonal lags to be accounted for -// @return {dict} updated optimized coefficients for SARMA model -ts.i.SARMA.coefficients:{[endog;exog;resid;coeff;params] - // data length to use - lenq:count[resid]-max raze params[`q`Q`seas_add_Q]; - lenp:count[endog]-max raze params[`p`P`seas_add_P]; - // prediction values - params[`real]:#[m:neg min lenp,lenq;endog]; - // get lagged values +// @desc Estimate coefficients as starting points to calculate the +// SARIMA coeffs +// @param endog {number[]} Endogenous variable (time-series) from which to +// build a model. This is the target variable from which a value is to be +// predicted +// @param exog {float[]|(::)} Exogenous variables are additional variables +// which may be accounted for to improve the model, if (::)/() +// this will be ignored +// @param residuals {number[]} Residual errors estimated using +// i.estimateErrorCoeffs +// @param coeffs {number[]} Estimated coefficients for ARMA model using OLS +// @param params {dictionary} Information on seasonal and non seasonal lags to +// be accounted for +// @return {dictionary} Updated optimized coefficients for SARMA model +ts.i.SARMA.coefficients:{[endog;exog;residuals;coeffs;params] + // Data length to use + qLen:count[residuals]-max raze params`q`Q`additionalQ; + pLen:count[endog]-max raze params`p`P`additionalP; + // Prediction values + params[`true]:#[m:neg min pLen,qLen;endog]; + // Get lagged values lagVal:ts.i.lagMatrix[endog;params`p]; - // get seasonal lag values + // Get seasonal lag values seasLag:flip params[`P]xprev\:endog; - // get additional seasonal lag values - params[`seas_lag_add]:$[params[`p]&min count params`P; - m#flip params[`seas_add_P]xprev\:endog; + // Get additional seasonal lag values + params[`additionalLags]:$[params[`p]&count params`P; + m#flip params[`additionalP]xprev\:endog; 2#0f ]; - // get resid vals - residVal:ts.i.lagMatrix[resid;params`q]; - seasResid:flip params[`Q]xprev\:resid; - params[`seas_resid_add]:$[params[`q]&min count params`Q; - m#flip params[`seas_add_Q]xprev\:resid; + // Get resid vals + residVal:ts.i.lagMatrix[residuals;params`q]; + seasResid:flip params[`Q]xprev\:residuals; + params[`additionalResiduals]:$[params[`q]&count params`Q; + m#flip params[`additionalQ]xprev\:residuals; 2#0f ]; - // normal arima vals + // Normal arima vals vals:(exog;lagVal;residVal;seasLag;seasResid); - params[`norm_mat]:(,'/)m#'vals; - optD:`xk`args!(coeff;params); - // use optimizer function to improve SARMA coefficients - .ml.optimize.BFGS[ts.i.SARMA.maxLikelihood;coeff;params;::]`xVals + params[`matrix]:(,'/)m#'vals; + // Use optimizer function to improve SARMA coefficients + .ml.optimize.BFGS[ts.i.SARMA.maxLikelihood;coeffs;params;::]`xVals } // @private // @kind function // @category fitUtility -// @fileoverview Calculation of the errors in calculation of the SARIMA coefficients -// @param params {dict} Parameters required for calculation of SARIMA coefficients -// @param dict {dict} Additional parameters required in calculation -// @return {float} returns the square root of the summed, squared errors -ts.i.SARMA.maxLikelihood:{[params;dict] - // get additional seasonal parameters - dict,:ts.i.SARMA.preproc[params;dict]; - // calculate sarima model including the additional seasonal coeffs - preds:ts.i.SARMA.eval[params;dict]; - // calculate error - sqrt sum n*n:preds-dict`real +// @desc Calculation of the error when finding the SARIMA coefficients +// @param coeffs {dictionary} Coefficients of SARIMA model +// @param dict {dictionary} Additional parameters required in calculation +// @return {float} The square root of the summed, squared errors +ts.i.SARMA.maxLikelihood:{[coeffs;dict] + // Get additional seasonal parameters + dict,:ts.i.SARMA.preproc[coeffs;dict]; + // Calculate SARIMA model including the additional seasonal coeffs + preds:ts.i.SARMA.eval[coeffs;dict]; + // Calculate error + n:preds-dict`true; + sqrt wsum[n;n] } // @private // @kind function // @category fitUtility -// @fileoverview Extract fitted ARMA model params to return -// @param endog {num[]} endogenous variable (time-series) from which to build a model -// this is the target variable from which a value is to be predicted -// @param coeff {num[]} error coefficients -// @param params {dict} information on setup of ARMA model -// @param errors {dict} error and parameter dictionary information -// @param lags {integer} the number/order of time lags of the model -// @return {num[]} list of parameters needed for future predictions -ts.i.ARMA.parameters:{[endog;coeff;params;errors;lags] +// @desc Sort ARMA coefficients and parameters into correct order +// @param endog {number[]} Endogenous variable (time-series) from which to +// build a model. This is the target variable from which a value is to be +// predicted +// @param coeff {number[]} Coefficients for calculating residuals +// @param params {dictionary} Parameter sets used to fit the ARMA model +// @param errors {dictionary} Error and coefficient dictionary +// @param n {int} The number/order of time lags in estimated AR model +// @return {number[]} Information needed for future predictions +ts.i.ARMA.sortValues:{[endog;coeff;params;errors;n] (params[`p]#neg[sum params`q`p]#coeff;neg[params`q]#coeff), - (neg[lags]#endog;neg[params`q]#errors`err;errors`params), + (neg[n]#endog;neg[params`q]#errors`errorVals;errors`estCoeffs), enlist params } // @private // @kind function // @category fitUtility -// @fileoverview Extract fitted SARMA model params to return -// @param endog {num[]} endogenous variable (time-series) from which to build a model -// this is the target variable from which a value is to be predicted -// @param coeff {num[]} error coefficients -// @param params {dict} information on setup of ARMA model -// @param errors {dict} error and parameter dictionary information -// @param lags {integer} the number/order of time lags of the model -// @return {dict} parameters needed for future predictions -ts.i.SARMA.parameters:{[endog;coeff;params;errors;lags] - // number of seasonal components - ns:count raze params`P`Q; +// @desc Sort SARMA coefficients and parameters into correct order +// @param endog {number[]} Endogenous variable (time-series) from which to +// build a model this is the target variable from which a value is to be +// predicted +// @param coeff {number[]} Coefficients for calculating residuals +// @param params {dictionary} Parameter sets used to fit the SARMA model +// @param errors {dictionary} Error and coefficient dictionary +// @param n {int} The number/order of time lags in estimated AR model +// @return {dictionary} Information needed for future predictions +ts.i.SARMA.sortValues:{[endog;coeff;params;errors;n] + // Number of seasonal components + seasParams:count raze params`P`Q; // Separate coeffs into normal and seasonal componants - coefn:neg[ns]_coeff;coefs:neg[ns]#coeff; - sarmaParams:(params[`p]#neg[sum params`q`p]#coefn; - neg[params`q]#coefn;count[params`P]#coefs; - neg count[params`Q]#coefs), - (#[neg lags|max raze params`P`seas_add_P;endog]; - #[neg max raze params`p`Q`seas_add_Q;errors`err]; - errors`params); + coeffNorm:neg[seasParams]_coeff; + coeffSeas:neg[seasParams]#coeff; + SARMAparams:(params[`p]#neg[sum params`q`p]#coeffNorm; + neg[params`q]#coeffNorm;count[params`P]#coeffSeas; + neg count[params`Q]#coeffSeas), + (#[neg n|max raze params`P`additionalP;endog]; + #[neg max raze params`p`Q`additionalQ;errors`errorVals];errors`estCoeffs); // Update dictionary values for seasonality funcs - params[`P`Q`seas_add_P`seas_add_Q]:params[`P`Q`seas_add_P`seas_add_Q]-min params[`m]; - sarmaParams,enlist params,`tr`n!params[`tr],lags + paramKeys:`P`Q`additionalP`additionalQ; + params[paramKeys]:params[paramKeys]-min params`m; + SARMAparams,enlist params,`trend`n!params[`trend],n } - // Prediction function utilities -// @private -// @kind list -// @category predictUtility -// @fileoverview lists of keys which must be present in each application of the -// various prediction functions to ensure the application of prediction is valid -ts.i.AR.keyList :`params`tr_param`exog_param`p_param`lags -ts.i.ARMA.keyList :ts.i.AR.keyList,`q_param`resid`estresid`pred_dict -ts.i.ARIMA.keyList :ts.i.ARMA.keyList,`origd -ts.i.SARIMA.keyList:ts.i.ARIMA.keyList,`origs`P_param`Q_param -ts.i.ARCH.keyList :`params`tr_param`p_param`resid - // @private // @kind function // @category predictUtility -// @fileoverview predict a set number of values based on a fit model AR/ARMA/SARMA -// @param mdl {dict} contains all information regarding model parameters and required -// residual information -// @param exog {tab} Exogenous variables, are additional variables which -// may be accounted for to improve the model -// @param len {integer} the number of data points to be predicted -// @param predfn {function} the function to be used for prediction -// @return {num[]} predicted values based on fit model -ts.i.predictFunction:{[mdl;exog;len;predfn] - vals:(mdl`lags;mdl`resid;()); - last{x>count y 2}[len;]predfn[mdl`params;exog;mdl`pred_dict;;mdl`estresid]/vals +// @desc Predict a set number of future values based on a fit model +// AR/ARMA/SARMA +// @param model {dictionary} All information regarding model coefficients and +// required residual information +// @param exog {float[]|(::)} Exogenous variables are additional variables +// which may be accounted for to improve the model, if (::)/() +// this will be ignored +// @param len {int} The number of future data points to be predicted +// @param predFunc {fn} The function to be used for prediction +// @return {number[]} Predicted values based on fit model +ts.i.predictFunction:{[model;exog;len;predFunc] + vals:(model`lagVals;model`residualVals;()); + last{x>count y 2}[len;]predFunc + [model`coefficients;exog;model`paramDict;;model`residualCoeffs]/vals } - // ARMA/AR model prediction functionality // @private // @kind function // @category predictUtility -// @fileoverview prediction function for ARMA model -// @param mdl {dict} contains all information regarding model parameters and required -// residual information -// @param exog {tab} exogenous variables, are additional variables which -// may be accounted for to improve the model -// @param len {integer} the number of data points to be predicted -// @return {num[]} predicted values based on fit ARMA model -ts.i.ARMA.predictFunction:{[mdl;exog;len] - exog:ts.i.predDataCheck[mdl;exog]; - ts.i.predictFunction[mdl;exog;len;ts.i.ARMA.singlePredict] +// @desc Prediction function for ARMA model +// @param model {dictionary} All information regarding model coefficients and +// required residual information +// @param exog {float[]|(::)} Exogenous variables are additional variables +// which may be accounted for to improve the model, if (::)/() +// this will be ignored +// @param len {int} The number of future data points to be predicted +// @return {number[]} Predicted values based on fit ARMA model +ts.i.ARMA.predictFunction:{[model;exog;len] + exog:ts.i.predDataCheck[model;exog]; + ts.i.predictFunction[model;exog;len;ts.i.ARMA.singlePredict] } // @private // @kind function // @category predictUtility -// @fileoverview predict a single ARMA value -// @param params {num[]} model parameters retrieved from initial fit model -// @param exog {tab} exogenous variables, are additional variables which -// may be accounted for to improve the model -// @param dict {dict} additional information which can dictate the behaviour -// when making a prediction -// @param pvals {num[]} previously predicted values -// @param estresid {num[]} estimates of the residual errors -// @return {num[]} information required for the prediction of a set of ARMA values -ts.i.ARMA.singlePredict:{[params;exog;dict;pvals;estresid] - exog:exog count pvals 2; - normmat:exog,raze#[neg[dict`p];pvals[0]],pvals[1]; - pred:$[dict`tr; - params[0]+normmat mmu 1_params; - params mmu normmat +// @desc Predict a single ARMA value +// @param coeffs {number[]} Model coefficients retrieved from initial fit model +// @param exog {float[]|(::)} Exogenous variables are additional variables +// which may be accounted for to improve the model, if (::)/() +// this will be ignored +// @param dict {dictionary} Additional information which can dictate the +// behaviour when making a prediction +// @param pastPreds {number[]} Previously predicted values +// @param residualCoeffs {number[]} Coefficients to estimate the residuals +// @return {number[]} Information required for the prediction of a set of ARMA +// values +ts.i.ARMA.singlePredict:{[coeffs;exog;dict;pastPreds;residualCoeffs] + exog:exog count pastPreds 2; + matrix:exog,raze#[neg dict`p;pastPreds 0],pastPreds 1; + preds:$[dict`trend; + coeffs[0]+matrix mmu 1_coeffs; + coeffs mmu matrix ]; - if[count pvals 1; - estvals:exog,pvals[0]; - pvals[1]:(1_pvals[1]),pred-mmu[estresid;estvals] + if[count pastPreds 1; + estVals:exog,pastPreds 0; + pastPreds[1]:(1_pastPreds 1),preds-mmu[residualCoeffs;estVals] ]; - ((1_pvals[0]),pred;pvals[1];pvals[2],pred) + ((1_pastPreds 0),preds;pastPreds 1;pastPreds[2],preds) } // @private // @kind function // @category predictUtility -// @fileoverview prediction function for AR model -// @param mdl {dict} contains all information regarding model parameters and required -// residual information -// @param exog {tab} Exogenous variables, are additional variables which -// may be accounted for to improve the model -// @param len {integer} the number of data points to be predicted -// @return {num[]} predicted values based on fit AR model -ts.i.AR.predictFunction:{[mdl;exog;len] - exog:ts.i.predDataCheck[mdl;exog]; - mdl[`pred_dict]:enlist[`p]!enlist count mdl`p_param; - mdl[`estresid]:(); - mdl[`resid]:(); - ts.i.predictFunction[mdl;exog;len;ts.i.AR.singlePredict] +// @desc Prediction function for AR model +// @param model {dictionary} All information regarding model coefficients and +// required residual information +// @param exog {float[]|(::)} Exogenous variables are additional variables +// which may be accounted for to improve the model, if (::)/() +// this will be ignored +// @param len {int} The number of future data points to be predicted +// @return {number[]} Predicted values based on fit AR model +ts.i.AR.predictFunction:{[model;exog;len] + exog:ts.i.predDataCheck[model;exog]; + model[`paramDict]:enlist[`p]!enlist count model`pCoeff; + model[`residualCoeffs]:(); + model[`residualVals]:(); + ts.i.predictFunction[model;exog;len;ts.i.AR.singlePredict] } // Predict a single AR value ts.i.AR.singlePredict:ts.i.ARMA.singlePredict - // SARIMA model calculation functionality // @private // @kind function // @category predictUtility -// @fileoverview prediction function for SARMA model -// @param mdl {dict} contains all information regarding model parameters and required -// residual information -// @param exog {tab} Exogenous variables, are additional variables which -// may be accounted for to improve the model -// @param len {integer} the number of data points to be predicted -// @return {num[]} predicted values based on fit SARMA model -ts.i.SARMA.predictFunction:{[mdl;exog;len] - exog:ts.i.predDataCheck[mdl;exog]; - $[count raze mdl[`pred_dict]; - ts.i.predictFunction[mdl;exog;len;ts.i.SARMA.singlePredict]; - ts.i.AR.predictFunction[mdl;exog;len] +// @desc Prediction function for SARMA model +// @param model {dictionary} All information regarding model coefficients and +// required residual information +// @param exog {float[]|(::)} Exogenous variables are additional variables +// which may be accounted for to improve the model, if (::)/() +// this will be ignored +// @param len {int} The number of future data points to be predicted +// @return {number[]} Predicted values based on fit SARMA model +ts.i.SARMA.predictFunction:{[model;exog;len] + exog:ts.i.predDataCheck[model;exog]; + $[count raze model`paramDict; + ts.i.predictFunction[model;exog;len;ts.i.SARMA.singlePredict]; + ts.i.AR.predictFunction[model;exog;len] ] } // @private // @kind function // @category predictUtility -// @fileoverview predict a single SARMA value -// @param params {num[]} model parameters retrieved from initial fit model -// @param exog {tab} exogenous variables, are additional variables which -// may be accounted for to improve the model -// @param dict {dict} additional information which can dictate the behaviour -// when making a prediction -// @param pvals {num[]} previously predicted values -// @param estresid {num[]} estimates of the residual errors -// @return {num[]} information required for the prediction of SARMA values -ts.i.SARMA.singlePredict:{[params;exog;dict;pvals;estresid]; - exog:exog count pvals 2; - dict,:ts.i.SARMA.preproc[params;dict]; - pred:ts.i.SARMA.predictValue[params;pvals;exog;dict]; - if[count pvals 1; - estvals:exog,neg[dict`n]#pvals 0; - pvals[1]:(1_pvals[1]),pred-mmu[estresid;estvals] +// @desc Predict a single SARMA value +// @param coeffs {dictionary} Model coefficients retrieved from initial fit +// model +// @param exog {float[]|(::)} Exogenous variables, are additional variables +// which may be accounted for to improve the model, if (::)/() +// this will be ignored +// @param dict {dictionary} Additional information which can dictate the +// behaviour when making a prediction +// @param pastPreds {number[]} Previously predicted values +// @param residualCoeffs {number[]} Coefficients to calculate the residual +// errors +// @return {number[]} Information required for the prediction of SARMA values +ts.i.SARMA.singlePredict:{[coeffs;exog;dict;pastPreds;residualCoeffs]; + exog:exog count pastPreds 2; + dict,:ts.i.SARMA.preproc[coeffs;dict]; + preds:ts.i.SARMA.predictVal[coeffs;pastPreds;exog;dict]; + if[count pastPreds 1; + estVals:exog,neg[dict`n]#pastPreds 0; + pastPreds[1]:(1_pastPreds 1),preds-mmu[residualCoeffs;estVals] ]; - // append new lag values, for next step calculations - ((1_pvals[0]),pred;pvals[1];pvals[2],pred) + // Append new lag values, for next step calculations + ((1_pastPreds 0),preds;pastPreds 1;pastPreds[2],preds) } // @private // @kind function // @category predictUtility -// @fileoverview Calculate new required lags for SARMA prediction surrounding -// seasonal components -// @param params {dict} model parameters retrieved from initial fit model -// @param dict {dict} additional information which can dictate the behaviour -// in different situations where predictions are being made -// @return {dict} seasonal parameters for prediction in SARMA models -ts.i.SARMA.preproc:{[params;dict] - // 1. Calculate or retrieve all necessary seasonal lagged values for SARMA prediction - // split up the coefficients to their respective p,q,P,Q parts - lagp:(dict[`tr] _params)[til dict`p]; - lagq:((dict[`tr]+dict`p)_params)[til dict`q]; - lagSeasp:((dict[`tr]+sum dict`q`p)_params)[til count[dict`P]]; - lagSeasq:neg[count dict`Q]#params; - // Function to extract additional seasonal multiplied coefficients - // These coefficients multiply p x P vals and q x Q vals - seas_multi:{[x;y;z;d]$[d[x]&min count d upper x;(*/)flip y cross z;2#0f]}; - // append new lags to original dictionary - dictKeys:`add_lag_param`add_resid_param; - dictVals:(seas_multi[`p;lagp;lagSeasp;dict];seas_multi[`q;lagq;lagSeasq;dict]); +// @desc Calculate additional coefficients for SARMA prediction +// surrounding seasonal components +// @param coeffs {dictionary} Model coefficients retrieved from initial fit +// model +// @param dict {dictionary} Additional information which can dictate the +// behaviour in different situations where predictions are being made +// @return {dictionary} Seasonal parameters for prediction in SARMA models +ts.i.SARMA.preproc:{[coeffs;dict] + // Calculate or retrieve all necessary seasonal lagged values for SARMA + // prediction and split up the coefficients to their respective p,q,P,Q parts + pVals:(dict[`trend] _coeffs)til dict`p; + qVals:((dict[`trend]+dict`p)_coeffs)til dict`q; + pSeasonVals:((dict[`trend]+sum dict`q`p)_coeffs)til count dict`P; + qSeasonVals:neg[count dict`Q]#coeffs; + // Append new lags to original dictionary + dictKeys:`additionalpCoeff`additionalqCoeff; + dictVals:(ts.i.SARMA.multiplySeason[`p;pVals;pSeasonVals;dict]; + ts.i.SARMA.multiplySeason[`q;qVals;qSeasonVals;dict]); dictKeys!dictVals } // @private // @kind function // @category predictUtility -// @fileoverview predict a single SARMA value -// @param params {num[]} model parameters retrieved from initial fit model -// @param pvals {num[]} previously predicted values -// @param exog {tab} exogenous variables, are additional variables which -// may be accounted for to improve the model -// @param dict {dict} additional information which can dictate the behaviour -// when making a prediction -// @return {num[]} information required for the prediction of a set of SARMA values -ts.i.SARMA.predictValue:{[params;pvals;exog;dict] - dict[`seas_resid_add]:$[dict[`q]&min count dict`Q; - pvals[1]dict[`seas_add_Q]; +// @desc Function to extract additional seasonal multiplied +// coefficients. These coefficients multiply p x P vals and q x Q vals +// @param dictKeys {symbol} Key of dictionary to extract info from +// @param normVals {number[]} Non seasonal coefficients +// @param seasonVals {number[]} Seasonal coefficients +// @param dict {dictionary} Model parameters retrieved from initial fit model +// @return {dictionary} Seasonal coefficients multiplied by non seasonal +// coefficients +ts.i.SARMA.multiplySeason:{[dictKey;normVals;seasonVals;dict] + $[dict[dictKey]&min count dict upper dictKey; + (*/)flip normVals cross seasonVals; + 2#0f + ] + } + +// @private +// @kind function +// @category predictUtility +// @desc Predict a single SARMA value +// @param coeffs {number[]} Model coefficiants retrieved from initial fit model +// @param pastPreds {number[]} Previously predicted values +// @param exog {float[]|(::)} Exogenous variables are additional variables +// which may be accounted for to improve the model, if (::)/() +// this will be ignored +// @param dict {dictionary} Additional information which can dictate the +// behaviour when making a prediction +// @return {number[]} information required for the prediction of a set of SARMA +// values +ts.i.SARMA.predictVal:{[coeffs;pastPreds;exog;dict] + dict[`additionalResiduals]:$[dict[`q]&min count dict`Q; + pastPreds[1]dict`additionalQ; 2#0f ]; - dict[`seas_lag_add]:$[dict[`p]&min count dict`P; - pvals[0]dict[`seas_add_P]; + dict[`additionalLags]:$[dict[`p]&min count dict`P; + pastPreds[0]dict`additionalP; 2#0f ]; - sarmavals:raze#[neg dict`p;pvals 0],#[neg dict`q;pvals 1],pvals[0][dict`P],pvals[1][dict`Q]; - dict[`norm_mat]:exog,sarmavals; - ts.i.SARMA.eval[params;dict] + SARMAvals:raze#[neg dict`p;pastPreds 0],#[neg dict`q;pastPreds 1], + pastPreds[0][dict`P],pastPreds[1]dict`Q; + dict[`matrix]:exog,SARMAvals; + ts.i.SARMA.eval[coeffs;dict] } // @private // @kind function // @category predictUtility -// @fileoverview calculate the value of a SARMA prediction based on -// provided params/dictionary -// @param params {num[]} model parameters retrieved from initial fit model -// @param dict {dict} additional information which can dictate the behaviour -// when making a prediction -// @return {num[]} the SARMA prediction values -ts.i.SARMA.eval:{[params;dict] - normVal :mmu[dict`norm_mat;dict[`tr] _params]; - seasResid:mmu[dict`seas_resid_add;dict`add_resid_param]; - seasLag :mmu[dict`seas_lag_add;dict`add_lag_param]; - $[dict`tr;params[0]+;]normVal+seasResid+seasLag +// @desc Calculate the value of a SARMA prediction based on +// provided coeffs/dictionary +// @param coeffs {number[]} Model coefficients retrieved from initial fit model +// @param dict {dictionary} Additional information which can dictate the +// behaviour when making a prediction +// @return {number[]} The SARMA prediction values +ts.i.SARMA.eval:{[coeffs;dict] + normVals :mmu[dict`matrix;dict[`trend] _coeffs]; + seasResids:mmu[dict`additionalResiduals;dict`additionalqCoeff]; + seasLags :mmu[dict`additionalLags;dict`additionalpCoeff]; + $[dict`trend;coeffs[0]+;]normVals+seasResids+seasLags } - // @private // @kind function // @category predictUtility -// @fileoverview calculate a single ARCH value, -// @param params {dict} model parameters retrieved from initial fit model -// @param pvals {num[]} list of values over which predictions are composed -// @return {num[]} list containing residuals and predicted values -ts.i.ARCH.singlePredict:{[params;pvals] - predict:params[0]+pvals[0] mmu 1_params; - ((1_pvals 0),predict;pvals[1],predict) +// @desc Calculate a single ARCH value, +// @param coeffs {dictionary} Model coefficients retrieved from +// initial fit model +// @param pastPreds {number[]} Previously predicted values +// @return {number[]} Residuals and predicted values +ts.i.ARCH.singlePredict:{[coeffs;pastPreds] + predict:coeffs[0]+pastPreds[0] mmu 1_coeffs; + ((1_pastPreds 0),predict;pastPreds[1],predict) } // Akaike Information Criterion @@ -468,50 +535,52 @@ ts.i.ARCH.singlePredict:{[params;pvals] // @private // @kind function // @category aicUtility -// @fileoverview calculate the Akaike Information Criterion -// @param true {num[]} true values -// @param pred {num[]} predicted values -// @param params {num[]} list of the lag/residual parameters +// @desc Calculate the Akaike Information Criterion +// @param true {number[]} True values +// @param pred {number[]} Predicted values +// @param params {number[]} The lag/residual parameters // @return {float} Akaike Information Criterion score ts.i.aicScore:{[true;pred;params] // Calculate residual sum of squares, normalised for number of values - rss:{wsum[x;x]%y}[true-pred;n:count pred]; + sumSquares:{wsum[x;x]%y}[true-pred;n:count pred]; // Number of parameter k:sum params; - aic:(2*k)+n*log rss; - // if k<40 use the altered aic score + aic:(2*k)+n*log sumSquares; + // If k<40 use the altered aic score $[k<40;aic+(2*k*k+1)%n-k-1;aic] } // @private // @kind function // @category aicUtility -// @fileoverview Fit a model, predict the test, return AIC score +// @desc Fit a model, predict the test, return AIC score // for a single set of input params -// @param train {dict} training data as a dictionary with endog and exog data -// @param test {dict} testing data as a dictionary with endog and exog data -// @param len {integer} number of steps in the future to be predicted -// @param params {dict} parameters used in prediction +// @param train {dictionary} Training data as a dictionary with +// endog and exog data +// @param test {dictionary} Testing data as a dictionary with +// endog and exog data +// @param len {integer} Number of steps in the future to be predicted +// @param params {dictionary} Parameters used in prediction // @return {float} Akaike Information Criterion score ts.i.aicFitScore:{[train;test;len;params] // Fit an model using the specified parameters - mdl :ts.ARIMA.fit[train`endog;train`exog;;;;]. params`p`d`q`tr; + model:ts.ARIMA.fit[train`endog;train`exog]. params`p`d`q`trend; // Predict using the fitted model - pred:ts.ARIMA.predict[mdl;test`exog;len]; + preds:model[`predict][test`exog;len]; // Score the predictions - ts.i.aicScore[len#test`endog;pred;params] + ts.i.aicScore[len#test`endog;preds;params] } - // Autocorrelation functionality // @private // @kind function // @category autocorrelationUtility -// @fileoverview Lagged covariance between a dataset at time t and time t-lag -// @param data {num[]} vector on which to calculate the lagged covariance -// @param lag {integer} size of the lag to use when calculating covariance -// @return {float} covariance between a time series and lagged version of itself +// @desc Lagged covariance between a dataset at time t and time t-lag +// @param data {number[]} Vector on which to calculate the lagged covariance +// @param lag {int} Size of the lag to use when calculating covariance +// @return {float} Covariance between a time series and lagged version of +// itself ts.i.lagCovariance:{[data;lag] cov[neg[lag] _ data;lag _ data] } @@ -519,28 +588,28 @@ ts.i.lagCovariance:{[data;lag] // @private // @kind function // @category autocorrelationUtility -// @fileoverview Calculate the autocorrelation between a series +// @desc Calculate the autocorrelation between a time series // and lagged version of itself -// @param data {num[]} vector on which to calculate the lagged covariance -// @param lag {integer} size of the lag to use when calculating covariance -// @return {float} autocorrelation between a time series and lagged version of itself +// @param data {number[]} Vector on which to calculate the lagged covariance +// @param lag {int} Size of the lag to use when calculating covariance +// @return {float} Autocorrelation between a time series and lagged version of +// itself ts.i.autoCorrFunction:{[data;lag] ts.i.lagCovariance[data;lag]%var data } - // Matrix creation/manipulation functionality // @private // @kind function // @category matrixUtilities -// @fileoverview create a lagged matrix with each row containing the original +// @desc Create a lagged matrix with each row containing the original // data as its first element and the remaining 'lag' values as additional row // elements -// @param data {num[]} vector from which to create the lagged matrix -// @param lag {integer} size of the lag to use when creating lagged matrix -// @return {num[][]} a numeric matrix containing original data augmented with -// lagged versions of the original dataset. +// @param data {number[]} Vector from which to create the lagged matrix +// @param lag {int} Size of the lag to use when creating lagged matrix +// @return {number[][]} A numeric matrix containing original data augmented +// with lagged versions of the original dataset. ts.i.lagMatrix:{[data;lag] data til[count[data]-lag]+\:til lag } @@ -548,90 +617,111 @@ ts.i.lagMatrix:{[data;lag] // @private // @kind function // @category matrixUtilities -// @fileoverview convert a simple table into a matrix -// @param data {tab} simple table to be converted to a matrix representation -// @return {num[][]} matrix representation of the input table in the same 'configuration' +// @desc Convert a simple table into a matrix +// @param data {table} Simple table to be converted to a matrix representation +// @return {number[]} Matrix representation of the input table in the same +// 'configuration' ts.i.tabToMatrix:{[data] flip value flip data } - -// Stationarity functionality used to test if datasets are suitable for application of the ARIMA -// and to facilitate transformation of the data to a more suitable form if relevant +// Stationarity functionality used to test if datasets are suitable for +// application of the ARIMA and to facilitate transformation of the data to a +// more suitable form if relevant // @private // @kind function // @category stationaryUtilities -// @fileoverview calculate relevant augmented dickey fuller statistics using python -// @param data {dict/tab/num[]} dataset to be testing for stationarity -// @param dtype {short} type of the dataset that's being passed to the function -// @return {num[]/num[][]} all relevant scores from an augmented dickey fuller test +// @desc Calculate relevant augmented dickey fuller statistics using +// python +// @param data {dictionary|table|number[]} Dataset to be testing for +// stationarity +// @param dtype {short} Type of the dataset that's being passed to the function +// @return {number[]} All relevant scores from an augmented dickey fuller test ts.i.stationaryScores:{[data;dtype] // Calculate the augmented dickey-fuller scores for a dict/tab/vector input - scores:{.ml.fresh.i.adfuller[x]`}@' - $[98h=dtype;flip data; - 99h=dtype;data; - dtype in(6h;7h;8h;9h);enlist data; - '"Inappropriate type provided"]; + scores:{.ml.fresh.i.adFuller[x]`}@' + $[98h=dtype; + flip data; + 99h=dtype; + data; + dtype in(6h;7h;8h;9h); + enlist data; + '"Inappropriate type provided" + ]; flip{x[0 1],(0.05>x 1),value x 4}each$[dtype in 98 99h;value::;]scores } // @private // @kind function // @category stationaryUtilities -// @fileoverview Are all of the series provided by a user stationary, +// @desc Are all of the series provided by a user stationary, // determined using augmented dickey fuller? -// @param data {dict/tab/num[]} dataset to be testing for stationarity -// @return {bool} indicate if all time series are stationary or not +// @param data {dictionary|table|number[]} Dataset to be testing for +// stationarity +// @return {boolean} Indicate if all time series are stationary or not ts.i.stationary:{[data] (all/)ts.i.stationaryScores[data;type data][2] } - // Differencing utilities // @private // @kind function // @category differUtility -// @fileoverview apply time-series differencing and remove first diff elements -// @param data {num[]/num[][]} dataset to apply differencing to -// @param diff {integer} order of time series differencing -// @return {num[]/num[][]} differenced time series -ts.i.diff:{[data;diff] - diffData:diff{deltas x}/data; - diff _ diffData +// @desc Apply time-series differencing and remove first d elements +// @param data {number[]} Dataset to apply differencing to +// @param d {int} Order of time series differencing +// @return {number[]} Differenced time series +ts.i.diff:{[data;d] + diffData:d{deltas x}/data; + d _ diffData } // @private // @kind function // @category differUtility -// @fileoverview apply seasonal differencing and remove first diff elements -// @param diff {integer} how many points in the past does data need to be +// @desc Apply seasonal differencing and remove first d elements +// @param d {int} How many points in the past does data need to be // differenced with respect to -// @param data {num[]/num[][]} dataset to apply differencing to -// @return {num[]/num[][]} differenced time series -ts.i.seasonDiff:{[diff;data] - diffData:data - xprev[diff;data]; - diff _ diffData +// @param data {number[]} Dataset to apply differencing to +// @return {number[]} Differenced time series +ts.i.seasonDiff:{[d;data] + diffData:data - xprev[d;data]; + d _ diffData } // @private // @kind function // @category differUtility -// @fileoverview revert seasonally differenced data to correct representation -// @param origd {num[]} set of original dataset saved before being differenced -// @param dfdata {num[]} differenced dataset -// @return {num[]} the data reverted back to its original format before differencing -ts.i.reverseSeasonDiff:{[origd;dfdata] - seasd:origd,dfdata; - n:count origd; - [n]_first{x[1]count exog;ts.i.err.len[]]; + if[not[()~exog]&count[endog]>count exog;ts.i.err.len[]]; // convert exon table to matrix $[98h~type exog;:"f"$ts.i.tabToMatrix exog;()~exog;:exog;:"f"$exog]; } @@ -665,109 +758,130 @@ ts.i.fitDataCheck:{[endog;exog] // @private // @kind function // @category dataCheckUtility -// @fileoverview ensure that all required keys are present for the application of -// the various prediction functions -// @param dict {dict} the dictionary parameter to be validated -// @param keyvals {sym[]} list of the keys which should be present in order to -// fully execute the logic of the function -// @param input {string} name of the input dictionary which issue is -// highlighted in -// @return {err/(::)} will error on incorrect inputs otherwise run silently -ts.i.dictCheck:{[dict;keyvals;input] +// @desc Ensure that all required keys are present for the application +// of the various prediction functions +// @param dict {dictionary} dictionary parameter to be validated +// @param keyVals {symbol[]} Keys which should be present in order to fully +// execute the logic of the function +// @param input {string} Name of input dictionary which issue is highlighted in +// @return {err|::} Will error on incorrect inputs otherwise run silently +ts.i.dictCheck:{[dict;keyVals;input] if[99h<>type dict;'input," must be a dictionary input"]; - validKeys:keyvals in key dict; + validKeys:keyVals in key dict; if[not all validKeys; - invalid:sv[", ";string[keyvals]where not validKeys]; - '"The following required dictionary keys for '",input,"' are not provided: ",invalid + invalid:sv[", ";string[keyVals]where not validKeys]; + '"The following required dictionary keys for '",input, + "' are not provided: ",invalid ]; } // @private // @kind function // @category dataCheckUtility -// @fileoverview check that the exogenous data match the expected input when -// predicting data using a the model are consistent, in the case they are not, -// flag an error ensure that the exogenous data is returned as a matrix -// @param mdl {dict} dictionary containing required information to predict -// future values -// @param exog {tab/num[][]} exogenous dataset -// @return {num[][]} exogenous data as a matrix -ts.i.predDataCheck:{[mdl;exog] - // allow null to be provided as exogenous variable +// @desc Check that the exogenous data match the expected input when +// predicting data using a the model are consistent, in the case they are +// not, flag an error ensure that the exogenous data is returned as a matrix +// @param model {dictionary} Dictionary containing required information to +// predict future values +// @param exog {float[]|(::)} Exogenous variables, are additional variables +// which may be accounted for to improve the model, if (::)/() +// this will be ignored +// @return {number[]} Exogenous data as a matrix +ts.i.predDataCheck:{[model;exog] + // Allow null to be provided as exogenous variable if[exog~(::);exog:()]; - // check that the fit and new params are equivalent - if[not count[mdl`exog_param]~count exog[0];ts.i.err.exog[]]; - // convert exogenous variable to a matrix if required + // Check that the fit and new params are equivalent + if[not count[model`exogCoeff]~count exog 0;ts.i.err.exog[]]; + // Convert exogenous variable to a matrix if required $[98h~type exog;"f"$ts.i.tabToMatrix exog;()~exog;:exog;"f"$exog] } // @private // @kind function // @category dataCheckUtility -// @fileoverview Apply seasonal and non-seasonal time-series differencing, -// error checking stationarity of the dataset following application of differencing -// @param endog {num[]} endogenous dataset -// @param diff {integer} non seasonal differencing component (integer) -// @param sdict {dict} dictionary containing relevant seasonal differencing components -// @return {dict} Seasonal and non-seasonally differenced stationary time-series -ts.i.differ:{[endog;d;s] +// @desc Apply seasonal and non-seasonal time-series differencing,error +// checking stationarity of the dataset following application of differencing +// @param endog {number[]} Endogenous variable (time-series) from which to +// build a model. This is the target variable from which a value is to be +// predicted +// @param d {int} Non seasonal differencing component +// @param seasonDict {dictionary} Dictionary containing relevant seasonal +// differencing components +// @return {dictionary} Seasonal and nonseasonally differenced stationary +// time-series +ts.i.differ:{[endog;d;seasonDict] // Apply non seasonal differencing if appropriate (handling of AR/ARMA) - if[s~()!();s[`D]:0b]; + if[seasonDict~()!();seasonDict[`D]:0b]; initDiff:ts.i.diff[endog;d]; // Apply seasonal differencing if appropriate - finalDiff:$[s[`D];s[`D]ts.i.seasonDiff[s`m]/initDiff;initDiff]; + finalDiff:$[seasonDict[`D]; + seasonDict[`D]ts.i.seasonDiff[seasonDict`m]/initDiff; + initDiff]; // Check stationarity if[not ts.i.stationary[finalDiff];ts.i.err.stat[]]; // Return integrated data `final`init!(finalDiff;initDiff) } - // Feature extraction utilities // @private // @kind function // @category featureExtractUtilities -// @fileoverview Apply a user defined unary function across a dataset +// @desc Apply a user defined unary function across a dataset // using a sliding window of specified length // Note: this is a modified version of a function provided in qidioms -// using floating point windows instead -// of long windows to increase the diversity of functions that can be applied -// @param func {lambda} unary function to be applied with the data in the sliding window -// @param win {integer} size of the sliding window -// @param data {num[]} data on which the sliding window and associated function -// are to be applied -// @return {num[]} result of the application of the function on each of the sliding window -// components over the data vector -ts.i.slidingWindowFunction:{[func;win;data] - 0f,-1_func each{ 1_x,y }\[win#0f;data] +// using floating point windows instead of long windows to increase the +// diversity of functions that can be applied +// @param func {fn} Unary function to be applied with the data in the sliding +// window +// @param winSize {int} Size of the sliding window +// @param data {number[]} Data on which the sliding window and associated +// function are to be applied +// @return {number[]} Result of the application of the function on each of the +// sliding window components over the data vector +ts.i.slidingWindowFunction:{[func;winSize;data] + 0f,-1_func each{1_x,y}\[winSize#0f;data] } +// @private +// @kind function +// @category featureExtractUtilities +// @desc Set up the order for the inputs of the sliding window function +// @param tab {table} Dataset onto which to apply the windowed functions +// @param uniCombs {number[]} Unique combinations of columns/windows and +// functions to be applied to the dataset +// @return {number[]} Result of the application of the function on each of the +// sliding window components over the data vector +ts.i.setupWindow:{[tab;uniCombs] + ts.i.slidingWindowFunction[get string uniCombs 0;uniCombs 1;tab uniCombs 2] + } // Plotting utilities // @private // @kind function // @category plottingUtility -// @fileoverview Plotting function used in the creation of plots +// @desc Plotting function used in the creation of plots // for both full and partial autocorrelation graphics -// @param data {num[]} x-axis original dataset -// @param vals {num[]} calculated values -// @param m {num[]} bar plot indices -// @param title {string} title to be given to the plot -// @return {graph} presents a plot to screen associated with relevant analysis +// @param data {number[]} x-axis original dataset +// @param vals {number[]} Calculated values +// @param m {number[]} Bar plot indices +// @param title {string} Title to be given to the plot +// @return {graph} Presents a plot to screen associated with relevant analysis ts.i.plotFunction:{[data;vals;m;width;title] - plt:.p.import[`matplotlib.pyplot]; + plt:.p.import`matplotlib.pyplot; conf:count[m]#1.95%sqrt count data; plt[`:bar][m;vals;`width pykw width%2]; - cfgkeys:`linewidth`linestyle`color`label; - cfgvals:3,`dashed`red`conf_interval; - plt[`:plot][m;conf;pykwargs cfgkeys!cfgvals]; + configKeys:`linewidth`linestyle`color`label; + configVals:3,`dashed`red`conf_interval; + plt[`:plot][m;conf;pykwargs configKeys!configVals]; if[0>min vals; - plt[`:plot][m;neg conf;pykwargs -1_cfgkeys!cfgvals] + plt[`:plot][m;neg conf;pykwargs -1_configKeys!configVals] ]; plt[`:legend][]; - plt[`:xlabel][`lags]; - plt[`:ylabel][`acf]; - plt[`:title][title]; - plt[`:show][];} + plt[`:xlabel]`lags; + plt[`:ylabel]`acf; + plt[`:title]title; + plt[`:show][]; + } diff --git a/util/README.md b/util/README.md index 1a252ce2..d54a4356 100644 --- a/util/README.md +++ b/util/README.md @@ -36,6 +36,6 @@ Documentation is available on the [Utilities](https://code.kx.com/v2/ml/toolkit/ ## Status -The machine-learning utilities library is still in development and is available here as a beta release. Further functionality and improvements will be made to the library in the coming months. +The machine-learning utilities library is still in development. Further functionality and improvements will be made to the library on an ongoing basis. If you have any issues, questions or suggestions, please write to ai@kx.com. diff --git a/util/functionMapping.json b/util/functionMapping.json new file mode 100644 index 00000000..515d35a5 --- /dev/null +++ b/util/functionMapping.json @@ -0,0 +1,310 @@ +{ + "util":{ + ".ml.imin":{ + "function":".ml.iMin", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.imax":{ + "function":".ml.iMax", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.df2tab_tz":{ + "function":".ml.df2tabTimezone", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.linspace":{ + "function":".ml.linearSpace", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.traintestsplit":{ + "function":".ml.trainTestSplit", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.classreport":{ + "function":".ml.classReport", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.confdict":{ + "function":".ml.confDict", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.confmat":{ + "function":".ml.confMatrix", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.corrmat":{ + "function":".ml.corrMatrix", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.cvm":{ + "function":".ml.covMatrix", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.f1score":{ + "function":".ml.f1Score", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.fbscore":{ + "function":".ml.fBetaScore", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.logloss":{ + "function":".ml.logLoss", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.r2score":{ + "function":".ml.r2Score", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.rocaucscore":{ + "function":".ml.rocAucScore", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.tscore":{ + "function":".ml.tScore", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.tscoreeq":{ + "function":".ml.tScoreEqual", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.applylabelencode":{ + "function":".ml.applyLabelEncode", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.dropconstant":{ + "function":".ml.dropConstant", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.filltab":{ + "function":".ml.fillTab", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.infreplace":{ + "function":".ml.infReplace", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.labelencode":{ + "function":".ml.labelEncode.fitTransform", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.lexiencode":{ + "function":".ml.lexiEncode.fitTransform", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.minmaxscaler":{ + "function":".ml.minMaxScaler.fitTransform", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.onehot":{ + "function":".ml.oneHot.fitTransform", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.polytab":{ + "function":".ml.polyTab", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.stdscaler":{ + "function":".ml.stdScaler.fitTransform", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.timesplit":{ + "function":".ml.timeSplit", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.describe":{ + "function":".ml.stats.describe", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.percentile":{ + "function":".ml.stats.percentile", + "warning":"futureWarning", + "version":"3.0" + } + }, + "clust":{ + ".ml.clust.cure.cutk":{ + "function":".ml.clust.cure.cutK", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.clust.cure.cutdist":{ + "function":".ml.clust.cure.cutDist", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.clust.hc.cutk":{ + "function":".ml.clust.hc.cutK", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.clust.hc.cutdist":{ + "function":".ml.clust.hc.cutDist", + "warning":"futureWarning", + "version":"3.0" + } + }, + "fresh":{ + ".ml.fresh.createfeatures":{ + "function":".ml.fresh.createFeatures", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.fresh.sigfeat":{ + "function":".ml.fresh.sigFeat", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.fresh.ksigfeat":{ + "function":".ml.fresh.kSigFeat", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.fresh.significantfeatures":{ + "function":".ml.fresh.significantFeatures", + "warning":"futureWarning", + "version":"3.0" + } + }, + "xval":{ + ".ml.gs.kfshuff":{ + "function":".ml.gs.kfShuff", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.gs.kfsplit":{ + "function":".ml.gs.kfSplit", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.gs.kfstrat":{ + "function":".ml.gs.kfStrat", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.gs.mcsplit":{ + "function":".ml.gs.mcSplit", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.gs.pcsplit":{ + "function":".ml.gs.pcSplit", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.gs.tschain":{ + "function":".ml.gs.tsChain", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.gs.tsrolls":{ + "function":".ml.gs.tsRolls", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.rs.kfshuff":{ + "function":".ml.rs.kfShuff", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.rs.kfsplit":{ + "function":".ml.rs.kfSplit", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.rs.kfstrat":{ + "function":".ml.rs.kfStrat", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.rs.mcsplit":{ + "function":".ml.rs.mcSplit", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.rs.pcsplit":{ + "function":".ml.rs.pcSplit", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.rs.tschain":{ + "function":".ml.rs.tsChain", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.rs.tsrolls":{ + "function":".ml.rs.tsRolls", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.xv.kfshuff":{ + "function":".ml.xv.kfShuff", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.xv.kfsplit":{ + "function":".ml.xv.kfSplit", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.xv.kfstrat":{ + "function":".ml.xv.kfStrat", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.xv.mcsplit":{ + "function":".ml.xv.mcSplit", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.xv.pcsplit":{ + "function":".ml.xv.pcSplit", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.xv.tschain":{ + "function":".ml.xv.tsChain", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.xv.tsrolls":{ + "function":".ml.xv.tsRolls", + "warning":"futureWarning", + "version":"3.0" + }, + ".ml.xv.fitscore":{ + "function":".ml.xv.fitScore", + "warning":"futureWarning", + "version":"3.0" + } + } +} diff --git a/util/init.q b/util/init.q index 44bb6867..3284ce43 100644 --- a/util/init.q +++ b/util/init.q @@ -1,3 +1,11 @@ -.ml.loadfile`:util/util.q +// util/init.q - Load utilities library +// Copyright (c) 2021 Kx Systems Inc + +.ml.loadfile`:util/utils.q +.ml.loadfile`:util/utilities.q .ml.loadfile`:util/metrics.q .ml.loadfile`:util/preproc.q +.ml.loadfile`:fresh/utils.q +.ml.loadfile`:stats/init.q + +.ml.i.deprecWarning`util diff --git a/util/metrics.q b/util/metrics.q index 1d956425..08a6d040 100644 --- a/util/metrics.q +++ b/util/metrics.q @@ -1,60 +1,346 @@ +// util/metrics.q - Metrics +// Copyright (c) 2021 Kx Systems Inc +// +// Metrics for scoring ml models + \d .ml -/ descriptive statistics -range:{max[x]-min x} -/ percentile y of list x -percentile:{r[0]+(p-i 0)*last r:0^deltas asc[x]i:0 1+\:floor p:y*-1+count x} -describe:{`count`mean`std`min`q1`q2`q3`max!flip(count;avg;sdev;min;percentile[;.25];percentile[;.5];percentile[;.75];max)@\:/:flip(exec c from meta[x]where t in"hijefpmdznuvt")#x} - -/ classification scores (x predictions, y labels, z positive label) -accuracy:{avg x=y} -precision: {sum[u&y =z]%sum u:x =z} -sensitivity:{sum[u&x =z]%sum u:y =z} -specificity:{sum[u&x<>z]%sum u:y<>z} -/ f1&fbeta scores -fbscore:{[x;y;z;b](sum[ap&pp]*1+b*b)%sum[pp:x=z]+b*b*sum ap:y=z} -f1score:fbscore[;;;1] -/ matthews correlation coefficient -matcorr:{.[-;prd raze[m](0 1;3 2)]%sqrt prd sum[m],sum each m:value confmat[x;y]} -/ confusion matrix -confmat:{(k!(2#count k)#0),0^((count each group@)each x group y)@\:k:$[1=type k:asc distinct x,y;01b;k]} -/ confusion dictionary -confdict:{`tn`fp`fn`tp!raze value confmat .(x;y)=z} -/ class report -classreport:{[x;y]k:asc distinct y; - t:`precision`recall`f1_score`support!((precision;sensitivity;f1score;{sum y=z}).\:(x;y))@/:\:k; - ([]class:`$string[k],enlist"avg/total")!flip[t],(avg;avg;avg;sum)@'t} - -/ x list of class labels (0,1,...,n-1), y list of lists of (n) probabilities (one per class) -i.EPS:1e-15 -crossentropy:logloss:{neg avg log i.EPS|y@'x} - -/ regression scores (x predictions, y values) -mse:{avg d*d:x-y} -sse:{sum d*d:x-y} -rmse:{sqrt mse[x;y]} -rmsle:{rmse . log(x;y)+1} -mae:{avg abs x-y} -mape:{100*avg abs 1-x%y} -smape:{100*avg abs[y-x]%abs[x]+abs y} -r2score:{1-sse[y;x]%sse[y]avg y} - -/ t-score for a test (one sample) -tscore:{[x;mu](avg[x]-mu)%sdev[x]%sqrt count x} -/ t-score for t-test (two independent samples, not equal variances) -tscoreeq:{abs[avg[x]-avg y]%sqrt(svar[x]%count x)+svar[y]%count y} - -/ covariance/correlation calculate upper triangle only -cvm:{(x+flip(not n=\:n)*x:(n#'0.0),'(x$/:'(n:til count x)_\:x)%count first x)-a*\:a:avg each x:"f"$x} -crm:{cvm[x]%u*/:u:dev each x} -/ correlation matrix, in dictionary format if input is a table -corrmat:{$[t;{x!x!/:y}cols x;]crm$[t:98=type x;value flip@;]x} - -/ exclude colinear point -i.curvepts:{(x;y)@\:where(1b,2_differ deltas[y]%deltas x),1b} -/ area under curve (x,y) -i.auc:{sum 1_deltas[x]*y-.5*deltas y} -/ ROC curve: y the actual class, p the positive probability -roc:{[y;p]{0.,x%last x}each value exec 1+i-y,y from(update sums y from`p xdesc([]y;p))where p<>next p} -/ area under ROC curve -rocaucscore:{[y;p]i.auc . i.curvepts . roc[y;p]} +// @kind function +// @category metric +// @desc Accuracy of classification results +// @param pred {int[]|boolean[]|string[]} A vector/matrix of predicted labels +// @param true {int[]|boolean[]|string[]} A vector/matrix of true labels +// @returns {float} The accuracy of predictions made +accuracy:{[pred;true] + avg pred=true + } + +// @kind function +// @category metric +// @desc Precision of a binary classifier +// @param pred {boolean[]} A vector of predicted labels +// @param true {boolean[]} A vector of true labels +// @param posClass {boolean} The positive class +// @returns {float} A measure of the precision +precision:{[pred;true;posClass] + predPos:pred=posClass; + truePos:predPos&true=posClass; + sum[truePos]%sum predPos + } + +// @kind function +// @category metric +// @desc Sensitivity of a binary classifier +// @param pred {boolean[]} A vector of predicted labels +// @param true {boolean[]} A vector of true labels +// @param posClass {boolean} The positive class +// @returns {float} A measure of the sensitivity +sensitivity:{[pred;true;posClass] + realPos:true=posClass; + truePos:realPos&pred=posClass; + sum[truePos]%sum realPos + } + +// @kind function +// @category metric +// @desc Specificity of a binary classifier +// @param pred {boolean[]} A vector of predicted labels +// @param true {boolean[]} A vector of true labels +// @param posClass {boolean} The positive class +// @returns {float} A measure of the specificity +specificity:{[pred;true;posClass] + allNeg:true<>posClass; + trueNeg:allNeg&pred<>posClass; + sum[trueNeg]%sum allNeg + } + +// @kind function +// @category metric +// @desc F-beta score for classification results +// @param pred {number[]|boolean[]} A vector of predicted labels +// @param true {number[]|boolean[]} A vector of true labels +// @param posClass {number|boolean} The positive class +// @param beta {float} The value of beta +// @returns {float} The F-beta score between predicted and true labels +fBetaScore:{[pred;true;posClass;beta] + realPos:true=posClass; + predPos:pred=posClass; + minPos:realPos&predPos; + (sum[minPos]*1+beta*beta)%sum[predPos]+beta*beta*sum realPos + } + +// @kind function +// @category metric +// @desc F-1 score for classification results +// @param pred {int[]|boolean[]|string[]} A vector of predicted labels +// @param true {int[]|boolean[]|string[]} A vector of true labels +// @param posClass {number|boolean} The positive class +// @returns {float} The F-1 score between predicted and true labels +f1Score:fBetaScore[;;;1] + +// @kind function +// @category metric +// @desc Matthews-correlation coefficient +// @param pred {int[]|boolean[]|string[]} A vector of predicted labels +// @param true {int[]|boolean[]|string[]} A vector of true labels +// @returns {float} The Matthews-correlation coefficient between predicted +// and true values +matthewCorr:{[true;pred] + confMat:value confMatrix[true;pred]; + sqrtConfMat:sqrt prd sum[confMat],sum each confMat; + .[-;prd raze[confMat](0 1;3 2)]%sqrtConfMat + } + +// @kind function +// @category metric +// @desc Confusion matrix +// @param pred {int[]|boolean[]|string[]} A vector of predicted labels +// @param true {int[]|boolean[]|string[]} A vector of true labels +// @returns {dictionary} A confusion matrix +confMatrix:{[pred;true] + classes:asc distinct pred,true; + if[1=type classes;classes:01b]; + classDict:classes!(2#count classes)#0; + groupClass:0^((count each group@)each pred group true)@\:classes; + classDict,groupClass + } + +// @kind function +// @category metric +// @desc True/false positives and true/false negatives +// @param pred {int[]|boolean[]|string[]} A vector of predicted labels +// @param true {int[]|boolean[]|string[]} A vector of true labels +// @param posClass {number|boolean} The positive class +// @returns {dictionary} The count of true positives (tp), true negatives (tn), +// false positives (fp) and false negatives (fn) +confDict:{[pred;true;posClass] + confKeys:`tn`fp`fn`tp; + confVals:raze value confMatrix .(pred;true)=posClass; + confKeys!confVals + } + +// @kind function +// @category metric +// @desc Statistical information about classification result +// @param pred {int[]|boolean[]|string[]} A vector of predicted labels +// @param true {int[]|boolean[]|string[]} A vector of true labels +// @returns {table} The accuracy, precision, f1 scores and the support +// (number of occurrences) of each class. +classReport:{[pred;true] + trueClass:asc distinct true; + dictCols:`precision`recall`f1_score`support; + funcs:(precision;sensitivity;f1Score;{sum y=z}); + dictVals:(funcs .\:(pred;true))@/:\:trueClass; + dict:dictCols!dictVals; + classTab:([]class:`$string[trueClass],enlist"avg/total"); + classTab!flip[dict],(avg;avg;avg;sum)@'dict + } + +// @kind function +// @category metric +// @desc Logarithmic loss +// @param class {boolean[]} Class labels +// @param prob {float[]} Representing the probability of belonging to +// each class +// @returns {float} Total logarithmic loss +crossEntropy:logLoss:{[class;prob] + EPS:1e-15; + neg avg log EPS|prob@'class + } + +// @kind function +// @category metric +// @desc Mean square error +// @param pred {float[]} A vector of predicted labels +// @param true {float[]} A vector of true labels +// @returns {float} The mean squared error between predicted values and +// the true values +mse:{[pred;true] + avg diff*diff:pred-true + } + +// @kind function +// @category metric +// @desc Sum squared error +// @param pred {float[]} A vector of predicted labels +// @param true {float[]} A vector of true labels +// @returns {float} The sum squared error between predicted values and +// the true values +sse:{[pred;true] + sum diff*diff:pred-true + } + +// @kind function +// @category metric +// @desc Root mean squared error +// @param pred {float[]} A vector of predicted labels +// @param true {float[]} A vector of true labels +// @returns {float} The root mean squared error between predicted values +// and the true values +rmse:{[pred;true] + sqrt mse[pred;true] + } + +// @kind function +// @category metric +// @desc Root mean squared log error +// @param pred {float[]} A vector of predicted labels +// @param true {float[]} A vector of true labels +// @returns {float} The root mean squared log error between predicted values +// and the true values +rmsle:{[pred;true] + rmse . log(pred;true)+1 + } + +// @kind function +// @category metric +// @desc Residual squared error +// @param pred {float[]} A vector of predicted labels +// @param true {float[]} A vector of true labels +// @param n {long} The degrees of freedom of the residual +// @returns {float} The residual squared error between predicted values +// and the true values +rse:{[pred;true;n] + sqrt sse[pred;true]%n + } + +// @kind function +// @category metric +// @desc Mean absolute error +// @param pred {float[]} A vector of predicted labels +// @param true {float[]} A vector of true labels +// @returns {float} The mean absolute error between predicted values +// and the true values +mae:{[pred;true] + avg abs pred-true + } + +// @kind function +// @category metric +// @desc Mean absolute percentage error +// @param pred {float[]} A vector of predicted labels +// @param true {float[]} A vector of true labels +// @returns {float} The mean absolute percentage error between predicted values +// and the true values +mape:{[pred;true] + 100*avg abs 1-pred%true + } + +// @kind function +// @category metric +// @desc Symmetric mean absolute percentage error +// @param pred {float[]} A vector of predicted labels +// @param true {float[]} A vector of true labels +// @returns {float} The symmetric-mean absolute percentage between predicted +// and true values +smape:{[pred;true] + sumAbsVals:abs[pred]+abs true; + 100*avg abs[true-pred]%sumAbsVals + } + +// @kind function +// @category metric +// @desc R2-score for regression model validation +// @param pred {float[]} A vector of predicted labels +// @param true {float[]} A vector of true labels +// @returns {float} The R2-score between the true and predicted values. +// Values close to 1 indicate good prediction, while negative values +// indicate poor predictors of the system behavior +r2Score:{[pred;true] + 1-sse[true;pred]%sse[true]avg true + } + +// @kind function +// @category metric +// @desc R2 adjusted score for regression model validation +// @param pred {float[]} A vector of predicted labels +// @param true {float[]} A vector of true labels +// @param p {long} Number of independent regressors, i.e. the number of +// variables in your model, excluding the constant +// @returns {float} The R2 adjusted score between the true and predicted +// values. Values close to 1 indicate good prediction, while negative values +// indicate poor predictors of the system behavior +r2AdjScore:{[pred;true;p] + n:count pred; + r2:r2Score[pred;true]; + 1-(1-r2)*(n-1)%(n-p)-1 + } + +// @kind function +// @category metric +// @desc One-sample t-test score +// @param sample {number[]} A set of samples from a distribution +// @param mu {float} The population mean +// @returns {float} The one sample t-score for a distribution with less than +// 30 samples. +tScore:{[sample;mu] + (avg[sample]-mu)%sdev[sample]%sqrt count sample + } + +// @kind function +// @category metric +// @desc T-test for independent samples with equal variances +// and equal sample size +// @param sample1 {number[]} A sample from a distribution +// @param sample1 {number[]} A sample from a distribution +// sample1&2 are independent with equal variance and sample size +// @returns {float} Their t-test score +tScoreEqual:{[sample1;sample2] + count1:count sample1; + count2:count sample2; + absAvg:abs avg[sample1]-avg sample2; + absAvg%sqrt(svar[sample1]%count1)+svar[sample2]%count2 + } + +// @kind function +// @category metric +// @desc Calculate the covariance of a matrix +// @param matrix {number[]} A sample from a distribution +// @returns {number[]} The covariance matrix +covMatrix:{[matrix] + matrix:"f"$matrix; + n:til count matrix; + avgMat:avg each matrix; + upperTri:matrix$/:'n _\:matrix; + diag:not n=\:n; + matrix:(n#'0.0),'upperTri%count first matrix; + multiplyMat:matrix+flip diag*matrix; + multiplyMat-avgMat*\:avgMat + } + +// @kind function +// @category metric +// @desc Calculate the correlation of a matrix or table +// @param data {table|number[]} A sample from a distribution +// @returns {dictionary|number[]} The covariance of the data +corrMatrix:{[data] + dataTab:98=type data; + matrix:$[dataTab;value flip@;]data; + corrMat:i.corrMatrix matrix; + $[dataTab;{x!x!/:y}cols data;]corrMat + } + +// @kind function +// @category metric +// @desc X- and Y-axis values for an ROC curve +// @param label {number[]|boolean[]} Label associated with a prediction +// @param prob {float[]} Probability that each prediction belongs to +// the positive class +// @returns {number[]} The coordinates of the true-positive and false-positive +// values associated with the ROC curve +roc:{[label;prob] + tab:(update sums label from`prob xdesc([]label;prob)); + probDict:exec 1+i-label,label from tab where prob<>next prob; + {0.,x%last x}each value probDict + } + +// @kind function +// @category metric +// @desc Area under an ROC curve +// @param label {number[]|boolean[]} Label associated with a prediction +// @param prob {float[]} Probability that each prediction belongs to +// the positive class +// @returns {float} The area under the ROC curve +rocAucScore:{[label;prob] + i.auc . i.curvePts . roc[label;prob] + } diff --git a/util/mproc.q b/util/mproc.q index 8fb5021a..e9de8518 100644 --- a/util/mproc.q +++ b/util/mproc.q @@ -1,11 +1,48 @@ +// util/mproc.q - Utilities for multiprocessing +// Copyright (c) 2021 Kx Systems Inc +// +// Distributes functions to worker processes + \d .ml -if[not `mproc in key .ml;.z.pd:`u#0#0i;mproc.N:0] -.z.pc:{[f;x].z.pd:`u#.z.pd except x;f[x]}@[value;`.z.pc;{{}}] -mproc.reg:{.z.pd,:.z.w;neg[.z.w]@/:mproc.cmds} -mproc.init:{[n;x] +// @kind function +// @category multiProcess +// @desc If the multiProc key is not already loaded in set .`z.pd` and +// N to 0 +// @return {::} `.z.pd` and N are set to 0 +if[not`multiProc in key .ml;.z.pd:`u#0#0i;multiProc.N:0] + +// @kind function +// @category multiProcess +// @desc Define what happens when the connection is closed +// @param func {fn} Value of `.z.pc` function +// @param proc {int} Handle to the worker process +// @return {::} Appropriate handles are closed +.z.pc:{[func;proc] + .z.pd:`u#.z.pd except proc; + func proc + }@[value;`.z.pc;{{}}] + +// @kind function +// @category multiProcess +// @desc Register the handle and pass any functions required to the +// worker processes +// @return {::} The handle is registered and function is passed to process +multiProc.reg:{ + .z.pd,:.z.w; + neg[.z.w]@/:multiProc.cmds + } + +// @kind function +// @category multiProcess +// @desc Distributes functions to worker processes +// @param n {int} Number of processes open +// @param func {string} Function to be passed to the process +// @return {::} Each of the `n` worker processes evaluate `func` +multiProc.init:{[n;func] if[not p:system"p";'"set port to multiprocess"]; - neg[.z.pd]@\:/:x; - mproc.cmds,:x; - do[0|n-mproc.N;system"q ",path,"/util/mprocw.q -pp ",string p]; - mproc.N|:n;} + neg[.z.pd]@\:/:func; + multiProc.cmds,:func; + do[0|n-multiProc.N;system"q ",path,"/util/mprocw.q -pp ",string p]; + multiProc.N|:n; + } diff --git a/util/mprocw.q b/util/mprocw.q index 9930d3d2..ef1a7062 100644 --- a/util/mprocw.q +++ b/util/mprocw.q @@ -1,5 +1,15 @@ +// util/mprocw.q - Multiprocessing +// Copyright (c) 2021 Kx Systems Inc +// +// Mutliprocessing based on command line input + +// Exit if `pp isn't passed as a command parameter if[not`pp in key .Q.opt .z.x;exit 1]; +// Exit if no values were passed with pp if[not count .Q.opt[.z.x]`pp;exit 2]; +// Exit if cannot open port if[not h:@[hopen;"J"$first .Q.opt[.z.x]`pp;0];exit 3]; +// Exit if cannot load ml.q @[system;"l ml/ml.q";{exit 4}] -neg[h]`.ml.mproc.reg` +// Register the handle and run appropriate functions +neg[h]`.ml.multiProc.reg` diff --git a/util/pickle.q b/util/pickle.q index 39e30cbb..e6d4d534 100644 --- a/util/pickle.q +++ b/util/pickle.q @@ -1,5 +1,28 @@ +// util/pickle.q - Pickle file utilities +// Copyright (c) 2021 Kx Systems Inc +// +// Save and load python objects to and from pickle files + \d .ml -pickledump:.p.import[`pickle;`:dumps;<] -pickleload:.p.import[`pickle;`:loads] -picklewrap:{[b;x]$[b;{.ml.pickleload y}[;pickledump x];{y}[;x]]} +// @kind function +// @cateogory pickle +// @desc Generate python pickle dump module to save a python object +pickleDump:.p.import[`pickle;`:dumps;<] + +// @kind function +// @cateogory pickle +// @desc Generate python pickle lodas module to load a python object +pickleLoad:.p.import[`pickle;`:loads] + +// @kind function +// @cateogory pickle +// @desc A wrapper function to load and save python +// objects using pickle +// @param module {boolean} Whether the pickle load module (1b) or +// dump module (0b) is to be invoked +// @param obj {<} Python object to be saved/loaded +// @return {::;<} Object is saved/loaded +pickleWrap:{[module;obj] + $[module;{.ml.pickleLoad y}[;pickleDump obj];{y}[;obj]] + } diff --git a/util/preproc.q b/util/preproc.q index 16129bd6..1e41d93b 100644 --- a/util/preproc.q +++ b/util/preproc.q @@ -1,88 +1,365 @@ +// util/preproc.q - Preprocessing functions +// Copyright (c) 2021 Kx Systems Inc +// +// Preprocessing of data prior to training + \d .ml -/ data preprocessing - -/* x = simple table/dictionary -dropconstant:{ - if[not(typ:type x)in 98 99h;'"Data must be simple table or dictionary"]; - if[99h=typ;if[98h~type value x;'"Data cannot be a keyed table"]]; - // find keys/cols that contain non-numeric data - fc:$[typ=99h;i.fndkey;i.fndcols].(x;"csg ",upper .Q.t); - // store instructions to flip table and execute this - dt:(fdata:$[99=typ;;flip])x; - // drop constant numeric and non numeric cols/keys - fdata i.dropconst.num[fc _ dt],i.dropconst.other fc#dt - } - -// logic to find numeric and drop constant columns -i.dropconst.num:{(where 0=0^var each x)_x} -i.dropconst.other:{(where{all 1_(~':)x}each x)_x} -// Find keys relating to a specific type -i.fndkey:{where({.Q.t abs type x}each x)in y} - - -minmaxscaler:i.ap{(x-mnx)%max[x]-mnx:min x} -stdscaler :i.ap{(x-avg x)%dev x} -/ replace +/- 0w with max/min vals -infreplace :i.ap{@[x;i;:;z@[x;i:where x=y;:;0n]]}/[;-0w 0w;min,max] - -/ produce features which are combinations of n features from table x -polytab:{[x;n]flip(`$"_"sv'string c)!prd each x c@:combs[count c:cols x;n]} - -filltab:{[t;gc;tc;d] - d:$[0=count d;:t;(::)~d;c!(count c:i.fndcols[t;"ghijefcspmdznuvt"]except gc,tc)#`forward;d]; - t:flip flip[t],(`$string[k],\:"_null")!null t k:key d; - ![t;();$[count gc,:();gc!gc;0b];@[i.fillmap;`linear;,';tc][d],'k]} - -/ fill methods -i.fillmap.zero:{0^x} -i.fillmap.median:{med[x]^x} -i.fillmap.mean:{avg[x]^x} -i.fillmap.forward:{"f"$(x first where not null x)^fills x} -i.fillmap.linear:{[t;v] - if[2>count i:where not n:null v;:v]; - g:1_deltas[v i]%deltas t i; - "f"$@[v;n;:;v[i][u]+g[u]*t[n]-t[i]u:0|(i:-1_i)bin n:where n]} - -/ encode categorical features using one-hot encoding -i.onehot1:{d!"f"$x=/:d:asc distinct x} -onehot:{[x;c] - if[(::)~c;c:i.fndcols[x;"s"]]; - flip(c _ flip x),raze{[x;c](`$"_"sv'string c,'key r)!value r:i.onehot1 x c}/:[x]c,:()} - -/ encode categorical features with frequency of category occurrence -freqencode:{[x;c] - if[(::)~c;c:i.fndcols[x;"s"]]; - flip(c _ flip x),(`$string[c],\:"_freq")!{(g%sum g:count each group x)x}each x c,:()} - -/ encode categorical features with lexigraphical order -lexiencode:{[x;c] - if[(::)~c;c:i.fndcols[x;"s"]]; - flip(c _ flip x),(`$string[c],\:"_lexi")!{(asc distinct x)?x}each x c,:()} - -// Encode the a dataset to a list of integers, and provide a mapping allowing a user to -// revert new integer lists to the original version -/* x = data to be encoded and mapped -labelencode:{[x] - adx:asc distinct x; - `mapping`encoding!(adx!til count adx;adx?x) - } - -// Map a list of integers to their true representation based on a label encoding schema -/* x = data to be revert to true representation based on -/* y = label encoding map either labelencode[x]`mapping or labelencode[x] -applylabelencode:{[x;y] - if[99h<>type y;'"Input must be a dictionary"]; - $[`mapping`encoding~key y;y[`mapping]?;y?]x - } - -/ split temporal types into constituents -i.timesplit.d:{update wd:1type map;'"Input must be a dictionary"]; + $[`modelInfo`transform~key map;map[`modelInfo]?;map?]data + } + +// @kind function +// @category preprocessing +// @desc Break specified time columns into constituent components +// @param tab {table} Contains time columns +// @param timeCols {symbol[]} Columns to apply encoding to, if set to :: +// all columns with date/time types will be encoded +// @return {dictionary} All time or date types broken into labeled versions +// of their constituent components +timeSplit:{[tab;timeCols] + if[(::)~timeCols;timeCols:i.findCols[tab;"dmntvupz"]]; + timeDict:i.timeDict/:[tab]timeCols,:(); + flip(timeCols _ flip tab),raze timeDict + } diff --git a/util/tests/metric.t b/util/tests/metric.t index d18f56bb..8620fd6c 100644 --- a/util/tests/metric.t +++ b/util/tests/metric.t @@ -28,17 +28,6 @@ ymb:100 10#yb plaintab:([]4 5 6.;1 2 3.;-1 -2 -3.;0.4 0.5 0.6) plaintabn:plaintab,'([]x4:1 3 0n) -.ml.range[til 63] ~ 62 -.ml.range[5] ~ 0 -.ml.range[0 1 3 2f]~3f -.ml.range[0 1 0n 2]~2f -.ml.percentile[x;0.75]~np[`:percentile][x;75]` -.ml.percentile[x;0.02]~np[`:percentile][x;2]` -.ml.percentile[xf;0.5]~np[`:percentile][xf;50]` -.ml.percentile[3 0n 4 4 0n 4 4 3 3 4;0.5]~3.5 -("f"$flip value .ml.describe[plaintab])~flip .ml.df2tab .p.import[`pandas][`:DataFrame.describe][.ml.tab2df[plaintab]] -("f"$flip value .ml.describe[plaintabn])~flip (.ml.df2tab .p.import[`pandas][`:DataFrame.describe][.ml.tab2df[plaintab]]),'"f"$([]x4:3 2,sdev[1 3 0n],1 0 1 2 3) - .ml.accuracy[x;y] ~ skmetric[`:accuracy_score][x;y]` .ml.accuracy[xb;yb] ~ 0.5 .ml.accuracy[3 2 2 0n 4;0n 4 3 2 4]~0.2 @@ -65,45 +54,45 @@ plaintabn:plaintab,'([]x4:1 3 0n) .ml.specificity[10#1b;10#0b;1b]~0f .ml.specificity[10#1b;10#1b;0b]~1f -.ml.fbscore[xb;yb;1b;0.02] ~ fbscore[yb;xb;`beta pykw 0.02]` -.ml.fbscore[xb;yb;1b;0.5] ~ fbscore[yb;xb;`beta pykw 0.5]` -.ml.fbscore[xb;yb;1b;1.5] ~ fbscore[yb;xb;`beta pykw 1.5]` -.ml.fbscore[xb;yb;0b;1.5] ~ 0.493670886075949 -.ml.fbscore[1000#1b;yb;0b;.5]~0f -.ml.fbscore[xb;1000#1b;0b;.5]~0f -.ml.fbscore[1000#0b;1000#1b;1b;.2]~0f - -.ml.f1score[xb;yb;0b] ~ f1[xb;yb;`pos_label pykw 0]` -.ml.f1score[xb;yb;1b] ~ f1[xb;yb;`pos_label pykw 1]` -.ml.f1score[xb;1000#0b;1b]~0f -.ml.f1score[1000#1b;yb;1b]~f1[1000#1b;yb;`pos_label pykw 1]` -.ml.f1score[10#1b;10#0b;1b]~f1[10#1b;10#0b;`pos_label pykw 1]` - -.ml.matcorr[xb;yb]~mcoeff[xb;yb]` -.ml.matcorr[110010b;111111b]~0n -.ml.matcorr[111111b;110010b]~0n - -(value .ml.confmat[xb;yb])~(300 400;100 200) -(value .ml.confmat[2 3# 0 0 1 1 0 0;2 3# 1 0 1 0 0 1]) ~ (0 1 0;0 0 0;1 0 0) -(value .ml.confmat[1 2 3;3 2 1])~(0 0 1;0 1 0;1 0 0) -(value .ml.confmat[1 2 3f;3 2 1f])~(0 0 1;0 1 0;1 0 0) -(value .ml.confmat[3#1b;3#0b])~(0 3;0 0) - -.ml.confdict[xb;yb;1b] ~ `tn`fp`fn`tp!300 400 100 200 -.ml.confdict[3#0b;3#1b;0b] ~`tn`fp`fn`tp!0 3 0 0 -.ml.confdict[3#1b;3#0b;0b]~`tn`fp`fn`tp!0 0 3 0 - -.ml.classreport[110b;101b]~1!flip`class`precision`recall`f1_score`support!((`$string each 0 1),`$"avg/total";0 0.5 0.25; 0 0.5 0.25;0.0 0.5 0.25;1 2 3i) -.ml.classreport[3 3 5 2 5 1;3 5 2 3 5 1]~1!flip`class`precision`recall`f1_score`support!((`$string each 1 2 3 5),`$"avg/total";1 0 0.5 0.5 0.5;1 0 0.5 0.5 0.5;1 0 0.5 0.5 0.5;1 1 2 2 6i) -.ml.classreport[3 3 5 2 5 1f;3 5 2 3 5 1f]~1!flip`class`precision`recall`f1_score`support!((`$string each 1 2 3 5),`$"avg/total";1 0 0.5 0.5 0.5;1 0 0.5 0.5 0.5;1 0 0.5 0.5 0.5;1 1 2 2 6i) -.ml.classreport[3 3 5 0n 5 1;3 5 2 3 5 0n]~1!flip`class`precision`recall`f1_score`support!((`$string each 0n 2 3 5),`$"avg/total";0 0n 0.5 0.5 0.33333333333333;0 0 0.5 0.5 0.25;0 0 0.5 0.5 0.25;1 1 2 2 6i) - -{.ml.logloss[x;y]~logloss[x;y]`}[1000?0b;(1-p),'p:1000?1f] -{.ml.logloss[x;y]~logloss[x;y]`}[1000?0b;(1-p),'p:1000?1i] -.ml.logloss[10#0b;(1-p),'p:10?1i]~-0f -(floor .ml.logloss[10110b;(2 0n;1 1; 3 1;0n 2; 3 3)])~floor 6 -(floor .ml.logloss[1000?0b;(1-p),'p:1000#0n])~34 -{.ml.crossentropy[x;y]~logloss[x;y]`}[(first idesc@)each p;p%:sum each p:1000 5#5000?1f] +.ml.fBetaScore[xb;yb;1b;0.02] ~ fbscore[yb;xb;`beta pykw 0.02]` +.ml.fBetaScore[xb;yb;1b;0.5] ~ fbscore[yb;xb;`beta pykw 0.5]` +.ml.fBetaScore[xb;yb;1b;1.5] ~ fbscore[yb;xb;`beta pykw 1.5]` +.ml.fBetaScore[xb;yb;0b;1.5] ~ 0.493670886075949 +.ml.fBetaScore[1000#1b;yb;0b;.5]~0f +.ml.fBetaScore[xb;1000#1b;0b;.5]~0f +.ml.fBetaScore[1000#0b;1000#1b;1b;.2]~0f + +.ml.f1Score[xb;yb;0b] ~ f1[xb;yb;`pos_label pykw 0]` +.ml.f1Score[xb;yb;1b] ~ f1[xb;yb;`pos_label pykw 1]` +.ml.f1Score[xb;1000#0b;1b]~0f +.ml.f1Score[1000#1b;yb;1b]~f1[1000#1b;yb;`pos_label pykw 1]` +.ml.f1Score[10#1b;10#0b;1b]~f1[10#1b;10#0b;`pos_label pykw 1]` + +.ml.matthewCorr[xb;yb]~mcoeff[xb;yb]` +.ml.matthewCorr[110010b;111111b]~0n +.ml.matthewCorr[111111b;110010b]~0n + +(value .ml.confMatrix[xb;yb])~(300 400;100 200) +(value .ml.confMatrix[2 3# 0 0 1 1 0 0;2 3# 1 0 1 0 0 1]) ~ (0 1 0;0 0 0;1 0 0) +(value .ml.confMatrix[1 2 3;3 2 1])~(0 0 1;0 1 0;1 0 0) +(value .ml.confMatrix[1 2 3f;3 2 1f])~(0 0 1;0 1 0;1 0 0) +(value .ml.confMatrix[3#1b;3#0b])~(0 3;0 0) + +.ml.confDict[xb;yb;1b] ~ `tn`fp`fn`tp!300 400 100 200 +.ml.confDict[3#0b;3#1b;0b] ~`tn`fp`fn`tp!0 3 0 0 +.ml.confDict[3#1b;3#0b;0b]~`tn`fp`fn`tp!0 0 3 0 + +.ml.classReport[110b;101b]~1!flip`class`precision`recall`f1_score`support!((`$string each 0 1),`$"avg/total";0 0.5 0.25; 0 0.5 0.25;0.0 0.5 0.25;1 2 3i) +.ml.classReport[3 3 5 2 5 1;3 5 2 3 5 1]~1!flip`class`precision`recall`f1_score`support!((`$string each 1 2 3 5),`$"avg/total";1 0 0.5 0.5 0.5;1 0 0.5 0.5 0.5;1 0 0.5 0.5 0.5;1 1 2 2 6i) +.ml.classReport[3 3 5 2 5 1f;3 5 2 3 5 1f]~1!flip`class`precision`recall`f1_score`support!((`$string each 1 2 3 5),`$"avg/total";1 0 0.5 0.5 0.5;1 0 0.5 0.5 0.5;1 0 0.5 0.5 0.5;1 1 2 2 6i) +.ml.classReport[3 3 5 0n 5 1;3 5 2 3 5 0n]~1!flip`class`precision`recall`f1_score`support!((`$string each 0n 2 3 5),`$"avg/total";0 0n 0.5 0.5 0.33333333333333;0 0 0.5 0.5 0.25;0 0 0.5 0.5 0.25;1 1 2 2 6i) + +{.ml.logLoss[x;y]~logloss[x;y]`}[1000?0b;(1-p),'p:1000?1f] +{.ml.logLoss[x;y]~logloss[x;y]`}[1000?0b;(1-p),'p:1000?1i] +.ml.logLoss[10#0b;(1-p),'p:10?1i]~-0f +(floor .ml.logLoss[10110b;(2 0n;1 1; 3 1;0n 2; 3 3)])~floor 6 +(floor .ml.logLoss[1000?0b;(1-p),'p:1000#0n])~34 +{.ml.crossEntropy[x;y]~logloss[x;y]`}[(first idesc@)each p;p%:sum each p:1000 5#5000?1f] .ml.mse[x;y] ~ skmetric[`:mean_squared_error][x;y]` .ml.mse[xf;yf] ~ skmetric[`:mean_squared_error][xf;yf]` .ml.mse[x;x]~0f @@ -138,35 +127,35 @@ plaintabn:plaintab,'([]x4:1 3 0n) .ml.smape[xm;ym]~{smape[x;y]}'[flip xm;flip ym] .ml.smape[x;x]~0f .ml.smape[1 0n 4 2 0n;1 2 4 3 1]~6.666666666666666667 -.ml.r2score[xf;yf] ~ r2[yf;xf]` -.ml.r2score[xf;xf] ~ r2[xf;xf]` -.ml.r2score[2 2 2;1 2 3] ~ r2[1 2 3;2 2 2]` -.ml.r2score[x;x]~1f -.ml.r2score[1 0n 4 2 0n;1 2 4 2 1]~1f -.ml.tscore[x;y] ~first stats[`:ttest_1samp][x;y]` -.ml.tscore[xf;yf]~first stats[`:ttest_1samp][xf;yf]` -.ml.tscore[xb;yb]~first stats[`:ttest_1samp][xb;yb]` -.ml.tscore[x;x]~first stats[`:ttest_1samp][x;x]` -.ml.tscoreeq[x;y]~abs first stats[`:ttest_ind][x;y]` -.ml.tscoreeq[xf;yf]~abs first stats[`:ttest_ind][xf;yf]` -.ml.tscoreeq[xb;yb]~abs first stats[`:ttest_ind][xb;yb]` -.ml.tscoreeq[x;x]~abs first stats[`:ttest_ind][x;x]` -.ml.cvm[flip value flip plaintab]~np[`:cov][flip value flip plaintab;`bias pykw 1b]` -.ml.cvm[(10110b;01110b)]~(0.24 0.04;0.04 0.24) -.ml.cvm[(10110b;11111b)]~(0.24 0f;0 0f) -.ml.cvm[(11111b;11111b)]~(0 0f;0 0f) -.ml.cvm[(10110b;1101b,0n)]~(0.24 0n;2#0n) -.ml.crm[(1 2;2 1)]~(2 2#1 -1 -1 1f) -.ml.crm[(011b;001b)]~(1 0.5;0.5 1) -.ml.crm[(1111b;1111b)]~(2 2#4#0n) -.ml.crm[(1 1 2;1 2 0n)]~(1 0n;2#0n) -(value .ml.corrmat[plaintab]) ~ "f"$([]1 1 -1 1;1 1 -1 1;-1 -1 1 -1;1 1 -1 1) -.ml.corrmat[(0011b;1010b)]~(1 0f;0 1f) -.ml.corrmat[(0011b;1111b)]~(1 0n;2#0n) -.ml.corrmat[(1111b;1111b)]~(2 2#2#0n) -.ml.corrmat[(1 1 2;1 2 0n)]~(1 0n;2#0n) -{.ml.rocaucscore[x;y]~rocau[x;y]`}[10?0b;10?1f] -.ml.rocaucscore[10#01b;10#1f]~0.5 -.ml.rocaucscore[10#0b;10?1f]~0f -.ml.rocaucscore[10#1b;10#0f]~0f -.ml.rocaucscore[1011000110b;0n 0.1 0.2 0.1 0.3 0.4 0.2 0.4 0.3 0.2]~0.525 +.ml.r2Score[xf;yf] ~ r2[yf;xf]` +.ml.r2Score[xf;xf] ~ r2[xf;xf]` +.ml.r2Score[2 2 2;1 2 3] ~ r2[1 2 3;2 2 2]` +.ml.r2Score[x;x]~1f +.ml.r2Score[1 0n 4 2 0n;1 2 4 2 1]~1f +.ml.tScore[x;y] ~first stats[`:ttest_1samp][x;y]` +.ml.tScore[xf;yf]~first stats[`:ttest_1samp][xf;yf]` +.ml.tScore[xb;yb]~first stats[`:ttest_1samp][xb;yb]` +.ml.tScore[x;x]~first stats[`:ttest_1samp][x;x]` +.ml.tScoreEqual[x;y]~abs first stats[`:ttest_ind][x;y]` +.ml.tScoreEqual[xf;yf]~abs first stats[`:ttest_ind][xf;yf]` +.ml.tScoreEqual[xb;yb]~abs first stats[`:ttest_ind][xb;yb]` +.ml.tScoreEqual[x;x]~abs first stats[`:ttest_ind][x;x]` +.ml.covMatrix[flip value flip plaintab]~np[`:cov][flip value flip plaintab;`bias pykw 1b]` +.ml.covMatrix[(10110b;01110b)]~(0.24 0.04;0.04 0.24) +.ml.covMatrix[(10110b;11111b)]~(0.24 0f;0 0f) +.ml.covMatrix[(11111b;11111b)]~(0 0f;0 0f) +.ml.covMatrix[(10110b;1101b,0n)]~(0.24 0n;2#0n) +.ml.corrMatrix[(1 2;2 1)]~(2 2#1 -1 -1 1f) +.ml.corrMatrix[(011b;001b)]~(1 0.5;0.5 1) +.ml.corrMatrix[(1111b;1111b)]~(2 2#4#0n) +.ml.corrMatrix[(1 1 2;1 2 0n)]~(1 0n;2#0n) +(value .ml.corrMatrix[plaintab]) ~ "f"$([]1 1 -1 1;1 1 -1 1;-1 -1 1 -1;1 1 -1 1) +.ml.corrMatrix[(0011b;1010b)]~(1 0f;0 1f) +.ml.corrMatrix[(0011b;1111b)]~(1 0n;2#0n) +.ml.corrMatrix[(1111b;1111b)]~(2 2#2#0n) +.ml.corrMatrix[(1 1 2;1 2 0n)]~(1 0n;2#0n) +{.ml.rocAucScore[x;y]~rocau[x;y]`}[10?0b;10?1f] +.ml.rocAucScore[10#01b;10#1f]~0.5 +.ml.rocAucScore[10#0b;10?1f]~0f +.ml.rocAucScore[10#1b;10#0f]~0f +.ml.rocAucScore[1011000110b;0n 0.1 0.2 0.1 0.3 0.4 0.2 0.4 0.3 0.2]~0.525 diff --git a/util/tests/preproctst.t b/util/tests/preproctst.t index c3fc2581..09c97824 100644 --- a/util/tests/preproctst.t +++ b/util/tests/preproctst.t @@ -17,12 +17,18 @@ tab:([]sym:`a`a`a`b`b;time:`time$til 5;@[5#0n;2 4;:;1f];@["f"$til 5;4;:;0n]) timetab:([]`timestamp$(2000.01.01+til 3);1 3 2;2 1 3) timetabn:([]`timestamp$(2000.01.01+til 3),0n;1 3 3 2;2 1 3 3) +\S 42 + x:1000?40 y:1000?40 xf:1000?100f yf:1000?100f xb:1000#0101101011b yb:1000#0000111000b +scale1:(2 3f;4 2f;5 3f) +scale2:3 2 5 4 1f +scale3:0011b +scale4:3 2#3 5 1 0n 4 0n onehotx:`a`p`l`h`j symtf:([]`a`b`b`a`a;"f"$til 5) symti:([]`a`b`b`a`a;til 5) @@ -36,93 +42,144 @@ tf:([]1000?500f;1000#30f;1000?1000f;1000?100f) tb:([]1000?0b;1000#1b;1000?0b;1000?0b) infdict:`x`x1`x2!(0 1 2 0w;0 1 2 -0w;1 2 3 0w) nt:([]101b;000b;1 2 0n) +keyedinfs:([k:1 2]x:0 0W) -.ml.dropconstant[ti]~flip `x`x2`x3!ti`x`x2`x3 -.ml.dropconstant[tf]~flip `x`x2`x3!tf`x`x2`x3 -.ml.dropconstant[tb]~flip `x`x2`x3!tb`x`x2`x3 -.ml.dropconstant[nt]~([]101b;x2:1 2 0n) -.ml.dropconstant[nulltab]~select x,x1,x2,x3 from nulltab +.ml.dropConstant[ti]~flip `x`x2`x3!ti`x`x2`x3 +.ml.dropConstant[tf]~flip `x`x2`x3!tf`x`x2`x3 +.ml.dropConstant[tb]~flip `x`x2`x3!tb`x`x2`x3 +.ml.dropConstant[flip ti]~`x`x2`x3!ti`x`x2`x3 +.ml.dropConstant[flip tf]~`x`x2`x3!tf`x`x2`x3 +.ml.dropConstant[flip tb]~`x`x2`x3!tb`x`x2`x3 +.ml.dropConstant[nt]~([]101b;x2:1 2 0n) +.ml.dropConstant[nulltab]~select x,x1,x2,x3 from nulltab MinMaxScaler[`:fit][flip plainmat]; +minMaxKeys:`minData`maxData +minMax1:.ml.minMaxScaler.fit[plainmat] +minMax2:.ml.minMaxScaler.fit[scale1] +minMax3:.ml.minMaxScaler.fit[scale2] +minMax4:.ml.minMaxScaler.fit[scale3] +minMax5:.ml.minMaxScaler.fit[scale4] + +minMax1[`modelInfo]~minMaxKeys!(4 1 -3 0.4f;6 3 -1 0.6f) +minMax2[`modelInfo]~minMaxKeys!(2 2 3f;3 4 5f) +minMax3[`modelInfo]~minMaxKeys!1 5f +minMax4[`modelInfo]~minMaxKeys!01b +minMax5[`modelInfo]~minMaxKeys!(3 1 4f;5 1 4f) + +.ml.minMaxScaler.fitTransform[plainmat]~flip"f"$MinMaxScaler[`:transform][flip plainmat]` +.ml.minMaxScaler.fitTransform[scale1]~(0 1f;1 0f;1 0f) +.ml.minMaxScaler.fitTransform[scale2]~0.5 0.25 1 0.75 0f +.ml.minMaxScaler.fitTransform[scale3]~0 0 1 1f +.ml.minMaxScaler.fitTransform[scale4]~(0 1f;2#0n;2#0n) +minMax2.transform[scale4]~(1 3f;-0.5 0n;0.5 0n) +minMax3.transform[5#y]~5.75 1.75 9.5 5.5 4.25 + StdScaler[`:fit][flip plainmat]; -.ml.minmaxscaler[plainmat] ~ flip"f"$MinMaxScaler[`:transform][flip plainmat]` -.ml.minmaxscaler[(2 3f;4 2f;5 3f)]~(0 1f;1 0f;1 0f) -.ml.minmaxscaler[3 2 5 4 1f]~0.5 0.25 1 0.75 0f -.ml.minmaxscaler[0011b]~0 0 1 1f -.ml.minmaxscaler[3 2#3 5 1 0n 4 0n]~(0 1f;2#0n;2#0n) - -.ml.stdscaler[plainmat] ~ flip"f"$StdScaler[`:transform][flip plainmat]` -.ml.stdscaler[(2 3f;4 2f;5 3f)]~(-1 1f;1 -1f;1 -1f) -.ml.stdscaler[xf]~scale[xf]` -.ml.stdscaler[y]~scale[y]` -.ml.stdscaler[yb]~scale[yb]` -.ml.stdscaler[3 2#2 4 1 0n 2 0n]~(-1 1f;2#0n;2#0n) - -.ml.infreplace[infdict]~`x`x1`x2!"f"$(0 1 2 2;0 1 2 0;1 2 3 3) -.ml.infreplace[flip infdict]~flip `x`x1`x2!"f"$(0 1 2 2;0 1 2 0;1 2 3 3) -.ml.infreplace[infdict`x]~0 1 2 2f - -.ml.polytab[([] 2 4 1f;3 4 1f;3 2 3f);2]~([]x_x1:6 16 1f;x_x2:6 8 3f;x1_x2:9 8 3f) -.ml.polytab[([] 2 4 1;3 4 1;3 2 3);2]~([]x_x1:6 16 1;x_x2:6 8 3;x1_x2:9 8 3) -.ml.polytab[([]101b;110b;100b);2]~([]x_x1:1 0 0i;x_x2:1 0 0i;x1_x2:1 0 0i) -.ml.polytab[nt;2]~([]x_x1:0 0 0i;x_x2:1 0 0n;x1_x2:0 0 0n) -.ml.polytab[([] 0n 0n;2 3;1 2);2]~([]x_x1:2#0n;x_x2:2#0n;x1_x2:2 6) - -.ml.filltab[tab;0#();`time;`x1`x!`linear`mean]~flip`sym`time`x`x1`x1_null`x_null!(`a`a`a`b`b;00:00:00.000 00:00:00.001 00:00:00.002 00:00:00.003 00:00:00.004;1 1 1 1 1f;0 1 2 3 4f;00001b;11010b) -.ml.filltab[tab;`sym;`time;()!()]~tab -.ml.filltab[tab;`sym;`time;::]~flip`sym`time`x`x1`x_null`x1_null!(`a`a`a`b`b;00:00:00.000 00:00:00.001 00:00:00.002 00:00:00.003 00:00:00.004;1 1 1 1 1f;0 1 2 3 3f;11010b;00001b) -(select x4,x5,x1_null,x3_null from .ml.filltab[nulltab;`x2;x;`x1`x3!`median`mean])~([]x4:5#0;x5:5#0n;x1_null:00100b;x3_null:11000b) -.ml.filltab[tab,'flip (enlist `x2)!enlist 5#0n;`sym;`time;`x1`x`x2!`median`mean`max]~flip`sym`time`x`x1`x2`x1_null`x_null`x2_null!(`a`a`a`b`b;00:00:00.000 00:00:00.001 00:00:00.002 00:00:00.003 00:00:00.004;1 1 1 1 1f;0 1 2 3 3f;5#0n;00001b;11010b;11111b) - -.ml.onehot[symtf;`x] ~"f"$([] x1:til 5;x_a:1 0 0 1 1;x_b: 0 1 1 0 0) -.ml.onehot[symtf;::] ~"f"$([] x1:til 5;x_a:1 0 0 1 1;x_b: 0 1 1 0 0) -.ml.onehot[symti;`x] ~([] x1:til 5;x_a:1 0 0 1 1f;x_b: 0 1 1 0 0f) -.ml.onehot[symti;::] ~([] x1:til 5;x_a:1 0 0 1 1f;x_b: 0 1 1 0 0f) -.ml.onehot[symtb;`x]~([] x1:11001b;x_a:1 0 0 1 1f;x_b: 0 1 1 0 0f) -.ml.onehot[symtb;::]~([] x1:11001b;x_a:1 0 0 1 1f;x_b: 0 1 1 0 0f) -.ml.onehot[symtn;`x]~([]x1:til 5;x_:0 0 0 1 0f;x_a:1 0 0 0 1f;x_b:0 1 1 0 0f) -.ml.onehot[symtn;::]~([]x1:til 5;x_:0 0 0 1 0f;x_a:1 0 0 0 1f;x_b:0 1 1 0 0f) -.ml.onehot[symm;::]~([]x1:til 5;x_a:1 0 0 1 1f;x_b:0 1 1 0 0f;x2_q:1 0 1 1 0f;x2_w:0 1 0 0 1f) - -.ml.freqencode[symtf;`x]~(delete x from symtf),'([]x_freq:0.6 0.4 0.4 0.6 0.6) -.ml.freqencode[symtf;::]~(delete x from symtf),'([]x_freq:0.6 0.4 0.4 0.6 0.6) -.ml.freqencode[symti;`x]~(delete x from symti),'([]x_freq:0.6 0.4 0.4 0.6 0.6) -.ml.freqencode[symti;::]~(delete x from symti),'([]x_freq:0.6 0.4 0.4 0.6 0.6) -.ml.freqencode[symtb;`x]~(delete x from symtb),'([]x_freq:0.6 0.4 0.4 0.6 0.6) -.ml.freqencode[symtb;::]~(delete x from symtb),'([]x_freq:0.6 0.4 0.4 0.6 0.6) -.ml.freqencode[symtn;`x]~([] x1:til 5;x_freq:0.4 0.4 0.4 0.2 0.4) -.ml.freqencode[symtn;::]~([] x1:til 5;x_freq:0.4 0.4 0.4 0.2 0.4) -.ml.freqencode[symm;::]~([]x1:til 5;x_freq:0.6 0.4 0.4 0.6 0.6;x2_freq:0.6 0.4 0.6 0.6 0.4) - -.ml.lexiencode[symtf;`x]~(delete x from symtf),'([]x_lexi:0 1 1 0 0) -.ml.lexiencode[symtf;::]~(delete x from symtf),'([]x_lexi:0 1 1 0 0) -.ml.lexiencode[symti;`x]~(delete x from symti),'([]x_lexi:0 1 1 0 0) -.ml.lexiencode[symti;::]~(delete x from symti),'([]x_lexi:0 1 1 0 0) -.ml.lexiencode[symtb;`x]~(delete x from symtb),'([]x_lexi:0 1 1 0 0) -.ml.lexiencode[symtb;::]~(delete x from symtb),'([]x_lexi:0 1 1 0 0) -.ml.lexiencode[symtn;`x]~([] x1:til 5;x_lexi:1 2 2 0 1) -.ml.lexiencode[symtn;::]~([] x1:til 5;x_lexi:1 2 2 0 1) -.ml.lexiencode[symm;::]~([]x1:til 5;x_lexi: 0 1 1 0 0;x2_lexi:0 1 0 0 1) +stdScaleKeys:`avgData`devData +stdScale1:.ml.stdScaler.fit[plainmat] +stdScale2:.ml.stdScaler.fit[scale1] +stdScale3:.ml.stdScaler.fit[xf] +stdScale4:.ml.stdScaler.fit[y] +stdScale5:.ml.stdScaler.fit[yb] +stdScale6:.ml.stdScaler.fit[scale4] + +key[stdScale1[`modelInfo]]~stdScaleKeys +key[stdScale2[`modelInfo]]~stdScaleKeys +key[stdScale3[`modelInfo]]~stdScaleKeys +key[stdScale4[`modelInfo]]~stdScaleKeys +key[stdScale5[`modelInfo]]~stdScaleKeys +key[stdScale6[`modelInfo]]~stdScaleKeys + +stdScale1.transform[plainmat]~flip"f"$StdScaler[`:transform][flip plainmat]` +stdScale2.transform[scale1]~(-1 1f;1 -1f;1 -1f) +stdScale3.transform[xf]~scale[xf]` +stdScale4.transform[y]~scale[y]` +stdScale5.transform[yb]~scale[yb]` +stdScale6.transform[scale4]~(-1 1f;2#0n;2#0n) +stdScale2.transform[scale4]~(1 5f;-2 0n;0 0n) + +.ml.infReplace[infdict]~`x`x1`x2!"f"$(0 1 2 2;0 1 2 0;1 2 3 3) +.ml.infReplace[flip infdict]~flip `x`x1`x2!"f"$(0 1 2 2;0 1 2 0;1 2 3 3) +.ml.infReplace[infdict`x]~0 1 2 2f +.ml.infReplace[keyedinfs]~([k:1 2]x:0 0) + +.ml.polyTab[([] 2 4 1f;3 4 1f;3 2 3f);2]~([]x_x1:6 16 1f;x_x2:6 8 3f;x1_x2:9 8 3f) +.ml.polyTab[([] 2 4 1;3 4 1;3 2 3);2]~([]x_x1:6 16 1;x_x2:6 8 3;x1_x2:9 8 3) +.ml.polyTab[([]101b;110b;100b);2]~([]x_x1:1 0 0i;x_x2:1 0 0i;x1_x2:1 0 0i) +.ml.polyTab[nt;2]~([]x_x1:0 0 0i;x_x2:1 0 0n;x1_x2:0 0 0n) +.ml.polyTab[([] 0n 0n;2 3;1 2);2]~([]x_x1:2#0n;x_x2:2#0n;x1_x2:2 6) + +.ml.fillTab[tab;0#();`time;`x1`x!`linear`mean]~flip`sym`time`x`x1`x1_null`x_null!(`a`a`a`b`b;00:00:00.000 00:00:00.001 00:00:00.002 00:00:00.003 00:00:00.004;1 1 1 1 1f;0 1 2 3 4f;00001b;11010b) +.ml.fillTab[tab;`sym;`time;()!()]~tab +.ml.fillTab[tab;`sym;`time;::]~flip`sym`time`x`x1`x_null`x1_null!(`a`a`a`b`b;00:00:00.000 00:00:00.001 00:00:00.002 00:00:00.003 00:00:00.004;1 1 1 1 1f;0 1 2 3 3f;11010b;00001b) +(select x4,x5,x1_null,x3_null from .ml.fillTab[nulltab;`x2;x;`x1`x3!`median`mean])~([]x4:5#0;x5:5#0n;x1_null:00100b;x3_null:11000b) +.ml.fillTab[tab,'flip (enlist `x2)!enlist 5#0n;`sym;`time;`x1`x`x2!`median`mean`max]~flip`sym`time`x`x1`x2`x1_null`x_null`x2_null!(`a`a`a`b`b;00:00:00.000 00:00:00.001 00:00:00.002 00:00:00.003 00:00:00.004;1 1 1 1 1f;0 1 2 3 3f;5#0n;00001b;11010b;11111b) + +.ml.oneHot.fitTransform[symtf;`x] ~"f"$([] x1:til 5;x_a:1 0 0 1 1;x_b: 0 1 1 0 0) +.ml.oneHot.fitTransform[symtf;::] ~"f"$([] x1:til 5;x_a:1 0 0 1 1;x_b: 0 1 1 0 0) +.ml.oneHot.fitTransform[symti;`x] ~([] x1:til 5;x_a:1 0 0 1 1f;x_b: 0 1 1 0 0f) +.ml.oneHot.fitTransform[symti;::] ~([] x1:til 5;x_a:1 0 0 1 1f;x_b: 0 1 1 0 0f) +.ml.oneHot.fitTransform[symtb;`x]~([] x1:11001b;x_a:1 0 0 1 1f;x_b: 0 1 1 0 0f) +.ml.oneHot.fitTransform[symtb;::]~([] x1:11001b;x_a:1 0 0 1 1f;x_b: 0 1 1 0 0f) +.ml.oneHot.fitTransform[symtn;`x]~([]x1:til 5;x_:0 0 0 1 0f;x_a:1 0 0 0 1f;x_b:0 1 1 0 0f) +.ml.oneHot.fitTransform[symtn;::]~([]x1:til 5;x_:0 0 0 1 0f;x_a:1 0 0 0 1f;x_b:0 1 1 0 0f) +.ml.oneHot.fitTransform[symm;::]~([]x1:til 5;x_a:1 0 0 1 1f;x_b:0 1 1 0 0f;x2_q:1 0 1 1 0f;x2_w:0 1 0 0 1f) + +oneHot1:.ml.oneHot.fit[symtf;::] +oneHot1.transform[symtb;::]~([] x1:11001b;x_a:1 0 0 1 1f;x_b: 0 1 1 0 0f) +oneHot1.transform[symti;::]~([] x1:til 5;x_a:1 0 0 1 1f;x_b: 0 1 1 0 0f) +oneHot1.transform[symm;`x`x2!`x`x]~([]x1:til 5;x_a:1 0 0 1 1f;x_b:0 1 1 0 0f;x2_a:5#0f;x2_b:5#0f) + +.ml.freqEncode[symtf;`x]~(delete x from symtf),'([]x_freq:0.6 0.4 0.4 0.6 0.6) +.ml.freqEncode[symtf;::]~(delete x from symtf),'([]x_freq:0.6 0.4 0.4 0.6 0.6) +.ml.freqEncode[symti;`x]~(delete x from symti),'([]x_freq:0.6 0.4 0.4 0.6 0.6) +.ml.freqEncode[symti;::]~(delete x from symti),'([]x_freq:0.6 0.4 0.4 0.6 0.6) +.ml.freqEncode[symtb;`x]~(delete x from symtb),'([]x_freq:0.6 0.4 0.4 0.6 0.6) +.ml.freqEncode[symtb;::]~(delete x from symtb),'([]x_freq:0.6 0.4 0.4 0.6 0.6) +.ml.freqEncode[symtn;`x]~([] x1:til 5;x_freq:0.4 0.4 0.4 0.2 0.4) +.ml.freqEncode[symtn;::]~([] x1:til 5;x_freq:0.4 0.4 0.4 0.2 0.4) +.ml.freqEncode[symm;::]~([]x1:til 5;x_freq:0.6 0.4 0.4 0.6 0.6;x2_freq:0.6 0.4 0.6 0.6 0.4) + +.ml.lexiEncode.fitTransform[symtf;`x]~(delete x from symtf),'([]x_lexi:0 1 1 0 0) +.ml.lexiEncode.fitTransform[symtf;::]~(delete x from symtf),'([]x_lexi:0 1 1 0 0) +.ml.lexiEncode.fitTransform[symti;`x]~(delete x from symti),'([]x_lexi:0 1 1 0 0) +.ml.lexiEncode.fitTransform[symti;::]~(delete x from symti),'([]x_lexi:0 1 1 0 0) +.ml.lexiEncode.fitTransform[symtb;`x]~(delete x from symtb),'([]x_lexi:0 1 1 0 0) +.ml.lexiEncode.fitTransform[symtb;::]~(delete x from symtb),'([]x_lexi:0 1 1 0 0) +.ml.lexiEncode.fitTransform[symtn;`x]~([] x1:til 5;x_lexi:1 2 2 0 1) +.ml.lexiEncode.fitTransform[symtn;::]~([] x1:til 5;x_lexi:1 2 2 0 1) +.ml.lexiEncode.fitTransform[symm;::]~([]x1:til 5;x_lexi: 0 1 1 0 0;x2_lexi:0 1 0 0 1) + +lexi1:.ml.lexiEncode.fit[symtf;::] +lexi1.transform[symtb;::]~(delete x from symtb),'([]x_lexi:0 1 1 0 0) +lexi1.transform[symti;::]~(delete x from symti),'([]x_lexi:0 1 1 0 0) +lexi1.transform[symm;`x`x2!`x`x]~([]x1:til 5;x_lexi: 0 1 1 0 0;x2_lexi:5#-1) guidList :asc 5?0Ng -symList :`b`a`d`c +symList1 :`b`a`d`c +symList2 :`e`a`d`d floatList:1.2 2 2.5 0.1 -guidReturn:`mapping`encoding!(((asc distinct guidList)!til count distinct guidList);til 5) -.ml.labelencode[guidList] ~guidReturn -.ml.labelencode[symList] ~`mapping`encoding!((`a`b`c`d!til 4);1 0 3 2) -.ml.labelencode[floatList]~`mapping`encoding!((0.1 1.2 2 2.5!til 4);1 2 3 0) - -.ml.applylabelencode[0 0 2 3 4 ;.ml.labelencode floatList]~(0.1;0.1;2f;2.5;0n) -.ml.applylabelencode[1 1 2 5 3 0;.ml.labelencode symList ]~`b`b`c``d`a -.ml.applylabelencode[0 0 0 1 6 ;.ml.labelencode guidList ]~(3#guidList 0),(guidList 1),`guid$0Ng -.ml.applylabelencode[0 0 2 3 4 ;.ml.labelencode[floatList]`mapping]~(0.1;0.1;2f;2.5;0n) -.ml.applylabelencode[1 1 2 5 3 0;.ml.labelencode[symList]`mapping]~`b`b`c``d`a -.ml.applylabelencode[0 0 0 1 6 ;.ml.labelencode[guidList]`mapping]~(3#guidList 0),(guidList 1),`guid$0Ng - -.ml.timesplit[timetab;::]~(delete x from timetab),'flip`x_dow`x_year`x_mm`x_dd`x_qtr`x_wd`x_hh`x_uu`x_ss!(0 1 2i;2000 2000 2000i;1 1 1i;1 2 3i;1 1 1j;001b;0 0 0i;0 0 0i;0 0 0i) -.ml.timesplit[timetab;`x]~(delete x from timetab),'flip`x_dow`x_year`x_mm`x_dd`x_qtr`x_wd`x_hh`x_uu`x_ss!(0 1 2i;2000 2000 2000i;1 1 1i;1 2 3i;1 1 1j;001b;0 0 0i;0 0 0i;0 0 0i) -.ml.timesplit[timetabn;::]~(delete x from timetabn),'flip`x_dow`x_year`x_mm`x_dd`x_qtr`x_wd`x_hh`x_uu`x_ss!(`int$(0 1 2 0n);`int$(2000 2000 2000 0n);`int$(1 1 1 0n);`int$(1 2 3 0n);"j"$(1 1 1 0n);0010b;`int$(0 0 0 0n);`int$(0 0 0 0n);`int$(0 0 0 0n)) -.ml.timesplit[symtf;::]~symtf -.ml.timesplit[symti;::]~symti -.ml.timesplit[symtb;::]~symtb +.ml.labelEncode.fit[guidList][`modelInfo] ~(asc distinct guidList)!til count distinct guidList +.ml.labelEncode.fit[symList1][`modelInfo] ~`a`b`c`d!til 4 +.ml.labelEncode.fit[floatList][`modelInfo]~0.1 1.2 2 2.5!til 4 + +label1:.ml.labelEncode.fit[symList1] +label1.transform[symList1]~1 0 3 2 +label1.transform[symList2]~-1 0 3 3 + +.ml.applyLabelEncode[0 0 2 3 4 ;.ml.labelEncode.fit floatList]~(0.1;0.1;2f;2.5;0n) +.ml.applyLabelEncode[1 1 2 5 3 0;.ml.labelEncode.fit symList1 ]~`b`b`c``d`a +.ml.applyLabelEncode[0 0 0 1 6 ;.ml.labelEncode.fit guidList ]~(3#guidList 0),(guidList 1),`guid$0Ng +.ml.applyLabelEncode[0 0 2 3 4 ;.ml.labelEncode.fit [floatList]`modelInfo]~(0.1;0.1;2f;2.5;0n) +.ml.applyLabelEncode[1 1 2 5 3 0;.ml.labelEncode.fit [symList1]`modelInfo]~`b`b`c``d`a +.ml.applyLabelEncode[0 0 0 1 6 ;.ml.labelEncode.fit [guidList]`modelInfo]~(3#guidList 0),(guidList 1),`guid$0Ng + +timesplitKeys:`x_dayOfWeek`x_year`x_month`x_day`x_quarter`x_weekday`x_hour`x_minute`x_second +.ml.timeSplit[timetab;::]~(delete x from timetab),'flip timesplitKeys!(0 1 2i;2000 2000 2000i;1 1 1i;1 2 3i;1 1 1j;001b;0 0 0i;0 0 0i;0 0 0i) +.ml.timeSplit[timetab;`x]~(delete x from timetab),'flip timesplitKeys!(0 1 2i;2000 2000 2000i;1 1 1i;1 2 3i;1 1 1j;001b;0 0 0i;0 0 0i;0 0 0i) +.ml.timeSplit[timetabn;::]~(delete x from timetabn),'flip timesplitKeys!(`int$(0 1 2 0n);`int$(2000 2000 2000 0n);`int$(1 1 1 0n);`int$(1 2 3 0n);"j"$(1 1 1 0n);0010b;`int$(0 0 0 0n);`int$(0 0 0 0n);`int$(0 0 0 0n)) +.ml.timeSplit[symtf;::]~symtf +.ml.timeSplit[symti;::]~symti +.ml.timeSplit[symtb;::]~symtb diff --git a/util/tests/utiltst.t b/util/tests/utiltst.t index a64a44d6..4482721b 100644 --- a/util/tests/utiltst.t +++ b/util/tests/utiltst.t @@ -2,7 +2,6 @@ \l util/init.q np:.p.import[`numpy] - p)import pandas as pd p)import numpy as np p)import datetime @@ -19,6 +18,13 @@ dt1:2019.01.01D01:30:00.000000000 2019.01.02D01:30:00.000000000 plaintab:([]4 5 6.;1 2 3.;-1 -2 -3.;0.4 0.5 0.6) xm:100 10#1000?100f +x:1000?1000 +xf:1000?100f + +.ml.range[til 63] ~ 62 +.ml.range[5] ~ 0 +.ml.range[0 1 3 2f]~3f +.ml.range[0 1 0n 2]~2f df :.ml.tab2df tt:([]fcol:12?1.;jcol:12?100;scol:12?`aaa`bbb`ccc) dfj:.ml.tab2df tj:select by jcol from tt @@ -45,9 +51,9 @@ tt2:([]date:2005.07.14 2005.07.15;timesp:("N"$"12:10:30.000500000";"N"$"12:13:30 .ml.arange[2.5;50.2;0.2] ~ np[`:arange][2.5;50.2;0.2]` .ml.arange[2f;10f;1f]~2 3 4 5 6 7 8 9f -.ml.linspace[1;10;9] ~ np[`:linspace][1;10;9]` -.ml.linspace[-0.2;109;62] ~ np[`:linspace][-0.2;109;62]` -.ml.linspace[-0.2;10.4;20] ~ np[`:linspace][-0.2;10.4;20]` +.ml.linearSpace[1;10;9] ~ np[`:linspace][1;10;9]` +.ml.linearSpace[-0.2;109;62] ~ np[`:linspace][-0.2;109;62]` +.ml.linearSpace[-0.2;10.4;20] ~ np[`:linspace][-0.2;10.4;20]` .ml.eye[3] ~ "f"$(1 0 0;0 1 0;0 0 1) first[.ml.eye[1]] ~ enlist 1f @@ -57,10 +63,10 @@ first[.ml.eye[1]] ~ enlist 1f .ml.df2tab[t]~([]fcol:0.1*1+til 5;jcol:10*1+til 5) .ml.df2tab[t2]~([]fcol:5#(::);jcol:10101b) -.ml.df2tab_tz[t3;0b;1b]~([]date:2005.07.14 2005.07.15;time:("N"$"12:10:30.000500000";"N"$"12:13:30.000200000");str:enlist each ("h";"i");ind:1.3 2.5;bool:10b) -.ml.df2tab_tz[t4;0b;1b]~([]bool:10b;date:"p"$(2005.02.25;2015.12.22);timed:(neg "N"$"05:00:00";"N"$"00:16:40")) -.ml.df2tab_tz[t5;1b;0b]~([]dt:dt1;dt_with_tz:dt1) -.ml.df2tab_tz[t5;0b;0b]~([]dt:dt1;dt_with_tz:dt1-"T"$"01:00:00") +.ml.df2tabTimezone[t3;0b;1b]~([]date:2005.07.14 2005.07.15;time:("N"$"12:10:30.000500000";"N"$"12:13:30.000200000");str:enlist each ("h";"i");ind:1.3 2.5;bool:10b) +.ml.df2tabTimezone[t4;0b;1b]~([]bool:10b;date:"p"$(2005.02.25;2015.12.22);timed:(neg "N"$"05:00:00";"N"$"00:16:40")) +.ml.df2tabTimezone[t5;1b;0b]~([]dt:dt1;dt_with_tz:dt1) +.ml.df2tabTimezone[t5;0b;0b]~([]dt:dt1;dt_with_tz:dt1-"T"$"01:00:00") tt~update`$scol from .ml.df2tab df tj~update`$scol from .ml.df2tab dfj @@ -71,10 +77,8 @@ tx~update`$scol from`scol`jcol xcol .ml.df2tab dfxj tx~update`$scol from`scol`jcol xcol .ml.df2tab dfxx \S 43 -.ml.traintestsplit[til 10;1+til 10;0.2]~`xtrain`ytrain`xtest`ytest!(2 3 7 1 6 4 9 5;3 4 8 2 7 5 10 6;0 8;1 9) +.ml.trainTestSplit[til 10;1+til 10;0.2]~`xtrain`ytrain`xtest`ytest!(2 3 7 1 6 4 9 5;3 4 8 2 7 5 10 6;0 8;1 9) \S 43 -.ml.traintestsplit["f"$til 10;1+"f"$til 10;0.2]~`xtrain`ytrain`xtest`ytest!(2 3 7 1 6 4 9 5f;3 4 8 2 7 5 10 6f;0 8f;1 9f) +.ml.trainTestSplit["f"$til 10;1+"f"$til 10;0.2]~`xtrain`ytrain`xtest`ytest!(2 3 7 1 6 4 9 5f;3 4 8 2 7 5 10 6f;0 8f;1 9f) \S 22 -.ml.traintestsplit[1010110011b;1001100011b;0.33]~`xtrain`ytrain`xtest`ytest!(110100b;111100b;1011b;0001b) - - +.ml.trainTestSplit[1010110011b;1001100011b;0.33]~`xtrain`ytrain`xtest`ytest!(110100b;111100b;1011b;0001b) diff --git a/util/util.q b/util/util.q deleted file mode 100644 index 12e98b7f..00000000 --- a/util/util.q +++ /dev/null @@ -1,72 +0,0 @@ -\d .ml - -/ values between x and y in steps of length z -arange:{x+z*til 0|ceiling(y-x)%z} -/ combinations of k elements from 0,1,...,n-1 -combs:{[n;k]flip(k-1){[n;x]j@:i:where 0<>k:n-j:1+last x;(x@\:where k),enlist -1_sums@[(1+sum k i)#1;0,sums k i;:;(j,0)-0,-1+j+k i]}[n]/enlist til n} -/ identity matrix -eye:{@[x#0.;;:;1.]each til x} -/ indexing functions -imax:{x?max x} -imin:{x?min x} -/ z evenly spaced values between x and y -linspace:{x+til[z]*(y-x)%z-1} -/ shape of matrix/table -shape:{-1_count each first scan x} -/ split into train/test sets with sz% in test -traintestsplit:{[x;y;sz]`xtrain`ytrain`xtest`ytest!raze(x;y)@\:/:(0,floor n*1-sz)_neg[n]?n:count x} - -/ q vector to numpy datetime -i.q2npdt:{.p.import[`numpy;`:array;("p"$@[4#+["d"$0];-16+type x]x)-"p"$1970.01m;"datetime64[ns]"]`.} -/ q tab to pandas dataframe -tab2df:{ - updx:@[flip 0!x;i.fndcols[x;"c"];enlist each]; - r:.p.import[`pandas;`:DataFrame;@[updx;i.fndcols[x]"pmdznuvt";i.q2npdt]][@;cols x]; - $[count k:keys x;r[`:set_index]k;r]} -/ pandas dataframe to q tab -df2tab_tz:{ - n:$[enlist[::]~x[`:index.names]`;0;x[`:index.nlevels]`]; - c:`$(x:$[n;x[`:reset_index][];x])[`:columns.to_numpy][]`; - d:x[`:select_dtypes][pykwargs enlist[`exclude]!enlist`float32`datetime`datetimetz`timedelta][`:to_dict;`list]`; - d,:dt_convert x[`:select_dtypes][`include pykw`datetime]; - d,:dt_dict[x[`:select_dtypes][`include pykw`timedelta]]+"n"$0; - d,:tz_convert[;y]x[`:select_dtypes][`include pykw`datetimetz]; - d,:float32_convert[;y]x[`:select_dtypes][`include pykw`float32][`:to_dict;`list]`; - / check if the first value in columns are foreign - if[0x;"F"$string x;0.000001*floor 0.5+x*1000000]]} -/ Convert time zone data (0b -> UTC time; 1b -> local time) -tz_convert:{$[y~0b;dt_convert;{"P"$neg[6]_/:'x[`:astype;`str][`:to_dict;<;`list]}]x} -/ Convert datetime/datetimetz to timestamp -dt_convert:{ - $[count nulCols:where any each x[`:isnull;::][`:to_dict;<;`list]; - [c:`$x[`:columns.to_numpy][]`; - null_data:"P"$x[`:drop;c except nulCols;`axis pykw 1][`:astype;`str][`:to_dict;<;`list]; - non_null_data:dt_dict x[`:drop;nulCols;`axis pykw 1]; - null_data,non_null_data+1970.01.01D0]; - dt_dict[x]+1970.01.01D0]} -/ Convert data to integer representation and return as a dict -dt_dict:{x[`:astype;`int64][`:to_dict;<;`list]} -/ Convert datetime.date/time types to kdb+ date/time -date_time_convert:{ - $[y~0b;x; - [ fval:.p.wrap first x; - / convert datetime.time/date to iso string format and convert to kdb+ - / otherwise return foreign - $[i.isinstance[fval;i.dt`:time];{"N"$.p.wrap[x][`:isoformat][]`}each x; - i.isinstance[fval;i.dt`:date];{"D"$.p.wrap[x][`:isoformat][]`}each x; - x]]]} -/ function defaults to return UTC timezone(y) and non converted date/times(z) -df2tab:df2tab_tz[;0b;0b] - -/ apply to list, mixed list, dictionary, table, keyed table -i.ap:{$[0=type y;x each y;98=type y;flip x each flip y;99<>type y;x y;98=type key y;key[y]!.z.s value y;x each y]} -/ find columns of x with type in y -i.fndcols:{m[`c]where(m:0!meta x)[`t]in y} -/ required python utilities for df2tab -i.isinstance:.p.import[`builtins][`:isinstance;<] -i.dt :.p.import[`datetime] diff --git a/util/utilities.q b/util/utilities.q new file mode 100644 index 00000000..281ba7c2 --- /dev/null +++ b/util/utilities.q @@ -0,0 +1,159 @@ +// util/utilities.q - Utilities library +// Copyright (c) 2021 Kx Systems Inc +// +// Includes range, arange, combs, eye, iMax, iMin, +// linearSpace, shape, trainTestSplit, tab2df, +// df2tabTimezone, df2tab + +\d .ml + +// @kind function +// @category utilities +// @desc Range of values +// @param array {number[]} A numerical array +// @returns {float} Range of its values +range:{[array] + max[array]-min array + } + +// @kind function +// @category utilities +// @desc Evenly-spaced values +// @param start {number} Start of the interval (inclusive) +// @param end {number} End of the interval (non-inclusive) +// @param step {number} Spacing between values +// @return {number[]} A vector of evenly-spaced values between start and end +// in steps of length `step` +arange:{[start;end;step] + start+step*til 0|ceiling(end-start)%step + } + +// @kind function +// @category utilities +// @desc Unique combinations of a vector or matrix +// @param n {int} Number of values required for combinations +// @param degree {int} Degree of the combinations to be produced +// @return {int[]} Unique combinations of values from the data +combs:{[n;degree] + flip(degree-1)i.combFunc[n]/enlist til n + } + +// @kind function +// @category utilities +// @desc Create identity matrix +// @param n {int} Width/height of identity matrix +// @return {int[]} Identity matrix of height/width n +eye:{[n] + @[n#0.;;:;1.]each til n + } + +// @kind function +// @category utilities +// @desc Index of the first occurance of the maximum value in a list +// @param array {number[]} Array of values +// @return {number} The index of the maximum element of the array +iMax:{[array] + array?max array + } + +// @kind function +// @category utilities +// @desc Index of minimum element of a list +// @param array {number[]} Array of values +// @return {number} The index of the minimum element of the array +iMin:{[array] + array?min array + } + +// @kind function +// @category utilities +// @desc Create an array of evenly-spaced values +// @param start {number} Start of the interval (inclusive) +// @param end {number} End of the interval (non-inclusive) +// @param n {int} How many spaces are to be created +// @return {number[]} A vector of `n` evenly-spaced values between +// start and end +linearSpace:{[start;end;n] + start+til[n]*(end-start)%n-1 + } + +// @kind function +// @category utilities +// @desc Shape of a matrix +// @param matrix {number[]} Matrix of values +// @return {number[]} Its shape as a list of dimensions +shape:{[matrix] + -1_count each first scan matrix + } + +// @kind function +// @category utilities +// @desc Split data into training and test sets +// @param data {any[]} Matrix of input values +// @param target {any[]} A vector of target values the same count as data +// @param size {float[]} Percentage size of the testing set +// @return {dictionary} Contains the data matrix and target split into a +// training and testing set +trainTestSplit:{[data;target;size] + dictKeys:`xtrain`ytrain`xtest`ytest; + n:count data; + split:(0,floor n*1-size)_neg[n]?n; + dictVals:raze(data;target)@\:/:split; + dictKeys!dictVals + } + +// @kind function +// @category utilities +// @desc Convert q table to Pandas dataframe +// @param tab {table} A q table +// @return {<} a Pandas dataframe +tab2df:{[tab] + updTab:@[flip 0!tab;i.findCols[tab;"c"];enlist each]; + transformTab:@[updTab;i.findCols[tab]"pmdznuvt";i.q2npDate]; + pandasDF:i.pandasDF[transformTab][@;cols tab]; + $[count keyTab:keys tab; + pandasDF[`:set_index]keyTab; + pandasDF + ] + } + +// @kind function +// @category utilities +// @desc Convert a pandas dataframe containing datetime timezones and +// datetime objects (datetime.datetime, datetime.time) to a q table +// @param tab {<} An embedPy representation of a Pandas dataframe +// @param local {boolean} Indicates if timezone objects are to be converted +// to local time (1b) or UTC (0b) +// @param qObj {boolean} Indicates if python datetime.date/datetime.time +// objects are returned as q (1b) or foreign objects (0b) +// @return {<} a q table +df2tabTimezone:{[tab;local;qObj] + index:$[enlist[::]~tab[`:index.names]`;0;tab[`:index.nlevels]`]; + tab:$[index;tab[`:reset_index][];tab]; + numpyCols:`$tab[`:columns.to_numpy][]`; + dataArgs:enlist[`exclude]!enlist`float32`datetime`datetimetz`timedelta; + dict:tab[`:select_dtypes][pykwargs dataArgs][`:to_dict;`list]`; + dateTimeData:tab[`:select_dtypes][`include pykw`datetime]; + dict,:i.dateConvert dateTimeData; + timeDeltaData:tab[`:select_dtypes][`include pykw`timedelta]; + dict,:i.dateDict[timeDeltaData]+"n"$0; + timezoneData:tab[`:select_dtypes][`include pykw`datetimetz]; + dict,:i.timezoneConvert[timezoneData;local]; + float32Data:tab[`:select_dtypes][`include pykw`float32][`:to_dict;`list]`; + dict,:i.float32Convert[float32Data;local]; + // Check if the first value in columns are foreign + foreign:where 112h=type each first each value dict; + if[0k:n-j:1+last vals; + sumVals:-1_sums@[(1+sum k i)#1;0,sums k i;:;(j,0)-0,-1+j+k i]; + (vals@\:where k),enlist sumVals + } + +// @private +// @kind function +// @category utilitiesUtility +// @desc Transform q object to numpy date +// @param date {date} q datetime object +// @return {<} Numpy datetime object +i.q2npDate:{[date] + dateConvert:("p"$@[4#+["d"$0];-16+type date]date)-"p"$1970.01m; + .p.import[`numpy;`:array;dateConvert;"datetime64[ns]"]`. + } + +// @private +// @kind function +// @category utilitiesUtility +// @desc Convert python float32 function to produce correct precision +// Note check for x~()!() which is required in cases where underlying +// representation is float32 for dates/times +// @param data {float[]} Floating point data from the dataFrame +// @param local {boolean} Indicates if timezone objects are to be converted +// to local time (1b) or UTC (0b) +// @return {float[]} Python float32 objects converted to correct precision +// in kdb +i.float32Convert:{[data;local] + $[(local~0b)|data~()!(); + data; + ?[0.000001>data;"F"$string data;0.000001*floor 0.5+data*1000000] + ] + } + +// @private +// @kind function +// @category utilitiesUtility +// @desc Convert datetime.timezone types to kdb+ date/time +// @param tab {<} Contains columns with datetime timezone objects +// @param local {boolean} Indicates if timezone objects are to be converted +// to local time (1b) or UTC (0b) +// @return {dictionary} Datetime objects are converted to kdb date/time +// objects +i.timezoneConvert:{[tab;local] + $[local~0b; + i.dateConvert tab; + "P"$neg[6]_/:'tab[`:astype;`str][`:to_dict;<;`list] + ] + } + +// @private +// @kind function +// @category utilitiesUtility +// @desc Convert datetime/datetimetz objects to kdb timestamp +// @param dataFrame {<} Pandas dataFrame containing datetime data +// @return {dictionary} Datetime objects are converted to timestamps in kdb +i.dateConvert:{[dataFrame] + nullCols:where any each dataFrame[`:isnull;::][`:to_dict;<;`list]; + $[count nullCols; + [npCols:`$dateFrame[`:columns.to_numpy][]`; + dropCols:dataFrame[`:drop;npCols except nulCols;`axis pykw 1]; + nullData:"P"$dropCols[`:astype;`str][`:to_dict;<;`list]; + nonNullData:i.dateDict dataFrame[`:drop;nullCols;`axis pykw 1]; + nullData,nonNullData+1970.01.01D0 + ]; + i.dateDict[dataFrame]+1970.01.01D0 + ] + } + +// @private +// @kind function +// @category utilitiesUtility +// @desc Convert datetime data to integer representation +// @param data {<} Pandas dataframe object containing timedelta objects +// @return {dictionary} Datetime objects are converted to integer values +i.dateDict:{[data] + data[`:astype;`int64][`:to_dict;<;`list] + } + +// @private +// @kind function +// @category utilitiesUtility +// @desc Convert datetime.date/time objects to kdb+ date/time +// @param dateTime {<} Python datetime object +// @param qObj {boolean} Indicates if python datetime.date/datetime.time +// objects +// are returned as q (1b) or foreign objects (0b) +// @return {datetime;<} kdb date/time format or embedpy object +i.dateTimeConvert:{[dateTime;qObj] + $[qObj~0b; + dateTime; + [firstVal:.p.wrap first dateTime; + // Convert datetime.time/date to iso string format and convert to kdb+ + // otherwise return foreign + $[i.isInstance[firstVal;i.dateTime`:time]; + i.isoFormat["N"]each dateTime; + i.isInstance[firstVal;i.dateTime`:date]; + i.isoFormat["D"]each dateTime; + dateTime + ] + ] + ] + } + +// @private +// @kind function +// @category utilitiesUtility +// @desc Cast python datetime object to a kdb datatype +// @param cast {string} Data type in which python object will be cast to +// @param dateTime {<} Python datetime object +// @return {any} Python datetime object casted to kdb datatype +i.isoFormat:{[cast;dateTime] + cast$.p.wrap[dateTime][`:isoformat][]` + } + +// @private +// @kind function +// @category utilitiesUtility +// @desc Apply function to data of various types +// @param func {fn} Function to apply to data +// @param data {any} Data of various types +// @return {fn} function to apply to data +i.ap:{[func;data] + $[0=type data; + func each data; + 98=type data; + flip func each flip data; + 99<>type data; + func data; + 98=type key data; + key[data]!.z.s[func] value data; + func each data + ] + } + +// @private +// @kind function +// @category utilitiesUtility +// @desc Apply function to data of various types +// @param func {fn} Function to apply to data +// @param data {any} Data of various types +// @return {fn} function to apply to data +i.apUpd:{[func;data] + $[0=type data; + func data; + 98=type data; + func each data; + 99<>type data; + func data; + 98=type key data; + key[data]!.z.s value data; + func data + ] + } + +// @private +// @kind function +// @category utilitiesUtility +// @desc Find columns of certain types +// @param tab {table} Data in tabular format +// @param char {char[]} Type of column to find +// @return {symbol[]} Columns containing the type being searched +i.findCols:{[tab;char] + metaTab:0!meta tab; + metaTab[`c]where metaTab[`t]in char + } + +// @private +// @kind function +// @category utilitiesUtility +// @desc Checks if object is of a specified type +i.isInstance:.p.import[`builtins][`:isinstance;<] + +// @private +// @kind function +// @category utilitiesUtility +// @desc Python datetime module +i.dateTime:.p.import`datetime + +// @private +// @kind function +// @category utilitiesUtility +// @desc Python pandas dataframe module +i.pandasDF:.p.import[`pandas]`:DataFrame + +// @private +// @kind function +// @category utilitiesUtility +// @desc Check that the length of the endog and another parameter +// are equal +// @param endog {float[]} The endogenous variable +// @param param {number[][]|number[]} A parameter to compare the length of +// @param paramName {string} The name of the parameter +// @returns {::|err} Return an error if they aren't equal +i.checkLen:{[endog;param;paramName] + if[not count[endog]=count param; + '"The length of the endog variable and ",paramName," must be equal" + ] + } + +// Metric utility functions + +// @private +// @kind function +// @category metricUtility +// @desc Exclude collinear points +// @param x {number[]} X coordinate of true positives and false negatives +// @param y {number[]} Y coorfinate of true positives and false negatives +// @returns {number[]} any colinear points are excluded +i.curvePts:{[x;y] + (x;y)@\:where(1b,2_differ deltas[y]%deltas x),1b + } + +// @private +// @kind function +// @category metricUtility +// @desc Calculate the area under an ROC curve +// @param x {number[]} X coordinate of true positives and false negatives +// @param y {number[]} Y coorfinate of true positives and false negatives +// @returns {number[]} Area under the curve +i.auc:{[x;y] + sum 1_deltas[x]*y-.5*deltas y + } + +// @private +// @kind function +// @category metricUtility +// @desc Calculate the correlation of a matrix +// @param matrix {number[]} A sample from a distribution +// @returns {number[]} The covariance matrix +i.corrMatrix:{[matrix] + devMatrix:dev each matrix; + covMatrix[matrix]%devMatrix*/:devMatrix + } + +// Preproc utility functions + +// @private +// @kind function +// @category preprocessingUtility +// @desc Drop any constant numeric values +// @param data {dictionary} Numerical data +// @return {dictionary} All keys with zero variance are removed +i.dropConstant.num:{[num] + (where 0=0^var each num)_num + } + +// @private +// @kind function +// @category preprocessingUtility +// @desc All non numeric values with zero variance are removed +// @param data {dictionary} Non-numerical data +// @return {dictionary} All keys with zero variance are removed +i.dropConstant.other:{[data] + (where{all 1_(~':)x}each data)_data + } + +// @private +// @kind function +// @category preprocessingUtility +// @desc Find keys of certain types +// @param dict {dictionary} Data stored as a dictionary +// @param char {char[]} Type of key to find +// @return {symbol[]} Keys containing the type being searched +i.findKey:{[dict;char] + where({.Q.t abs type x}each dict)in char + } + +// @private +// @kind function +// @category preprocessingUtility +// @desc Fill nulls with 0 +// @param data {table|number[]} Numerical data +// @return {table|number[]} Nulls filled with 0 +i.fillMap.zero:{[data] + 0^data + } + +// @private +// @kind function +// @category preprocessingUtility +// @desc Fill nulls with the median value +// @param data {table|number[]} Numerical data +// @return {table|number[]} Nulls filled with the median value +i.fillMap.median:{[data] + med[data]^data + } + +// @private +// @kind function +// @category preprocessingUtility +// @desc Fill nulls with the average value +// @param data {table|number[]} Numerical data +// @return {table|number[]} Nulls filled with the average value +i.fillMap.mean:{[data] + avg[data]^data + } + +// @private +// @kind function +// @category preprocessingUtility +// @desc Fill nulls forward +// @param data {table|number[]} Numerical data +// @return {table|number[]} Nulls filled foward +i.fillMap.forward:{[data] + "f"$(data first where not null data)^fills data + } + +// @private +// @kind function +// @category preprocessingUtility +// @desc Fill nulls depending on timestamp component +// @param time {time[]} Data containing a time component +// @param nulls {any[]} Contains null values +// @return {table|number[]} Nulls filled in respect to time component +i.fillMap.linear:{[time;vals] + nullVal:null vals; + i:where not nullVal; + if[2>count i;:vals]; + diffs:1_deltas[vals i]%deltas time i; + nullVal:where nullVal; + iBin:0|(i:-1_i)bin nullVal; + "f"$@[vals;nullVal;:;vals[i][iBin]+diffs[iBin]*time[nullVal]-time[i]iBin] + } + +// @private +// @kind function +// @category preprocessingUtility +// @desc Encode categorical features using one-hot encoding +// @param data {symbol[]} Data to encode +// @return {dictionary} One-hot encoded representation +i.oneHot:{[data] + vals:asc distinct data; + vals!"f"$data=/:vals + } + +// @private +// @kind function +// @category preprocessingUtility +// @desc Encode categorical features with frequency of +// category occurrence +// @param data {symbol[]} Data to encode +// @return {number[]} Frequency of occurrance of individual symbols within +// a column +i.freqEncode:{[data] + (groupVals%sum groupVals:count each group data)data + } + +// @private +// @kind function +// @category preprocessingUtility +// @desc Break date column into constituent components +// @param date {date} Data containing a date component +// @return {dictionary} A date broken into its constituent components +i.timeSplit.d:{[date] + dateDict:`dayOfWeek`year`month`day!`date`year`mm`dd$/:\:date; + update weekday:1floor k:xlog[2]n; + '"trials must equal 2^n for sobol search" + ]; + // Find numerical hyperparameter spaces + num:where any`uniform`loguniform=\:first each p:params`p; + // Set random seed + system"S ",string$[(::)~params`randomState;42;params`random_state]; + // Import sobol sequence generator and check requirements + pySobol:.p.import[`sobol_seq;`:i4_sobol_generate;<]; + genPts:$[`sobol~typ:params`typ; + enlist each flip pySobol[count num;n]; + `random~typ; + n; + '"hyperparam type not supported" + ]; + // Generate hyperparameters + hyperparams:distinct flip hp.i.hpGen[typ;n]each p,:num!p[num],'genPts; + // Take distinct sets + if[n>dst:count hyperparams; + -1"Distinct hp sets less than n - returning ",string[dst]," sets." + ]; + hyperparams + } + +// @private +// @kind function +// @category hyperparameterUtility +// @desc Random/sobol hyperparameter generation for .ml.rs +// @param randomType {symbol} Type of random search, denoting the namespace +// to use +// @param n {long} Number of hyperparameter sets +// @param params {dictionary} Parameters +// @return {any} Hyperparameters +hp.i.hpGen:{[randomType;n;params] + // Split parameters + params:@[;0;first](0;1)_params,(); + // Respective parameter generation + $[(typ:params 0)~`boolean;n?0b; + typ in`rand`symbol; + n?(),params[1]0; + typ~`uniform; + hp.i.uniform[randomType]. params 1; + typ~`loguniform; + hp.i.logUniform[randomType]. params 1; + '"please enter a valid type" + ] + } + +// @private +// @kind function +// @category hyperparameterUtility +// @desc Uniform number generator +// @param randomType {symbol} Type of random search, denoting the namespace +// to use +// @param low {long} Lower bound +// @param high {long} Higher bound +// @param paramType {char} Type of parameter, e.g. "i", "f", etc +// @param params {number[]} Parameters +// @return {number[]} Uniform numbers +hp.i.uniform:{[randomType;low;high;paramType;params] + if[highfloor k:xlog[2]n;'"trials must equal 2^n for sobol search"]; - num:where any`uniform`loguniform=\:first each p:x`p; - system"S ",string$[(::)~x`random_state;42;x`random_state]; - pysobol:.p.import[`sobol_seq;`:i4_sobol_generate;<]; - genpts:$[`sobol~typ:x`typ;enlist each flip pysobol[count num;n];`random~typ;n;'"hyperparam type not supported"]; - prms:distinct flip hp.i.hpgen[typ;n]each p,:num!p[num],'genpts; - if[n>dst:count prms;-1"Number of distinct hp sets less than n, returning ",string[dst]," sets."]; - prms} -hp.i.hpgen:{[ns;n;p] - p:@[;0;first](0;1)_p,(); - $[(typ:p 0)~`boolean;n?0b; - typ in`rand`symbol;n?(),p[1]0; - typ~`uniform;hp.i.uniform[ns]. p 1; - typ~`loguniform;hp.i.loguniform[ns]. p 1; - '"please enter a valid type"]} -hp.i.uniform:{[ns;lo;hi;typ;p]if[hisystem"s";mproc.init[abs system"s"]enlist".ml.loadfile`:util/pickle.q"]; -xv.picklewrap:{picklewrap[(0>system"s")&.p.i.isw x]x} \ No newline at end of file +xv.picklewrap:{picklewrap[(0>system"s")&.p.i.isw x]x}