From 9661b5712747e091e11ecb080869a5ca16298830 Mon Sep 17 00:00:00 2001 From: Shafaq Siddiqi Date: Wed, 9 Aug 2023 12:22:35 +0200 Subject: [PATCH] [MINOR] Adding decision tree to cleaning pipelines evaluation --- scripts/builtin/abstain.dml | 2 +- scripts/builtin/apply_pipeline.dml | 4 +- scripts/builtin/bandit.dml | 28 +++-- scripts/builtin/correctTyposApply.dml | 1 + scripts/builtin/executePipeline.dml | 21 ++-- scripts/builtin/fit_pipeline.dml | 19 +-- scripts/builtin/frameSort.dml | 2 +- scripts/builtin/mice.dml | 2 +- scripts/builtin/miceApply.dml | 2 +- scripts/builtin/topk_cleaning.dml | 116 ++++++++++-------- scripts/builtin/vectorToCsv.dml | 4 +- .../pipelines/scripts/enumerateLogical.dml | 15 +-- scripts/pipelines/scripts/utils.dml | 16 +-- .../frame/data/columns/StringArray.java | 18 ++- .../sysds/runtime/util/PorterStemmer.java | 54 ++------ .../BuiltinTopkCleaningRegressionTest.java | 2 +- src/test/resources/datasets/Salaries.json | 11 ++ .../pipelines/executePipelineTest.dml | 2 +- .../functions/pipelines/fit_pipelineTest.dml | 4 +- .../classification/applyFunc.csv | 6 +- .../classification/applyFunc.csv.mtd | 4 +- .../intermediates/classification/bestAcc.csv | 6 +- .../classification/bestAcc.csv.mtd | 4 +- .../classification/dirtyScore.csv | 2 +- .../classification/dirtyScore.csv.mtd | 4 +- .../classification/evalHp.csv.mtd | 4 +- .../intermediates/classification/hp.csv | 6 +- .../intermediates/classification/hp.csv.mtd | 8 +- .../intermediates/classification/pip.csv | 6 +- .../intermediates/classification/pip.csv.mtd | 4 +- .../functions/pipelines/topkLogicalTest.dml | 6 +- .../topkcleaningClassificationTest.dml | 46 ++++++- .../pipelines/topkcleaningRegressionTest.dml | 89 +++++++++++--- 33 files changed, 308 insertions(+), 210 deletions(-) create mode 100644 src/test/resources/datasets/Salaries.json diff --git a/scripts/builtin/abstain.dml b/scripts/builtin/abstain.dml index 6e0cb516343..17e0fea2eb2 100644 --- a/scripts/builtin/abstain.dml +++ b/scripts/builtin/abstain.dml @@ -43,7 +43,7 @@ return (Matrix[Double] Xout, Matrix[Double] Yout) Yout = Y if(min(Y) != max(Y) & max(Y) <= 2) { - betas = multiLogReg(X=X, Y=Y, icpt=1, reg=1e-4, maxi=100, maxii=0, verbose=verbose) + betas = multiLogReg(X=X, Y=Y, icpt=1, reg=1e-4, maxi=50, maxii=0, verbose=verbose) [prob, yhat, accuracy] = multiLogRegPredict(X, betas, Y, FALSE) inc = ((yhat != Y) & (rowMaxs(prob) > threshold)) diff --git a/scripts/builtin/apply_pipeline.dml b/scripts/builtin/apply_pipeline.dml index b9f660a30d2..b6a114b1811 100644 --- a/scripts/builtin/apply_pipeline.dml +++ b/scripts/builtin/apply_pipeline.dml @@ -52,7 +52,7 @@ s_apply_pipeline = function(Frame[Unknown] testData, Frame[Unknown] metaData = a [schema, mask, fdMask, maskY] = topk::prepareMeta(testData, metaData) pip = removeEmpty(target=pip, margin="cols") applyFunc = removeEmpty(target=applyFunc, margin="cols") - metaList = list(mask=mask, schema=schema, fd=fdMask, applyFunc=as.frame("NULL")) + metaList = list(mask=mask, schema=schema, fd=fdMask, applyFunc=as.frame("NULL"), tfspec=as.scalar(exState[3])) ctx = list(prefix="----"); #TODO include seed # separate the label [Xtest, Ytest] = topk::getLabel(testData, isLastLabel) @@ -75,7 +75,7 @@ s_apply_pipeline = function(Frame[Unknown] testData, Frame[Unknown] metaData = a M = as.frame(exState[2]) if(sum(mask) > 0) { - index = vectorToCsv(mask) + index = vectorToCsv(mask, ncol(mask)) jspecR = "{ids:true, recode:["+index+"]}" eXtest = transformapply(target=Xtest, spec=jspecR, meta=M); } diff --git a/scripts/builtin/bandit.dml b/scripts/builtin/bandit.dml index 3699eb0c6dd..a10ac8324ac 100644 --- a/scripts/builtin/bandit.dml +++ b/scripts/builtin/bandit.dml @@ -57,7 +57,8 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, Matrix[Doubl totalPruneCount = 0 FLAG_VARIABLE = 5 pipelines_executed = 0 - HYPERPARAM_LENGTH = ((ncol(lp) + 2) * FLAG_VARIABLE * 3) + 1 ## num of col in logical * 5 meat flag vars * max hyperparam per op + 1 accuracy col + maxValueInParam = max(as.matrix(param[, 3])) + HYPERPARAM_LENGTH = ((ncol(lp) + 2) * FLAG_VARIABLE * maxValueInParam) + 1 ## num of col in logical * 5 meat flag vars * max hyperparam per op + 1 accuracy col bestPipeline = frame("", rows=1, cols=1) bestHyperparams = as.matrix(0) bestAccuracy = as.matrix(0) @@ -111,7 +112,7 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, Matrix[Doubl configurations = configurations[1:n_i, ] pipelines_executed = pipelines_executed + (n_i * r_i) [outPip,outHp, pruneCount] = run_with_hyperparam(ph_pip=configurations, r_i=r_i, X=X_train, Y=Y_train, Xtest=X_test, Ytest=Y_test, metaList=metaList, - evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, param=param, cv=cv, cvk=cvk, ref=ref, seed = seed, enablePruning=enablePruning) + evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, param=param, maxOpCount=maxValueInParam, cv=cv, cvk=cvk, ref=ref, seed = seed, enablePruning=enablePruning) totalPruneCount = totalPruneCount + pruneCount # sort the pipelines by order of accuracy decreasing IX = order(target = outPip, by = 1, decreasing=TRUE, index.return=TRUE) @@ -214,19 +215,21 @@ get_physical_configurations = function(Frame[String] logical, Scalar[int] numCon # # this method will call the execute pipelines with their hyper-parameters run_with_hyperparam = function(Frame[Unknown] ph_pip, Integer r_i = 1, Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xtest, Matrix[Double] Ytest, List[Unknown] metaList, String evaluationFunc, Matrix[Double] evalFunHp, - Frame[Unknown] param, Boolean cv = FALSE, Integer cvk = 2, Double ref = 0, Integer seed = -1, Boolean enablePruning = FALSE, Boolean default = FALSE) + Frame[Unknown] param, Boolean cv = FALSE, Integer maxOpCount=3, Integer cvk = 2, Double ref = 0, Integer seed = -1, Boolean enablePruning = FALSE, Boolean default = FALSE) return (Matrix[Double] output_operator, Matrix[Double] output_hyperparam, Integer pruneCount, Matrix[Double] changesByPipMatrix) { # # # TODO there is a partial overlap but it is negligible so we will not rewrite the scripts but lineage based reuse will get rid of it + tfspec=as.scalar(metaList["tfspec"]) + mask=as.matrix(metaList["mask"]) changesByPipMatrix = matrix(0, rows=nrow(ph_pip) * r_i, cols=1) pruneCount = 0 - output_hp = matrix(0, nrow(ph_pip)*r_i, (ncol(ph_pip)) * 5 * 3) + output_hp = matrix(0, nrow(ph_pip)*r_i, (ncol(ph_pip)) * 5 * maxOpCount) output_accuracy = matrix(0, nrow(ph_pip)*r_i, 1) output_pipelines = matrix(0, nrow(ph_pip)*r_i, 3) # rows in validation set ids = as.matrix(ph_pip[, 1:2]) ph_pip = ph_pip[, 3:ncol(ph_pip)] - inputHpMatrix = matrix(0, nrow(ph_pip)*r_i, (ncol(ph_pip)) * 5 * 3 + 1) + inputHpMatrix = matrix(0, nrow(ph_pip)*r_i, (ncol(ph_pip)) * 5 * maxOpCount + 1) # prepare the pipelines and resources allPipelines = frame(0, rows = nrow(ph_pip) * r_i, cols=ncol(ph_pip)) allApplyFunctions = frame(0, rows = nrow(ph_pip) * r_i, cols=ncol(ph_pip)) @@ -286,7 +289,7 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip, Integer r_i = 1, Matrix[Do hp = hp[, 2:totalVals] applyFunctions = allApplyFunctions[i] no_of_res = nrow(hp) - # print("PIPELINE EXECUTION START ... "+toString(op)) + print("PIPELINE EXECUTION START ... "+toString(op)) hpForPruning = matrix(0, rows=1, cols=ncol(op)) changesByOp = matrix(0, rows=1, cols=ncol(op)) metaList2 = metaList; #ensure metaList is no result var @@ -317,7 +320,7 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip, Integer r_i = 1, Matrix[Do else if(changesByPip < ref) print("prunningAlert 2: not training the model due to minimum changes") else - evalFunOutput = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtest, Ytest=eYtest, Xorig=as.matrix(0), evalFunHp=evalFunHp)) + evalFunOutput = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtest, Ytest=eYtest, Xorig=mask, evalFunHp=evalFunHp, tfspec=tfspec)) accuracy = as.scalar(evalFunOutput[1, 1]) } @@ -506,12 +509,13 @@ crossV = function(Matrix[double] X, Matrix[double] y, Integer cvk, Matrix[Double Matrix[Double] hpForPruning = as.matrix(0), Matrix[Double] changesByOp = as.matrix(0), String evalFunc, Double ref = 0) return (Double accuracy, Matrix[Double] evalFunHp, Matrix[Double] hpForPruning, Matrix[Double] changesByOp, Double allChanges) { - + tfspec = as.scalar(metaList['tfspec']) # # in the below condition we compute the hp using cv method on train dataset - if(is.na(as.scalar(evalFunHp[1,1]))) { - forEvalHp = eval(evalFunc, list(X=X, Y=y, Xtest=X, Ytest=y, Xorig=as.matrix(0), evalFunHp=evalFunHp)) + if(is.na(as.scalar(evalFunHp[1,1])) & tfspec=="NA") { + forEvalHp = eval(evalFunc, list(X=X, Y=y, Xtest=X, Ytest=y, Xorig=as.matrix(0), evalFunHp=evalFunHp, tfspec=tfspec)) evalFunHp = forEvalHp[1, 2:ncol(forEvalHp)] } + mask = as.matrix(metaList['mask']) changesByPip = 0 cvChanges = matrix(0, rows=cvk, cols=ncol(changesByOp)) accuracyMatrix = matrix(0, cvk, 1) @@ -547,9 +551,9 @@ return (Double accuracy, Matrix[Double] evalFunHp, Matrix[Double] hpForPruning, allChanges[i] = changesByPip } if(changesByPip < ref) - print("prunning alert 2: no training the model due to minimum changes") + print("pruning alert 2: no training the model due to minimum changes") else { - res = eval(evalFunc, list(X=trainX, Y=trainy, Xtest=testX, Ytest=testy, Xorig=as.matrix(0), evalFunHp=evalFunHp)) + res = eval(evalFunc, list(X=trainX, Y=trainy, Xtest=testX, Ytest=testy, Xorig=mask , evalFunHp=evalFunHp, tfspec=tfspec)) accuracyMatrix[i] = res[1, 1] } diff --git a/scripts/builtin/correctTyposApply.dml b/scripts/builtin/correctTyposApply.dml index addc75940ea..0482f46cc21 100644 --- a/scripts/builtin/correctTyposApply.dml +++ b/scripts/builtin/correctTyposApply.dml @@ -91,3 +91,4 @@ replaceStrings1 = function(String replacement, String to_replace, Frame[String] { strings = map(strings, "s -> s.equals(\""+to_replace+"\") ? \""+replacement+"\" : s"); } + diff --git a/scripts/builtin/executePipeline.dml b/scripts/builtin/executePipeline.dml index b42a49bd0e5..ba44c02862a 100644 --- a/scripts/builtin/executePipeline.dml +++ b/scripts/builtin/executePipeline.dml @@ -101,7 +101,6 @@ s_executePipeline = function(Frame[String] pipeline, Matrix[Double] Xtrain, Mat else { print("not applying operation executeFlag = 0") } - if(ncol(Xtest) == d & nrow(Xtest) == nrow(XtestClone) & ncol(hpForPruning) > 1) { changesSingle = sum(abs(replace(target=Xtest, pattern=NaN, replacement=0) - replace(target=XtestClone, pattern=NaN, replacement=0)) > 0.001 ) changesAll = sum(abs(replace(target=Xtest, pattern=NaN, replacement=0) - replace(target=Xorig, pattern=NaN, replacement=0)) > 0.001 ) @@ -204,7 +203,7 @@ return (Matrix[Double] X) # X without numerics Xcat = removeEmpty(target=originalX, margin="cols", select=mask) nanMask = is.na(Xcat) - Xcat = replace(target = Xcat, pattern = NaN, replacement = -1111) + Xcat = abs(round(replace(target = Xcat, pattern = NaN, replacement = 4444))) # reconstruct the original matrix p = table(seq(1, ncol(nX)), removeEmpty(target=seq(1, ncol(mask)), margin="rows", @@ -214,16 +213,16 @@ return (Matrix[Double] X) X = (nX %*% p) + (Xcat %*% q) X = replace(target = X, pattern = maxDummy, replacement = NaN) - X = replace(target = X, pattern = -1111, replacement = NaN) + X = replace(target = X, pattern = 4444, replacement = NaN) } else if(dataFlag == 1 & (sum(mask) > 0) & (sum(mask) != ncol(originalX))) { - maxDummy = max(replace(target=nX, pattern=NaN, replacement=0)) + 1 - nX = replace(target = nX, pattern = NaN, replacement = maxDummy) + maxDummy = abs(round(max(replace(target=nX, pattern=NaN, replacement=0)) + 1)) + nX = abs(round(replace(target = nX, pattern = NaN, replacement = maxDummy))) # X without categorical Xnum = removeEmpty(target=originalX, margin="cols", select=(mask==0)) nanMask = is.na(Xnum) - Xnum = replace(target = Xnum, pattern = NaN, replacement = -1111) + Xnum = replace(target = Xnum, pattern = NaN, replacement = 4444) # reconstruct the original matrix p = table(seq(1, ncol(Xnum)), removeEmpty(target=seq(1, ncol(mask)), margin="rows", select=t(mask==0)), ncol(Xnum), ncol(originalX)) @@ -231,7 +230,7 @@ return (Matrix[Double] X) select=t(mask)), ncol(nX), ncol(originalX)) X = (nX %*% q) + (Xnum %*% p) X = replace(target = X, pattern = maxDummy, replacement = NaN) - X = replace(target = X, pattern = -1111, replacement = NaN) + X = replace(target = X, pattern = 4444, replacement = NaN) } else X = nX @@ -247,14 +246,14 @@ return (Matrix[Double] X) ####################################################################### dummycoding = function(Matrix[Double] X, Matrix[Double] mask) -return (Matrix[Double] X, String jspec, Frame[Unknown] meta) { - +return (Matrix[Double] X, String jspec, Frame[Unknown] meta) { + meta = as.frame("NULL") jspec = "" if(sum(mask) > 0) { X = replace(target=X, pattern=NaN, replacement=0) - idx = vectorToCsv(mask) + idx = vectorToCsv(mask, ncol(X)) # specifications for one-hot encoding of categorical features jspec = "{ids:true, dummycode:["+idx+"]}"; # OHE of categorical features @@ -268,6 +267,7 @@ return (Matrix[Double] Y) { if(jspec != "") { + X = replace(target=X, pattern=NaN, replacement=0) Y = transformapply(target=as.frame(X), spec=jspec, meta=meta); } else Y = X @@ -286,6 +286,7 @@ return (Matrix[Double] X, Matrix[Double] fillMatrix) if(sum(fdMask) > 0) { t = replace(target=X, pattern=NaN, replacement=1) + t = replace(target=t, pattern=0, replacement=1) fdMask = removeEmpty(target=fdMask, margin="cols") FD = discoverFD(X=t, Mask=fdMask, threshold=threshold) FD = (diag(matrix(1, rows=nrow(FD), cols=1)) ==0) * FD diff --git a/scripts/builtin/fit_pipeline.dml b/scripts/builtin/fit_pipeline.dml index e31bf656765..f9095d1d607 100644 --- a/scripts/builtin/fit_pipeline.dml +++ b/scripts/builtin/fit_pipeline.dml @@ -48,7 +48,7 @@ source("scripts/builtin/bandit.dml") as bandit; s_fit_pipeline = function(Frame[Unknown] trainData, Frame[Unknown] testData, Frame[Unknown] metaData = as.frame("NULL"), Frame[Unknown] pip, Frame[Unknown] applyFunc, Matrix[Double] hp, Integer cvk=3, String evaluationFunc, Matrix[Double] evalFunHp, - Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE) + Boolean isLastLabel = TRUE, String tfspec="NA", Boolean OHE=TRUE, Boolean correctTypos=FALSE) return (Matrix[Double] scores, Matrix[Double] cleanTrain, Matrix[Double] cleanTest, List[Unknown] externalState, List[Unknown] iState) { externalState = list() @@ -57,7 +57,7 @@ return (Matrix[Double] scores, Matrix[Double] cleanTrain, Matrix[Double] cleanTe pip = removeEmpty(target=pip, margin="cols") applyFunc = removeEmpty(target=applyFunc, margin="cols") - metaList = list(mask=mask, schema=schema, fd=fdMask, applyFunc=as.frame("NULL")) + metaList = list(mask=mask, schema=schema, fd=fdMask, applyFunc=as.frame("NULL"), tfspec=tfspec) ctx = list(prefix="----"); #TODO include seed # separate the label [Xtrain, Ytrain] = topk::getLabel(trainData, isLastLabel) @@ -65,6 +65,9 @@ return (Matrix[Double] scores, Matrix[Double] cleanTrain, Matrix[Double] cleanTe # always recode the label if(maskY == 1) { + sc = detectSchema(Ytrain) + Ytrain = applySchema(Ytrain, sc) + Ytest = applySchema(Ytest, sc) [eYtrain, M] = transformencode(target=Ytrain, spec= "{ids:true, recode:[1]}"); eYtest = transformapply(target=Ytest, spec= "{ids:true, recode:[1]}", meta=M); externalState = append(externalState, M) @@ -77,12 +80,13 @@ return (Matrix[Double] scores, Matrix[Double] cleanTrain, Matrix[Double] cleanTe # # # when the evaluation function is called first we also compute and keep hyperparams of target application ctx = list(prefix="evaluate Pipeline") dirtyScore = topk::getDirtyScore(X=Xtrain, Y=eYtrain, Xtest=Xtest, Ytest=eYtest, metaList=metaList, - evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, ctx=ctx) + evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, OHE=OHE, ctx=ctx) [Xtrain, Xtest] = topk::runStringPipeline(Xtrain, Xtest, schema, mask, FALSE, correctTypos, ctx) # # # if mask has 1s then there are categorical features [eXtrain, eXtest, M1] = topk::recodeData(Xtrain, Xtest, mask, FALSE, "recode") externalState = append(externalState, M1) + externalState = append(externalState, tfspec) # # # do the early dropping # [eXtrain, eXtest, metaList] = topk::featureDrop(eXtrain, eXtest, metaList, FALSE) metaList["applyFunc"] = applyFunc @@ -94,25 +98,22 @@ return (Matrix[Double] scores, Matrix[Double] cleanTrain, Matrix[Double] cleanTe [trainScore, evalFunHp] = bandit::crossV(X=eXtrain, y=eYtrain, cvk=cvk, evalFunHp=evalFunHp, pipList=pipList, metaList=metaList, evalFunc=evaluationFunc) - print("train score cv: "+toString(trainScore)) - # # # now test accuracy [eXtrain, eYtrain, eXtest, eYtest, a, b, c, d, iState] = executePipeline(pipeline=pip, Xtrain=eXtrain, Ytrain=eYtrain, Xtest=eXtest, Ytest=eYtest, metaList=metaList, hyperParameters=hp_matrix, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE) - + if(max(eYtrain) == min(eYtrain)) stop("Y contains only one class") # score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtrain, Ytest=eYtrain, Xorig=as.matrix(0), evalFunHp=evalFunHp)) # trainAccuracy = as.scalar(score[1, 1]) - - score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtest, Ytest=eYtest, Xorig=as.matrix(0), evalFunHp=evalFunHp)) + score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtest, Ytest=eYtest, Xorig=mask, evalFunHp=evalFunHp, tfspec=tfspec)) testAccuracy = as.scalar(score[1, 1]) scores = matrix(0, rows=1, cols=3) scores[1, 1] = dirtyScore - # scores[1, 2] = trainAccuracy + scores[1, 2] = trainScore scores[1, 3] = testAccuracy cleanTrain = cbind(eXtrain, eYtrain) cleanTest = cbind(eXtest, eYtest) diff --git a/scripts/builtin/frameSort.dml b/scripts/builtin/frameSort.dml index fa85a28946b..5da8b57ae81 100644 --- a/scripts/builtin/frameSort.dml +++ b/scripts/builtin/frameSort.dml @@ -36,7 +36,7 @@ s_frameSort = function(Frame[String] F, Matrix[Double] mask, Boolean orderDesc = TRUE) return (Frame[String] f_ordered) { - index = vectorToCsv(mask) + index = vectorToCsv(mask, ncol(F)) # recode logical pipelines for easy handling jspecR = "{ids:true, recode:["+index+"]}"; [X, M] = transformencode(target=F, spec=jspecR); diff --git a/scripts/builtin/mice.dml b/scripts/builtin/mice.dml index 8d7b1af69e3..18b27c34419 100644 --- a/scripts/builtin/mice.dml +++ b/scripts/builtin/mice.dml @@ -72,7 +72,7 @@ m_mice= function(Matrix[Double] X, Matrix[Double] cMask, Integer iter = 3, d = ncol(X1) n = nrow(X1) # compute index of categorical features - index = vectorToCsv(cMask) + index = vectorToCsv(cMask, ncol(cMask)) # specifications for one-hot encoding of categorical features jspecDC = "{ids:true, dummycode:["+index+"]}"; [dX, dM] = transformencode(target=as.frame(X1), spec=jspecDC); diff --git a/scripts/builtin/miceApply.dml b/scripts/builtin/miceApply.dml index 448310ef3ca..16ab856c7a0 100644 --- a/scripts/builtin/miceApply.dml +++ b/scripts/builtin/miceApply.dml @@ -72,7 +72,7 @@ m_miceApply = function(Matrix[Double] X, Matrix[Double] meta, Double threshold, n = nrow(X1) # compute index of categorical features - index = vectorToCsv(mask) + index = vectorToCsv(mask, ncol(mask)) # specifications for one-hot encoding of categorical features jspecDC = "{ids:true, dummycode:["+index+"]}"; diff --git a/scripts/builtin/topk_cleaning.dml b/scripts/builtin/topk_cleaning.dml index ed5a00572e8..3cc13d31ce4 100644 --- a/scripts/builtin/topk_cleaning.dml +++ b/scripts/builtin/topk_cleaning.dml @@ -28,9 +28,9 @@ source("scripts/builtin/bandit.dml") as bandit; s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = as.frame("NULL"), Frame[Unknown] metaData = as.frame("NULL"), Frame[Unknown] primitives, Frame[Unknown] parameters, Frame[String] refSol = as.frame("NaN"), String evaluationFunc, Matrix[Double] evalFunHp, Integer topK = 5, Integer resource_val = 20, - Integer max_iter = 10, Double lq = 0.1, Double uq=0.7, Double sample = 1.0, Double expectedIncrease=1.0, Integer seed = -1, Boolean cv=TRUE, Integer cvk = 2, - Boolean isLastLabel = TRUE, Integer rowCount = 3700, - Boolean correctTypos=FALSE, Boolean enablePruning = FALSE) + Integer max_iter = 10, Double lq = 0.1, Double uq=0.7, Double sample = 1.0, Double expectedIncrease=0.8, Integer seed = -1, Boolean cv=TRUE, Integer cvk = 2, + Boolean isLastLabel = TRUE, Integer rowCount = 3700, String tfspec="NA", + Boolean correctTypos=FALSE, Boolean enablePruning = FALSE, Boolean OHE = TRUE) return (Frame[Unknown] topKPipelines, Matrix[Double] topKHyperParams, Matrix[Double] topKScores, Double dirtyScore, Matrix[Double] evalFunHp, Frame[Unknown] applyFunc) { @@ -43,7 +43,7 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = a # prepare meta data # # keeping the meta list format if we decide to add more stuff in metadata [schema, mask, fdMask, maskY] = prepareMeta(dataTrain, metaData) - metaList = list(mask=mask, schema=schema, fd=fdMask, applyFunc=as.frame("null"), distY=0, minFold=0) + metaList = list(mask=mask, schema=schema, fd=fdMask, applyFunc=as.frame("null"), distY=0, tfspec=tfspec) t2 = time(); print("-- Cleaning - Prepare Metadata: "+(t2-t1)/1e9+"s"); # separate the label @@ -53,6 +53,10 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = a # always recode the label if(maskY == 1) { + # detect and apply the same schema to labels + sc = detectSchema(Ytrain) + Ytrain = applySchema(Ytrain, sc) + Ytest = applySchema(Ytest, sc) [eYtrain, M] = transformencode(target=Ytrain, spec= "{ids:true, recode:[1]}"); eYtest = transformapply(target=Ytest, spec= "{ids:true, recode:[1]}", meta=M); } @@ -64,8 +68,13 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = a # # # when the evaluation function is called first we also compute and keep hyperparams of target application print("-- Cleaning - Get Dirty Score: "); - [dirtyScore, evalFunHp] = getDirtyScore(X=Xtrain, Y=eYtrain, Xtest=Xtest, Ytest=eYtest, evaluationFunc=evaluationFunc, - metaList=metaList, cv=cv, cvk=cvk, evalFunHp=evalFunHp, ctx=ctx) + scaledCond = ifelse(expectedIncrease < 1, 0.5, 50) + if(expectedIncrease <= scaledCond) #TODO fix in more general way (to avoid computing dirty score on huge datasets in case it is already provided ) + [dirtyScore, evalFunHp] = getDirtyScore(X=Xtrain, Y=eYtrain, Xtest=Xtest, Ytest=eYtest, evaluationFunc=evaluationFunc, + metaList=metaList, cv=cv, cvk=cvk, evalFunHp=evalFunHp, OHE=OHE, ctx=ctx) + else + dirtyScore = expectedIncrease + print("-- Dirty Score: "+dirtyScore) t4 = time(); print("---- finalized in: "+(t4-t3)/1e9+"s"); # # do the string processing @@ -77,9 +86,8 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = a # # # do the early dropping # [eXtrain, eXtest, metaList] = featureDrop(eXtrain, eXtest, metaList, cv) # apply sampling on training data for pipeline enumeration - # TODO why recoding/sampling twice (within getDirtyScore) - print("---- class-stratified sampling of feature matrix w/ f="+sample); - if(nrow(eYtrain) >= rowCount & sample == 1.0 & sum(mask) > ncol(mask)/2) # & + print("---- class-stratified sampling of feature matrix w/ f="+sample+" samples="+nrow(eYtrain)); + if(nrow(eYtrain) >= 10000 & sample == 1.0 & (sum(mask) > 0 | OHE == FALSE)) # (sum(mask) > 0 | OHE == FALSE) [eXtrain, eYtrain ] = utils::doErrorSample(eXtrain, eYtrain, lq, uq, rowCount) else [eXtrain, eYtrain] = utils::doSample(eXtrain, eYtrain, sample, mask, metaR, TRUE) @@ -115,7 +123,7 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = a [bestLogical, bestHp, con, refChanges, acc] = lg::enumerateLogical(X=eXtrain, y=eYtrain, Xtest=eXtest, ytest=eYtest, initial_population=logical, refSol=refSol, seed = seed, max_iter=max_iter, metaList = metaList, evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, primitives=primitives, param=parameters, - dirtyScore = (dirtyScore + expectedIncrease), cv=cv, cvk=cvk, verbose=TRUE, ctx=ctx) + dirtyScore = (dirtyScore + expectedIncrease), cv=cv, cvk=cvk, verbose=TRUE, OHE=OHE, ctx=ctx) t6 = time(); print("---- finalized in: "+(t6-t5)/1e9+"s"); topKPipelines = as.frame("NULL"); topKHyperParams = matrix(0,0,0); topKScores = matrix(0,0,0); applyFunc = as.frame("NULL") # write(acc, output+"/acc.csv", format="csv") @@ -180,32 +188,35 @@ return(Frame[Unknown] Xtrain, Frame[Unknown] Xtest) } getDirtyScore = function(Frame[Unknown] X, Matrix[Double] Y, Frame[Unknown] Xtest, Matrix[Double] Ytest, String evaluationFunc, List[Unknown] metaList, - Matrix[Double] evalFunHp, Boolean cv = FALSE, Integer cvk = 3, List[Unknown] ctx=list() ) + Matrix[Double] evalFunHp, Boolean cv = FALSE, Integer cvk = 3, Boolean OHE=TRUE, List[Unknown] ctx=list() ) return(Double dirtyScore, Matrix[Double] evalFunHp) { dirtyScore = 100 dschema = detectSchema(X) + tfspec = as.scalar(metaList['tfspec']) dmask = matrix(0, rows=1, cols=ncol(dschema)) + for(i in 1:ncol(dschema)) - if(as.scalar(dschema[1, i]) == "STRING" | as.scalar(dschema[1, i]) == "BOOLEAN") + if(as.scalar(dschema[1, i]) == "STRING") dmask[1, i] = 1 prefix = as.scalar(ctx["prefix"]); - mask = as.matrix(metaList['mask']) - mask = ifelse(sum(mask == dmask) < ncol(mask), matrix(1, rows=1, cols=ncol(mask)), mask) + mask = as.matrix(metaList['mask']) + mask = ifelse(sum(mask) < sum(dmask), dmask, mask) [eXtrain, eXtest] = recodeData(X, Xtest, mask, cv, "recode") - eXtrain = replace(target=eXtrain, pattern=NaN, replacement = 1) - eXtest = replace(target=eXtest, pattern=NaN, replacement = 1) - [eXtrain, eXtest] = recodeData(as.frame(eXtrain), as.frame(eXtest), mask, cv, "dummycode") + eXtrain = imputeByMean(eXtrain, mask) + eXtest = imputeByMean(eXtest, mask) + if(OHE) + [eXtrain, eXtest] = recodeData(as.frame(eXtrain), as.frame(eXtest), mask, cv, "dummycode") pipList = list(lp = as.frame("NULL"), ph = as.frame("NULL"), hp = as.matrix(0), flags = 0) print(prefix+" hyper-parameter tuning and dirtyscore computation"); if(cv) { [dirtyScore, evalFunHp] = bandit::crossV(X=eXtrain, y=Y, cvk=cvk, evalFunHp=evalFunHp, pipList=pipList, metaList=metaList, evalFunc=evaluationFunc) - print("dirtyScore cv: "+dirtyScore) + print("dirtyScore cv: "+dirtyScore+" evla hp "+toString(evalFunHp)) } else { - res = eval(evaluationFunc, list(X=eXtrain, Y=Y, Xtest=eXtest, Ytest=Ytest, Xorig=as.matrix(0), evalFunHp=evalFunHp)) + res = eval(evaluationFunc, list(X=eXtrain, Y=Y, Xtest=eXtest, Ytest=Ytest, Xorig=mask , evalFunHp=evalFunHp, tfspec=tfspec)) dirtyScore = as.scalar(res[1, 1]) evalFunHp = res[1, 2:ncol(res)] print("Dirty Accuracy holdout: "+dirtyScore) @@ -217,11 +228,12 @@ return(Matrix[Double] eXtrain, Matrix[Double] eXtest, Frame[Unknown] X_meta) { if(sum(mask) > 0) { - index = vectorToCsv(mask) + index = vectorToCsv(mask=mask, n=ncol(Xtrain)) jspecR = "{ids:true, "+code+":["+index+"]}" [eXtrain, X_meta] = transformencode(target=Xtrain, spec=jspecR); - if(!cv) - eXtest = transformapply(target=Xtest, spec=jspecR, meta=X_meta); + if(!cv) { + eXtest = transformapply(target=Xtest, spec=jspecR, meta=X_meta); + } else eXtest = as.matrix(Xtest) } # if no categorical value exist then just cast the frame into matrix @@ -232,31 +244,37 @@ return(Matrix[Double] eXtrain, Matrix[Double] eXtest, Frame[Unknown] X_meta) } } -# featureDrop = function(Matrix[Double] eXtrain, Matrix[Double] eXtest, List[Unknown] metaList, Boolean cv) -# return(Matrix[Double] eXtrain, Matrix[Double] eXtest, List[Unknown] metaList) -# { - # mask = as.matrix(metaList['mask']) - # fdMask = as.matrix(metaList['fd']) - # schema = as.frame(metaList['schema']) - # # # 1. if 90% of the column is empty - # # # # 2. if the column has only single value - # # # # have all unique values - # Xtmp = replace(target = eXtrain, pattern = NaN, replacement = 0) - # nullMask = is.na(eXtrain) - # singleValuesCol = ((colMins(Xtmp) == 0) & (colMaxs(Xtmp) == 1)) | (colMaxs(Xtmp) == colMins(Xtmp)) - # allmostEmpty = colSums(nullMask) - # allmostEmptyRatio = allmostEmpty >= (nrow(Xtmp) * 0.9) - # allSum = singleValuesCol | allmostEmptyRatio - # if(sum(allSum) > 0) { - # eXtrain = removeEmpty(target=eXtrain, margin="cols", select = (allSum == 0)) - # if(!cv) - # eXtest = removeEmpty(target=eXtest, margin="cols", select = (allSum == 0)) - # mask = removeEmpty(target=mask, margin="cols", select = (allSum == 0)) - # fdMask = removeEmpty(target=fdMask, margin="cols", select = (allSum == 0)) - # schema = removeEmpty(target=schema, margin="cols", select = (allSum == 0)) - # metaList['mask'] = mask - # metaList['schema'] = schema - # metaList['fd'] = fdMask - # } -# } +featureDrop = function(Matrix[Double] eXtrain, Matrix[Double] eXtest, List[Unknown] metaList, Boolean cv) +return(Matrix[Double] eXtrain, Matrix[Double] eXtest, List[Unknown] metaList) +{ + mask = as.matrix(metaList['mask']) + fdMask = as.matrix(metaList['fd']) + schema = as.frame(metaList['schema']) + # # 1. if 90% of the column is empty + # # # 2. if the column has only single value + # # # have all unique values + Xtmp = replace(target = eXtrain, pattern = NaN, replacement = 0) + nullMask = is.na(eXtrain) + singleValuesCol = ((colMins(Xtmp) == 0) & (colMaxs(Xtmp) == 1)) | (colMaxs(Xtmp) == colMins(Xtmp)) + allmostEmpty = colSums(nullMask) + allmostEmptyRatio = allmostEmpty >= (nrow(Xtmp) * 0.9) + print("colMax "+toString(colMaxs(Xtmp))) + print("Empty "+toString(allmostEmpty)) + allUnique = (colMaxs(Xtmp)+allmostEmpty) >= (nrow(Xtmp) - nrow(Xtmp)*0.1) + # # if dummycode create more columns than rows + + allSum = singleValuesCol | allmostEmptyRatio | allUnique + print("dropping columns :" +toString(allSum)) + if(sum(allSum) > 0) { + eXtrain = removeEmpty(target=eXtrain, margin="cols", select = (allSum == 0)) + if(!cv) + eXtest = removeEmpty(target=eXtest, margin="cols", select = (allSum == 0)) + mask = removeEmpty(target=mask, margin="cols", select = (allSum == 0)) + fdMask = removeEmpty(target=fdMask, margin="cols", select = (allSum == 0)) + schema = removeEmpty(target=schema, margin="cols", select = (allSum == 0)) + metaList['mask'] = mask + metaList['schema'] = schema + metaList['fd'] = fdMask + } +} diff --git a/scripts/builtin/vectorToCsv.dml b/scripts/builtin/vectorToCsv.dml index 9a28cbb1b44..46cc0e2337c 100644 --- a/scripts/builtin/vectorToCsv.dml +++ b/scripts/builtin/vectorToCsv.dml @@ -32,10 +32,10 @@ # indexes indexes # ---------------------------------------------------------------------------------------- -m_vectorToCsv = function(Matrix[Double] mask) +m_vectorToCsv = function(Matrix[Double] mask = as.matrix(0), Integer n=1) return (String indexes){ - vector = mask * t(seq(1, ncol(mask))) + vector = mask * t(seq(1, n)) vector = removeEmpty(target = vector, margin = "cols") if(nrow(vector) > ncol(vector)) vector = t(vector) diff --git a/scripts/pipelines/scripts/enumerateLogical.dml b/scripts/pipelines/scripts/enumerateLogical.dml index cb933787b47..0a399470a38 100644 --- a/scripts/pipelines/scripts/enumerateLogical.dml +++ b/scripts/pipelines/scripts/enumerateLogical.dml @@ -55,7 +55,7 @@ source("scripts/builtin/bandit.dml") as bandit; enumerateLogical = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] Xtest, Matrix[Double] ytest, Frame[Unknown] initial_population, Frame[String] refSol = as.frame("NaN"), Integer seed = -1, Integer max_iter=10, List[Unknown] metaList, String evaluationFunc, Matrix[Double] evalFunHp, - Frame[Unknown] primitives, Frame[Unknown] param, Double dirtyScore = 79, Boolean cv=FALSE, Boolean cvk=3, + Frame[Unknown] primitives, Frame[Unknown] param, Double dirtyScore = 79, Boolean cv=FALSE, Boolean cvk=3, Boolean OHE = TRUE, Boolean verbose, List[Unknown] ctx=list(prefix="----")) return (Frame[Unknown] outputPip, Matrix[Double] outputHp, boolean converged, Double refChanges, Frame[Unknown] acc) { @@ -90,11 +90,12 @@ return (Frame[Unknown] outputPip, Matrix[Double] outputHp, boolean converged, Do ROWS = (max_iter+1)*(nrow(pipelines)) finalOutputFrame = frame(0, rows=ROWS, cols=max_iter*2) # num of max operations * max hp per operation * no of flag + buffer for pipeline no and acc - maxParam = ncol(finalOutputFrame) * max(as.matrix(param[, 3])) * FLAGS + 2 + maxValueInParam = max(as.matrix(param[, 3])) + maxParam = ncol(finalOutputFrame) * maxValueInParam * FLAGS + 2 finalOutputMatrix = matrix(0, rows=ROWS, cols=maxParam) # # if the data has categorical columns then add the dummycode operation - if(sum(mask) > 0) + if(sum(mask) > 0 & OHE) { dummyEncode = frame("dummycoding", rows=nrow(pipelines), cols=1) pipelines[, 2] = dummyEncode @@ -121,7 +122,7 @@ return (Frame[Unknown] outputPip, Matrix[Double] outputHp, boolean converged, Do # # # execute the physical pipelines [outPip, outHp, p, refChanges] = bandit::run_with_hyperparam(ph_pip=cbind(as.frame(id), population), X=X, Y=y, Xtest=Xtest, Ytest=ytest, metaList=metaList, evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, param=param, - cv=cv, cvk=cvk, seed=seed, default=TRUE) + maxOpCount=maxValueInParam, cv=cv, cvk=cvk, seed=seed, default=TRUE) # # sort the configurations score-wise actPip = cbind(as.frame(outPip[, 1]), as.frame(refChanges)) @@ -174,7 +175,7 @@ return (Frame[Unknown] outputPip, Matrix[Double] outputHp, boolean converged, Do idx2 = min(max(pipRand), idx + 1) top2 = removeEmpty(target=topk[idx2], margin="cols") # # # keep the tail "dummycode" operation from transitions - if(sum(mask) > 0) { + if(sum(mask) > 0 & OHE) { tail = top[, ncol(top)] tail2 = top2[, ncol(top2)] top = top[, 1:ncol(top) - 1] @@ -192,7 +193,7 @@ return (Frame[Unknown] outputPip, Matrix[Double] outputHp, boolean converged, Do c1 = crossover(top, top2, seed) # # # put back the tail operation - if(sum(mask) > 0) + if(sum(mask) > 0 & OHE) c1 = cbind(c1, tail) children[i, 1:ncol(c1)] = c1 } @@ -215,7 +216,7 @@ return (Frame[Unknown] outputPip, Matrix[Double] outputHp, boolean converged, Do outputPip = outputPip[,3:ncol(outputPip)] # # # prepare the hyp output - hpLength = ((ncol(outputPip) + 2) * FLAGS * 3) + 1 + hpLength = ((ncol(outputPip) + 2) * FLAGS * maxValueInParam) + 1 outputHp = finalOutputMatrix[, 1:hpLength] outputHp = order(target = outputHp, by = 1, decreasing=FALSE) } diff --git a/scripts/pipelines/scripts/utils.dml b/scripts/pipelines/scripts/utils.dml index 45688db8837..ab43c0a03e4 100644 --- a/scripts/pipelines/scripts/utils.dml +++ b/scripts/pipelines/scripts/utils.dml @@ -184,14 +184,14 @@ return(Frame[Unknown] data, List[Unknown] distanceMatrix, List[Unknown] dictiona # data = valueSwap(data, schema) # step 3 drop invalid types - print(prefix+" drop values with type mismatch"); - data = dropInvalidType(data, schema) + # print(prefix+" drop values with type mismatch"); + # data = dropInvalidType(data, schema) # step 5 porter stemming on all features - print(prefix+" porter-stemming on all features"); - data = map(data, "x -> PorterStemmer.stem(x)", 0) + # print(prefix+" porter-stemming on all features"); + # data = map(data, "x -> PorterStemmer.stem(x)", 0) } # step 6 typo correction if(CorrectTypos) @@ -245,12 +245,12 @@ return(Frame[Unknown] data) # # # step 3 fix swap values # data = valueSwap(data, schema) - # step 3 drop invalid types - data = dropInvalidType(data, schema) + # # step 3 drop invalid types + # data = dropInvalidType(data, schema) - # step 5 porter stemming on all features - data = map(data, "x -> PorterStemmer.stem(x)", 0) + # # step 5 porter stemming on all features + # data = map(data, "x -> PorterStemmer.stem(x)", 0) # step 6 typo correction diff --git a/src/main/java/org/apache/sysds/runtime/frame/data/columns/StringArray.java b/src/main/java/org/apache/sysds/runtime/frame/data/columns/StringArray.java index 8eddc37707d..aeebd136e00 100644 --- a/src/main/java/org/apache/sysds/runtime/frame/data/columns/StringArray.java +++ b/src/main/java/org/apache/sysds/runtime/frame/data/columns/StringArray.java @@ -571,12 +571,20 @@ public double getAsDouble(int i) { @Override public double getAsNaNDouble(int i) { - if(_data[i] != null && !_data[i].isEmpty()) { - return getAsDouble(_data[i]); - } - else { - return Double.NaN; + String value = _data[i]; + if(value != null && !value.isEmpty()) + { + char c = (value.charAt(0) == '-')? value.charAt(1): value.charAt(0); + if(Character.isDigit(c)) + return DoubleArray.parseDouble(_data[i]); + else { + if (FrameUtil.isType(value.toString(), ValueType.BOOLEAN) == ValueType.BOOLEAN) + return (value.equals("true")? 1: 0 ); + else + throw new DMLRuntimeException("Type mismatch String found when Double expected "+value); + } } + else return Double.NaN; } private static double getAsDouble(String s) { diff --git a/src/main/java/org/apache/sysds/runtime/util/PorterStemmer.java b/src/main/java/org/apache/sysds/runtime/util/PorterStemmer.java index 9c8b241b497..6712f9b4d29 100644 --- a/src/main/java/org/apache/sysds/runtime/util/PorterStemmer.java +++ b/src/main/java/org/apache/sysds/runtime/util/PorterStemmer.java @@ -25,24 +25,17 @@ import java.util.Map.Entry; /** - * Stemmer, implementing the Porter Stemming Algorithm - * - * The Stemmer class transforms a word into its root form. The input - * word can be provided a character at time (by calling add()), or at once - * by calling one of the various stem(something) methods. + * Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, + * no. 3, pp 130-137 + */ public class PorterStemmer { - /* m() measures the number of consonant sequences between 0 and j. if c is - a consonant sequence and v a vowel sequence, and <..> indicates arbitrary - presence, - - gives 0 + /* m() measures the number of consonant sequences for vowels v and consonants c vc gives 1 vcvc gives 2 vcvcvc gives 3 - .... */ private static int calcM(String word) @@ -65,7 +58,7 @@ private static int calcM(String word) return count; } - /* doublec(j) is true <=> j,(j-1) contain a double consonant. */ + /* ends on a double consonant i.e., ee, ss, tt */ private static boolean doublec(String word) { int len = word.length() - 1; @@ -74,13 +67,9 @@ private static boolean doublec(String word) return cons(word, len); } - /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant - and also if the second c is not w,x or y. this is used when trying to - restore an e at the end of a short word. e.g. - - cav(e), lov(e), hop(e), crim(e), but - snow, box, tray. -* */ + /* ends with cvc and second c is not w,x or y e.g. + snow, box, tray. + * */ private static boolean cvc(String word) { int len = word.length(); @@ -94,7 +83,7 @@ private static boolean cvc(String word) return !exceptions.contains(ch); } - /* vowelinstem() is true <=> 0,...j contains a vowel */ + /* vowelinstem() is true if stem contains a vowel */ private static boolean vowelinStem(String word, String suffix) { int length = word.length() - suffix.length(); for(int i=0; i b[i] is a consonant. */ private static boolean cons(String stem, int i) { @@ -112,13 +100,6 @@ private static boolean cons(String stem, int i) char ch = stem.charAt(i); if(vowels.contains(String.valueOf(stem.charAt(i)))) return false; - if(ch == 'y') - { - if(i == 0) - return true; - else - return (!cons(stem, i - 1)); - } return true; } // process the collection of tuples to find which prefix matches the case. @@ -154,20 +135,6 @@ private static String replacer(String word, String orig, String replace, int mCo return null; } - /* step1() gets rid of plurals and -ed or -ing. e.g. - i.e., condition & suffix -> replacement - SSES -> SS - IES -> I - SS -> SS - S -> "" - (m > 0) EED -> EE - vowelSequence(ED) -> "" - vowelsequence(ING) -> "" - any("at, bl, iz") -> add(e) - doubleconsonant and not("l", "s", "z") -> remove single letter from end - (m == 1 and cvc) -> add(e) - turns terminal y to i when there is another vowel in the stem. - */ private static String step1(String word) { @@ -323,6 +290,7 @@ private static String step5(String word) } public static String stem (String word) { + word = StringUtils.lowerCase(word); if(word.length() >= 3) { word = step1(word); word = step2(word); @@ -333,4 +301,4 @@ public static String stem (String word) } return word; } -} \ No newline at end of file +} diff --git a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningRegressionTest.java b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningRegressionTest.java index b5f11445e30..71d57f42bc7 100644 --- a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningRegressionTest.java +++ b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningRegressionTest.java @@ -43,7 +43,7 @@ public void setUp() { addTestConfiguration(TEST_NAME1,new TestConfiguration(TEST_CLASS_DIR, TEST_NAME1,new String[]{"R"})); } - @Test + @Ignore public void testRegressionPipelinesCP1() { runFindPipelineTest(1.0, 5,20, "FALSE", 3, 0.8, Types.ExecMode.SINGLE_NODE); diff --git a/src/test/resources/datasets/Salaries.json b/src/test/resources/datasets/Salaries.json new file mode 100644 index 00000000000..e58a9171ce2 --- /dev/null +++ b/src/test/resources/datasets/Salaries.json @@ -0,0 +1,11 @@ +{ + "ids":true, + "recode":[1,2,5], + "bin":[ + {"id":3, "method":"equi-width", "numbins":10}, + {"id":4, "method":"equi-width", "numbins":10}, + {"id":6, "method":"equi-width", "numbins":10} + + ] +} + diff --git a/src/test/scripts/functions/pipelines/executePipelineTest.dml b/src/test/scripts/functions/pipelines/executePipelineTest.dml index d80abe3a9e6..178a9943931 100644 --- a/src/test/scripts/functions/pipelines/executePipelineTest.dml +++ b/src/test/scripts/functions/pipelines/executePipelineTest.dml @@ -68,7 +68,7 @@ return(Matrix[Double] eXtrain, Matrix[Double] eXtest) { if(sum(mask) > 0) { - index = vectorToCsv(mask) + index = vectorToCsv(mask, ncol(mask)) jspecR = "{ids:true, "+code+":["+index+"]}" [eXtrain, X_meta] = transformencode(target=Xtrain, spec=jspecR); if(!cv) diff --git a/src/test/scripts/functions/pipelines/fit_pipelineTest.dml b/src/test/scripts/functions/pipelines/fit_pipelineTest.dml index 34ae24bbe25..ebfbc4d06b1 100644 --- a/src/test/scripts/functions/pipelines/fit_pipelineTest.dml +++ b/src/test/scripts/functions/pipelines/fit_pipelineTest.dml @@ -60,7 +60,7 @@ testData = F[split+1:nrow(F),] print("pipeline: "+toString(pip[1])) -[result, trX, tsX, exState, iState] = fit_pipeline(trainData, testData, metaInfo, pip[1,], applyFunc[1,], hp[1,], 3, "evalClassification", evalHp, TRUE, FALSE) +[result, trX, tsX, exState, iState] = fit_pipeline(trainData, testData, metaInfo, pip[1,], applyFunc[1,], hp[1,], 3, "evalClassification", evalHp, TRUE, "NA", TRUE, FALSE) eXtest = apply_pipeline(testData, metaInfo, pip[1,], applyFunc[1,], hp[1,], TRUE, exState, iState, FALSE) @@ -84,7 +84,7 @@ print(toString(writeRes)) # UDF for evaluation # choice of parameters provided by API, X, Y, clone_X, evalFunHp (hyper-param), trainML (boolean for optimizing hp internally or passed by externally ) evalClassification = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xtest, Matrix[Double] Ytest, Matrix[Double] Xorig=as.matrix(0), - Matrix[Double] evalFunHp) + Matrix[Double] evalFunHp, String tfspec="NA") return(Matrix[Double] output, Matrix[Double] error) { if(is.na(as.scalar(evalFunHp[1,1]))) diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv b/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv index eb368f7d612..1a632b6e242 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv +++ b/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv @@ -1,3 +1,3 @@ -outlierBySdApply,forward_fill,fillDefaultApply,imputeByMedianApply,fillDefaultApply,fillDefaultApply,forward_fill,dummycodingApply,0,0,0,0,0,0,0,0,0,0 -outlierBySdApply,forward_fill,fillDefaultApply,imputeByMedianApply,fillDefaultApply,fillDefaultApply,forward_fill,dummycodingApply,0,0,0,0,0,0,0,0,0,0 -outlierBySdApply,forward_fill,fillDefaultApply,imputeByMedianApply,fillDefaultApply,fillDefaultApply,forward_fill,dummycodingApply,0,0,0,0,0,0,0,0,0,0 +forward_fill,imputeByMedianApply,winsorizeApply,NA,dummycodingApply,0,0,0,0,0,0,0,0,0,0,0,0,0 +forward_fill,imputeByMedianApply,NA,dummycodingApply,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +scaleApply,forward_fill,scaleApply,NA,NA,dummycodingApply,0,0,0,0,0,0,0,0,0,0,0,0 diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv.mtd b/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv.mtd index 2d2e90dae95..3e24d3ef3b2 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv.mtd +++ b/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv.mtd @@ -4,8 +4,8 @@ "rows": 3, "cols": 18, "format": "csv", - "author": "mboehm", + "author": "Shafaq Siddiqui", "header": false, "sep": ",", - "created": "2023-06-05 17:02:47 CEST" + "created": "2023-08-09 16:16:42 CEST" } \ No newline at end of file diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv b/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv index 84389d26b5b..4bd69ab3b38 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv +++ b/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv @@ -1,3 +1,3 @@ -67.57246376811595 -67.57246376811595 -67.3913043478261 +72.10144927536231 +70.65217391304348 +70.65217391304347 diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv.mtd b/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv.mtd index 09ab53a0f17..0a1101e3692 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv.mtd +++ b/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv.mtd @@ -5,8 +5,8 @@ "cols": 1, "nnz": 3, "format": "csv", - "author": "mboehm", + "author": "Shafaq Siddiqui", "header": false, "sep": ",", - "created": "2023-06-05 17:02:46 CEST" + "created": "2023-08-09 16:16:42 CEST" } \ No newline at end of file diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv b/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv index 4e5b1a5042c..f27a836bc66 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv +++ b/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv @@ -1 +1 @@ -61.050724637681164 \ No newline at end of file +68.29710144927536 \ No newline at end of file diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv.mtd b/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv.mtd index b10577b1475..5529b4fd16d 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv.mtd +++ b/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv.mtd @@ -2,6 +2,6 @@ "data_type": "scalar", "value_type": "double", "format": "text", - "author": "mboehm", - "created": "2023-06-05 17:02:47 CEST" + "author": "Shafaq Siddiqui", + "created": "2023-08-09 16:16:42 CEST" } \ No newline at end of file diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/evalHp.csv.mtd b/src/test/scripts/functions/pipelines/intermediates/classification/evalHp.csv.mtd index bb702909902..50bdbb78af0 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/evalHp.csv.mtd +++ b/src/test/scripts/functions/pipelines/intermediates/classification/evalHp.csv.mtd @@ -5,8 +5,8 @@ "cols": 3, "nnz": 3, "format": "csv", - "author": "mboehm", + "author": "Shafaq Siddiqui", "header": false, "sep": ",", - "created": "2023-06-05 17:02:47 CEST" + "created": "2023-08-09 16:16:42 CEST" } \ No newline at end of file diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv index 70369c81f1d..b3ef79f4be2 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv +++ b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv @@ -1,3 +1,3 @@ -72.0,3.0,3.0,2.0,1.0,0,0,0,1.0,0,1.0,1.0,0,0,0,0,0,1.0,2.0,0,0,0,0,0,0,0,0,2.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,2.0,1.0,1.0,0,0,0,0,0,1.0,2.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -72.0,3.0,5.0,2.0,1.0,0,0,0,1.0,0,1.0,1.0,0,0,0,0,0,1.0,2.0,0,0,0,0,0,0,0,0,2.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,2.0,1.0,1.0,0,0,0,0,0,1.0,2.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -72.0,3.0,1.0,2.0,1.0,0,0,0,1.0,0,1.0,1.0,0,0,0,0,0,1.0,2.0,0,0,0,0,0,0,0,0,2.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,2.0,0,0,0,0,0,0,0,0,2.0,1.0,1.0,0,0,0,0,0,1.0,2.0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +40.0,1.0,1.0,0,0,0,0,1.0,2.0,0,0,0,1.0,0,0,0,2.0,2.0,0.05,0.95,0,0,0,1.0,0,0,0,0,0,0,1.0,0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +28.0,1.0,1.0,0,0,0,1.0,2.0,0,0,1.0,0,0,0,2.0,0,0,0,0,1.0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +48.0,2.0,1.0,0,0,0,0,0,0,1.0,1.0,0,0,0,0,1.0,2.0,2.0,1.0,0,0,0,0,0,0,0,0,0,0,0,1.0,0,2.0,0,0,0,0,0,1.0,0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv.mtd b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv.mtd index 43f7d62a58a..b4a71c552c9 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv.mtd +++ b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv.mtd @@ -2,11 +2,11 @@ "data_type": "matrix", "value_type": "double", "rows": 3, - "cols": 300, - "nnz": 63, + "cols": 400, + "nnz": 41, "format": "csv", - "author": "mboehm", + "author": "Shafaq Siddiqui", "header": false, "sep": ",", - "created": "2023-06-05 17:02:46 CEST" + "created": "2023-08-09 16:16:42 CEST" } \ No newline at end of file diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv index dc6138ae279..8788b4a1d18 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv +++ b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv @@ -1,3 +1,3 @@ -outlierBySd,forward_fill,fillDefault,imputeByMedian,fillDefault,fillDefault,forward_fill,dummycoding,0,0,0,0,0,0,0,0,0,0 -outlierBySd,forward_fill,fillDefault,imputeByMedian,fillDefault,fillDefault,forward_fill,dummycoding,0,0,0,0,0,0,0,0,0,0 -outlierBySd,forward_fill,fillDefault,imputeByMedian,fillDefault,fillDefault,forward_fill,dummycoding,0,0,0,0,0,0,0,0,0,0 +forward_fill,imputeByMedian,winsorize,tomeklink,dummycoding,0,0,0,0,0,0,0,0,0,0,0,0,0 +forward_fill,imputeByMedian,tomeklink,dummycoding,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +scale,forward_fill,scale,tomeklink,tomeklink,dummycoding,0,0,0,0,0,0,0,0,0,0,0,0 diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv.mtd b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv.mtd index cc523889d28..3e24d3ef3b2 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv.mtd +++ b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv.mtd @@ -4,8 +4,8 @@ "rows": 3, "cols": 18, "format": "csv", - "author": "mboehm", + "author": "Shafaq Siddiqui", "header": false, "sep": ",", - "created": "2023-06-05 17:02:46 CEST" + "created": "2023-08-09 16:16:42 CEST" } \ No newline at end of file diff --git a/src/test/scripts/functions/pipelines/topkLogicalTest.dml b/src/test/scripts/functions/pipelines/topkLogicalTest.dml index 3c6e70cd7bb..582dbf268ad 100644 --- a/src/test/scripts/functions/pipelines/topkLogicalTest.dml +++ b/src/test/scripts/functions/pipelines/topkLogicalTest.dml @@ -45,7 +45,7 @@ getFdMask = as.matrix(metaInfo[3, 2:ncol(metaInfo)]) # columns of interest for F if(sum(getMask) > 0) { # always recode the label - index = vectorToCsv(getMask) + index = vectorToCsv(getMask, ncol(getMask)) jspecR = "{ids:true, recode:["+index+"]}" [eX, X_meta] = transformencode(target=X, spec=jspecR); # change the schema to reflect the encoded values @@ -66,7 +66,7 @@ getMask = getMask[, 1:ncol(getMask) - 1] # strip the mask of class label getFdMask = getFdMask[, 1:ncol(getFdMask) - 1] # strip the mask of class label getSchema = getSchema[, 1:ncol(getSchema) - 1] # strip the mask of class label -metaList = list(mask=getMask, schema=getSchema, fd=as.matrix(0), applyFunc=as.frame("NULL"), distY = 20) +metaList = list(mask=getMask, schema=getSchema, fd=as.matrix(0), applyFunc=as.frame("NULL"), distY = 20, tfspec="NA") logical = frame([ "MVI", @@ -107,7 +107,7 @@ write(converged , $O) # UDF for evaluation # choice of parameters provided by API, X, Y, clone_X, evalFunHp (hyper-param), trainML (boolean for optimizing hp internally or passed by externally ) evalML = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xtest, Matrix[Double] Ytest, Matrix[Double] Xorig=as.matrix(0), - Matrix[Double] evalFunHp) + Matrix[Double] evalFunHp, String tfspec="NA") return(Matrix[Double] accuracy) { diff --git a/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml b/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml index 84549f199d9..11608d35269 100644 --- a/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml +++ b/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml @@ -52,14 +52,12 @@ else { if(nrow(metaInfo) < 2) stop("incomplete meta info") - metaInfo = metaInfo[, 2:ncol(metaInfo)] # # # split in train/test 70/30 - [topKPipelines, topKHyperParams, topKScores, baseLineScore, evalFunHp, applyFunc] = topk_cleaning(dataTrain=trainData, dataTest=testData, metaData=metaInfo, primitives=primitives, parameters=param, - refSol = frame(["imputeByMean", "scale", "dummycoding"], rows=1, cols=3), - evaluationFunc=evalFunc, evalFunHp=as.matrix(NaN),topK=topK, resource_val=resources, enablePruning=TRUE, - expectedIncrease=expectedIncrease, seed = 23, max_iter=max_iter, cv=testCV, cvk=cvk, sample=sample, isLastLabel=TRUE, correctTypos=FALSE) + refSol = frame(["imputeByMean", "scale"], rows=1, cols=2), tfspec="NA", + evaluationFunc=evalFunc, evalFunHp=as.matrix(NaN),topK=topK, resource_val=resources, enablePruning=TRUE, OHE=TRUE, + expectedIncrease=expectedIncrease, seed = 41, max_iter=max_iter, cv=testCV, cvk=cvk, sample=sample, isLastLabel=TRUE, correctTypos=FALSE) write(topKPipelines, output+"/pip.csv", format="csv") write(topKHyperParams, output+"/hp.csv", format="csv") @@ -74,7 +72,7 @@ write(result, $O) # UDF for evaluation # choice of parameters provided by API, X, Y, clone_X, evalFunHp (hyper-param), trainML (boolean for optimizing hp internally or passed by externally ) evalClassification = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xtest, Matrix[Double] Ytest, Matrix[Double] Xorig=as.matrix(0), - Matrix[Double] evalFunHp) + Matrix[Double] evalFunHp, String tfspec="NA") return(Matrix[Double] output, Matrix[Double] error) { if(is.na(as.scalar(evalFunHp[1,1]))) @@ -144,9 +142,45 @@ return(Matrix[Double] output, Matrix[Double] error) } output = cbind(accuracy, evalFunHp) } + accuracyMSVM = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B) return (Matrix[Double] err) { yhat = msvmPredict(X=X, W=B); yhat = rowIndexMax(yhat) acc = mean(yhat == y) err = as.matrix(1-(acc)); +} + + +evalClassificationDT = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xtest, Matrix[Double] Ytest, Matrix[Double] Xorig=as.matrix(0), + Matrix[Double] evalFunHp, String tfspec) +return(Matrix[Double] output) +{ + + [X, meta] = transformencode(target=as.frame(X), spec=tfspec); + Xtest = transformapply(target=as.frame(Xtest), spec = tfspec, meta=meta) + X = imputeByMode(X); + Xtest = imputeByMode(Xtest); + X = replace(target=X, pattern=0, replacement=1) + Xtest = replace(target=Xtest, pattern=0, replacement=1) + + print("column minimums \n"+toString(colMins(X))) + R = cbind(Xorig, as.matrix(1)) + 1 + print(toString(R)) + if(min(Y) == max(Y)) + { + accuracy = as.matrix(0) + a = 0 + } + else { + M = decisionTree(X = X, y = Y, ctypes = R, max_features=1, min_split=4, min_leaf=2, verbose=FALSE); + [accuracy, err] = accuracyDT(X=Xtest, y=Ytest, M=M, R=R); + } + output = cbind(accuracy, evalFunHp) +} + +accuracyDT = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] M, Matrix[Double] R) + return (Matrix[Double] acc, Matrix[Double] err) { + yhat = decisionTreePredict(X=X, ctypes=R, M=M) + acc = as.matrix(mean(yhat == y)) * 100 + err = 1-(acc); } \ No newline at end of file diff --git a/src/test/scripts/functions/pipelines/topkcleaningRegressionTest.dml b/src/test/scripts/functions/pipelines/topkcleaningRegressionTest.dml index 6a13253e08a..7f8cb75a0c5 100644 --- a/src/test/scripts/functions/pipelines/topkcleaningRegressionTest.dml +++ b/src/test/scripts/functions/pipelines/topkcleaningRegressionTest.dml @@ -22,7 +22,7 @@ source("scripts/pipelines/scripts/utils.dml") as utils; # read the inputs -F = read($dirtyData, data_type="frame", format="csv", header=TRUE, +F = read($dirtyData, data_type="frame", format="csv", header=TRUE, naStrings= ["NA", "null"," ","NaN", "nan", "", " ", "_nan_", "inf", "?", "NAN", "99999"]); F = F[,2:ncol(F)] primitives = read($primitives, data_type = "frame", format="csv", header= TRUE) @@ -34,9 +34,9 @@ output=$output testCV = as.logical($testCV) trainTestSplit = as.double($split) cvk = as.integer($cvk) - +tfspec = read("D:/Workspace/Development/Systemml/src/test/resources/datasets/Salaries.json", data_type="scalar", value_type="string"); split = nrow(F) * trainTestSplit - evalFunc = "evalRegression" + evalFunc = "evalRegressionDT" if(testCV) { trainData = F[1:split,] testData = frame("", rows=0, cols=0) @@ -47,10 +47,10 @@ else { } # # # split in train/test 70/30 -#matrix("1 1e-6 1e-9 1000", rows=1, cols=4) -[topKPipelines, topKHyperParams, topKScores, baseLineScore, evalFunHp, applyFunc] = topk_cleaning(dataTrain=trainData, dataTest=testData, - primitives=primitives, parameters=param, evaluationFunc=evalFunc, evalFunHp=as.matrix(NaN), - topK=topK, resource_val=resources, cv=testCV, cvk=cvk, sample=sample, isLastLabel=TRUE, correctTypos=FALSE) +[topKPipelines, topKHyperParams, topKScores, baseLineScore, evalFunHp, applyFunc] = topk_cleaning(dataTrain=trainData, dataTest=testData,primitives=primitives, parameters=param, + refSol = frame(["imputeByMean", "scale"], rows=1, cols=2), tfspec=tfspec, + evaluationFunc=evalFunc, evalFunHp=as.matrix(NaN),topK=topK, resource_val=resources, enablePruning=TRUE, OHE=FALSE, + expectedIncrease=1, seed = 23, max_iter=10, cv=testCV, cvk=cvk, sample=sample, isLastLabel=TRUE, correctTypos=FALSE) write(topKPipelines, output+"/pip.csv", format="csv") write(topKHyperParams, output+"/hp.csv", format="csv") @@ -58,29 +58,27 @@ write(topKScores, output+"/bestAcc.csv", format="csv") write(baseLineScore, output+"/dirtyScore.csv", format="csv") write(evalFunHp, output+"/evalHp.csv", format="csv") write(applyFunc, output+"/applyFunc.csv", format="csv") -result = baseLineScore < as.scalar(topKScores[1, 1]) +result = baseLineScore < as.scalar(topKScores[1, 1]) write(result, $O) -# UDF for evaluation +# UDF for evaluation # choice of parameters provided by API, X, Y, clone_X, evalFunHp (hyper-param), trainML (boolean for optimizing hp internally or passed by externally ) evalRegression = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xtest, Matrix[Double] Ytest, Matrix[Double] Xorig=as.matrix(0), - Matrix[Double] evalFunHp) + Matrix[Double] evalFunHp, String tfspec="NA") return(Matrix[Double] output) { if(is.na(as.scalar(evalFunHp[1,1]))) { # do the gridsearch for hyper-parameters - lArgs=list(X=X, y=Y, icpt=0, reg=-1, tol=-1, maxi=-1, verbose=FALSE); params = list("icpt","reg", "tol"); - paramRanges = list(seq(0,2,1), 10^seq(0,-4), 10^seq(-6,-12)); - [B1, opt] = gridSearch(X=X, y=Y, train="lm", predict="wmape", trainArgs=lArgs, + paramRanges = list(seq(0,2,1),10^seq(0,-4), 10^seq(-6,-12)); + [B1, opt] = gridSearch(X=X, y=Y, train="lm", predict="wmape", numB=ncol(X)+1, params=params, paramValues=paramRanges, cv=TRUE, cvk=3, verbose=FALSE); - evalFunHp = as.matrix(opt) + evalFunHp = as.matrix(opt) } - beta = lm(X=X, y=Y, icpt=as.scalar(evalFunHp[1,1]), reg=as.scalar(evalFunHp[1,2]), tol=as.scalar(evalFunHp[1,3]), + beta = lm(X=X, y=Y, icpt=as.scalar(evalFunHp[1,1]), reg=as.scalar(evalFunHp[1,2]), tol=as.scalar(evalFunHp[1,3]), maxi=1000, verbose=FALSE); - acc = wmape(Xtest, Ytest, beta) accuracy = (1 - acc) output = cbind(accuracy, evalFunHp) @@ -90,13 +88,66 @@ return(Matrix[Double] output) # # loss = as.matrix(sum((y - X%*%B)^2)); # pred = lmPredict(X=X, B=B, ytest=y); # WMAPE = sum(abs(y - pred))/sum(abs(y)) #this will give the lose into range of [0,1] - # loss = ifelse(is.na(as.matrix(WMAPE)), as.matrix(0), as.matrix(WMAPE)) + # loss = ifelse(is.na(as.matrix(WMAPE)), as.matrix(0), as.matrix(WMAPE)) # } wmape = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B) return (Matrix[Double] loss) { # loss = as.matrix(sum((y - X%*%B)^2)); - pred = lmPredict(X=X, B=B, ytest=y, verbose=FALSE); + pred = lmPredict(X=X, B=B, ytest=y); # print("WMAPO: "+(1 - (sum(abs((pred - y)/(pred + y)))/nrow(y)))) WMAPE = 1 - (sum(abs((pred - y)/(pred + y)))/nrow(y)) #this will give the lose into range of [0,1] - loss = ifelse(is.na(as.matrix(WMAPE)), as.matrix(0), as.matrix(WMAPE)) + loss = ifelse(is.na(as.matrix(WMAPE)), as.matrix(0), as.matrix(WMAPE)) +} + + + +evalRegressionDT = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xtest, Matrix[Double] Ytest, Matrix[Double] Xorig=as.matrix(0), + Matrix[Double] evalFunHp, String tfspec) +return(Matrix[Double] output) +{ + + [X, meta] = transformencode(target=as.frame(X), spec=tfspec); + Xtest = transformapply(target=as.frame(Xtest), spec = tfspec, meta=meta) + X = imputeByMode(X); + Xtest = imputeByMode(Xtest); + X = replace(target=X, pattern=0, replacement=1) + Xtest = replace(target=Xtest, pattern=0, replacement=1) + + print("column minimums \n"+toString(colMins(X))) + R = cbind(Xorig, as.matrix(0)) + 1 + print(toString(R)) + if(min(Y) == max(Y)) + { + accuracy = as.matrix(0) + a = 0 + } + else { + M = decisionTree(X = X, y = Y, ctypes = R, max_features=1, min_split=4, min_leaf=2, verbose=FALSE); + accuracy = Rsquared(X=Xtest, y=Ytest, M=M, R=R, jspec=tfspec, meta=meta); + } + output = cbind(accuracy, evalFunHp) } + +# accuracyDT = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] M, Matrix[Double] R) + # return (Matrix[Double] acc, Matrix[Double] err) { + # yhat = decisionTreePredict(X=X, ctypes=R, M=M) + # acc = as.matrix(mean(yhat == y)) + # err = 1-(acc); +# } + +Rsquared = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] M, Matrix[Double] R, String jspec, Frame[Unknown] meta) return (Matrix[Double] gain) { + # loss = as.matrix(sum((y - X%*%B)^2)); + yhat = decisionTreePredict(X=X, ctypes=R, M=M) #, B=B, ytest=y, verbose = TRUE); + print("predicted bins: "+toString(yhat)) + while(FALSE){} + decoded = transformdecode(target=cbind(X, yhat), spec=jspec, meta=meta) + pred = as.matrix(decoded[, ncol(decoded)]) + print("prediction: "+toString(yhat)) + while(FALSE){} + Rsqu = sum((y - pred)^2)/sum((y - mean(y))^2) #this will give the lose into range of [0,1] + Rsqu = 1 - (Rsqu) + # adjRsqu = 1 - (((n - 1)/(n - k - 1)) * (1 - Rsqu)) + print("Rs: "+Rsqu) + while(FALSE){} + gain = ifelse(is.na(as.matrix(Rsqu)), as.matrix(0), as.matrix(Rsqu)) +} \ No newline at end of file