Skip to content

Commit

Permalink
[MINOR] Adding decision tree to cleaning pipelines evaluation
Browse files Browse the repository at this point in the history
  • Loading branch information
Shafaq-Siddiqi committed Aug 9, 2023
1 parent d28921a commit 9661b57
Show file tree
Hide file tree
Showing 33 changed files with 308 additions and 210 deletions.
2 changes: 1 addition & 1 deletion scripts/builtin/abstain.dml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ return (Matrix[Double] Xout, Matrix[Double] Yout)
Yout = Y
if(min(Y) != max(Y) & max(Y) <= 2)
{
betas = multiLogReg(X=X, Y=Y, icpt=1, reg=1e-4, maxi=100, maxii=0, verbose=verbose)
betas = multiLogReg(X=X, Y=Y, icpt=1, reg=1e-4, maxi=50, maxii=0, verbose=verbose)
[prob, yhat, accuracy] = multiLogRegPredict(X, betas, Y, FALSE)

inc = ((yhat != Y) & (rowMaxs(prob) > threshold))
Expand Down
4 changes: 2 additions & 2 deletions scripts/builtin/apply_pipeline.dml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ s_apply_pipeline = function(Frame[Unknown] testData, Frame[Unknown] metaData = a
[schema, mask, fdMask, maskY] = topk::prepareMeta(testData, metaData)
pip = removeEmpty(target=pip, margin="cols")
applyFunc = removeEmpty(target=applyFunc, margin="cols")
metaList = list(mask=mask, schema=schema, fd=fdMask, applyFunc=as.frame("NULL"))
metaList = list(mask=mask, schema=schema, fd=fdMask, applyFunc=as.frame("NULL"), tfspec=as.scalar(exState[3]))
ctx = list(prefix="----"); #TODO include seed
# separate the label
[Xtest, Ytest] = topk::getLabel(testData, isLastLabel)
Expand All @@ -75,7 +75,7 @@ s_apply_pipeline = function(Frame[Unknown] testData, Frame[Unknown] metaData = a
M = as.frame(exState[2])
if(sum(mask) > 0)
{
index = vectorToCsv(mask)
index = vectorToCsv(mask, ncol(mask))
jspecR = "{ids:true, recode:["+index+"]}"
eXtest = transformapply(target=Xtest, spec=jspecR, meta=M);
}
Expand Down
28 changes: 16 additions & 12 deletions scripts/builtin/bandit.dml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, Matrix[Doubl
totalPruneCount = 0
FLAG_VARIABLE = 5
pipelines_executed = 0
HYPERPARAM_LENGTH = ((ncol(lp) + 2) * FLAG_VARIABLE * 3) + 1 ## num of col in logical * 5 meat flag vars * max hyperparam per op + 1 accuracy col
maxValueInParam = max(as.matrix(param[, 3]))
HYPERPARAM_LENGTH = ((ncol(lp) + 2) * FLAG_VARIABLE * maxValueInParam) + 1 ## num of col in logical * 5 meat flag vars * max hyperparam per op + 1 accuracy col
bestPipeline = frame("", rows=1, cols=1)
bestHyperparams = as.matrix(0)
bestAccuracy = as.matrix(0)
Expand Down Expand Up @@ -111,7 +112,7 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, Matrix[Doubl
configurations = configurations[1:n_i, ]
pipelines_executed = pipelines_executed + (n_i * r_i)
[outPip,outHp, pruneCount] = run_with_hyperparam(ph_pip=configurations, r_i=r_i, X=X_train, Y=Y_train, Xtest=X_test, Ytest=Y_test, metaList=metaList,
evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, param=param, cv=cv, cvk=cvk, ref=ref, seed = seed, enablePruning=enablePruning)
evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, param=param, maxOpCount=maxValueInParam, cv=cv, cvk=cvk, ref=ref, seed = seed, enablePruning=enablePruning)
totalPruneCount = totalPruneCount + pruneCount
# sort the pipelines by order of accuracy decreasing
IX = order(target = outPip, by = 1, decreasing=TRUE, index.return=TRUE)
Expand Down Expand Up @@ -214,19 +215,21 @@ get_physical_configurations = function(Frame[String] logical, Scalar[int] numCon
# # this method will call the execute pipelines with their hyper-parameters
run_with_hyperparam = function(Frame[Unknown] ph_pip, Integer r_i = 1, Matrix[Double] X, Matrix[Double] Y,
Matrix[Double] Xtest, Matrix[Double] Ytest, List[Unknown] metaList, String evaluationFunc, Matrix[Double] evalFunHp,
Frame[Unknown] param, Boolean cv = FALSE, Integer cvk = 2, Double ref = 0, Integer seed = -1, Boolean enablePruning = FALSE, Boolean default = FALSE)
Frame[Unknown] param, Boolean cv = FALSE, Integer maxOpCount=3, Integer cvk = 2, Double ref = 0, Integer seed = -1, Boolean enablePruning = FALSE, Boolean default = FALSE)
return (Matrix[Double] output_operator, Matrix[Double] output_hyperparam, Integer pruneCount, Matrix[Double] changesByPipMatrix)
{
# # # TODO there is a partial overlap but it is negligible so we will not rewrite the scripts but lineage based reuse will get rid of it
tfspec=as.scalar(metaList["tfspec"])
mask=as.matrix(metaList["mask"])
changesByPipMatrix = matrix(0, rows=nrow(ph_pip) * r_i, cols=1)
pruneCount = 0
output_hp = matrix(0, nrow(ph_pip)*r_i, (ncol(ph_pip)) * 5 * 3)
output_hp = matrix(0, nrow(ph_pip)*r_i, (ncol(ph_pip)) * 5 * maxOpCount)
output_accuracy = matrix(0, nrow(ph_pip)*r_i, 1)
output_pipelines = matrix(0, nrow(ph_pip)*r_i, 3)
# rows in validation set
ids = as.matrix(ph_pip[, 1:2])
ph_pip = ph_pip[, 3:ncol(ph_pip)]
inputHpMatrix = matrix(0, nrow(ph_pip)*r_i, (ncol(ph_pip)) * 5 * 3 + 1)
inputHpMatrix = matrix(0, nrow(ph_pip)*r_i, (ncol(ph_pip)) * 5 * maxOpCount + 1)
# prepare the pipelines and resources
allPipelines = frame(0, rows = nrow(ph_pip) * r_i, cols=ncol(ph_pip))
allApplyFunctions = frame(0, rows = nrow(ph_pip) * r_i, cols=ncol(ph_pip))
Expand Down Expand Up @@ -286,7 +289,7 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip, Integer r_i = 1, Matrix[Do
hp = hp[, 2:totalVals]
applyFunctions = allApplyFunctions[i]
no_of_res = nrow(hp)
# print("PIPELINE EXECUTION START ... "+toString(op))
print("PIPELINE EXECUTION START ... "+toString(op))
hpForPruning = matrix(0, rows=1, cols=ncol(op))
changesByOp = matrix(0, rows=1, cols=ncol(op))
metaList2 = metaList; #ensure metaList is no result var
Expand Down Expand Up @@ -317,7 +320,7 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip, Integer r_i = 1, Matrix[Do
else if(changesByPip < ref)
print("prunningAlert 2: not training the model due to minimum changes")
else
evalFunOutput = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtest, Ytest=eYtest, Xorig=as.matrix(0), evalFunHp=evalFunHp))
evalFunOutput = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtest, Ytest=eYtest, Xorig=mask, evalFunHp=evalFunHp, tfspec=tfspec))
accuracy = as.scalar(evalFunOutput[1, 1])
}

Expand Down Expand Up @@ -506,12 +509,13 @@ crossV = function(Matrix[double] X, Matrix[double] y, Integer cvk, Matrix[Double
Matrix[Double] hpForPruning = as.matrix(0), Matrix[Double] changesByOp = as.matrix(0), String evalFunc, Double ref = 0)
return (Double accuracy, Matrix[Double] evalFunHp, Matrix[Double] hpForPruning, Matrix[Double] changesByOp, Double allChanges)
{

tfspec = as.scalar(metaList['tfspec'])
# # in the below condition we compute the hp using cv method on train dataset
if(is.na(as.scalar(evalFunHp[1,1]))) {
forEvalHp = eval(evalFunc, list(X=X, Y=y, Xtest=X, Ytest=y, Xorig=as.matrix(0), evalFunHp=evalFunHp))
if(is.na(as.scalar(evalFunHp[1,1])) & tfspec=="NA") {
forEvalHp = eval(evalFunc, list(X=X, Y=y, Xtest=X, Ytest=y, Xorig=as.matrix(0), evalFunHp=evalFunHp, tfspec=tfspec))
evalFunHp = forEvalHp[1, 2:ncol(forEvalHp)]
}
mask = as.matrix(metaList['mask'])
changesByPip = 0
cvChanges = matrix(0, rows=cvk, cols=ncol(changesByOp))
accuracyMatrix = matrix(0, cvk, 1)
Expand Down Expand Up @@ -547,9 +551,9 @@ return (Double accuracy, Matrix[Double] evalFunHp, Matrix[Double] hpForPruning,
allChanges[i] = changesByPip
}
if(changesByPip < ref)
print("prunning alert 2: no training the model due to minimum changes")
print("pruning alert 2: no training the model due to minimum changes")
else {
res = eval(evalFunc, list(X=trainX, Y=trainy, Xtest=testX, Ytest=testy, Xorig=as.matrix(0), evalFunHp=evalFunHp))
res = eval(evalFunc, list(X=trainX, Y=trainy, Xtest=testX, Ytest=testy, Xorig=mask , evalFunHp=evalFunHp, tfspec=tfspec))
accuracyMatrix[i] = res[1, 1]
}

Expand Down
1 change: 1 addition & 0 deletions scripts/builtin/correctTyposApply.dml
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,4 @@ replaceStrings1 = function(String replacement, String to_replace, Frame[String]
{
strings = map(strings, "s -> s.equals(\""+to_replace+"\") ? \""+replacement+"\" : s");
}

21 changes: 11 additions & 10 deletions scripts/builtin/executePipeline.dml
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,6 @@ s_executePipeline = function(Frame[String] pipeline, Matrix[Double] Xtrain, Mat
else {
print("not applying operation executeFlag = 0")
}

if(ncol(Xtest) == d & nrow(Xtest) == nrow(XtestClone) & ncol(hpForPruning) > 1) {
changesSingle = sum(abs(replace(target=Xtest, pattern=NaN, replacement=0) - replace(target=XtestClone, pattern=NaN, replacement=0)) > 0.001 )
changesAll = sum(abs(replace(target=Xtest, pattern=NaN, replacement=0) - replace(target=Xorig, pattern=NaN, replacement=0)) > 0.001 )
Expand Down Expand Up @@ -204,7 +203,7 @@ return (Matrix[Double] X)
# X without numerics
Xcat = removeEmpty(target=originalX, margin="cols", select=mask)
nanMask = is.na(Xcat)
Xcat = replace(target = Xcat, pattern = NaN, replacement = -1111)
Xcat = abs(round(replace(target = Xcat, pattern = NaN, replacement = 4444)))

# reconstruct the original matrix
p = table(seq(1, ncol(nX)), removeEmpty(target=seq(1, ncol(mask)), margin="rows",
Expand All @@ -214,24 +213,24 @@ return (Matrix[Double] X)
X = (nX %*% p) + (Xcat %*% q)

X = replace(target = X, pattern = maxDummy, replacement = NaN)
X = replace(target = X, pattern = -1111, replacement = NaN)
X = replace(target = X, pattern = 4444, replacement = NaN)
}
else if(dataFlag == 1 & (sum(mask) > 0) & (sum(mask) != ncol(originalX)))
{
maxDummy = max(replace(target=nX, pattern=NaN, replacement=0)) + 1
nX = replace(target = nX, pattern = NaN, replacement = maxDummy)
maxDummy = abs(round(max(replace(target=nX, pattern=NaN, replacement=0)) + 1))
nX = abs(round(replace(target = nX, pattern = NaN, replacement = maxDummy)))
# X without categorical
Xnum = removeEmpty(target=originalX, margin="cols", select=(mask==0))
nanMask = is.na(Xnum)
Xnum = replace(target = Xnum, pattern = NaN, replacement = -1111)
Xnum = replace(target = Xnum, pattern = NaN, replacement = 4444)
# reconstruct the original matrix
p = table(seq(1, ncol(Xnum)), removeEmpty(target=seq(1, ncol(mask)), margin="rows",
select=t(mask==0)), ncol(Xnum), ncol(originalX))
q = table(seq(1, ncol(nX)), removeEmpty(target=seq(1, ncol(mask)), margin="rows",
select=t(mask)), ncol(nX), ncol(originalX))
X = (nX %*% q) + (Xnum %*% p)
X = replace(target = X, pattern = maxDummy, replacement = NaN)
X = replace(target = X, pattern = -1111, replacement = NaN)
X = replace(target = X, pattern = 4444, replacement = NaN)

}
else X = nX
Expand All @@ -247,14 +246,14 @@ return (Matrix[Double] X)
#######################################################################

dummycoding = function(Matrix[Double] X, Matrix[Double] mask)
return (Matrix[Double] X, String jspec, Frame[Unknown] meta) {

return (Matrix[Double] X, String jspec, Frame[Unknown] meta) {
meta = as.frame("NULL")
jspec = ""
if(sum(mask) > 0)
{
X = replace(target=X, pattern=NaN, replacement=0)
idx = vectorToCsv(mask)
idx = vectorToCsv(mask, ncol(X))
# specifications for one-hot encoding of categorical features
jspec = "{ids:true, dummycode:["+idx+"]}";
# OHE of categorical features
Expand All @@ -268,6 +267,7 @@ return (Matrix[Double] Y) {

if(jspec != "")
{
X = replace(target=X, pattern=NaN, replacement=0)
Y = transformapply(target=as.frame(X), spec=jspec, meta=meta);
}
else Y = X
Expand All @@ -286,6 +286,7 @@ return (Matrix[Double] X, Matrix[Double] fillMatrix)
if(sum(fdMask) > 0)
{
t = replace(target=X, pattern=NaN, replacement=1)
t = replace(target=t, pattern=0, replacement=1)
fdMask = removeEmpty(target=fdMask, margin="cols")
FD = discoverFD(X=t, Mask=fdMask, threshold=threshold)
FD = (diag(matrix(1, rows=nrow(FD), cols=1)) ==0) * FD
Expand Down
19 changes: 10 additions & 9 deletions scripts/builtin/fit_pipeline.dml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ source("scripts/builtin/bandit.dml") as bandit;

s_fit_pipeline = function(Frame[Unknown] trainData, Frame[Unknown] testData, Frame[Unknown] metaData = as.frame("NULL"),
Frame[Unknown] pip, Frame[Unknown] applyFunc, Matrix[Double] hp, Integer cvk=3, String evaluationFunc, Matrix[Double] evalFunHp,
Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE)
Boolean isLastLabel = TRUE, String tfspec="NA", Boolean OHE=TRUE, Boolean correctTypos=FALSE)
return (Matrix[Double] scores, Matrix[Double] cleanTrain, Matrix[Double] cleanTest, List[Unknown] externalState, List[Unknown] iState)
{
externalState = list()
Expand All @@ -57,14 +57,17 @@ return (Matrix[Double] scores, Matrix[Double] cleanTrain, Matrix[Double] cleanTe

pip = removeEmpty(target=pip, margin="cols")
applyFunc = removeEmpty(target=applyFunc, margin="cols")
metaList = list(mask=mask, schema=schema, fd=fdMask, applyFunc=as.frame("NULL"))
metaList = list(mask=mask, schema=schema, fd=fdMask, applyFunc=as.frame("NULL"), tfspec=tfspec)
ctx = list(prefix="----"); #TODO include seed
# separate the label
[Xtrain, Ytrain] = topk::getLabel(trainData, isLastLabel)
[Xtest, Ytest] = topk::getLabel(testData, isLastLabel)

# always recode the label
if(maskY == 1) {
sc = detectSchema(Ytrain)
Ytrain = applySchema(Ytrain, sc)
Ytest = applySchema(Ytest, sc)
[eYtrain, M] = transformencode(target=Ytrain, spec= "{ids:true, recode:[1]}");
eYtest = transformapply(target=Ytest, spec= "{ids:true, recode:[1]}", meta=M);
externalState = append(externalState, M)
Expand All @@ -77,12 +80,13 @@ return (Matrix[Double] scores, Matrix[Double] cleanTrain, Matrix[Double] cleanTe
# # # when the evaluation function is called first we also compute and keep hyperparams of target application
ctx = list(prefix="evaluate Pipeline")
dirtyScore = topk::getDirtyScore(X=Xtrain, Y=eYtrain, Xtest=Xtest, Ytest=eYtest, metaList=metaList,
evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, ctx=ctx)
evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, OHE=OHE, ctx=ctx)
[Xtrain, Xtest] = topk::runStringPipeline(Xtrain, Xtest, schema, mask, FALSE, correctTypos, ctx)

# # # if mask has 1s then there are categorical features
[eXtrain, eXtest, M1] = topk::recodeData(Xtrain, Xtest, mask, FALSE, "recode")
externalState = append(externalState, M1)
externalState = append(externalState, tfspec)
# # # do the early dropping
# [eXtrain, eXtest, metaList] = topk::featureDrop(eXtrain, eXtest, metaList, FALSE)
metaList["applyFunc"] = applyFunc
Expand All @@ -94,25 +98,22 @@ return (Matrix[Double] scores, Matrix[Double] cleanTrain, Matrix[Double] cleanTe

[trainScore, evalFunHp] = bandit::crossV(X=eXtrain, y=eYtrain, cvk=cvk, evalFunHp=evalFunHp,
pipList=pipList, metaList=metaList, evalFunc=evaluationFunc)
print("train score cv: "+toString(trainScore))


# # # now test accuracy
[eXtrain, eYtrain, eXtest, eYtest, a, b, c, d, iState] = executePipeline(pipeline=pip, Xtrain=eXtrain, Ytrain=eYtrain,
Xtest=eXtest, Ytest=eYtest, metaList=metaList, hyperParameters=hp_matrix, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE)

if(max(eYtrain) == min(eYtrain))
stop("Y contains only one class")

# score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtrain, Ytest=eYtrain, Xorig=as.matrix(0), evalFunHp=evalFunHp))
# trainAccuracy = as.scalar(score[1, 1])

score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtest, Ytest=eYtest, Xorig=as.matrix(0), evalFunHp=evalFunHp))
score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtest, Ytest=eYtest, Xorig=mask, evalFunHp=evalFunHp, tfspec=tfspec))
testAccuracy = as.scalar(score[1, 1])

scores = matrix(0, rows=1, cols=3)
scores[1, 1] = dirtyScore
# scores[1, 2] = trainAccuracy
scores[1, 2] = trainScore
scores[1, 3] = testAccuracy
cleanTrain = cbind(eXtrain, eYtrain)
cleanTest = cbind(eXtest, eYtest)
Expand Down
2 changes: 1 addition & 1 deletion scripts/builtin/frameSort.dml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
s_frameSort = function(Frame[String] F, Matrix[Double] mask, Boolean orderDesc = TRUE)
return (Frame[String] f_ordered)
{
index = vectorToCsv(mask)
index = vectorToCsv(mask, ncol(F))
# recode logical pipelines for easy handling
jspecR = "{ids:true, recode:["+index+"]}";
[X, M] = transformencode(target=F, spec=jspecR);
Expand Down
2 changes: 1 addition & 1 deletion scripts/builtin/mice.dml
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ m_mice= function(Matrix[Double] X, Matrix[Double] cMask, Integer iter = 3,
d = ncol(X1)
n = nrow(X1)
# compute index of categorical features
index = vectorToCsv(cMask)
index = vectorToCsv(cMask, ncol(cMask))
# specifications for one-hot encoding of categorical features
jspecDC = "{ids:true, dummycode:["+index+"]}";
[dX, dM] = transformencode(target=as.frame(X1), spec=jspecDC);
Expand Down
2 changes: 1 addition & 1 deletion scripts/builtin/miceApply.dml
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ m_miceApply = function(Matrix[Double] X, Matrix[Double] meta, Double threshold,
n = nrow(X1)

# compute index of categorical features
index = vectorToCsv(mask)
index = vectorToCsv(mask, ncol(mask))
# specifications for one-hot encoding of categorical features
jspecDC = "{ids:true, dummycode:["+index+"]}";

Expand Down
Loading

0 comments on commit 9661b57

Please sign in to comment.