From a71aa8e3a3a6849e9822555fd2dfa058c353fcfb Mon Sep 17 00:00:00 2001 From: Philipp Kuntschik Date: Fri, 29 Mar 2019 12:04:50 +0100 Subject: [PATCH 1/9] add: comments --- R/autoCoding.R | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/R/autoCoding.R b/R/autoCoding.R index f397590..cee22f1 100644 --- a/R/autoCoding.R +++ b/R/autoCoding.R @@ -60,18 +60,28 @@ codingBySearchOneFile <- function(pattern, fid, cid, seperator, concatenate, ... ## by providing approperiate seperator, it allows flexible control on the unit of autocoding txt <- RQDAQuery(sprintf("select file from source where status=1 and id=%s",fid))$file Encoding(txt) <- "UTF-8" + + ## get all separator matches and calculate start and end of each analysis unit pidx <- gregexpr(sprintf("(%s){1,}", seperator),txt) idx1 <- c(0,pidx[[1]]+attr(pidx[[1]],"match.length")-1) idx2 <- c(pidx[[1]]-1,nchar(txt)) + + ## find all pattern matches sidx <- gregexpr(pattern,txt, ...)[[1]] + if (length(sidx) > 1 || (sidx != -1)) { + + ## get the matching analysis units residx <- unique(findInterval(sidx,sort(c(idx1,idx2)))) idx <- (residx + 1)/2 + if (concatenate) + ## mark bordering matching analysis units removeidx <- which(diff(idx)==1) else removeidx <- NULL + ## receive start and end indexes of the matching analysis units if (length(removeidx) > 0) { selfirst = idx1[idx[-(removeidx+1)]] elend = idx2[idx[-removeidx]] @@ -80,6 +90,7 @@ codingBySearchOneFile <- function(pattern, fid, cid, seperator, concatenate, ... selend = idx2[idx] } + ## add the codings for (c in cid) for (i in (1:length(selfirst))) insertCoding (fid=fid, cid=c, start=selfirst[i], end=selend[i], txt) From 2e1b9becf698f64199914173bc9272d8f9ede992 Mon Sep 17 00:00:00 2001 From: Philipp Kuntschik Date: Fri, 29 Mar 2019 12:05:32 +0100 Subject: [PATCH 2/9] fix: bug where concatenated pattern-matches were not merged. --- R/autoCoding.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/autoCoding.R b/R/autoCoding.R index cee22f1..9ef7793 100644 --- a/R/autoCoding.R +++ b/R/autoCoding.R @@ -84,7 +84,7 @@ codingBySearchOneFile <- function(pattern, fid, cid, seperator, concatenate, ... ## receive start and end indexes of the matching analysis units if (length(removeidx) > 0) { selfirst = idx1[idx[-(removeidx+1)]] - elend = idx2[idx[-removeidx]] + selend = idx2[idx[-removeidx]] } else { selfirst = idx1[idx] selend = idx2[idx] From 6f188cd947a38e3b32f1e0b034b8aef3985c8a9d Mon Sep 17 00:00:00 2001 From: Philipp Kuntschik Date: Fri, 29 Mar 2019 13:51:13 +0100 Subject: [PATCH 3/9] chg: if there is no pattern-match, there is nothing more to do --- R/autoCoding.R | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/R/autoCoding.R b/R/autoCoding.R index 9ef7793..a4be587 100644 --- a/R/autoCoding.R +++ b/R/autoCoding.R @@ -60,17 +60,16 @@ codingBySearchOneFile <- function(pattern, fid, cid, seperator, concatenate, ... ## by providing approperiate seperator, it allows flexible control on the unit of autocoding txt <- RQDAQuery(sprintf("select file from source where status=1 and id=%s",fid))$file Encoding(txt) <- "UTF-8" - - ## get all separator matches and calculate start and end of each analysis unit - pidx <- gregexpr(sprintf("(%s){1,}", seperator),txt) - idx1 <- c(0,pidx[[1]]+attr(pidx[[1]],"match.length")-1) - idx2 <- c(pidx[[1]]-1,nchar(txt)) - + ## find all pattern matches sidx <- gregexpr(pattern,txt, ...)[[1]] - - if (length(sidx) > 1 || (sidx != -1)) { + if (length(sidx) > 1 || (sidx != -1)) { + ## get all separator matches and calculate start and end of each analysis unit + pidx <- gregexpr(sprintf("(%s){1,}", seperator),txt) + idx1 <- c(0,pidx[[1]]+attr(pidx[[1]],"match.length")-1) + idx2 <- c(pidx[[1]]-1,nchar(txt)) + ## get the matching analysis units residx <- unique(findInterval(sidx,sort(c(idx1,idx2)))) idx <- (residx + 1)/2 From 0ced73bb444875a447a9978a49cf538317c86242 Mon Sep 17 00:00:00 2001 From: Philipp Kuntschik Date: Fri, 29 Mar 2019 13:53:30 +0100 Subject: [PATCH 4/9] chg: renamed sidx to patternmatches --- R/autoCoding.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/autoCoding.R b/R/autoCoding.R index a4be587..e54e372 100644 --- a/R/autoCoding.R +++ b/R/autoCoding.R @@ -62,8 +62,8 @@ codingBySearchOneFile <- function(pattern, fid, cid, seperator, concatenate, ... Encoding(txt) <- "UTF-8" ## find all pattern matches - sidx <- gregexpr(pattern,txt, ...)[[1]] - if (length(sidx) > 1 || (sidx != -1)) { + patternmatches <- gregexpr(pattern,txt, ...)[[1]] + if (length(patternmatches) > 1 || (patternmatches != -1)) { ## get all separator matches and calculate start and end of each analysis unit pidx <- gregexpr(sprintf("(%s){1,}", seperator),txt) @@ -71,7 +71,7 @@ codingBySearchOneFile <- function(pattern, fid, cid, seperator, concatenate, ... idx2 <- c(pidx[[1]]-1,nchar(txt)) ## get the matching analysis units - residx <- unique(findInterval(sidx,sort(c(idx1,idx2)))) + residx <- unique(findInterval(patternmatches,sort(c(idx1,idx2)))) idx <- (residx + 1)/2 if (concatenate) From f1555fa66a9b7dabfcd124215ad9bbd26e982bab Mon Sep 17 00:00:00 2001 From: Philipp Kuntschik Date: Fri, 29 Mar 2019 13:58:33 +0100 Subject: [PATCH 5/9] chg: renamed regex match variables --- R/autoCoding.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/R/autoCoding.R b/R/autoCoding.R index e54e372..0843296 100644 --- a/R/autoCoding.R +++ b/R/autoCoding.R @@ -62,16 +62,16 @@ codingBySearchOneFile <- function(pattern, fid, cid, seperator, concatenate, ... Encoding(txt) <- "UTF-8" ## find all pattern matches - patternmatches <- gregexpr(pattern,txt, ...)[[1]] - if (length(patternmatches) > 1 || (patternmatches != -1)) { + pattern_matches <- gregexpr(pattern,txt, ...)[[1]] + if (length(pattern_matches) > 1 || (pattern_matches != -1)) { ## get all separator matches and calculate start and end of each analysis unit - pidx <- gregexpr(sprintf("(%s){1,}", seperator),txt) - idx1 <- c(0,pidx[[1]]+attr(pidx[[1]],"match.length")-1) - idx2 <- c(pidx[[1]]-1,nchar(txt)) + separator_matches <- gregexpr(sprintf("(%s){1,}", seperator),txt)[[1]] + idx1 <- c(0,separator_matches+attr(separator_matches,"match.length")-1) + idx2 <- c(separator_matches-1,nchar(txt)) ## get the matching analysis units - residx <- unique(findInterval(patternmatches,sort(c(idx1,idx2)))) + residx <- unique(findInterval(pattern_matches,sort(c(idx1,idx2)))) idx <- (residx + 1)/2 if (concatenate) From df8ae4db9263e7ef38ac490e5b548c336442a8bd Mon Sep 17 00:00:00 2001 From: Philipp Kuntschik Date: Fri, 29 Mar 2019 13:58:59 +0100 Subject: [PATCH 6/9] chg: renamed unitindex variables --- R/autoCoding.R | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/R/autoCoding.R b/R/autoCoding.R index 0843296..782c0f6 100644 --- a/R/autoCoding.R +++ b/R/autoCoding.R @@ -67,11 +67,11 @@ codingBySearchOneFile <- function(pattern, fid, cid, seperator, concatenate, ... ## get all separator matches and calculate start and end of each analysis unit separator_matches <- gregexpr(sprintf("(%s){1,}", seperator),txt)[[1]] - idx1 <- c(0,separator_matches+attr(separator_matches,"match.length")-1) - idx2 <- c(separator_matches-1,nchar(txt)) + unit_start_indexes <- c(0,separator_matches+attr(separator_matches,"match.length")-1) + unit_end_indexes <- c(separator_matches-1,nchar(txt)) ## get the matching analysis units - residx <- unique(findInterval(pattern_matches,sort(c(idx1,idx2)))) + residx <- unique(findInterval(pattern_matches,sort(c(unit_start_indexes,unit_end_indexes)))) idx <- (residx + 1)/2 if (concatenate) @@ -82,11 +82,11 @@ codingBySearchOneFile <- function(pattern, fid, cid, seperator, concatenate, ... ## receive start and end indexes of the matching analysis units if (length(removeidx) > 0) { - selfirst = idx1[idx[-(removeidx+1)]] - selend = idx2[idx[-removeidx]] + selfirst = unit_start_indexes[idx[-(removeidx+1)]] + selend = unit_end_indexes[idx[-removeidx]] } else { - selfirst = idx1[idx] - selend = idx2[idx] + selfirst = unit_start_indexes[idx] + selend = unit_end_indexes[idx] } ## add the codings From 133aceed3cd2e93841800fc48187b7d255e59272 Mon Sep 17 00:00:00 2001 From: Philipp Kuntschik Date: Fri, 29 Mar 2019 14:01:04 +0100 Subject: [PATCH 7/9] chg: improved readability --- R/autoCoding.R | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/R/autoCoding.R b/R/autoCoding.R index 782c0f6..8749495 100644 --- a/R/autoCoding.R +++ b/R/autoCoding.R @@ -58,32 +58,32 @@ insertCoding <- function(fid, cid, start, end, fulltext) { codingBySearchOneFile <- function(pattern, fid, cid, seperator, concatenate, ...) { ## auto coding: when seperator is \n, each paragraph is a analysis unit ## by providing approperiate seperator, it allows flexible control on the unit of autocoding - txt <- RQDAQuery(sprintf("select file from source where status=1 and id=%s",fid))$file + txt <- RQDAQuery(sprintf("select file from source where status=1 and id=%s", fid))$file Encoding(txt) <- "UTF-8" ## find all pattern matches - pattern_matches <- gregexpr(pattern,txt, ...)[[1]] + pattern_matches <- gregexpr(pattern, txt, ...)[[1]] if (length(pattern_matches) > 1 || (pattern_matches != -1)) { ## get all separator matches and calculate start and end of each analysis unit - separator_matches <- gregexpr(sprintf("(%s){1,}", seperator),txt)[[1]] - unit_start_indexes <- c(0,separator_matches+attr(separator_matches,"match.length")-1) - unit_end_indexes <- c(separator_matches-1,nchar(txt)) + separator_matches <- gregexpr(sprintf("(%s){1,}", seperator), txt)[[1]] + unit_start_indexes <- c(0, separator_matches + attr(separator_matches, "match.length") - 1) + unit_end_indexes <- c(separator_matches - 1, nchar(txt)) ## get the matching analysis units - residx <- unique(findInterval(pattern_matches,sort(c(unit_start_indexes,unit_end_indexes)))) - idx <- (residx + 1)/2 + residx <- unique(findInterval(pattern_matches, sort(c(unit_start_indexes, unit_end_indexes)))) + idx <- (residx + 1) / 2 if (concatenate) ## mark bordering matching analysis units - removeidx <- which(diff(idx)==1) + removeidx <- which(diff(idx) == 1) else removeidx <- NULL ## receive start and end indexes of the matching analysis units if (length(removeidx) > 0) { - selfirst = unit_start_indexes[idx[-(removeidx+1)]] - selend = unit_end_indexes[idx[-removeidx]] + selfirst = unit_start_indexes[idx[ - (removeidx + 1)]] + selend = unit_end_indexes[idx[ - removeidx]] } else { selfirst = unit_start_indexes[idx] selend = unit_end_indexes[idx] @@ -91,7 +91,7 @@ codingBySearchOneFile <- function(pattern, fid, cid, seperator, concatenate, ... ## add the codings for (c in cid) - for (i in (1:length(selfirst))) + for (i in (1 : length(selfirst))) insertCoding (fid=fid, cid=c, start=selfirst[i], end=selend[i], txt) } } From 0166f7220299174b25c7067097dcf518bfcf714f Mon Sep 17 00:00:00 2001 From: Philipp Kuntschik Date: Mon, 1 Apr 2019 15:13:26 +0200 Subject: [PATCH 8/9] chg: allow patterns accross multiple annotation units --- R/autoCoding.R | 58 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 18 deletions(-) diff --git a/R/autoCoding.R b/R/autoCoding.R index 8749495..cc7087e 100644 --- a/R/autoCoding.R +++ b/R/autoCoding.R @@ -66,33 +66,55 @@ codingBySearchOneFile <- function(pattern, fid, cid, seperator, concatenate, ... if (length(pattern_matches) > 1 || (pattern_matches != -1)) { ## get all separator matches and calculate start and end of each analysis unit - separator_matches <- gregexpr(sprintf("(%s){1,}", seperator), txt)[[1]] + separator_matches <- gregexpr(sprintf("(%s){1,}", seperator), txt)[[1]] unit_start_indexes <- c(0, separator_matches + attr(separator_matches, "match.length") - 1) - unit_end_indexes <- c(separator_matches - 1, nchar(txt)) + unit_end_indexes <- c(separator_matches - 1, nchar(txt)) ## get the matching analysis units - residx <- unique(findInterval(pattern_matches, sort(c(unit_start_indexes, unit_end_indexes)))) - idx <- (residx + 1) / 2 + unit_start_reference <- findInterval(pattern_matches, unit_start_indexes) + unit_end_reference <- findInterval(pattern_matches + attr(pattern_matches, "match.length"), unit_end_indexes) - if (concatenate) - ## mark bordering matching analysis units - removeidx <- which(diff(idx) == 1) - else - removeidx <- NULL - - ## receive start and end indexes of the matching analysis units - if (length(removeidx) > 0) { - selfirst = unit_start_indexes[idx[ - (removeidx + 1)]] - selend = unit_end_indexes[idx[ - removeidx]] + if(concatenate){ + ## get a logical array with true values for start references that we need to skip + unit_start_reference <- c(sort(unit_start_reference), NA) + unit_end_reference <- c(NA, sort(unit_end_reference)) + bordering <- unit_start_reference - unit_end_reference <= 1 + + ## receive start and end indexes of the matching analysis unit + for(i in (1 : length(bordering))){ + current_end = i + + if(is.na(bordering[i]) || !bordering[i]){ + + ## if this is the first pattern match: + if(!exists("current_start")) + current_start = i + + else{ + ## if exists append, otherwise create: + if(exists("match_start_index")) + { + match_start_index <- c(match_start_index, unit_start_indexes[unit_start_reference[current_start]]) + match_end_index <- c(match_end_index, unit_end_indexes[unit_end_reference[current_end]]) + }else { + match_start_index <- c(unit_start_indexes[unit_start_reference[current_start]]) + match_end_index <- c(unit_end_indexes[unit_end_reference[current_end]]) + } + current_start = i + } + } + } } else { - selfirst = unit_start_indexes[idx] - selend = unit_end_indexes[idx] + ## if we do not concatenate, things are easier + ## receive start and end indexes of the matching analysis units + match_start_index <- unit_start_indexes[unit_start_reference] + match_end_index <- unit_end_indexes[unit_end_reference] } ## add the codings for (c in cid) - for (i in (1 : length(selfirst))) - insertCoding (fid=fid, cid=c, start=selfirst[i], end=selend[i], txt) + for (i in (1 : length(match_start_index))) + insertCoding (fid=fid, cid=c, start=match_start_index[i], end=match_end_index[i], txt) } } From 3cca66332a3644297cbd927a06c8c578b25eedaf Mon Sep 17 00:00:00 2001 From: Philipp Kuntschik Date: Tue, 2 Apr 2019 17:29:28 +0200 Subject: [PATCH 9/9] fix: a bug when a pattern is in the middle of a annotation unit --- R/autoCoding.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/autoCoding.R b/R/autoCoding.R index cc7087e..91e8651 100644 --- a/R/autoCoding.R +++ b/R/autoCoding.R @@ -72,7 +72,7 @@ codingBySearchOneFile <- function(pattern, fid, cid, seperator, concatenate, ... ## get the matching analysis units unit_start_reference <- findInterval(pattern_matches, unit_start_indexes) - unit_end_reference <- findInterval(pattern_matches + attr(pattern_matches, "match.length"), unit_end_indexes) + unit_end_reference <- findInterval(pattern_matches + attr(pattern_matches, "match.length"), unit_start_indexes, left.open = TRUE) if(concatenate){ ## get a logical array with true values for start references that we need to skip