From a71aa8e3a3a6849e9822555fd2dfa058c353fcfb Mon Sep 17 00:00:00 2001
From: Philipp Kuntschik <Philipp.Kuntschik@htwchur.ch>
Date: Fri, 29 Mar 2019 12:04:50 +0100
Subject: [PATCH 1/9] add: comments

---
 R/autoCoding.R | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/R/autoCoding.R b/R/autoCoding.R
index f397590..cee22f1 100644
--- a/R/autoCoding.R
+++ b/R/autoCoding.R
@@ -60,18 +60,28 @@ codingBySearchOneFile <- function(pattern, fid, cid, seperator, concatenate, ...
   ## by providing approperiate seperator, it allows flexible control on the unit of autocoding
     txt <- RQDAQuery(sprintf("select file from source where status=1 and id=%s",fid))$file
     Encoding(txt) <- "UTF-8"
+    
+    ## get all separator matches and calculate start and end of each analysis unit
     pidx <- gregexpr(sprintf("(%s){1,}", seperator),txt)
     idx1 <- c(0,pidx[[1]]+attr(pidx[[1]],"match.length")-1)
     idx2 <- c(pidx[[1]]-1,nchar(txt))
+    
+    ## find all pattern matches
     sidx <- gregexpr(pattern,txt, ...)[[1]]
+    
     if (length(sidx) > 1 || (sidx != -1)) {
+      
+        ## get the matching analysis units
         residx <- unique(findInterval(sidx,sort(c(idx1,idx2))))
         idx <- (residx + 1)/2
+        
         if (concatenate)
+            ## mark bordering matching analysis units
             removeidx <- which(diff(idx)==1)
         else
             removeidx <- NULL
         
+        ## receive start and end indexes of the matching analysis units
         if (length(removeidx) > 0) {
           selfirst = idx1[idx[-(removeidx+1)]]
           elend   = idx2[idx[-removeidx]]
@@ -80,6 +90,7 @@ codingBySearchOneFile <- function(pattern, fid, cid, seperator, concatenate, ...
           selend   = idx2[idx]
         }
         
+        ## add the codings
         for (c in cid)
           for (i in (1:length(selfirst)))
             insertCoding (fid=fid, cid=c, start=selfirst[i], end=selend[i], txt)

From 2e1b9becf698f64199914173bc9272d8f9ede992 Mon Sep 17 00:00:00 2001
From: Philipp Kuntschik <Philipp.Kuntschik@htwchur.ch>
Date: Fri, 29 Mar 2019 12:05:32 +0100
Subject: [PATCH 2/9] fix: bug where concatenated pattern-matches were not
 merged.

---
 R/autoCoding.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/autoCoding.R b/R/autoCoding.R
index cee22f1..9ef7793 100644
--- a/R/autoCoding.R
+++ b/R/autoCoding.R
@@ -84,7 +84,7 @@ codingBySearchOneFile <- function(pattern, fid, cid, seperator, concatenate, ...
         ## receive start and end indexes of the matching analysis units
         if (length(removeidx) > 0) {
           selfirst = idx1[idx[-(removeidx+1)]]
-          elend   = idx2[idx[-removeidx]]
+          selend   = idx2[idx[-removeidx]]
         } else {
           selfirst = idx1[idx]
           selend   = idx2[idx]

From 6f188cd947a38e3b32f1e0b034b8aef3985c8a9d Mon Sep 17 00:00:00 2001
From: Philipp Kuntschik <Philipp.Kuntschik@htwchur.ch>
Date: Fri, 29 Mar 2019 13:51:13 +0100
Subject: [PATCH 3/9] chg: if there is no pattern-match, there is nothing more
 to do

---
 R/autoCoding.R | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/R/autoCoding.R b/R/autoCoding.R
index 9ef7793..a4be587 100644
--- a/R/autoCoding.R
+++ b/R/autoCoding.R
@@ -60,17 +60,16 @@ codingBySearchOneFile <- function(pattern, fid, cid, seperator, concatenate, ...
   ## by providing approperiate seperator, it allows flexible control on the unit of autocoding
     txt <- RQDAQuery(sprintf("select file from source where status=1 and id=%s",fid))$file
     Encoding(txt) <- "UTF-8"
-    
-    ## get all separator matches and calculate start and end of each analysis unit
-    pidx <- gregexpr(sprintf("(%s){1,}", seperator),txt)
-    idx1 <- c(0,pidx[[1]]+attr(pidx[[1]],"match.length")-1)
-    idx2 <- c(pidx[[1]]-1,nchar(txt))
-    
+
     ## find all pattern matches
     sidx <- gregexpr(pattern,txt, ...)[[1]]
-    
-    if (length(sidx) > 1 || (sidx != -1)) {
+    if (length(sidx) > 1 || (sidx != -1)) {    
       
+        ## get all separator matches and calculate start and end of each analysis unit
+        pidx <- gregexpr(sprintf("(%s){1,}", seperator),txt)
+        idx1 <- c(0,pidx[[1]]+attr(pidx[[1]],"match.length")-1)
+        idx2 <- c(pidx[[1]]-1,nchar(txt))
+    
         ## get the matching analysis units
         residx <- unique(findInterval(sidx,sort(c(idx1,idx2))))
         idx <- (residx + 1)/2

From 0ced73bb444875a447a9978a49cf538317c86242 Mon Sep 17 00:00:00 2001
From: Philipp Kuntschik <Philipp.Kuntschik@htwchur.ch>
Date: Fri, 29 Mar 2019 13:53:30 +0100
Subject: [PATCH 4/9] chg: renamed sidx to patternmatches

---
 R/autoCoding.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/R/autoCoding.R b/R/autoCoding.R
index a4be587..e54e372 100644
--- a/R/autoCoding.R
+++ b/R/autoCoding.R
@@ -62,8 +62,8 @@ codingBySearchOneFile <- function(pattern, fid, cid, seperator, concatenate, ...
     Encoding(txt) <- "UTF-8"
 
     ## find all pattern matches
-    sidx <- gregexpr(pattern,txt, ...)[[1]]
-    if (length(sidx) > 1 || (sidx != -1)) {    
+    patternmatches <- gregexpr(pattern,txt, ...)[[1]]
+    if (length(patternmatches) > 1 || (patternmatches != -1)) {
       
         ## get all separator matches and calculate start and end of each analysis unit
         pidx <- gregexpr(sprintf("(%s){1,}", seperator),txt)
@@ -71,7 +71,7 @@ codingBySearchOneFile <- function(pattern, fid, cid, seperator, concatenate, ...
         idx2 <- c(pidx[[1]]-1,nchar(txt))
     
         ## get the matching analysis units
-        residx <- unique(findInterval(sidx,sort(c(idx1,idx2))))
+        residx <- unique(findInterval(patternmatches,sort(c(idx1,idx2))))
         idx <- (residx + 1)/2
         
         if (concatenate)

From f1555fa66a9b7dabfcd124215ad9bbd26e982bab Mon Sep 17 00:00:00 2001
From: Philipp Kuntschik <Philipp.Kuntschik@htwchur.ch>
Date: Fri, 29 Mar 2019 13:58:33 +0100
Subject: [PATCH 5/9] chg: renamed regex match variables

---
 R/autoCoding.R | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/R/autoCoding.R b/R/autoCoding.R
index e54e372..0843296 100644
--- a/R/autoCoding.R
+++ b/R/autoCoding.R
@@ -62,16 +62,16 @@ codingBySearchOneFile <- function(pattern, fid, cid, seperator, concatenate, ...
     Encoding(txt) <- "UTF-8"
 
     ## find all pattern matches
-    patternmatches <- gregexpr(pattern,txt, ...)[[1]]
-    if (length(patternmatches) > 1 || (patternmatches != -1)) {
+    pattern_matches <- gregexpr(pattern,txt, ...)[[1]]
+    if (length(pattern_matches) > 1 || (pattern_matches != -1)) {
       
         ## get all separator matches and calculate start and end of each analysis unit
-        pidx <- gregexpr(sprintf("(%s){1,}", seperator),txt)
-        idx1 <- c(0,pidx[[1]]+attr(pidx[[1]],"match.length")-1)
-        idx2 <- c(pidx[[1]]-1,nchar(txt))
+        separator_matches <- gregexpr(sprintf("(%s){1,}", seperator),txt)[[1]]
+        idx1 <- c(0,separator_matches+attr(separator_matches,"match.length")-1)
+        idx2 <- c(separator_matches-1,nchar(txt))
     
         ## get the matching analysis units
-        residx <- unique(findInterval(patternmatches,sort(c(idx1,idx2))))
+        residx <- unique(findInterval(pattern_matches,sort(c(idx1,idx2))))
         idx <- (residx + 1)/2
         
         if (concatenate)

From df8ae4db9263e7ef38ac490e5b548c336442a8bd Mon Sep 17 00:00:00 2001
From: Philipp Kuntschik <Philipp.Kuntschik@htwchur.ch>
Date: Fri, 29 Mar 2019 13:58:59 +0100
Subject: [PATCH 6/9] chg: renamed unitindex variables

---
 R/autoCoding.R | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/R/autoCoding.R b/R/autoCoding.R
index 0843296..782c0f6 100644
--- a/R/autoCoding.R
+++ b/R/autoCoding.R
@@ -67,11 +67,11 @@ codingBySearchOneFile <- function(pattern, fid, cid, seperator, concatenate, ...
       
         ## get all separator matches and calculate start and end of each analysis unit
         separator_matches <- gregexpr(sprintf("(%s){1,}", seperator),txt)[[1]]
-        idx1 <- c(0,separator_matches+attr(separator_matches,"match.length")-1)
-        idx2 <- c(separator_matches-1,nchar(txt))
+        unit_start_indexes <- c(0,separator_matches+attr(separator_matches,"match.length")-1)
+        unit_end_indexes <- c(separator_matches-1,nchar(txt))
     
         ## get the matching analysis units
-        residx <- unique(findInterval(pattern_matches,sort(c(idx1,idx2))))
+        residx <- unique(findInterval(pattern_matches,sort(c(unit_start_indexes,unit_end_indexes))))
         idx <- (residx + 1)/2
         
         if (concatenate)
@@ -82,11 +82,11 @@ codingBySearchOneFile <- function(pattern, fid, cid, seperator, concatenate, ...
         
         ## receive start and end indexes of the matching analysis units
         if (length(removeidx) > 0) {
-          selfirst = idx1[idx[-(removeidx+1)]]
-          selend   = idx2[idx[-removeidx]]
+          selfirst = unit_start_indexes[idx[-(removeidx+1)]]
+          selend   = unit_end_indexes[idx[-removeidx]]
         } else {
-          selfirst = idx1[idx]
-          selend   = idx2[idx]
+          selfirst = unit_start_indexes[idx]
+          selend   = unit_end_indexes[idx]
         }
         
         ## add the codings

From 133aceed3cd2e93841800fc48187b7d255e59272 Mon Sep 17 00:00:00 2001
From: Philipp Kuntschik <Philipp.Kuntschik@htwchur.ch>
Date: Fri, 29 Mar 2019 14:01:04 +0100
Subject: [PATCH 7/9] chg: improved readability

---
 R/autoCoding.R | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/R/autoCoding.R b/R/autoCoding.R
index 782c0f6..8749495 100644
--- a/R/autoCoding.R
+++ b/R/autoCoding.R
@@ -58,32 +58,32 @@ insertCoding <- function(fid, cid, start, end, fulltext) {
 codingBySearchOneFile <- function(pattern, fid, cid, seperator, concatenate, ...) {
   ## auto coding: when seperator is \n, each paragraph is a analysis unit
   ## by providing approperiate seperator, it allows flexible control on the unit of autocoding
-    txt <- RQDAQuery(sprintf("select file from source where status=1 and id=%s",fid))$file
+    txt <- RQDAQuery(sprintf("select file from source where status=1 and id=%s", fid))$file
     Encoding(txt) <- "UTF-8"
 
     ## find all pattern matches
-    pattern_matches <- gregexpr(pattern,txt, ...)[[1]]
+    pattern_matches <- gregexpr(pattern, txt, ...)[[1]]
     if (length(pattern_matches) > 1 || (pattern_matches != -1)) {
       
         ## get all separator matches and calculate start and end of each analysis unit
-        separator_matches <- gregexpr(sprintf("(%s){1,}", seperator),txt)[[1]]
-        unit_start_indexes <- c(0,separator_matches+attr(separator_matches,"match.length")-1)
-        unit_end_indexes <- c(separator_matches-1,nchar(txt))
+        separator_matches <- gregexpr(sprintf("(%s){1,}", seperator), txt)[[1]]
+        unit_start_indexes <- c(0, separator_matches + attr(separator_matches, "match.length") - 1)
+        unit_end_indexes <- c(separator_matches - 1, nchar(txt))
     
         ## get the matching analysis units
-        residx <- unique(findInterval(pattern_matches,sort(c(unit_start_indexes,unit_end_indexes))))
-        idx <- (residx + 1)/2
+        residx <- unique(findInterval(pattern_matches, sort(c(unit_start_indexes, unit_end_indexes))))
+        idx <- (residx + 1) / 2
         
         if (concatenate)
             ## mark bordering matching analysis units
-            removeidx <- which(diff(idx)==1)
+            removeidx <- which(diff(idx) == 1)
         else
             removeidx <- NULL
         
         ## receive start and end indexes of the matching analysis units
         if (length(removeidx) > 0) {
-          selfirst = unit_start_indexes[idx[-(removeidx+1)]]
-          selend   = unit_end_indexes[idx[-removeidx]]
+          selfirst = unit_start_indexes[idx[ - (removeidx + 1)]]
+          selend   = unit_end_indexes[idx[ - removeidx]]
         } else {
           selfirst = unit_start_indexes[idx]
           selend   = unit_end_indexes[idx]
@@ -91,7 +91,7 @@ codingBySearchOneFile <- function(pattern, fid, cid, seperator, concatenate, ...
         
         ## add the codings
         for (c in cid)
-          for (i in (1:length(selfirst)))
+          for (i in (1 : length(selfirst)))
             insertCoding (fid=fid, cid=c, start=selfirst[i], end=selend[i], txt)
     }
 }

From 0166f7220299174b25c7067097dcf518bfcf714f Mon Sep 17 00:00:00 2001
From: Philipp Kuntschik <Philipp.Kuntschik@htwchur.ch>
Date: Mon, 1 Apr 2019 15:13:26 +0200
Subject: [PATCH 8/9] chg: allow patterns accross multiple annotation units

---
 R/autoCoding.R | 58 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 40 insertions(+), 18 deletions(-)

diff --git a/R/autoCoding.R b/R/autoCoding.R
index 8749495..cc7087e 100644
--- a/R/autoCoding.R
+++ b/R/autoCoding.R
@@ -66,33 +66,55 @@ codingBySearchOneFile <- function(pattern, fid, cid, seperator, concatenate, ...
     if (length(pattern_matches) > 1 || (pattern_matches != -1)) {
       
         ## get all separator matches and calculate start and end of each analysis unit
-        separator_matches <- gregexpr(sprintf("(%s){1,}", seperator), txt)[[1]]
+        separator_matches  <- gregexpr(sprintf("(%s){1,}", seperator), txt)[[1]]
         unit_start_indexes <- c(0, separator_matches + attr(separator_matches, "match.length") - 1)
-        unit_end_indexes <- c(separator_matches - 1, nchar(txt))
+        unit_end_indexes   <- c(separator_matches - 1, nchar(txt))
     
         ## get the matching analysis units
-        residx <- unique(findInterval(pattern_matches, sort(c(unit_start_indexes, unit_end_indexes))))
-        idx <- (residx + 1) / 2
+        unit_start_reference <- findInterval(pattern_matches, unit_start_indexes)
+        unit_end_reference   <- findInterval(pattern_matches + attr(pattern_matches, "match.length"), unit_end_indexes)
         
-        if (concatenate)
-            ## mark bordering matching analysis units
-            removeidx <- which(diff(idx) == 1)
-        else
-            removeidx <- NULL
-        
-        ## receive start and end indexes of the matching analysis units
-        if (length(removeidx) > 0) {
-          selfirst = unit_start_indexes[idx[ - (removeidx + 1)]]
-          selend   = unit_end_indexes[idx[ - removeidx]]
+        if(concatenate){
+            ## get a logical array with true values for start references that we need to skip
+            unit_start_reference <- c(sort(unit_start_reference), NA)
+            unit_end_reference   <- c(NA, sort(unit_end_reference))
+            bordering <- unit_start_reference - unit_end_reference <= 1
+      
+            ## receive start and end indexes of the matching analysis unit
+            for(i in (1 : length(bordering))){
+                current_end = i
+                
+                if(is.na(bordering[i]) || !bordering[i]){
+                  
+                    ## if this is the first pattern match:
+                    if(!exists("current_start"))
+                        current_start = i
+                    
+                    else{
+                        ## if exists append, otherwise create:
+                        if(exists("match_start_index"))
+                        {
+                            match_start_index <- c(match_start_index, unit_start_indexes[unit_start_reference[current_start]])
+                            match_end_index   <- c(match_end_index, unit_end_indexes[unit_end_reference[current_end]])
+                        }else {
+                            match_start_index <- c(unit_start_indexes[unit_start_reference[current_start]])
+                            match_end_index   <- c(unit_end_indexes[unit_end_reference[current_end]])
+                        }
+                        current_start = i
+                    }
+                }
+            }
         } else {
-          selfirst = unit_start_indexes[idx]
-          selend   = unit_end_indexes[idx]
+            ## if we do not concatenate, things are easier
+            ## receive start and end indexes of the matching analysis units
+            match_start_index <- unit_start_indexes[unit_start_reference]
+            match_end_index   <- unit_end_indexes[unit_end_reference]
         }
         
         ## add the codings
         for (c in cid)
-          for (i in (1 : length(selfirst)))
-            insertCoding (fid=fid, cid=c, start=selfirst[i], end=selend[i], txt)
+            for (i in (1 : length(match_start_index)))
+                insertCoding (fid=fid, cid=c, start=match_start_index[i], end=match_end_index[i], txt)
     }
 }
 

From 3cca66332a3644297cbd927a06c8c578b25eedaf Mon Sep 17 00:00:00 2001
From: Philipp Kuntschik <Philipp.Kuntschik@htwchur.ch>
Date: Tue, 2 Apr 2019 17:29:28 +0200
Subject: [PATCH 9/9] fix: a bug when a pattern is in the middle of a
 annotation unit

---
 R/autoCoding.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/autoCoding.R b/R/autoCoding.R
index cc7087e..91e8651 100644
--- a/R/autoCoding.R
+++ b/R/autoCoding.R
@@ -72,7 +72,7 @@ codingBySearchOneFile <- function(pattern, fid, cid, seperator, concatenate, ...
     
         ## get the matching analysis units
         unit_start_reference <- findInterval(pattern_matches, unit_start_indexes)
-        unit_end_reference   <- findInterval(pattern_matches + attr(pattern_matches, "match.length"), unit_end_indexes)
+        unit_end_reference   <- findInterval(pattern_matches + attr(pattern_matches, "match.length"), unit_start_indexes, left.open = TRUE)
         
         if(concatenate){
             ## get a logical array with true values for start references that we need to skip