fixed combinations, added category

christofs · Apr 25, 2016 · 711a6f2 · 711a6f2
1 parent cdc0b0d
commit 711a6f2
Showing 1 changed file with 96 additions and 29 deletions.
diff --git a/martians.py b/martians.py
@@ -3,7 +3,7 @@
 # Filename: martians.py
 # Author: Christof Schöch, 2016.
 
-WorkDir = "/media/christof/data/Dropbox/0-Analysen/2016/martians/diffs3/"
+WorkDir = "/media/christof/data/Dropbox/0-Analysen/2016/martians/diffs4/"
 InputTexts = WorkDir + "texts/*.txt"
 DiffText = WorkDir + "martians_wdiffed.txt"
 DiffTextPrep = WorkDir + "martians_wdiffed-prep.txt"
@@ -62,18 +62,19 @@ def extract_diffs(DiffTextPrep, DiffTable):
         diff_text = df.read()
         diff_text = re.split("\n", diff_text)
         all_diffs = []
-        sent_number = 0
-        for sent in diff_text:
+        line_number = 0
+        for line in diff_text:
             #print(sent)
-            sent_number +=1
-            pairs = re.findall("\[-.*?\-\] {\+.*?\+}", sent, re.DOTALL)
+            line_number +=1
+            pairs = re.findall("\[-.*?\-\] {\+.*?\+}", line, re.DOTALL)
             item_number = 0
             for item in pairs: 
                 item_number += 1
-                item_id = str(sent_number)+"-"+str(item_number)
+                item_id = '{:05d}'.format(line_number)+"-"+str(item_number)
                 item = re.split("\] {", item)
                 #print(item_id, item)
                 type = ""
+                category = ""
                 item1 = item[0][2:-1]
                 item2 = item[1][1:-2]
                 insertion = 0
@@ -84,74 +85,140 @@ def extract_diffs(DiffTextPrep, DiffTable):
                 punctuation = 0
                 hyphenation = 0
                 numbers = 0
+                abreviation = 0
                 condensation = 0
                 expansion = 0
                 tbc = 0
+                combination = 0
+                copyedit = 0
+                content = 0
+                cat_tbc = 0
                 # Complete deletion or insertion
                 if len(item1) == 0:
+                    category = "tbc"
+                    cat_tbc = 1
                     type = "insertion"
                     insertion = 1
                 elif len(item2) == 0:
+                    category = "tbc"
+                    cat_tbc = 1
                     type = "deletion"
                     deletion = 1
-                # Composite cases: two criteria apply
-                elif (re.sub(" ","",item1.lower()) == re.sub(" ","",item2.lower())) and (re.sub(" ","",item1) != re.sub(" ","",item2)) and (item1.lower() != item2.lower()):
-                    type = "combination"
-                    capitalization = 1                
-                    whitespace = 1
-#                elif re.sub("\*","",item1) == re.sub("\*","",item2): 
-#                    type = "combination"
-#                    capitalization = 1                
-#                    italics = 1
-#                elif re.sub("[\",';:!?\.\(\)]","",item1.lower()) == re.sub("[\",';:!?\.\(\)]","",item2.lower()):
-#                    type = "combination"
-#                    capitalization = 1                
-#                    punctuation = 1
-#                elif re.sub("\-","",item1.lower()) == re.sub(" ","",item2.lower()): 
-#                    type = "combination"
-#                    hyphenation = 1
-#                    capitalization = 1                
-#                elif re.sub(" ","",item1.lower()) == re.sub("\-","",item2.lower()): 
-#                    type = "combination"
-#                    hyphenation = 1
-#                    capitalization = 1                
                 # Simple cases: only one criterion applies
                 elif item1.lower() == item2.lower(): 
                     type = "capitalization"
+                    category = "copyedit"
                     capitalization = 1
+                    copyedit = 1
                 elif re.sub(" ","",item1) == re.sub(" ","",item2): 
                     type = "whitespace"
+                    category = "copyedit"
+                    copyedit = 1
                     whitespace = 1
                 elif re.sub("\*","",item1) == re.sub("\*","",item2): 
                     type = "italics"
+                    category = "copyedit"
+                    copyedit = 1
                     italics = 1
                 elif re.sub("[\",';:!?\.\(\)]","",item1) == re.sub("[\",';:!?\.\(\)]","",item2): 
                     type = "punctuation"
+                    category = "copyedit"
+                    copyedit = 1
                     punctuation = 1
                 elif re.sub("\-","",item1) == re.sub(" ","",item2): 
                     type = "hyphenation"
+                    category = "copyedit"
+                    copyedit = 1
                     hyphenation = 1
                 elif re.sub(" ","",item1) == re.sub("\-","",item2): 
                     type = "hyphenation"
+                    category = "copyedit"
+                    copyedit = 1
                     hyphenation = 1
                 elif bool(re.search(r'\d', item1+item2)) == True:
                     type = "numbers"
+                    category = "copyedit"
+                    copyedit = 1
                     numbers = 1
+                # Composite cases: two criteria apply
+                elif re.sub(" ","",item1.lower()) == re.sub(" ","",item2.lower()):
+                    type = "combination"
+                    category = "copyedit"
+                    copyedit = 1
+                    capitalization = 1                
+                    whitespace = 1
+                    combination = 1
+                elif re.sub("\*","",item1) == re.sub("\*","",item2): 
+                    type = "combination"
+                    category = "copyedit"
+                    copyedit = 1
+                    capitalization = 1                
+                    italics = 1
+                    combination = 1
+                elif re.sub("[\",';:!?\.\(\)]","",item1.lower()) == re.sub("[\",';:!?\.\(\)]","",item2.lower()):
+                    category = "copyedit"
+                    copyedit = 1
+                    type = "combination"
+                    capitalization = 1                
+                    punctuation = 1
+                    combination = 1
+                elif re.sub("\-","",item1.lower()) == re.sub(" ","",item2.lower()): 
+                    category = "copyedit"
+                    copyedit = 1
+                    type = "combination"
+                    capitalization = 1                
+                    hyphenation = 1
+                    combination = 1
+                elif re.sub(" ","",item1.lower()) == re.sub("\-","",item2.lower()): 
+                    category = "copyedit"
+                    copyedit = 1
+                    type = "combination"
+                    capitalization = 1       
+                    hyphenation = 1
+                    combination = 1
+                elif re.sub("[\",';:!?\.\(\) ]","",item1.lower()) == re.sub("[\",';:!?\.\(\) ]","",item2.lower()):
+                    category = "copyedit"
+                    copyedit = 1
+                    type = "combination"
+                    punctuation = 1       
+                    whitespace = 1
+                    combination = 1
+                elif re.sub("[\",';:!?\.\(\)\-]","",item1.lower()) == re.sub("[\",';:!?\.\(\)\-]","",item2.lower()):
+                    category = "copyedit"
+                    copyedit = 1
+                    type = "combination"
+                    punctuation = 1
+                    hyphenation = 1       
+                    combination = 1
+                elif re.sub("[\",';:!?\.\(\)\- ]","",item1.lower()) == re.sub("[\",';:!?\.\(\)\- ]","",item2.lower()):
+                    category = "copyedit"
+                    copyedit = 1
+                    type = "combination"
+                    punctuation = 1
+                    hyphenation = 1       
+                    whitespace = 1       
+                    combination = 1
                 # If none of the more specific cases apply:
                 elif len(item1) > len(item2)+3:
+                    category = "tbc"
+                    cat_tbc = 1
                     type = "condensation"
                     condensation = 1
                 elif len(item2) > len(item1)+3:
+                    category = "tbc"
+                    cat_tbc = 1
                     type = "expansion"
                     expansion = 1
                 # All still unclassified cases:
                 else: 
+                    category = "tbc"
+                    cat_tbc = 1
                     type = "tbc"
                     tbc = 1
                 levenshtein = ld.distance(item1, item2)
-                complete_item = [item_id, item1, item2, levenshtein, type, insertion, deletion, capitalization, whitespace, italics, punctuation, hyphenation, numbers, condensation, expansion, tbc]
+                complete_item = [item_id, item1, item2, category, type, levenshtein, insertion, deletion, capitalization, whitespace, italics, punctuation, hyphenation, numbers, abreviation, condensation, expansion, tbc, combination, copyedit, content, cat_tbc]
                 all_diffs.append(complete_item)
-    diff_df = pd.DataFrame(all_diffs, columns=["item-id","version1","version2", "levenshtein",  "type", "insertion", "deletion", "capitalization", "whitespace", "italics", "punctuation", "hyphenation", "numbers", "condensation", "expansion", "tbc"])
+    diff_df = pd.DataFrame(all_diffs, columns=["item-id", "version1", "version2", "category", "type", "levenshtein", "insertion", "deletion", "capitalization", "whitespace", "italics", "punctuation", "hyphenation", "numbers", "abreviation", "condensation", "expansion", "tbc", "combination", "cat=copyedit", "cat=content", "cat=tbc"])
     #print(diff_df)
     with open(DiffTable, "w") as dt: 
         diff_df.to_csv(dt, index=False, sep="\t")