From 8971c415cc20763afda159241425bc32fab702bf Mon Sep 17 00:00:00 2001 From: christofs Date: Sun, 24 Apr 2016 22:18:31 +0200 Subject: [PATCH] added combinations --- martians.py | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/martians.py b/martians.py index 0a2ba27..3ef941a 100644 --- a/martians.py +++ b/martians.py @@ -3,7 +3,7 @@ # Filename: martians.py # Author: Christof Schöch, 2016. -WorkDir = "/home/christof/Dropbox/0-Analysen/2016/martians/diffs2/" +WorkDir = "/media/christof/data/Dropbox/0-Analysen/2016/martians/diffs3/" InputTexts = WorkDir + "texts/*.txt" DiffText = WorkDir + "martians_wdiffed.txt" DiffTextPrep = WorkDir + "martians_wdiffed-prep.txt" @@ -33,14 +33,13 @@ def sentence_splitter(InputTexts): """ -Now do: wdiff --avoid-wraps martians1-sent.txt martians2-sent.txt > martians_wdiffed.txt +Now do, in [WorkDir]/texts: wdiff --avoid-wraps martian1-sent.txt martian2-sent.txt > martians_wdiffed.txt """ def prepare_text(DiffText, DiffTextPrep): """Make sure all locations of a modification are marked coherently.""" with open(DiffText, "r") as df: - filename = os.path.basename(DiffText)[:-4] diff_text = df.read() diff_text = re.sub("]\n{", "] {", diff_text) diff_text = re.sub("]{", "] {", diff_text) @@ -74,6 +73,7 @@ def extract_diffs(DiffTextPrep, DiffTable): item_id = str(sent_number)+"-"+str(item_number) item = re.split("\] {", item) #print(item_id, item) + type = "" item1 = item[0][2:-1] item2 = item[1][1:-2] insertion = 0 @@ -87,12 +87,35 @@ def extract_diffs(DiffTextPrep, DiffTable): condensation = 0 expansion = 0 tbc = 0 + # Complete deletion or insertion if len(item1) == 0: type = "insertion" insertion = 1 elif len(item2) == 0: type = "deletion" deletion = 1 + # Composite cases: two criteria apply + elif re.sub(" ","",item1.lower()) == re.sub(" ","",item2.lower()): + type = "combination" + capitalization = 1 + whitespace = 1 + elif re.sub("\*","",item1) == re.sub("\*","",item2): + type = "combination" + capitalization = 1 + italics = 1 + elif re.sub("[\",';:!?\.\(\)]","",item1.lower()) == re.sub("[\",';:!?\.\(\)]","",item2.lower()): + type = "combination" + capitalization = 1 + punctuation = 1 + elif re.sub("\-","",item1.lower()) == re.sub(" ","",item2.lower()): + type = "combination" + hyphenation = 1 + capitalization = 1 + elif re.sub(" ","",item1.lower()) == re.sub("\-","",item2.lower()): + type = "combination" + hyphenation = 1 + capitalization = 1 + # Simple cases: only one criterion applies elif item1.lower() == item2.lower(): type = "capitalization" capitalization = 1 @@ -114,12 +137,14 @@ def extract_diffs(DiffTextPrep, DiffTable): elif bool(re.search(r'\d', item1+item2)) == True: type = "numbers" numbers = 1 + # If none of the more specific cases apply: elif len(item1) > len(item2)+3: type = "condensation" condensation = 1 elif len(item2) > len(item1)+3: type = "expansion" expansion = 1 + # All still unclassified cases: else: type = "tbc" tbc = 1 @@ -129,7 +154,7 @@ def extract_diffs(DiffTextPrep, DiffTable): diff_df = pd.DataFrame(all_diffs, columns=["item-id","version1","version2", "levenshtein", "type", "insertion", "deletion", "capitalization", "whitespace", "italics", "punctuation", "hyphenation", "numbers", "condensation", "expansion", "tbc"]) #print(diff_df) with open(DiffTable, "w") as dt: - diff_df.to_csv(dt, index=False) + diff_df.to_csv(dt, index=False, sep="\t") def main(InputTexts, DiffText, DiffTextPrep, DiffTable):