diff --git a/martians.py b/martians.py index 845a692..0a2ba27 100644 --- a/martians.py +++ b/martians.py @@ -1,37 +1,79 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- # Filename: martians.py -# Author: #cf, 2016. +# Author: Christof Schöch, 2016. - -#diff_file = "wdiff_test.txt" -diff_file = "wdiffed_nowraps.txt" -diff_table_file = "diff_table.csv" +WorkDir = "/home/christof/Dropbox/0-Analysen/2016/martians/diffs2/" +InputTexts = WorkDir + "texts/*.txt" +DiffText = WorkDir + "martians_wdiffed.txt" +DiffTextPrep = WorkDir + "martians_wdiffed-prep.txt" +DiffTable = WorkDir + "DiffTable.csv" +import re import pandas as pd import Levenshtein as ld -import re +import nltk +import glob +import os + +def sentence_splitter(InputTexts): + """Splits a text into individual sentences. (Do before using wdiff.)""" + for file in glob.glob(InputTexts): + filename = os.path.basename(file)[:-4] + with open(file, "r") as inf: + text = inf.read() + text = nltk.sent_tokenize(text) + newtext = "" + for sent in text: + newtext = newtext + sent + "\n" + with open(WorkDir + filename + "-sent.txt", "w") as outf: + outf.write(newtext) -def extract_diffs(diff_file, diff_table_file): - with open(diff_file, "r") as df: + +""" +Now do: wdiff --avoid-wraps martians1-sent.txt martians2-sent.txt > martians_wdiffed.txt +""" + + +def prepare_text(DiffText, DiffTextPrep): + """Make sure all locations of a modification are marked coherently.""" + with open(DiffText, "r") as df: + filename = os.path.basename(DiffText)[:-4] + diff_text = df.read() + diff_text = re.sub("]\n{", "] {", diff_text) + diff_text = re.sub("]{", "] {", diff_text) + diff_text = re.split("\n", diff_text) + newtext = "" + for sent in diff_text: + sent = re.sub("-] ([^{])", "-] {++} \\1", sent) + sent = re.sub("-]$", "-] {++}", sent) + sent = re.sub("(\w) ({)", "\\1 [--] \\2", sent) + sent = re.sub("^{\+", "[--] {+", sent) + newtext = newtext + sent + "\n" + with open(DiffTextPrep, "w") as outf: + outf.write(newtext) + + + +def extract_diffs(DiffTextPrep, DiffTable): + """Extract each location of a modification and classify it in a number of types.""" + with open(DiffTextPrep, "r") as df: diff_text = df.read() diff_text = re.split("\n", diff_text) - line_number = 0 all_diffs = [] - for line in diff_text: - line_number +=1 - line = re.sub("]{", "] {", line) - line = re.sub("(-\]) ([^{])", "\\1 {++} \\2", line) - line = re.sub("(\w) ({)", "\\1 [--] \\2", line) - pairs = re.findall("\[-.*?\-\] {\+.*?\+}", line, re.DOTALL) + sent_number = 0 + for sent in diff_text: + #print(sent) + sent_number +=1 + pairs = re.findall("\[-.*?\-\] {\+.*?\+}", sent, re.DOTALL) item_number = 0 for item in pairs: item_number += 1 - #print(item_number, item) + item_id = str(sent_number)+"-"+str(item_number) item = re.split("\] {", item) - item_id = str(line_number)+"-"+str(item_number) + #print(item_id, item) item1 = item[0][2:-1] item2 = item[1][1:-2] insertion = 0 @@ -44,7 +86,7 @@ def extract_diffs(diff_file, diff_table_file): numbers = 0 condensation = 0 expansion = 0 - xxTBCxx = 0 + tbc = 0 if len(item1) == 0: type = "insertion" insertion = 1 @@ -79,14 +121,20 @@ def extract_diffs(diff_file, diff_table_file): type = "expansion" expansion = 1 else: - type = "xxTBCxx" - xxTBCxx = 1 + type = "tbc" + tbc = 1 levenshtein = ld.distance(item1, item2) - complete_item = [item_id, item1, item2, levenshtein, type, insertion, deletion, capitalization, whitespace, italics, punctuation, hyphenation, numbers, condensation, expansion, xxTBCxx] + complete_item = [item_id, item1, item2, levenshtein, type, insertion, deletion, capitalization, whitespace, italics, punctuation, hyphenation, numbers, condensation, expansion, tbc] all_diffs.append(complete_item) - diff_df = pd.DataFrame(all_diffs, columns=["item-id","version1","version2", "levenshtein", "type", "insertion", "deletion", "capitalization", "whitespace", "italics", "punctuation", "hyphenation", "numbers", "condensation", "expansion", "xxTBCxx"]) - print(diff_df) - with open(diff_table_file, "w") as dt: + diff_df = pd.DataFrame(all_diffs, columns=["item-id","version1","version2", "levenshtein", "type", "insertion", "deletion", "capitalization", "whitespace", "italics", "punctuation", "hyphenation", "numbers", "condensation", "expansion", "tbc"]) + #print(diff_df) + with open(DiffTable, "w") as dt: diff_df.to_csv(dt, index=False) - -extract_diffs(diff_file, diff_table_file) \ No newline at end of file + + +def main(InputTexts, DiffText, DiffTextPrep, DiffTable): + #sentence_splitter(InputTexts) + #prepare_text(DiffText, DiffTextPrep) + extract_diffs(DiffTextPrep, DiffTable) + +main(InputTexts, DiffText, DiffTextPrep, DiffTable)