Skip to content

Commit

Permalink
Separate into three functions
Browse files Browse the repository at this point in the history
  • Loading branch information
christofs committed Apr 20, 2016
1 parent 5437bc0 commit 27a3951
Showing 1 changed file with 74 additions and 26 deletions.
100 changes: 74 additions & 26 deletions martians.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,79 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Filename: martians.py
# Author: #cf, 2016.
# Author: Christof Schöch, 2016.


#diff_file = "wdiff_test.txt"
diff_file = "wdiffed_nowraps.txt"
diff_table_file = "diff_table.csv"
WorkDir = "/home/christof/Dropbox/0-Analysen/2016/martians/diffs2/"
InputTexts = WorkDir + "texts/*.txt"
DiffText = WorkDir + "martians_wdiffed.txt"
DiffTextPrep = WorkDir + "martians_wdiffed-prep.txt"
DiffTable = WorkDir + "DiffTable.csv"


import re
import pandas as pd
import Levenshtein as ld
import re
import nltk
import glob
import os


def sentence_splitter(InputTexts):
"""Splits a text into individual sentences. (Do before using wdiff.)"""
for file in glob.glob(InputTexts):
filename = os.path.basename(file)[:-4]
with open(file, "r") as inf:
text = inf.read()
text = nltk.sent_tokenize(text)
newtext = ""
for sent in text:
newtext = newtext + sent + "\n"
with open(WorkDir + filename + "-sent.txt", "w") as outf:
outf.write(newtext)

def extract_diffs(diff_file, diff_table_file):
with open(diff_file, "r") as df:

"""
Now do: wdiff --avoid-wraps martians1-sent.txt martians2-sent.txt > martians_wdiffed.txt
"""


def prepare_text(DiffText, DiffTextPrep):
"""Make sure all locations of a modification are marked coherently."""
with open(DiffText, "r") as df:
filename = os.path.basename(DiffText)[:-4]
diff_text = df.read()
diff_text = re.sub("]\n{", "] {", diff_text)
diff_text = re.sub("]{", "] {", diff_text)
diff_text = re.split("\n", diff_text)
newtext = ""
for sent in diff_text:
sent = re.sub("-] ([^{])", "-] {++} \\1", sent)
sent = re.sub("-]$", "-] {++}", sent)
sent = re.sub("(\w) ({)", "\\1 [--] \\2", sent)
sent = re.sub("^{\+", "[--] {+", sent)
newtext = newtext + sent + "\n"
with open(DiffTextPrep, "w") as outf:
outf.write(newtext)



def extract_diffs(DiffTextPrep, DiffTable):
"""Extract each location of a modification and classify it in a number of types."""
with open(DiffTextPrep, "r") as df:
diff_text = df.read()
diff_text = re.split("\n", diff_text)
line_number = 0
all_diffs = []
for line in diff_text:
line_number +=1
line = re.sub("]{", "] {", line)
line = re.sub("(-\]) ([^{])", "\\1 {++} \\2", line)
line = re.sub("(\w) ({)", "\\1 [--] \\2", line)
pairs = re.findall("\[-.*?\-\] {\+.*?\+}", line, re.DOTALL)
sent_number = 0
for sent in diff_text:
#print(sent)
sent_number +=1
pairs = re.findall("\[-.*?\-\] {\+.*?\+}", sent, re.DOTALL)
item_number = 0
for item in pairs:
item_number += 1
#print(item_number, item)
item_id = str(sent_number)+"-"+str(item_number)
item = re.split("\] {", item)
item_id = str(line_number)+"-"+str(item_number)
#print(item_id, item)
item1 = item[0][2:-1]
item2 = item[1][1:-2]
insertion = 0
Expand All @@ -44,7 +86,7 @@ def extract_diffs(diff_file, diff_table_file):
numbers = 0
condensation = 0
expansion = 0
xxTBCxx = 0
tbc = 0
if len(item1) == 0:
type = "insertion"
insertion = 1
Expand Down Expand Up @@ -79,14 +121,20 @@ def extract_diffs(diff_file, diff_table_file):
type = "expansion"
expansion = 1
else:
type = "xxTBCxx"
xxTBCxx = 1
type = "tbc"
tbc = 1
levenshtein = ld.distance(item1, item2)
complete_item = [item_id, item1, item2, levenshtein, type, insertion, deletion, capitalization, whitespace, italics, punctuation, hyphenation, numbers, condensation, expansion, xxTBCxx]
complete_item = [item_id, item1, item2, levenshtein, type, insertion, deletion, capitalization, whitespace, italics, punctuation, hyphenation, numbers, condensation, expansion, tbc]
all_diffs.append(complete_item)
diff_df = pd.DataFrame(all_diffs, columns=["item-id","version1","version2", "levenshtein", "type", "insertion", "deletion", "capitalization", "whitespace", "italics", "punctuation", "hyphenation", "numbers", "condensation", "expansion", "xxTBCxx"])
print(diff_df)
with open(diff_table_file, "w") as dt:
diff_df = pd.DataFrame(all_diffs, columns=["item-id","version1","version2", "levenshtein", "type", "insertion", "deletion", "capitalization", "whitespace", "italics", "punctuation", "hyphenation", "numbers", "condensation", "expansion", "tbc"])
#print(diff_df)
with open(DiffTable, "w") as dt:
diff_df.to_csv(dt, index=False)

extract_diffs(diff_file, diff_table_file)


def main(InputTexts, DiffText, DiffTextPrep, DiffTable):
#sentence_splitter(InputTexts)
#prepare_text(DiffText, DiffTextPrep)
extract_diffs(DiffTextPrep, DiffTable)

main(InputTexts, DiffText, DiffTextPrep, DiffTable)

0 comments on commit 27a3951

Please sign in to comment.