Skip to content

Commit

Permalink
added preliminary significance tests, need fixing
Browse files Browse the repository at this point in the history
  • Loading branch information
christofs committed May 22, 2016
1 parent 30e3d14 commit a107233
Showing 1 changed file with 48 additions and 2 deletions.
50 changes: 48 additions & 2 deletions split_narration.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@
import re
import os
import glob
import numpy as np
import pandas as pd

from scipy import stats

WorkDir = "/media/christof/data/Dropbox/0-Analysen/2016/martians/narration/"
DiffTable = WorkDir+"DiffTable_narration.csv"
Expand Down Expand Up @@ -103,7 +104,11 @@ def split_narration(DiffTable, TextFirst, TextThird):
CharDeltaAbsRelFirst = CharDeltaAbsFirst / len(TextFirstLines)
CharDeltaAbsRelThird = LevenshteinThird / len(TextFirstLines)
print("Absolute Char Delta (relative to lines): First", CharDeltaAbsRelFirst, "; Third", CharDeltaAbsRelThird)

#print(EditsFirst.loc[:,"levenshtein"].mean())
#print(EditsThird.loc[:,"levenshtein"].mean())
#print(np.mean(EditsFirst.loc[:,"levenshtein"]))
#print(np.mean(EditsThird.loc[:,"levenshtein"]))

# What was the relative levenshtein difference and absolute difference of characters relative to the number of tokens?
LevenshteinRelFirst = LevenshteinFirst / len(TextFirstTokens)
LevenshteinRelThird = LevenshteinThird / len(TextThirdTokens)
Expand All @@ -124,6 +129,27 @@ def split_narration(DiffTable, TextFirst, TextThird):
print("Number of edits: Third copyedits", len(ThirdCopy), "; Third significant", len(ThirdSign))
print("Proportion of significant edits: First", len(FirstSign)/len(EditsFirst), "; Third", len(ThirdSign)/len(EditsThird))
print("Proportion of copyedits: First", len(FirstCopy)/len(EditsFirst), "; Third", len(ThirdCopy)/len(EditsThird))

print("\n== Significance tests ==")
# See http://docs.scipy.org/doc/scipy-0.16.1/reference/generated/scipy.stats.ttest_ind.html

LevenshteinFirstList = EditsFirst.loc[:,"levenshtein"][-2200:]#.div(len(EditsFirst))
LevenshteinThirdList = EditsThird.loc[:,"levenshtein"][-2200:]#.div(len(EditsThird))
#print(LevenshteinFirstList, LevenshteinThirdList)
ttest = stats.ttest_ind(LevenshteinFirstList, LevenshteinThirdList, axis=0, equal_var=False)
print("Welch's t-test for the Levenshtein distances first vs. third: statistics", ttest[0], "p-value", ttest[1])


CharDeltaFirstList = EditsFirst.loc[:,"char-delta-abs"]#.div(len(EditsFirst))
CharDeltaThirdList = EditsThird.loc[:,"char-delta-abs"]#.div(len(EditsThird))
#print(CharDeltaFirstList, CharDeltaThirdList)
ttest = stats.ttest_ind(CharDeltaFirstList, CharDeltaThirdList, axis=0, equal_var=False)
print("Welch's t-test for the absolute character differences first vs. third: statistics", ttest[0], "p-value", ttest[1])


# See http://docs.scipy.org/doc/scipy-0.16.1/reference/generated/scipy.stats.chisquare.html
#scipy.stats.ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2, equal_var=True)



print("\nDone.")
Expand All @@ -132,3 +158,23 @@ def split_narration(DiffTable, TextFirst, TextThird):
split_narration(DiffTable, TextFirst, TextThird)






















0 comments on commit a107233

Please sign in to comment.