-
Notifications
You must be signed in to change notification settings - Fork 0
/
task1.py
100 lines (87 loc) · 4.34 KB
/
task1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#########################################################################
# Handling with File Contents and Preprocessing {Assignment2 of FIT9133}#
#########################################################################
#############################################################
# Student name: Sohail.Sankanur #
# Monash Student ID: 29996368 #
# Start Date: 07 Oct 2018 #
# Last Modified Date: 12 Oct 2018 #
#############################################################
## Explaination of task:
#>> In this task we have created a method named 'cleanfile'. This takes in an input argument which is the filename path
#>> this method would first filter out the relavent data
#>> After filtering out the relavent data we write the filtered data to a new file.
## Explaination of the code structure:
#>> In the code we have a method named 'cleanfile' which takes in an input argumrnt which would be a string.
#>> In the first part of the code in the method it checks weather the files 'SLI_cleaned' and 'TD_cleaned' exists in the
# ENNI folder. If they are not present the file directories are created.
#>> After this step we take the filename and append the path to it.
#>> We then read the file transcripts.
#>> Each string from the transcript is split with the help of string split function and the delimeter used in "*CHI:"
#>> We further split the string based on the "%mor" delimeter and only the relevant data is obtained.
#>> All the relevant data is stored in a list in the form of strings.
#>> All the filtering tasks which are mentioned in task1 is performed on the string and the filtered strings are placed in a
# new list
#>> We then write all the filtered strings to a file in SLI_cleaned or the TD_cleaned folder.
import os
def cleanFile(filename): #method for fltering out relavent data out of file transcripts
if not os.path.exists("ENNI/SLI_cleaned"):
os.makedirs("ENNI/SLI_cleaned")
if not os.path.exists("ENNI/TD_cleaned"):
os.makedirs("ENNI/TD_cleaned")
print("\n\nFiltered output of "+filename)
if filename.find("SLI") > -1:
cfilename = "ENNI/SLI_cleaned/" + filename.replace(".txt","") + "_cleaned.txt"
else:
cfilename = "ENNI/TD_cleaned/" + filename.replace(".txt","") + "_cleaned.txt"
if filename.find("SLI") > -1:
filename="ENNI/SLI/"+filename
else:
filename = "ENNI/TD/" + filename
file_read = open(filename, 'r')
splitChi= file_read.read().split("*CHI:")
allstr = []
finallist=[]
for i in range(1,len(splitChi)):
allstr.append(splitChi[i].split("%mor:")[0].replace("\n\t",' ').strip())
for k in allstr:
st = k
tempstr = st
tempstr=tempstr.replace("(..)","")
tempstr=tempstr.replace("(...)","")
tempstr=tempstr.replace("[*]","")
tempstr=tempstr.replace("[* m:+ed]","[*]")
wordremove = ""
for i in range(0,len(st)):
if st[i] == "[":
for j in range(i,len(st)):
wordremove=wordremove+st[j]
if st[j]=="]":
if wordremove=="[//]" or wordremove=="[/]" or wordremove=="[*]":
break
else:
wordremove=" "+wordremove
tempstr = tempstr.replace(wordremove,"")
break
wordremove=""
tempstr = tempstr.replace("<","")
tempstr = tempstr.replace(">","")
if st[i] == "&" or st[i]=="+":
for j in range(i,len(st)):
wordremove = wordremove+st[j]
if st[j]==" ":
tempstr = tempstr.replace(wordremove,"")
wordremove=""
tempstr = tempstr.replace("(.)", "#TokenForReplacement#")
tempstr=tempstr.replace("(","")
tempstr=tempstr.replace(")","")
tempstr = tempstr.replace("#TokenForReplacement#","(.)")
tempstr=tempstr.replace("[*]","[* m:+ed]")
print(tempstr)
finallist.append(tempstr)
#this part of the code writes the output relavent data to the new files.
file_write=open(cfilename,'w')
for i in finallist:
file_write.write(i + "\n")
file_read.close()
file_write.close()