-
Notifications
You must be signed in to change notification settings - Fork 0
/
check
120 lines (100 loc) · 2.85 KB
/
check
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/bin/bash
# Check for Validity of arguments
if [ $# -ne 1 ]
then
echo "Invalid arguments"
echo "Usage: $0 <mail-to-check>"
exit 1
else
TOCHECK=$1
fi
# Create log file
exec 1>`date +'%Y%m%d%H%M%S'`.log 2>&1
# Generates training data for Not Spam and Spam mails as follows:
# 1. Split the files one by one
# 2. Once split is successful, sort the splitted file so that it can used by uniq
# 3. Calculate total count of words
# 4. Then using uniq, calculate the probability of each word and sort the file in decreasing order of probability
getProb() {
for i in `find . -name "*" -type f`
do
echo "Processing $i ..."
../fileSplitter $i 1
if [ $? -ne 0 ]
then
echo "Some error occured during file splitting for 1grams"
exit 1
fi
done
echo ""
rm -f ../$mrgFile
for curFileToMrg in `find . -name "*.1" -type f`
do
cat $curFileToMrg >> ../$mrgFile
echo "" >> ../$mrgFile
done
cd ..
# This check is really important
err=`cat $mrgFile | awk '{ print NF }' | sort -u | wc -l`
if [ $err -gt 1 ]
then
echo "Number of fields in file $mrgFile are more than 1"
exit 1
fi
# Sort the splitted file
rm -f $mrgFile.sort
sort $mrgFile >> $mrgFile.sort
# Create grams for the sorted file
uniq -c $mrgFile.sort >> $mrgFile.sort.2
# Count total no of words
totCnt=`awk 'BEGIN{i=0} {i+=$1} END{print i}' $mrgFile.sort.2`
rm -f $mrgFile.sort.prob
cat $mrgFile.sort.2 | awk -v den=$totCnt '{ printf( "%s %f\n", $2, $1/den ) }' | sort -r -k 2 >> $mrgFile.sort.prob.$1.$totCnt
if [ $? -eq 0 ]
then
echo "Training data generated successfully"
echo "Total count of words :: "$totCnt
echo ""
fi
# Remove intermediate files created for nGrams
rm -f $mrgFile
rm -f $mrgFile.sort
rm -f $mrgFile.sort.2
}
mrgFile=1.txt
cnt=`find . -maxdepth 1 -name "1.txt.sort.prob.N*" -type f | wc -l`
if [ $cnt -eq 0 ]
then
echo "Probability file(for NotSpam) does not exists. Calculating Probability from the training data ..."
cd ./NotSpam/
getProb N
else
echo "Probability file(for NotSpam) already exists."
fi
cnt=`find . -maxdepth 1 -name "1.txt.sort.prob.P*" -type f | wc -l`
if [ $cnt -eq 0 ]
then
echo "Probability file(for Spam) does not exists. Calculating Probability from the training data ..."
cd ./Spam/
getProb P
else
echo "Probability file(for Spam) already exists. Checking if the I/P is SPAM ..."
fi
# Split the to be checked Spam file
./fileSplitter $TOCHECK 1
if [ $? -ne 0 ]
then
echo "Some error occured during file splitting for 1grams"
exit 1
fi
nSPCnt=`find . -maxdepth 1 -name "1.txt.sort.prob.N*" -type f | awk -F"N." '{print $2}'`
sPCnt=`find . -maxdepth 1 -name "1.txt.sort.prob.P*" -type f | awk -F"P." '{print $2}'`
# Check for spam
./checkForSpam $mrgFile.sort.prob.N.$nSPCnt $mrgFile.sort.prob.P.$sPCnt $TOCHECK.1 $nSPCnt $sPCnt
if [ $? -ne 0 ]
then
echo "Some error occured while checking for SPAM ..."
exit 1
else
rm -f $TOCHECK.1
fi