-
Notifications
You must be signed in to change notification settings - Fork 0
/
SpellingCorrector.java
158 lines (114 loc) · 4.24 KB
/
SpellingCorrector.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import java.io.*;
import java.util.*;
import java.util.regex.*;
/**
* Spelling Corrector
*
* @author sagar [email protected]
* @version 1.0
*
* Correct spelling if word doesn't exist in dictionery
*
* @since 2017-07-15
*/
class SpellingCorrector{
// Name of dictionery text file
String fileName = "wordlist.txt";
private HashMap<String, Integer> dictionery;
ArrayList<String> dictionary = new ArrayList<String>();
/**
* Constructor
*
* Read dictionery file
*
* @param void
* @return void
*
*/
SpellingCorrector(){
try{
FileReader fileReader = new FileReader(fileName);
BufferedReader bufferedReader = new BufferedReader(fileReader);
for(String line = bufferedReader.readLine(); line!=null; line = bufferedReader.readLine()){
String[] words = line.split(" ");
Collections.addAll(dictionary,words);
}
this.createHashMap(fileName);
}catch(Exception e){
System.out.println("ERROR: "+e.getMessage());
}
}
/**
*
* Builds up a map of correct words with
* their frequencies, based on the words in the given file.
*
* @param file the text to process
* @throws IOException
*/
public void createHashMap(String file) throws IOException {
dictionery = new HashMap<String, Integer>();
BufferedReader bufferedReader = new BufferedReader(new FileReader(file));
Pattern p = Pattern.compile("\\w+");
for(String temp = ""; temp != null; temp = bufferedReader.readLine()){
Matcher m = p.matcher(temp.toLowerCase());
while(m.find()){
dictionery.put((temp = m.group()), dictionery.containsKey(temp) ? dictionery.get(temp) + 1 : 1);
}
}
bufferedReader.close();
}
/**
* Constructs a list of all words within edit distance 1 of the given word.
*
*
* @param word the word to construct the list from
* @return a list of words with in edit distance 1 of word
*/
private ArrayList<String> probableEdits(String word) {
ArrayList<String> closeWords = new ArrayList<String>();
// Possible words with single letter delete
for(int i=0; i < word.length(); ++i)
closeWords.add(word.substring(0, i) + word.substring(i+1));
// Possible words with two letter swap
for(int i=0; i < word.length()-1; ++i)
closeWords.add(word.substring(0, i) + word.substring(i+1, i+2) +
word.substring(i, i+1) + word.substring(i+2));
// Possible words with one letter replaced with any letter
for(int i=0; i < word.length(); ++i)
for(char c='a'; c <= 'z'; ++c)
closeWords.add(word.substring(0, i) + String.valueOf(c) + word.substring(i+1));
// Possible words with insertion of any letter at any position in word
for(int i=0; i <= word.length(); ++i)
for(char c='a'; c <= 'z'; ++c)
closeWords.add(word.substring(0, i) + String.valueOf(c) + word.substring(i));
return closeWords;
}
/**
*
* Correct spelling of given word using dictionery of words
* If word exists in the dictionery, just return same word otherwise
* find probable closer words with simple edits or errors and return most
* probable word one from the probable list.
*
* @param word -> word to be corrected
*
* @return corrected word
*/
public String correctWord(String word) {
if(dictionery.containsKey(word))
return word;
ArrayList<String> list = probableEdits(word);
HashMap<Integer, String> candidates = new HashMap<Integer, String>();
for(String s : list)
if(dictionery.containsKey(s))
candidates.put(dictionery.get(s),s);
if(candidates.size() > 0)
return candidates.get(Collections.max(candidates.keySet()));
for(String s : list)
for(String w : probableEdits(s))
if(dictionery.containsKey(w))
candidates.put(dictionery.get(w),w);
return candidates.size() > 0 ? candidates.get(Collections.max(candidates.keySet())) : "?" + word;
}
}