-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathIRSystem.java
251 lines (183 loc) · 7.9 KB
/
IRSystem.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
import java.io.*;
import java.util.*;
import java.lang.*;
class IRSystem
{
private File[] collection;
private ArrayList<String> seen = new ArrayList<String>();
private Map<String, String> permutermIndex = new TreeMap<>();
private ArrayList<String> permutermSeen = new ArrayList<String>();
private Map<String, InvertedIndex> completeInvertedIndex = new HashMap<>(); //This Hashmap assigns every word in the collection with its own InvertedIndex object.
//This InvertedIndex object will contain all relevant information needed to calculate its TFIDF.
private Map<String, Double> documentWordCounts = new HashMap<>(); //Stores the word counts of every document, important for calculaing the Term Frequency later
public IRSystem()
{
collection = getFiles();
}
public File[] getFiles()
{
File[] files = null;
try
{
System.out.println();
System.out.print("Enter the filepath of a directory> ");
Scanner scan = new Scanner(System.in);
File dir = new File(scan.nextLine());
files = dir.listFiles();
System.out.println();
}
catch (Exception e)
{
System.out.println("Caught error in getFiles: " + e.toString());
}
return files;
}
public String process(String w) //Standardizes every word by eliminating any non alphanumeric character
{
int ascii = 0;
for (int i = 0; i < w.length(); i++) {
if(Character.isUpperCase(w.charAt(i))){
w = w.toLowerCase();
}
ascii = (int) w.charAt(i);
if (!((ascii > 96 && ascii < 123) || (ascii > 64 && ascii < 91) || (ascii > 47 && ascii < 58))) {
w = w.substring(0, i) + w.substring((i+1), w.length());
--i;
}
}
return w;
}
public boolean seenBefore(String w)
{
if (seen.contains(w)) {
return true;
}
else {
seen.add(w);
return false;
}
}
public String rotateAsterisk (String w) { //Rotates the permuterm so that the asterisk gets moved to the end.
int asteriskIndex = w.indexOf("*");
w += "$";
for (int i = 0; i < (asteriskIndex + 1); i++) {
w = w + w.charAt(0);
w = w.substring(1, ((w.length())));
}
return w; //Returns permuterm version of the initial query, with asterisk at the end
}
public String getClosestWord (String w ){ //Input: permuterm of the initial query. Compares this permuterm to every permuterm in the index for the closes match.
if(w.contains("*")){ //First, word is adjusted for use in the permuterm index (in case of submissions with wildcard(*) characters)
w = rotateAsterisk(w);
}
else {
w += "$";
}
//This algorithmn compares the edit distance between the query and every word found in the document collection
int min = Integer.MAX_VALUE;
int f = 0;
String s1;
String s2;
String realWord = "";
for (String x : permutermIndex.keySet()) {
s1 = x;
s2 = w;
int len1 = s1.length();
int len2 = s2.length();
int m[][] = new int[len1 + 1][len2 + 1];
for (int i = 0; i < len1 + 1; i++) {
m[i][0] = i;
}
for (int i = 0; i < len2 + 1; i++) {
m[0][i] = i;
}
for (int i = 1; i < (len1 + 1); i++) {
for (int j = 1; j < (len2 + 1); j++) {
if (s1.charAt(i-1) == s2.charAt(j-1)) {
f = 0;
}
else {
f = 1;
}
m[i][j] = Math.min((Math.min(
(m[i-1][j-1] + f),
(m[i-1][j] + 1))),
(m[i][j-1] + 1)
);
}
}
//end of edit distance calculator
//checks if a closer match has been found. If it has, it stores the root word of the closest match, and records new minimum
if (m[len1][len2] < min) {
min = m[len1][len2];
realWord = permutermIndex.get(x);
}
}
System.out.println("Word within collection that closest matches query:" + realWord);
return realWord; //returns closest root word match to the initial query
}
public void indexPermuterms (String w){
if (!(permutermSeen.contains(w))) {
permutermSeen.add(w);
String originalString = w;
w += "$";
for (int i = 0; i < w.length(); i++) {
permutermIndex.put(w, originalString);
w = w + w.charAt(0);
w = w.substring(1, ((w.length())));
}
}
}
public void getTFIDF (String w){ //Calculates the TFIDF of the query and prints out all documents relevant to query, ranked by said TFIDF
Map<Double, String> rankedDocuments = new TreeMap<>(Collections.reverseOrder());
double idf = completeInvertedIndex.get(w).giveIDF();
double tf = 0;
double tfIdf = 0;
for (Map.Entry<String, Double> i : completeInvertedIndex.get(w).giveTermFrequency().entrySet()) {
tf = i.getValue() / documentWordCounts.get(i.getKey());
tfIdf = tf * idf;
rankedDocuments.put(tfIdf, (i.getKey()));
}
System.out.println("Results: ");
for (Map.Entry<Double, String> i : rankedDocuments.entrySet()) {
System.out.println(i.getValue());
}
}
public void start()
{
try
{
for (File f : collection)
{
double wordCount = 0;
Scanner sc = new Scanner(f);
while (sc.hasNextLine())
{
StringTokenizer st = new StringTokenizer(sc.nextLine());
while (st.hasMoreTokens())
{
String inputWord = st.nextToken();
String outputWord = process(inputWord);
if (outputWord.equals("")) { //sole purpose is to catch and skip typos, ie 'words' consisting of a stray periods or commas in the document
continue;
}
indexPermuterms(outputWord);
if (!seenBefore(outputWord))
{
InvertedIndex data = new InvertedIndex(); //If this is the first time a word has been seen, it is added to the inverted index hashmap,
//and assigned its personal InvertedIndex object to store information in.
completeInvertedIndex.put(outputWord, data);
}
completeInvertedIndex.get(outputWord).updateTermFrequency(f.getName()); //increments the raw term frequency by one every time a word is encountered.
wordCount++;
}
}
documentWordCounts.put(f.getName(), wordCount); //Once a file is fully processed, its word count is stored for future calculations.
}
}
catch(Exception e)
{
System.out.println("Error in start: " + e.toString());
}
}
}