-
Notifications
You must be signed in to change notification settings - Fork 3
/
CorpusByWord.java
375 lines (337 loc) · 15.8 KB
/
CorpusByWord.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
/* This file is part of DocumentCluster, a program for clustering text
documents based on similarity. To use, specify the number of clusters
followed by the documents, which must be located in the data subdirectory.
Stopwords are eliminated by filtering the document contents against
stopwords.txt in the same directory. Words are stemmed using the Porter
Stemming algorithm. k-means clustering based on cosine similarity is used
for the clustering.
Copyright (C) 2013 Ezra Erb
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License version 3 as published
by the Free Software Foundation.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
I'd appreciate a note if you find this program useful or make
updates. Please contact me through LinkedIn or github (my profile also has
a link to the code depository)
*/
import java.util.*;
import java.security.InvalidParameterException;
/* This class takes a document corpus data set, organized by document, and
manipulates it as though it was organized by word instead.
WARNING: If multiple objects are created on the same document corpus, they
are not syncronized! Deleting with one while reading with the other will
create data races */
public class CorpusByWord
{
/* This class implements the classic syncronized scanning algorithm. The
data lists for each document are sorted in word order, so finding words
in order requires keeping a pointer into the current word for each
document, and sorting them. Multiple pointers may point to the same word
in different documents, so they are sorted in groups. The first group is
the current word. To advance to the next, move the pointers in the group,
and then resort them in the list. */
// Pointer to a word entry for a document
private class DocIndex
{
private DocWordData _docPointer; // Cache of pointer to document data
private int _document;
private int _index;
/* True, have done a deletion since the last time the pointer was
advanced, and it already points to the next entry */
private boolean _deleteSinceLastAdvance;
// Set index to first word of a document
public DocIndex(int document) throws Exception
{
if ((document < 0) || (document > _corpusData.size()))
throw new InvalidParameterException("Document " + document + " does not exist in data set");
_document = document;
_index = 0;
_deleteSinceLastAdvance = false;
/* Extract a pointer to the document data and cache it in the
object
WARNING: These pointers are shared and not syncronized, so
having multiple of these objects per document will create
race conditons! */
_docPointer = _corpusData.get(document);
}
// Return the word it points to
public String getWord() throws Exception
{
if ((!hasData()) || _deleteSinceLastAdvance)
return null;
else
return _docPointer.getWordForIndex(_index);
}
// Return the frequency data it points to
public FrequencyByDoc getData() throws Exception
{
if ((!hasData()) || _deleteSinceLastAdvance)
return new FrequencyByDoc(0.0, _document);
else
return new FrequencyByDoc(_docPointer.getData(_index),
_document);
}
/* Erase the word this index points to. This automatically moves the
next word for the document into its place */
public void removeData() throws Exception
{
/* If data was already deleted, don't delete it again because the
index points to the wrong place! Calling advance will reset
the pointer to consistent state */
if (!_deleteSinceLastAdvance) {
if (hasData())
_docPointer.removeData(_index);
_deleteSinceLastAdvance = true;
}
}
// Scale the frequency value this index points to
public void scaleData(double scaleAmt) throws Exception
{
if (hasData() && (!_deleteSinceLastAdvance))
_docPointer.scaleData(_index, scaleAmt);
}
// Return true if the index is valid
public boolean hasData()
{
return (_index < _docPointer.size());
}
// Returns true if the index points to the last word for the document
public boolean isLastWord()
{
/* If the last word was deleted, the index still effectively
points to that word, but the underlying data actually points
to the NEXT word to process (advancing the index restores
consistent state). This means the if something was deleted, the
index points to the last word if the pointer is now invalid.
Otherwise, check whether the indexed word is the last one in
the list */
if (_deleteSinceLastAdvance)
return (!hasData()); // Now invalid means deleted word was last
else
return ((_index + 1) == _docPointer.size());
}
// Moves index to next document entry
public void advance()
{
/* If the current entry was removed, the next entry automatically
replaced it, so no advance is actually needed; the pointer is
already there! */
if (_deleteSinceLastAdvance)
_deleteSinceLastAdvance = false;
else if (hasData())
_index++;
}
public String toString()
{
try {
if (_deleteSinceLastAdvance)
return "DocIndex:" + _document + " WordIndex:" + _index + " Data:DELETED";
else
return "DocIndex:" + _document + " WordIndex:" + _index + " Data:" + getWord() + "-" + getData();
}
catch (Exception e) {
return "DocIndex:" + _document + " WordIndex:" + _index + " Data:INVALID";
}
}
}
// Frequency data from the document corpus
private ArrayList<DocWordData> _corpusData;
/* List of current pointer locations, sorted by the words they point to.
First item in the map is the next word to process */
private TreeMap<String, ArrayList<DocIndex>> _wordList;
// True, at least one word has been accessed
boolean _haveProcessedWord;
public CorpusByWord(ArrayList<DocWordData> corpusData) throws Exception
{
_corpusData = corpusData;
/* Set initial pointers, one per document. If the pointer is invalid
(indicating the document data is empty) discard it. If ALL are
discarded, the corpus is invalid */
_wordList = new TreeMap<String, ArrayList<DocIndex>>();
int index;
for (index = 0; index < _corpusData.size(); index++)
// Ignore documents with no data
if (_corpusData.get(index) != null)
insertDocIndex(new DocIndex(index));
if (_wordList.isEmpty())
throw new InvalidParameterException("Corpus to analyze " + _corpusData + " has no data");
_haveProcessedWord = false;
}
// Inserts a valid document pointer in the current word list
private void insertDocIndex(DocIndex newIndex) throws Exception
{
// Only insert if pointer is valid
if (newIndex.hasData()) {
ArrayList<DocIndex> wantList = _wordList.get(newIndex.getWord());
if (wantList == null) {
// Create a new list
wantList = new ArrayList<DocIndex>();
_wordList.put(newIndex.getWord(), wantList);
} // List for word does not already exist
wantList.add(newIndex);
} // Valid doc index
}
// True if there are still words to process
public boolean hasNext()
{
/* The word pointer must be advanced just before getting the data for
the next word in order for delete to work properly (a delete changes
the underlying indexes in the corpus lists). This means that the
list is out only when there is a single word left, and advancing all
of its pointers leads to invalid data */
if ((_wordList.size() > 1) || (!_haveProcessedWord))
return true;
else if (_wordList.isEmpty())
return false; // Avoid null pointer error
else {
/* Iterate through the pointers for the word. If any does not point
to the last word for the document, still have words to process */
Iterator<DocIndex> testIterator = _wordList.firstEntry().getValue().iterator();
boolean lastWord = true;
while (testIterator.hasNext() && lastWord) {
DocIndex testIndex = testIterator.next();
// Test on validity in case word was deleted
lastWord = (!testIndex.hasData()) || testIndex.isLastWord();
}
return (!lastWord);
} // Exactly one entry in the word list
}
// Returns data for the next word to process
/* NOTE: Document order in the results has no guarantees. Client should sort
the data if needed */
public ArrayList<FrequencyByDoc> nextWord() throws Exception
{
/* The index values must be advanced just before reading the data for
the next word, in order for delete to work correctly (it changes the
index values in the underlying word lists) The object is constructed
with the pointer at the first word, so advance if something was
previously read. */
if (!_haveProcessedWord)
_haveProcessedWord = true; // Just about to
else {
/* To advance the pointers, extract those for the current first
entry in the word list, advance them, and re-insert them at their
new positions. If there is no next entry for a given pointer, it
becomes invalid and the insertion operation will throw it out.
If the word list is empty afterward, this method was called when
no words are left to process, which is an error */
Iterator<DocIndex> procIterator = _wordList.pollFirstEntry().getValue().iterator();
while (procIterator.hasNext()) {
DocIndex newIndex = procIterator.next();
newIndex.advance();
insertDocIndex(newIndex);
}
if (_wordList.isEmpty())
throw new NoSuchElementException();
} // At least one word was previously processed
// Extract the entry and iterate through it to get the data
Iterator<DocIndex> dataIterator = _wordList.firstEntry().getValue().iterator();
ArrayList<FrequencyByDoc> results = new ArrayList<FrequencyByDoc>();
while (dataIterator.hasNext())
results.add(dataIterator.next().getData());
return results;
}
// Returns the last word to be processed
public String getProcessedWord()
{
if ((!_haveProcessedWord) || (_wordList.isEmpty()))
return null; // Haven't processed anything, or none left
else
return _wordList.firstKey();
}
// Deletes the last word for which data was returned from the dataset
public void deleteProcessedWord() throws Exception
{
if (_haveProcessedWord && (!_wordList.isEmpty())) {
// Extract the current entry and delete every index in it
Iterator<DocIndex> dataIterator = _wordList.firstEntry().getValue().iterator();
while (dataIterator.hasNext())
dataIterator.next().removeData();
} // Have processed a word when method was called
}
// Scales the frequency values for the last processed word
public void scaleProcessedWord(double scaleValue) throws Exception
{
if (_haveProcessedWord && (!_wordList.isEmpty())) {
// Extract the current entry and delete every index in it
Iterator<DocIndex> dataIterator = _wordList.firstEntry().getValue().iterator();
while (dataIterator.hasNext())
dataIterator.next().scaleData(scaleValue);
} // Have processed a word when method was called
}
// Output function. Prints the indexes only to keep things a resonable size
public String toString()
{
return _wordList.toString();
}
public static void main(String[] args) throws Exception
{
try {
/* The test document corpus must be carefully constructed. It
needs words that appear in all documents, only some of them,
and one of them. Having a different number in each document is
also useful, and the last words should all be different */
DocWordData data1 = new DocWordData();
DocWordData data2 = new DocWordData();
DocWordData data3 = new DocWordData();
data1.addData("a", 0.1);
data3.addData("a", 0.3);
data2.addData("b", 0.2);
data3.addData("b", 0.4);
data1.addData("c", 0.5);
data1.addData("d", 0.111);
data2.addData("d", 0.112);
data3.addData("d", 0.113);
data2.addData("e", 0.6);
data1.addData("f", 0.7);
data3.addData("f", 0.8);
data3.addData("g", 0.9);
ArrayList<DocWordData> testData = new ArrayList<DocWordData>();
testData.add(data1);
testData.add(data2);
testData.add(data3);
CorpusByWord test = new CorpusByWord(testData);
System.out.println(testData);
System.out.println(test);
while (test.hasNext()) {
System.out.println(test.nextWord());
System.out.println("Word was: " + test.getProcessedWord());
System.out.println(test);
}
test = new CorpusByWord(testData);
test.nextWord(); // 'a'
test.nextWord(); // 'b'
test.scaleProcessedWord(0.05);
test.nextWord(); // 'c'
test.deleteProcessedWord();
test.deleteProcessedWord(); // Note the lack of a next call first
test.nextWord(); // 'd'
test.scaleProcessedWord(-1.0);
test.nextWord(); // 'e'
test.nextWord(); // 'f'
test.deleteProcessedWord();
test.nextWord(); // 'g'
test.deleteProcessedWord(); // Deletes last word of set
System.out.println(testData); // Original data, not the test object
System.out.println("Still have data: " + test.hasNext());
// Stress test: delete everything!
// Current content is 'a', 'b', 'd', 'e'
test = new CorpusByWord(testData);
while (test.hasNext()) {
System.out.println(test.nextWord());
System.out.println("Word was: " + test.getProcessedWord());
test.deleteProcessedWord();
System.out.println(test);
}
}
catch (Exception e) {
System.out.println("Exception " + e + " caught");
throw e; // Rethrow so improper temination is obvious
}
}
}