-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathIndexer.java
442 lines (415 loc) · 17.1 KB
/
Indexer.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.math.RoundingMode;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
/**
* Implementation for indexing and searching.
* Index a collection of documents for searching.
*
* @author M. Habib Rosyad
*/
public class Indexer {
private final static String INDEX_FILE = "index.txt";
private final static String STOPWORDS_FILE = "stopwords.txt";
private final static int PSEUDO_RF_ITER = 2;
private final static int PSEUDO_RF_K = 5;
private final static double PSEUDO_RF_ALPHA = 1.;
private final static double PSEUDO_RF_BETA = .7;
private final static double PSEUDO_RF_GAMA = 0.25;
private final Map<String,MutableDouble> documentVectorLength;
private Map<String,Map<String,Double>> documentVectors = new HashMap<>();
private final Map<String,Postings> index; // Format <term>,<postings>,<idf>
private final Path indexPath;
private Set<String> stopwords;
private Path stopwordsPath;
private final PorterStemmer stemmer;
/**
* Initialize indexer for indexing and searching through the collection.
*
* @param dir index directory
* @throws java.io.IOException
*/
public Indexer(String dir) throws IOException {
indexPath = Paths.get(dir, INDEX_FILE); // Get path/index.txt.
stemmer = new PorterStemmer();
index = new HashMap<>();
documentVectorLength = new HashMap<>();
documentVectors = new HashMap<>();
// Create index path dirs incase it doesn't exists.
if (!Files.isDirectory(indexPath.getParent()))
Files.createDirectory(indexPath.getParent());
// Try to load default stopwords (in the index directory),
// do not throw exception in this part because stopwords can be loaded later.
// Default stopwords list obtained from http://www.jmlr.org/papers/volume5/lewis04a/lewis04a.pdf
// * paper which used words found on the stop word list from the SMART system.
try {
loadStopwords(Paths.get(dir, STOPWORDS_FILE).toString());
} catch (IOException e) {
System.out.println("No default stopwords source available");
}
}
/**
* Get normalized tokens.
* Set default filters used to tokenize text.
*
* @param source source text or file
* @return a tokenizer
*/
private Tokenizer getNormalizedTokens(Object source) {
return new Tokenizer(source)
.addFilter(t -> t.length() > 1)
.addFilter(t -> !stopwords.contains(t.toLowerCase())) // Remove stopwords
.addFilter(t -> !isNumeric(t)); // Remove numerics
}
/**
* Check whether a String is numeric or not.
* Also check combination of numeric tokens e.g. 10-10 (using '-')
*
* @param value String to check
* @return true if String consists only of numerics combination, false otherwise
*/
private boolean isNumeric(String value) {
return value.matches("(?:-?\\d+(?:\\.\\d+)?)+"); //match a number with optional '-' and decimal.
}
/**
* Load stopwords for tokenization.
*
* @param path stopwords file path
* @throws IOException
*/
public final void loadStopwords(String path) throws IOException {
try (Scanner parser = new Scanner(new FileReader(path))) {
this.stopwords = new HashSet<>((Arrays.asList(
parser.useDelimiter("\\Z").next().split("[\\r\\n]+")
)));
}
// Save the path for copying stopwords later in the index directory.
stopwordsPath = Paths.get(path);
}
/**
* Parse the index file.
* Load index into memory, used for searching.
*
* @throws Exception
*/
public void read() throws Exception {
Scanner parser = new Scanner(new FileReader(indexPath.toFile()));
// Also calculate each document vector length.
while (parser.hasNextLine()) {
String record = parser.nextLine();
Postings postings = new Postings(record);
String token = record.substring(0, record.indexOf(','));
double idf = postings.getIdf();
index.put(token, postings);
// Update document set.
postings.getTfs().entrySet().stream().forEach(positions -> {
String documentId = positions.getKey();
double wf = (1 + Math.log(postings.getTf(documentId)));
MutableDouble length = documentVectorLength.get(documentId);
if (length == null) {
length = new MutableDouble();
documentVectorLength.put(documentId, length);
}
// Calculate power of 2 of wf-idf.
length.add(Math.pow(wf*idf, 2));
// Put into document vectors.
Map<String,Double> documentVector = documentVectors.get(documentId);
if (documentVector == null) {
documentVector = new HashMap<>();
documentVectors.put(documentId, documentVector);
}
documentVector.put(token, wf*idf);
});
}
// Document vector length is the square root of the sum of the values.
documentVectorLength.replaceAll((k,v) -> v.setValue(Math.sqrt(v.getValue())));
}
/**
* Search terms inside the index.
* Weight is calculated by sublinear-tf.
* Support phrasal query
*
* @param query String to search in the index
* @param topN n top number of documents to display in the search results
* @return
*/
public List<String> search(String query, long topN, boolean useRF) {
Map<String,MutableInteger> tfs = new HashMap<>();
Map<String,Double> queryVector = new HashMap<>();
Map<String,Double> ranks = new HashMap<>();
Tokenizer tokens = getNormalizedTokens(query);
List<String> phrase = new ArrayList<>();
// If number of docs is set to 0, then get the value from documentVectorLength.
double n = (double) documentVectorLength.size();
// Count token frequency.
while (tokens.hasNext()) {
// Start convert to lowercase and stem.
String token = stemmer.stem(tokens.next().toLowerCase());
if (!isNumeric(token)) {
MutableInteger tf = tfs.get(token);
if (tf == null) {
tf = new MutableInteger();
tfs.put(token, tf);
phrase.add(token);
}
tf.increment();
}
}
// Construct query vector.
tfs.entrySet().stream().forEach(tf -> {
String token = tf.getKey();
double wf = 1 + Math.log(tf.getValue().getValue());
Postings postings = index.get(token);
if (postings != null) {
queryVector.put(token, wf * postings.getIdf());
} else {
queryVector.put(token, wf * Math.log(n));
}
});
// Add queryVector for phrase.
if (phrase.size() > 1) {
String joined = String.join(" ", phrase);
Postings postings = getPhrasalPostings(joined, (int) n);
if (postings != null) {
queryVector.put(joined, postings.getIdf());
}
}
// Execute pseudo RF as defined by PSEUDO_RF_ITER
int i = 0;
while (i < (useRF ? PSEUDO_RF_ITER : 1)) {
// Calculate query vector length.
double queryVectorLength = Math.sqrt(queryVector.values().stream()
.reduce(.0, (a,b) -> a + b*b));
// Construct document vector.
documentVectorLength.entrySet().stream().forEach(l -> {
String documentId = l.getKey();
Map<String,Double> documentVector = new HashMap<>();
MutableDouble dotProduct = new MutableDouble();
queryVector.entrySet().stream().forEach(v -> {
String token = v.getKey();
double wq = v.getValue();
Postings postings;
if (isPhrasal(token)) {
postings = getPhrasalPostings(token, (int) n);
} else {
postings = index.get(token);
}
if (postings != null) {
int tf = postings.getTf(documentId);
if (tf != 0) {
double wf = (1 + Math.log(tf));
dotProduct.add(wf * postings.getIdf() * wq);
}
}
});
if (dotProduct.getValue() > 0.)
ranks.put(documentId, dotProduct.getValue()/(queryVectorLength*documentVectorLength.get(documentId).getValue()));
});
if (ranks.size() > 0)
applyPseudoRF(queryVector, documentVectors, sort(ranks));
else
return null;
i++;
}
if (ranks.size() > 0) {
DecimalFormat format = new DecimalFormat("#.###");
format.setRoundingMode(RoundingMode.CEILING);
return ranks.entrySet().stream()
.filter(r -> r.getValue() > .0)
.sorted((a,b) -> b.getValue().compareTo(a.getValue()))
.limit(topN)
.map(r -> r.getKey() + "," + format.format(r.getValue()))
.collect(Collectors.toList());
}
return null;
}
/**
* Check if a token is in a phrasal format.
*
* @param value
* @return
*/
private boolean isPhrasal(String token) {
return token.matches("[^\\s]+(?:\\s[^\\s]+)+");
}
/**
* Get intersection of postings.
*
* @param token
* @param n
* @return
*/
private Postings getPhrasalPostings(String token, int n) {
String[] split = token.split(" ");
List<Postings> postings = Arrays.asList(split).stream()
.map(t -> index.get(t))
.filter(t -> t != null)
.collect(Collectors.toList());
// Merge
if (postings.size() > 0)
return Postings.merge(n, postings.toArray(new Postings[0]));
return null;
}
/**
* Get new query vector based on pseudo relevance feedback.
* This method implement Rochio Algorithm.
* Assume sorted ranks map.
*
* @return
*/
private void applyPseudoRF(Map<String, Double> query, Map<String, Map<String,Double>> documents, Map<String,Double> ranks) {
// Assume top PSEUDO_RF_K as relevant.
final int k = ranks.size() >= PSEUDO_RF_K ? PSEUDO_RF_K : ranks.size();
// n doc - k
int ik = documents.size() - k;
MutableInteger i = new MutableInteger();
Map<String,MutableDouble> relevant = new HashMap<>();
Map<String,MutableDouble> irrelevant = new HashMap<>();
// Calc sigma relevant and irrelevant.
ranks.entrySet().stream().forEach(r -> {
Map<String,Double> document = documents.get(r.getKey());
document.entrySet().stream().forEach(d -> {
String token = d.getKey();
if (i.getValue() < k) {
// Calc as relevant.
MutableDouble value = relevant.get(token);
if (value == null) {
value = new MutableDouble();
relevant.put(token, value);
}
value.add(d.getValue());
} else {
// Calc as irrelevant.
MutableDouble value = irrelevant.get(token);
if (value == null) {
value = new MutableDouble();
irrelevant.put(token, value);
}
value.add(d.getValue());
}
});
i.increment();
});
// Calc new query vector.
relevant.replaceAll((key,value) -> value.setValue(value.getValue()*PSEUDO_RF_BETA/k));
irrelevant.replaceAll((key,value) -> value.setValue(value.getValue()*PSEUDO_RF_GAMA/ik));
// Loop for relevant.
relevant.entrySet().stream().forEach(w -> {
String token = w.getKey();
Double docValue = w.getValue().getValue();
Double queryValue = query.get(token);
if (queryValue == null) {
query.put(token, docValue);
} else {
query.put(token, PSEUDO_RF_ALPHA*queryValue + docValue);
}
});
// Loop for irrelevant.
irrelevant.entrySet().stream().forEach(w -> {
String token = w.getKey();
Double docValue = w.getValue().getValue();
Double queryValue = query.get(token);
if (queryValue == null) {
query.put(token, -docValue);
} else {
query.put(token, PSEUDO_RF_ALPHA*queryValue - docValue);
}
});
}
/**
* Update the index.
*
* @param dir document collection directory
* @throws IOException
*/
public void update(String dir) throws IOException {
Path path = Paths.get(dir);
// Check collection path existence and permission of source.
if (!Files.isDirectory(path) ||
!Files.isReadable(path))
throw new IOException("Invalid collection path");
// Count documents in the collection.
long n;
try (Stream<Path> collections = Files.walk(path, 1)) {
n = collections.filter(p -> Files.isRegularFile(p)).count();
}
// Update index.
try (Stream<Path> collections = Files.walk(path, 1)) {
collections.filter(p -> Files.isRegularFile(p)).forEach(p -> {
// Clean filename from ',' since it is used as delimiter in the index.
String fileName = p.getFileName().toString().replaceAll(",", "");
try {
Tokenizer tokens = getNormalizedTokens(new FileReader(p.toFile()));
// Init token position.
int position = 0;
while (tokens.hasNext()) {
// Start normalisation, convert to lowercase.
String token = stemmer.stem(tokens.next().toLowerCase());
// Ensure the stemmed token is not a number.
if (!isNumeric(token)) {
Postings postings = index.get(token);
if (postings == null) {
postings = new Postings();
index.put(token, postings);
}
postings.addTf(fileName, n, position++);
}
}
} catch (FileNotFoundException e) {
System.out.println("Unable to read document " + fileName);
}
});
}
// Save index, this can only be done after the idf values are fixed.
try (BufferedWriter writer = Files.newBufferedWriter(indexPath)) {
index.entrySet().stream().forEach(record -> {
try {
writer.write(record.getKey() + "," + record.getValue().toString());
writer.newLine();
} catch (IOException e) {
System.out.println("Unable to save the index " + e.getMessage());
}
});
}
// Copy stopwords file into index directory for searching later.
Files.copy(stopwordsPath, Paths.get(indexPath.getParent().toString(), STOPWORDS_FILE), StandardCopyOption.REPLACE_EXISTING);
}
/**
* Sort map descending.
* https://www.mkyong.com/java/how-to-sort-a-map-in-java/
*
* @param <K>
* @param <V>
* @param unsortMap
* @return
*/
private static <K, V extends Comparable<? super V>> Map<K, V> sort(Map<K, V> unsortMap) {
List<Map.Entry<K, V>> list =
new LinkedList<>(unsortMap.entrySet());
Collections.sort(list, (Map.Entry<K, V> o1, Map.Entry<K, V> o2) -> (o2.getValue()).compareTo(o1.getValue()));
Map<K, V> result = new LinkedHashMap<>();
list.forEach((entry) -> {
result.put(entry.getKey(), entry.getValue());
});
return result;
}
}