Merge lp:~jpakkane/libcolumbus/hud-rework into lp:libcolumbus
- hud-rework
- Merge into trunk
Status: | Merged | ||||||||
---|---|---|---|---|---|---|---|---|---|
Approved by: | Pete Woods | ||||||||
Approved revision: | 485 | ||||||||
Merged at revision: | 461 | ||||||||
Proposed branch: | lp:~jpakkane/libcolumbus/hud-rework | ||||||||
Merge into: | lp:libcolumbus | ||||||||
Diff against target: |
468 lines (+291/-19) 9 files modified
CMakeLists.txt (+1/-1) debian/changelog (+6/-0) include/Matcher.hh (+11/-0) include/WordStore.hh (+1/-0) src/MatchResults.cc (+4/-1) src/Matcher.cc (+71/-12) src/WordStore.cc (+8/-4) test/MatcherTest.cc (+171/-1) test/TrieTest.cc (+18/-0) |
||||||||
To merge this branch: | bzr merge lp:~jpakkane/libcolumbus/hud-rework | ||||||||
Related bugs: |
|
Reviewer | Review Type | Date Requested | Status |
---|---|---|---|
Pete Woods (community) | Approve | ||
PS Jenkins bot (community) | continuous-integration | Approve | |
Review via email: mp+208799@code.launchpad.net |
Commit message
Add new online search mode to get better performance in HUD.
Description of the change
Add new online search mode to get better performance in HUD.
- 483. By Jussi Pakkanen
-
Bumped version number.
Pete Woods (pete-woods) wrote : | # |
PS Jenkins bot (ps-jenkins) wrote : | # |
PASSED: Continuous integration, rev:483
http://
Executed test runs:
SUCCESS: http://
SUCCESS: http://
SUCCESS: http://
Click here to trigger a rebuild:
http://
- 484. By Jussi Pakkanen
-
Some test cleanup.
Jussi Pakkanen (jpakkane) wrote : | # |
Fixed.
PS Jenkins bot (ps-jenkins) wrote : | # |
FAILED: Continuous integration, rev:484
http://
Executed test runs:
FAILURE: http://
FAILURE: http://
FAILURE: http://
Click here to trigger a rebuild:
http://
- 485. By Jussi Pakkanen
-
Blah.
PS Jenkins bot (ps-jenkins) wrote : | # |
PASSED: Continuous integration, rev:485
http://
Executed test runs:
SUCCESS: http://
SUCCESS: http://
SUCCESS: http://
Click here to trigger a rebuild:
http://
Pete Woods (pete-woods) wrote : | # |
Looks good to me! Tried with the dependent branch of HUD (lp:~pete-woods/hud/tweak-search-parameters).
Preview Diff
1 | === modified file 'CMakeLists.txt' |
2 | --- CMakeLists.txt 2013-08-09 19:35:42 +0000 |
3 | +++ CMakeLists.txt 2014-02-28 14:46:51 +0000 |
4 | @@ -38,7 +38,7 @@ |
5 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden") |
6 | |
7 | set(SO_VERSION_MAJOR "1") |
8 | -set(SO_VERSION_MINOR "0") |
9 | +set(SO_VERSION_MINOR "1") |
10 | set(SO_VERSION_PATCH "0") |
11 | |
12 | set(SO_VERSION "${SO_VERSION_MAJOR}.${SO_VERSION_MINOR}.${SO_VERSION_PATCH}") |
13 | |
14 | === modified file 'debian/changelog' |
15 | --- debian/changelog 2014-01-20 19:43:49 +0000 |
16 | +++ debian/changelog 2014-02-28 14:46:51 +0000 |
17 | @@ -1,3 +1,9 @@ |
18 | +libcolumbus (1.1.0-0ubuntu1) UNRELEASED; urgency=medium |
19 | + |
20 | + * New online search mode |
21 | + |
22 | + -- Jussi Pakkanen <jussi.pakkanen@ubuntu.com> Fri, 28 Feb 2014 15:44:25 +0200 |
23 | + |
24 | libcolumbus (1.0.0+14.04.20140120-0ubuntu1) trusty; urgency=low |
25 | |
26 | * New rebuild forced |
27 | |
28 | === modified file 'include/Matcher.hh' |
29 | --- include/Matcher.hh 2013-08-07 12:20:52 +0000 |
30 | +++ include/Matcher.hh 2014-02-28 14:46:51 +0000 |
31 | @@ -60,6 +60,17 @@ |
32 | void index(const Corpus &c); |
33 | ErrorValues& getErrorValues(); |
34 | IndexWeights& getIndexWeights(); |
35 | + /* |
36 | + * This function is optimized for online matches, that is, queries |
37 | + * that are live updated during typing. It uses slightly different |
38 | + * search heuristics to ensure results that "feel good" to humans. |
39 | + * |
40 | + * The second argument is the field that should be the primary focus. |
41 | + * Usually it means having the text that will be shown to the user. |
42 | + * As an example, in the HUD, this field would contain the command |
43 | + * (and nothing else) that will be executed. |
44 | + */ |
45 | + MatchResults onlineMatch(const WordList &query, const Word &primaryIndex); |
46 | }; |
47 | |
48 | COL_NAMESPACE_END |
49 | |
50 | === modified file 'include/WordStore.hh' |
51 | --- include/WordStore.hh 2013-06-14 12:26:10 +0000 |
52 | +++ include/WordStore.hh 2014-02-28 14:46:51 +0000 |
53 | @@ -49,6 +49,7 @@ |
54 | |
55 | |
56 | WordID getID(const Word &w); |
57 | + bool hasWord(const Word &w) const; |
58 | Word getWord(const WordID id) const; |
59 | bool hasWord(const WordID id) const; |
60 | }; |
61 | |
62 | === modified file 'src/MatchResults.cc' |
63 | --- src/MatchResults.cc 2013-06-20 12:25:11 +0000 |
64 | +++ src/MatchResults.cc 2014-02-28 14:46:51 +0000 |
65 | @@ -96,7 +96,10 @@ |
66 | if(p->sorted) |
67 | return; |
68 | MatchResults *me = const_cast<MatchResults*>(this); |
69 | - sort(me->p->results.rbegin(), me->p->results.rend()); |
70 | + stable_sort(me->p->results.rbegin(), me->p->results.rend(), |
71 | + [](const pair<double, DocumentID> &a, const pair<double, DocumentID> &b) -> bool{ |
72 | + return a.first < b.first; |
73 | + }); |
74 | me->p->sorted = true; |
75 | } |
76 | |
77 | |
78 | === modified file 'src/Matcher.cc' |
79 | --- src/Matcher.cc 2013-08-07 12:20:52 +0000 |
80 | +++ src/Matcher.cc 2014-02-28 14:46:51 +0000 |
81 | @@ -36,6 +36,7 @@ |
82 | #include <stdexcept> |
83 | #include <map> |
84 | #include <vector> |
85 | +#include <algorithm> |
86 | |
87 | #ifdef HAS_SPARSE_HASH |
88 | #include <google/sparse_hash_map> |
89 | @@ -98,6 +99,7 @@ |
90 | IndexWeights weights; |
91 | MatcherStatistics stats; |
92 | WordStore store; |
93 | + map<pair<DocumentID, WordID>, size_t> originalSizes; // Lengths of original documents. |
94 | }; |
95 | |
96 | void ReverseIndex::add(const WordID wordID, const WordID indexID, const DocumentID id) { |
97 | @@ -230,15 +232,6 @@ |
98 | } |
99 | } |
100 | |
101 | -static void expandQuery(const WordList &query, WordList &expandedQuery) { |
102 | - for(size_t i=0; i<query.size(); i++) |
103 | - expandedQuery.addWord(query[i]); |
104 | - |
105 | - for(size_t i=0; i<query.size()-1; i++) { |
106 | - expandedQuery.addWord(query[i].join(query[i+1])); |
107 | - } |
108 | -} |
109 | - |
110 | static bool subtermsMatch(MatcherPrivate *p, const ResultFilter &filter, size_t term, DocumentID id) { |
111 | for(size_t subTerm=0; subTerm < filter.numSubTerms(term); subTerm++) { |
112 | const Word &filterName = filter.getField(term, subTerm); |
113 | @@ -286,6 +279,10 @@ |
114 | const Word &fieldName = textNames[ti]; |
115 | const WordID fieldID = p->store.getID(fieldName); |
116 | const WordList &text = d.getText(fieldName); |
117 | + pair<DocumentID, WordID> lengths; |
118 | + lengths.first = d.getID(); |
119 | + lengths.second = fieldID; |
120 | + p->originalSizes[lengths] = text.size(); |
121 | for(size_t wi=0; wi<text.size(); wi++) { |
122 | const Word &word = text[wi]; |
123 | const WordID wordID = p->store.getID(word); |
124 | @@ -336,16 +333,14 @@ |
125 | const int maxIterations = 1; |
126 | const int increment = LevenshteinIndex::getDefaultError(); |
127 | const size_t minMatches = 10; |
128 | - WordList expandedQuery; |
129 | MatchResults allMatches; |
130 | |
131 | if(query.size() == 0) |
132 | return matchedDocuments; |
133 | - expandQuery(query, expandedQuery); |
134 | // Try to search with ever growing error until we find enough matches. |
135 | for(int i=0; i<maxIterations; i++) { |
136 | MatchResults matches; |
137 | - matchWithRelevancy(expandedQuery, params, i*increment, matches); |
138 | + matchWithRelevancy(query, params, i*increment, matches); |
139 | if(matches.size() >= minMatches || i == maxIterations-1) { |
140 | allMatches.addResults(matches); |
141 | break; |
142 | @@ -392,5 +387,69 @@ |
143 | return p->weights; |
144 | } |
145 | |
146 | +static map<DocumentID, size_t> countExacts(MatcherPrivate *p, const WordList &query, const WordID indexID) { |
147 | + map<DocumentID, size_t> matchCounts; |
148 | + for(size_t i=0; i<query.size(); i++) { |
149 | + const Word &w = query[i]; |
150 | + if(w.length() == 0 || !p->store.hasWord(w)) { |
151 | + continue; |
152 | + } |
153 | + WordID curWord = p->store.getID(w); |
154 | + vector<DocumentID> exacts; |
155 | + p->reverseIndex.findDocuments(curWord, indexID, exacts); |
156 | + for(const auto &i : exacts) { |
157 | + matchCounts[i]++; // Default is zero initialisation. |
158 | + } |
159 | + } |
160 | + return matchCounts; |
161 | +} |
162 | + |
163 | +struct DocCount { |
164 | + DocumentID id; |
165 | + size_t matches; |
166 | +}; |
167 | + |
168 | +MatchResults Matcher::onlineMatch(const WordList &query, const Word &primaryIndex) { |
169 | + MatchResults results; |
170 | + set<DocumentID> exactMatched; |
171 | + map<DocumentID, double> accumulator; |
172 | + if(!p->store.hasWord(primaryIndex)) { |
173 | + string msg("Index named "); |
174 | + msg += primaryIndex.asUtf8(); |
175 | + msg += " is not known"; |
176 | + throw invalid_argument(msg); |
177 | + } |
178 | + WordID indexID = p->store.getID(primaryIndex); |
179 | + // How many times each document matched with zero error. |
180 | + vector<DocCount> stats; |
181 | + for(const auto &i : countExacts(p, query, indexID)) { |
182 | + DocCount c; |
183 | + pair<DocumentID, WordID> key; |
184 | + exactMatched.insert(i.first); |
185 | + key.first = i.first; |
186 | + key.second = indexID; |
187 | + c.id = i.first; |
188 | + c.matches = i.second; |
189 | + stats.push_back(c); |
190 | + } |
191 | + for(const auto &i: stats) { |
192 | + accumulator[i.id] = 2*i.matches; |
193 | + if(i.matches == query.size() |
194 | + && i.matches == p->originalSizes[make_pair(i.id, indexID)]) { // Perfect match. |
195 | + accumulator[i.id] += 100; |
196 | + } |
197 | + } |
198 | + // Merge in fuzzy matches. |
199 | + MatchResults fuzzyResults = match(query); |
200 | + for(size_t i = 0; i<fuzzyResults.size(); i++) { |
201 | + DocumentID docid = fuzzyResults.getDocumentID(i); |
202 | + accumulator[docid] += fuzzyResults.getRelevancy(i); |
203 | + } |
204 | + for(const auto &i : accumulator) { |
205 | + results.addResult(i.first, i.second); |
206 | + } |
207 | + return results; |
208 | +} |
209 | + |
210 | COL_NAMESPACE_END |
211 | |
212 | |
213 | === modified file 'src/WordStore.cc' |
214 | --- src/WordStore.cc 2013-01-31 10:23:45 +0000 |
215 | +++ src/WordStore.cc 2014-02-28 14:46:51 +0000 |
216 | @@ -53,15 +53,19 @@ |
217 | } |
218 | |
219 | WordID WordStore::getID(const Word &w) { |
220 | - TrieOffset node = p->words.findWord(w); |
221 | - if(node) |
222 | - return p->words.getWordID(node); |
223 | - node = p->words.insertWord(w, p->wordIndex.size()); |
224 | + if(p->words.hasWord(w)) { |
225 | + return p->words.getWordID(p->words.findWord(w)); |
226 | + } |
227 | + TrieOffset node = p->words.insertWord(w, p->wordIndex.size()); |
228 | p->wordIndex.push_back(node); |
229 | WordID result = p->wordIndex.size()-1; |
230 | return result; |
231 | } |
232 | |
233 | +bool WordStore::hasWord(const Word &w) const { |
234 | + return p->words.hasWord(w); |
235 | +} |
236 | + |
237 | Word WordStore::getWord(const WordID id) const { |
238 | if(!hasWord(id)) { |
239 | throw out_of_range("Tried to access non-existing WordID in WordStore."); |
240 | |
241 | === modified file 'test/MatcherTest.cc' |
242 | --- test/MatcherTest.cc 2013-06-20 12:10:40 +0000 |
243 | +++ test/MatcherTest.cc 2014-02-28 14:46:51 +0000 |
244 | @@ -23,6 +23,7 @@ |
245 | #include "WordList.hh" |
246 | #include "Document.hh" |
247 | #include "MatchResults.hh" |
248 | +#include "ColumbusHelpers.hh" |
249 | #include <cassert> |
250 | |
251 | using namespace Columbus; |
252 | @@ -123,7 +124,169 @@ |
253 | c.addDocument(d2); |
254 | m.index(c); |
255 | |
256 | - matches = m.match("Sara Michell Geller"); |
257 | + matches = m.match("Sari Michell Geller"); |
258 | + assert(matches.getDocumentID(0) == correct); |
259 | +} |
260 | + |
261 | +void testSentence() { |
262 | + Corpus c; |
263 | + DocumentID correct = 1; |
264 | + DocumentID wrong = 0; |
265 | + Document d1(correct); |
266 | + Document d2(wrong); |
267 | + Word fieldName("name"); |
268 | + Word secondName("context"); |
269 | + Matcher m; |
270 | + MatchResults matches; |
271 | + |
272 | + d1.addText(fieldName, "Fit Canvas to Layers"); |
273 | + d1.addText(secondName, "View Zoom (100%)"); |
274 | + d2.addText(fieldName, "Fit image in Window"); |
275 | + d2.addText(secondName, "Image"); |
276 | + |
277 | + c.addDocument(d1); |
278 | + c.addDocument(d2); |
279 | + |
280 | + m.index(c); |
281 | + matches = m.match("fit canvas to layers"); |
282 | + assert(matches.getDocumentID(0) == correct); |
283 | +} |
284 | + |
285 | +void testExactOrder() { |
286 | + Corpus c; |
287 | + DocumentID correct = 1; |
288 | + DocumentID wrong = 0; |
289 | + DocumentID moreWrong = 100; |
290 | + Document d1(correct); |
291 | + Document d2(wrong); |
292 | + Document d3(moreWrong); |
293 | + Word fieldName("name"); |
294 | + Word secondName("context"); |
295 | + Matcher m; |
296 | + MatchResults matches; |
297 | + WordList q = splitToWords("fit canvas to layers"); |
298 | + d1.addText(fieldName, "Fit Canvas to Layers"); |
299 | + d1.addText(secondName, "View Zoom (100%)"); |
300 | + d2.addText(fieldName, "Fit image in Window"); |
301 | + d2.addText(secondName, "Image"); |
302 | + d3.addText(fieldName, "Not matching."); |
303 | + d3.addText(secondName, "fit canvas to layers"); |
304 | + c.addDocument(d1); |
305 | + c.addDocument(d2); |
306 | + c.addDocument(d3); |
307 | + |
308 | + m.index(c); |
309 | + matches = m.onlineMatch(q, fieldName); |
310 | + assert(matches.size() >= 1); |
311 | + assert(matches.getDocumentID(0) == correct); |
312 | +} |
313 | + |
314 | +void testSmallestMatch() { |
315 | + Corpus c; |
316 | + DocumentID correct = 1; |
317 | + DocumentID wrong = 0; |
318 | + Document d1(correct); |
319 | + Document d2(wrong); |
320 | + Word fieldName("name"); |
321 | + Word field2("dummy"); |
322 | + Matcher m; |
323 | + MatchResults matches; |
324 | + WordList q = splitToWords("save"); |
325 | + d1.addText(fieldName, "save"); |
326 | + d1.addText(field2, "lots of text to ensure statistics of this field are ignored"); |
327 | + d2.addText(fieldName, "save as"); |
328 | + c.addDocument(d1); |
329 | + c.addDocument(d2); |
330 | + |
331 | + m.index(c); |
332 | + matches = m.onlineMatch(q, fieldName); |
333 | + assert(matches.size() == 2); |
334 | + assert(matches.getDocumentID(0) == correct); |
335 | +} |
336 | + |
337 | +void noCommonMatch() { |
338 | + Corpus c; |
339 | + DocumentID correct = 1; |
340 | + Document d1(correct); |
341 | + Word fieldName("name"); |
342 | + Word field2("dummy"); |
343 | + Matcher m; |
344 | + MatchResults matches; |
345 | + WordList q = splitToWords("fit canvas to selection"); |
346 | + d1.addText(fieldName, "Preparing your Images for the Web"); |
347 | + d1.addText(fieldName, "Help user manual"); |
348 | + c.addDocument(d1); |
349 | + |
350 | + m.index(c); |
351 | + matches = m.onlineMatch(q, fieldName); |
352 | + assert(matches.size() == 0); |
353 | +} |
354 | + |
355 | +void emptyMatch() { |
356 | + Corpus c; |
357 | + DocumentID correct = 1; |
358 | + Document d1(correct); |
359 | + Word fieldName("name"); |
360 | + Word field2("dummy"); |
361 | + Matcher m; |
362 | + MatchResults matches; |
363 | + WordList q; |
364 | + d1.addText(fieldName, "Preparing your Images for the Web"); |
365 | + d1.addText(fieldName, "Help user manual"); |
366 | + c.addDocument(d1); |
367 | + |
368 | + m.index(c); |
369 | + matches = m.onlineMatch(q, fieldName); |
370 | + assert(matches.size() == 0); |
371 | +} |
372 | + |
373 | +void testMatchCount() { |
374 | + Corpus c; |
375 | + DocumentID correct = 1; |
376 | + DocumentID wrong = 0; |
377 | + Document d1(correct); |
378 | + Document d2(wrong); |
379 | + Word fieldName("name"); |
380 | + Word secondName("context"); |
381 | + Matcher m; |
382 | + MatchResults matches; |
383 | + WordList q = splitToWords("fit canvas to selection"); |
384 | + d1.addText(fieldName, "Fit Canvas to Layers"); |
385 | + d1.addText(secondName, "View Zoom (100%)"); |
386 | + d2.addText(fieldName, "Selection editor"); |
387 | + d2.addText(secondName, "Windows dockable dialogs"); |
388 | + c.addDocument(d1); |
389 | + c.addDocument(d2); |
390 | + |
391 | + m.index(c); |
392 | + matches = m.onlineMatch(q, fieldName); |
393 | + assert(matches.size() == 2); |
394 | + assert(matches.getDocumentID(0) == correct); |
395 | +} |
396 | + |
397 | +void testPerfect() { |
398 | + Corpus c; |
399 | + DocumentID correct = 0; |
400 | + Document d1(1); |
401 | + Document d2(correct); |
402 | + Document d3(2); |
403 | + Document d4(3); |
404 | + Word fieldName("name"); |
405 | + Matcher m; |
406 | + MatchResults matches; |
407 | + WordList q = splitToWords("save"); |
408 | + d1.addText(fieldName, "Save as"); |
409 | + d2.addText(fieldName, "Save"); |
410 | + d3.addText(fieldName, "Save yourself"); |
411 | + d4.addText(fieldName, "Save the whales"); |
412 | + c.addDocument(d1); |
413 | + c.addDocument(d2); |
414 | + c.addDocument(d3); |
415 | + c.addDocument(d4); |
416 | + |
417 | + m.index(c); |
418 | + matches = m.onlineMatch(q, fieldName); |
419 | + assert(matches.size() >= 1); |
420 | assert(matches.getDocumentID(0) == correct); |
421 | } |
422 | |
423 | @@ -132,6 +295,13 @@ |
424 | testMatcher(); |
425 | testRelevancy(); |
426 | testMultiWord(); |
427 | + testSentence(); |
428 | + testExactOrder(); |
429 | + testSmallestMatch(); |
430 | + noCommonMatch(); |
431 | + emptyMatch(); |
432 | + testMatchCount(); |
433 | + testPerfect(); |
434 | } catch(const std::exception &e) { |
435 | fprintf(stderr, "Fail: %s\n", e.what()); |
436 | return 666; |
437 | |
438 | === modified file 'test/TrieTest.cc' |
439 | --- test/TrieTest.cc 2013-04-03 13:50:54 +0000 |
440 | +++ test/TrieTest.cc 2014-02-28 14:46:51 +0000 |
441 | @@ -46,10 +46,28 @@ |
442 | assert(result == w2); |
443 | } |
444 | |
445 | +void testHas() { |
446 | + Trie t; |
447 | + Word w1("abc"); |
448 | + Word w2("abd"); |
449 | + Word w3("a"); |
450 | + Word w4("x"); |
451 | + Word result; |
452 | + |
453 | + WordID i1 = 1; |
454 | + |
455 | + assert(t.numWords() == 0); |
456 | + t.insertWord(w1, i1); |
457 | + assert(t.hasWord(w1)); |
458 | + assert(!t.hasWord(w2)); |
459 | + assert(!t.hasWord(w3)); |
460 | + assert(!t.hasWord(w4)); |
461 | +} |
462 | |
463 | int main(int /*argc*/, char **/*argv*/) { |
464 | // Move basic tests from levtrietest here. |
465 | testWordBuilding(); |
466 | + testHas(); |
467 | return 0; |
468 | } |
469 |
Requests:
* Increment the debian version number
* Add some more "save foo" noise (before and after "save") to the save test