Merge lp:~jpakkane/libcolumbus/hud-rework into lp:libcolumbus

Proposed by Jussi Pakkanen on 2014-02-28
Status: Merged
Approved by: Pete Woods on 2014-03-03
Approved revision: 485
Merged at revision: 461
Proposed branch: lp:~jpakkane/libcolumbus/hud-rework
Merge into: lp:libcolumbus
Diff against target: 468 lines (+291/-19)
9 files modified
CMakeLists.txt (+1/-1)
debian/changelog (+6/-0)
include/Matcher.hh (+11/-0)
include/WordStore.hh (+1/-0)
src/MatchResults.cc (+4/-1)
src/Matcher.cc (+71/-12)
src/WordStore.cc (+8/-4)
test/MatcherTest.cc (+171/-1)
test/TrieTest.cc (+18/-0)
To merge this branch: bzr merge lp:~jpakkane/libcolumbus/hud-rework
Reviewer Review Type Date Requested Status
Pete Woods (community) 2014-02-28 Approve on 2014-03-03
PS Jenkins bot (community) continuous-integration Approve on 2014-02-28
Review via email: mp+208799@code.launchpad.net

Commit message

Add new online search mode to get better performance in HUD.

Description of the change

Add new online search mode to get better performance in HUD.

To post a comment you must log in.
lp:~jpakkane/libcolumbus/hud-rework updated on 2014-02-28
483. By Jussi Pakkanen on 2014-02-28

Bumped version number.

Pete Woods (pete-woods) wrote :

Requests:

* Increment the debian version number
* Add some more "save foo" noise (before and after "save") to the save test

lp:~jpakkane/libcolumbus/hud-rework updated on 2014-02-28
484. By Jussi Pakkanen on 2014-02-28

Some test cleanup.

Jussi Pakkanen (jpakkane) wrote :

Fixed.

lp:~jpakkane/libcolumbus/hud-rework updated on 2014-02-28
485. By Jussi Pakkanen on 2014-02-28

Blah.

Pete Woods (pete-woods) wrote :

Looks good to me! Tried with the dependent branch of HUD (lp:~pete-woods/hud/tweak-search-parameters).

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'CMakeLists.txt'
2--- CMakeLists.txt 2013-08-09 19:35:42 +0000
3+++ CMakeLists.txt 2014-02-28 14:46:51 +0000
4@@ -38,7 +38,7 @@
5 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden")
6
7 set(SO_VERSION_MAJOR "1")
8-set(SO_VERSION_MINOR "0")
9+set(SO_VERSION_MINOR "1")
10 set(SO_VERSION_PATCH "0")
11
12 set(SO_VERSION "${SO_VERSION_MAJOR}.${SO_VERSION_MINOR}.${SO_VERSION_PATCH}")
13
14=== modified file 'debian/changelog'
15--- debian/changelog 2014-01-20 19:43:49 +0000
16+++ debian/changelog 2014-02-28 14:46:51 +0000
17@@ -1,3 +1,9 @@
18+libcolumbus (1.1.0-0ubuntu1) UNRELEASED; urgency=medium
19+
20+ * New online search mode
21+
22+ -- Jussi Pakkanen <jussi.pakkanen@ubuntu.com> Fri, 28 Feb 2014 15:44:25 +0200
23+
24 libcolumbus (1.0.0+14.04.20140120-0ubuntu1) trusty; urgency=low
25
26 * New rebuild forced
27
28=== modified file 'include/Matcher.hh'
29--- include/Matcher.hh 2013-08-07 12:20:52 +0000
30+++ include/Matcher.hh 2014-02-28 14:46:51 +0000
31@@ -60,6 +60,17 @@
32 void index(const Corpus &c);
33 ErrorValues& getErrorValues();
34 IndexWeights& getIndexWeights();
35+ /*
36+ * This function is optimized for online matches, that is, queries
37+ * that are live updated during typing. It uses slightly different
38+ * search heuristics to ensure results that "feel good" to humans.
39+ *
40+ * The second argument is the field that should be the primary focus.
41+ * Usually it means having the text that will be shown to the user.
42+ * As an example, in the HUD, this field would contain the command
43+ * (and nothing else) that will be executed.
44+ */
45+ MatchResults onlineMatch(const WordList &query, const Word &primaryIndex);
46 };
47
48 COL_NAMESPACE_END
49
50=== modified file 'include/WordStore.hh'
51--- include/WordStore.hh 2013-06-14 12:26:10 +0000
52+++ include/WordStore.hh 2014-02-28 14:46:51 +0000
53@@ -49,6 +49,7 @@
54
55
56 WordID getID(const Word &w);
57+ bool hasWord(const Word &w) const;
58 Word getWord(const WordID id) const;
59 bool hasWord(const WordID id) const;
60 };
61
62=== modified file 'src/MatchResults.cc'
63--- src/MatchResults.cc 2013-06-20 12:25:11 +0000
64+++ src/MatchResults.cc 2014-02-28 14:46:51 +0000
65@@ -96,7 +96,10 @@
66 if(p->sorted)
67 return;
68 MatchResults *me = const_cast<MatchResults*>(this);
69- sort(me->p->results.rbegin(), me->p->results.rend());
70+ stable_sort(me->p->results.rbegin(), me->p->results.rend(),
71+ [](const pair<double, DocumentID> &a, const pair<double, DocumentID> &b) -> bool{
72+ return a.first < b.first;
73+ });
74 me->p->sorted = true;
75 }
76
77
78=== modified file 'src/Matcher.cc'
79--- src/Matcher.cc 2013-08-07 12:20:52 +0000
80+++ src/Matcher.cc 2014-02-28 14:46:51 +0000
81@@ -36,6 +36,7 @@
82 #include <stdexcept>
83 #include <map>
84 #include <vector>
85+#include <algorithm>
86
87 #ifdef HAS_SPARSE_HASH
88 #include <google/sparse_hash_map>
89@@ -98,6 +99,7 @@
90 IndexWeights weights;
91 MatcherStatistics stats;
92 WordStore store;
93+ map<pair<DocumentID, WordID>, size_t> originalSizes; // Lengths of original documents.
94 };
95
96 void ReverseIndex::add(const WordID wordID, const WordID indexID, const DocumentID id) {
97@@ -230,15 +232,6 @@
98 }
99 }
100
101-static void expandQuery(const WordList &query, WordList &expandedQuery) {
102- for(size_t i=0; i<query.size(); i++)
103- expandedQuery.addWord(query[i]);
104-
105- for(size_t i=0; i<query.size()-1; i++) {
106- expandedQuery.addWord(query[i].join(query[i+1]));
107- }
108-}
109-
110 static bool subtermsMatch(MatcherPrivate *p, const ResultFilter &filter, size_t term, DocumentID id) {
111 for(size_t subTerm=0; subTerm < filter.numSubTerms(term); subTerm++) {
112 const Word &filterName = filter.getField(term, subTerm);
113@@ -286,6 +279,10 @@
114 const Word &fieldName = textNames[ti];
115 const WordID fieldID = p->store.getID(fieldName);
116 const WordList &text = d.getText(fieldName);
117+ pair<DocumentID, WordID> lengths;
118+ lengths.first = d.getID();
119+ lengths.second = fieldID;
120+ p->originalSizes[lengths] = text.size();
121 for(size_t wi=0; wi<text.size(); wi++) {
122 const Word &word = text[wi];
123 const WordID wordID = p->store.getID(word);
124@@ -336,16 +333,14 @@
125 const int maxIterations = 1;
126 const int increment = LevenshteinIndex::getDefaultError();
127 const size_t minMatches = 10;
128- WordList expandedQuery;
129 MatchResults allMatches;
130
131 if(query.size() == 0)
132 return matchedDocuments;
133- expandQuery(query, expandedQuery);
134 // Try to search with ever growing error until we find enough matches.
135 for(int i=0; i<maxIterations; i++) {
136 MatchResults matches;
137- matchWithRelevancy(expandedQuery, params, i*increment, matches);
138+ matchWithRelevancy(query, params, i*increment, matches);
139 if(matches.size() >= minMatches || i == maxIterations-1) {
140 allMatches.addResults(matches);
141 break;
142@@ -392,5 +387,69 @@
143 return p->weights;
144 }
145
146+static map<DocumentID, size_t> countExacts(MatcherPrivate *p, const WordList &query, const WordID indexID) {
147+ map<DocumentID, size_t> matchCounts;
148+ for(size_t i=0; i<query.size(); i++) {
149+ const Word &w = query[i];
150+ if(w.length() == 0 || !p->store.hasWord(w)) {
151+ continue;
152+ }
153+ WordID curWord = p->store.getID(w);
154+ vector<DocumentID> exacts;
155+ p->reverseIndex.findDocuments(curWord, indexID, exacts);
156+ for(const auto &i : exacts) {
157+ matchCounts[i]++; // Default is zero initialisation.
158+ }
159+ }
160+ return matchCounts;
161+}
162+
163+struct DocCount {
164+ DocumentID id;
165+ size_t matches;
166+};
167+
168+MatchResults Matcher::onlineMatch(const WordList &query, const Word &primaryIndex) {
169+ MatchResults results;
170+ set<DocumentID> exactMatched;
171+ map<DocumentID, double> accumulator;
172+ if(!p->store.hasWord(primaryIndex)) {
173+ string msg("Index named ");
174+ msg += primaryIndex.asUtf8();
175+ msg += " is not known";
176+ throw invalid_argument(msg);
177+ }
178+ WordID indexID = p->store.getID(primaryIndex);
179+ // How many times each document matched with zero error.
180+ vector<DocCount> stats;
181+ for(const auto &i : countExacts(p, query, indexID)) {
182+ DocCount c;
183+ pair<DocumentID, WordID> key;
184+ exactMatched.insert(i.first);
185+ key.first = i.first;
186+ key.second = indexID;
187+ c.id = i.first;
188+ c.matches = i.second;
189+ stats.push_back(c);
190+ }
191+ for(const auto &i: stats) {
192+ accumulator[i.id] = 2*i.matches;
193+ if(i.matches == query.size()
194+ && i.matches == p->originalSizes[make_pair(i.id, indexID)]) { // Perfect match.
195+ accumulator[i.id] += 100;
196+ }
197+ }
198+ // Merge in fuzzy matches.
199+ MatchResults fuzzyResults = match(query);
200+ for(size_t i = 0; i<fuzzyResults.size(); i++) {
201+ DocumentID docid = fuzzyResults.getDocumentID(i);
202+ accumulator[docid] += fuzzyResults.getRelevancy(i);
203+ }
204+ for(const auto &i : accumulator) {
205+ results.addResult(i.first, i.second);
206+ }
207+ return results;
208+}
209+
210 COL_NAMESPACE_END
211
212
213=== modified file 'src/WordStore.cc'
214--- src/WordStore.cc 2013-01-31 10:23:45 +0000
215+++ src/WordStore.cc 2014-02-28 14:46:51 +0000
216@@ -53,15 +53,19 @@
217 }
218
219 WordID WordStore::getID(const Word &w) {
220- TrieOffset node = p->words.findWord(w);
221- if(node)
222- return p->words.getWordID(node);
223- node = p->words.insertWord(w, p->wordIndex.size());
224+ if(p->words.hasWord(w)) {
225+ return p->words.getWordID(p->words.findWord(w));
226+ }
227+ TrieOffset node = p->words.insertWord(w, p->wordIndex.size());
228 p->wordIndex.push_back(node);
229 WordID result = p->wordIndex.size()-1;
230 return result;
231 }
232
233+bool WordStore::hasWord(const Word &w) const {
234+ return p->words.hasWord(w);
235+}
236+
237 Word WordStore::getWord(const WordID id) const {
238 if(!hasWord(id)) {
239 throw out_of_range("Tried to access non-existing WordID in WordStore.");
240
241=== modified file 'test/MatcherTest.cc'
242--- test/MatcherTest.cc 2013-06-20 12:10:40 +0000
243+++ test/MatcherTest.cc 2014-02-28 14:46:51 +0000
244@@ -23,6 +23,7 @@
245 #include "WordList.hh"
246 #include "Document.hh"
247 #include "MatchResults.hh"
248+#include "ColumbusHelpers.hh"
249 #include <cassert>
250
251 using namespace Columbus;
252@@ -123,7 +124,169 @@
253 c.addDocument(d2);
254 m.index(c);
255
256- matches = m.match("Sara Michell Geller");
257+ matches = m.match("Sari Michell Geller");
258+ assert(matches.getDocumentID(0) == correct);
259+}
260+
261+void testSentence() {
262+ Corpus c;
263+ DocumentID correct = 1;
264+ DocumentID wrong = 0;
265+ Document d1(correct);
266+ Document d2(wrong);
267+ Word fieldName("name");
268+ Word secondName("context");
269+ Matcher m;
270+ MatchResults matches;
271+
272+ d1.addText(fieldName, "Fit Canvas to Layers");
273+ d1.addText(secondName, "View Zoom (100%)");
274+ d2.addText(fieldName, "Fit image in Window");
275+ d2.addText(secondName, "Image");
276+
277+ c.addDocument(d1);
278+ c.addDocument(d2);
279+
280+ m.index(c);
281+ matches = m.match("fit canvas to layers");
282+ assert(matches.getDocumentID(0) == correct);
283+}
284+
285+void testExactOrder() {
286+ Corpus c;
287+ DocumentID correct = 1;
288+ DocumentID wrong = 0;
289+ DocumentID moreWrong = 100;
290+ Document d1(correct);
291+ Document d2(wrong);
292+ Document d3(moreWrong);
293+ Word fieldName("name");
294+ Word secondName("context");
295+ Matcher m;
296+ MatchResults matches;
297+ WordList q = splitToWords("fit canvas to layers");
298+ d1.addText(fieldName, "Fit Canvas to Layers");
299+ d1.addText(secondName, "View Zoom (100%)");
300+ d2.addText(fieldName, "Fit image in Window");
301+ d2.addText(secondName, "Image");
302+ d3.addText(fieldName, "Not matching.");
303+ d3.addText(secondName, "fit canvas to layers");
304+ c.addDocument(d1);
305+ c.addDocument(d2);
306+ c.addDocument(d3);
307+
308+ m.index(c);
309+ matches = m.onlineMatch(q, fieldName);
310+ assert(matches.size() >= 1);
311+ assert(matches.getDocumentID(0) == correct);
312+}
313+
314+void testSmallestMatch() {
315+ Corpus c;
316+ DocumentID correct = 1;
317+ DocumentID wrong = 0;
318+ Document d1(correct);
319+ Document d2(wrong);
320+ Word fieldName("name");
321+ Word field2("dummy");
322+ Matcher m;
323+ MatchResults matches;
324+ WordList q = splitToWords("save");
325+ d1.addText(fieldName, "save");
326+ d1.addText(field2, "lots of text to ensure statistics of this field are ignored");
327+ d2.addText(fieldName, "save as");
328+ c.addDocument(d1);
329+ c.addDocument(d2);
330+
331+ m.index(c);
332+ matches = m.onlineMatch(q, fieldName);
333+ assert(matches.size() == 2);
334+ assert(matches.getDocumentID(0) == correct);
335+}
336+
337+void noCommonMatch() {
338+ Corpus c;
339+ DocumentID correct = 1;
340+ Document d1(correct);
341+ Word fieldName("name");
342+ Word field2("dummy");
343+ Matcher m;
344+ MatchResults matches;
345+ WordList q = splitToWords("fit canvas to selection");
346+ d1.addText(fieldName, "Preparing your Images for the Web");
347+ d1.addText(fieldName, "Help user manual");
348+ c.addDocument(d1);
349+
350+ m.index(c);
351+ matches = m.onlineMatch(q, fieldName);
352+ assert(matches.size() == 0);
353+}
354+
355+void emptyMatch() {
356+ Corpus c;
357+ DocumentID correct = 1;
358+ Document d1(correct);
359+ Word fieldName("name");
360+ Word field2("dummy");
361+ Matcher m;
362+ MatchResults matches;
363+ WordList q;
364+ d1.addText(fieldName, "Preparing your Images for the Web");
365+ d1.addText(fieldName, "Help user manual");
366+ c.addDocument(d1);
367+
368+ m.index(c);
369+ matches = m.onlineMatch(q, fieldName);
370+ assert(matches.size() == 0);
371+}
372+
373+void testMatchCount() {
374+ Corpus c;
375+ DocumentID correct = 1;
376+ DocumentID wrong = 0;
377+ Document d1(correct);
378+ Document d2(wrong);
379+ Word fieldName("name");
380+ Word secondName("context");
381+ Matcher m;
382+ MatchResults matches;
383+ WordList q = splitToWords("fit canvas to selection");
384+ d1.addText(fieldName, "Fit Canvas to Layers");
385+ d1.addText(secondName, "View Zoom (100%)");
386+ d2.addText(fieldName, "Selection editor");
387+ d2.addText(secondName, "Windows dockable dialogs");
388+ c.addDocument(d1);
389+ c.addDocument(d2);
390+
391+ m.index(c);
392+ matches = m.onlineMatch(q, fieldName);
393+ assert(matches.size() == 2);
394+ assert(matches.getDocumentID(0) == correct);
395+}
396+
397+void testPerfect() {
398+ Corpus c;
399+ DocumentID correct = 0;
400+ Document d1(1);
401+ Document d2(correct);
402+ Document d3(2);
403+ Document d4(3);
404+ Word fieldName("name");
405+ Matcher m;
406+ MatchResults matches;
407+ WordList q = splitToWords("save");
408+ d1.addText(fieldName, "Save as");
409+ d2.addText(fieldName, "Save");
410+ d3.addText(fieldName, "Save yourself");
411+ d4.addText(fieldName, "Save the whales");
412+ c.addDocument(d1);
413+ c.addDocument(d2);
414+ c.addDocument(d3);
415+ c.addDocument(d4);
416+
417+ m.index(c);
418+ matches = m.onlineMatch(q, fieldName);
419+ assert(matches.size() >= 1);
420 assert(matches.getDocumentID(0) == correct);
421 }
422
423@@ -132,6 +295,13 @@
424 testMatcher();
425 testRelevancy();
426 testMultiWord();
427+ testSentence();
428+ testExactOrder();
429+ testSmallestMatch();
430+ noCommonMatch();
431+ emptyMatch();
432+ testMatchCount();
433+ testPerfect();
434 } catch(const std::exception &e) {
435 fprintf(stderr, "Fail: %s\n", e.what());
436 return 666;
437
438=== modified file 'test/TrieTest.cc'
439--- test/TrieTest.cc 2013-04-03 13:50:54 +0000
440+++ test/TrieTest.cc 2014-02-28 14:46:51 +0000
441@@ -46,10 +46,28 @@
442 assert(result == w2);
443 }
444
445+void testHas() {
446+ Trie t;
447+ Word w1("abc");
448+ Word w2("abd");
449+ Word w3("a");
450+ Word w4("x");
451+ Word result;
452+
453+ WordID i1 = 1;
454+
455+ assert(t.numWords() == 0);
456+ t.insertWord(w1, i1);
457+ assert(t.hasWord(w1));
458+ assert(!t.hasWord(w2));
459+ assert(!t.hasWord(w3));
460+ assert(!t.hasWord(w4));
461+}
462
463 int main(int /*argc*/, char **/*argv*/) {
464 // Move basic tests from levtrietest here.
465 testWordBuilding();
466+ testHas();
467 return 0;
468 }
469

Subscribers

People subscribed via source and target branches

to all changes: