Merge lp:~mhr3/zeitgeist/fts-extras into lp:~zeitgeist/zeitgeist/bluebird
- fts-extras
- Merge into bluebird
Proposed by
Michal Hruby
Status: | Merged |
---|---|
Merged at revision: | 391 |
Proposed branch: | lp:~mhr3/zeitgeist/fts-extras |
Merge into: | lp:~zeitgeist/zeitgeist/bluebird |
Prerequisite: | lp:~zeitgeist/zeitgeist/fts++ |
Diff against target: |
1057 lines (+678/-45) 15 files modified
configure.ac (+37/-0) extensions/fts++/Makefile.am (+5/-0) extensions/fts++/fts.cpp (+30/-0) extensions/fts++/fts.h (+13/-0) extensions/fts++/fts.vapi (+10/-0) extensions/fts++/indexer.cpp (+210/-40) extensions/fts++/indexer.h (+16/-1) extensions/fts++/stringutils.cpp (+87/-1) extensions/fts++/stringutils.h (+8/-0) extensions/fts++/test/Makefile.am (+5/-0) extensions/fts++/test/test-indexer.cpp (+89/-0) extensions/fts++/test/test-stringutils.cpp (+95/-0) extensions/fts++/zeitgeist-fts.vala (+17/-0) extensions/fts.vala (+49/-3) src/remote.vala (+7/-0) |
To merge this branch: | bzr merge lp:~mhr3/zeitgeist/fts-extras |
Related bugs: |
Reviewer | Review Type | Date Requested | Status |
---|---|---|---|
Siegfried Gevatter | Approve | ||
Review via email: mp+92430@code.launchpad.net |
Commit message
Description of the change
Adds a few more extra features to FTS.
To post a comment you must log in.
lp:~mhr3/zeitgeist/fts-extras
updated
- 438. By Michal Hruby
-
Lower prio of the timeout source
- 439. By Michal Hruby
-
Add more string utils
- 440. By Michal Hruby
-
Preprocess everything we index
- 441. By Michal Hruby
-
Few more fixes
- 442. By Michal Hruby
-
Add more tests
Preview Diff
[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1 | === modified file 'configure.ac' |
2 | --- configure.ac 2012-02-08 18:54:58 +0000 |
3 | +++ configure.ac 2012-02-10 12:11:19 +0000 |
4 | @@ -40,6 +40,30 @@ |
5 | AC_SUBST(ZEITGEIST_LIBS) |
6 | |
7 | ################################################# |
8 | +# Dee-ICU check |
9 | +################################################# |
10 | +DEE_ICU_REQUIRED=1.0.2 |
11 | + |
12 | +AC_ARG_WITH([dee-icu], |
13 | + AS_HELP_STRING([--with-dee-icu[=@<:@no/auto/yes@:>@]], |
14 | + [Build the FTS extension with dee-icu]), |
15 | + [with_dee_icu=$withval], |
16 | + [with_dee_icu="auto"]) |
17 | + |
18 | +if test "x$with_dee_icu" = "xauto" ; then |
19 | + PKG_CHECK_EXISTS([dee-icu-1.0 >= $DEE_ICU_REQUIRED], |
20 | + with_dee_icu="yes", |
21 | + with_dee_icu="no") |
22 | +fi |
23 | + |
24 | +if test "x$with_dee_icu" = "xyes" ; then |
25 | + PKG_CHECK_MODULES(DEE_ICU, dee-icu-1.0 >= $DEE_ICU_REQUIRED) |
26 | + AC_DEFINE(HAVE_DEE_ICU, 1, [Have dee-icu]) |
27 | +fi |
28 | + |
29 | +AM_CONDITIONAL(HAVE_DEE_ICU, test "x$with_dee_icu" = "xyes") |
30 | + |
31 | +################################################# |
32 | # DBus service |
33 | ################################################# |
34 | |
35 | @@ -88,3 +112,16 @@ |
36 | fi |
37 | |
38 | AC_OUTPUT |
39 | + |
40 | +cat <<EOF |
41 | + |
42 | +${PACKAGE}-${VERSION} |
43 | + |
44 | + Build Environment |
45 | + Install Prefix: ${prefix} |
46 | + |
47 | + Optional dependencies |
48 | + dee-icu: ${with_dee_icu} |
49 | + |
50 | +EOF |
51 | + |
52 | |
53 | === modified file 'extensions/fts++/Makefile.am' |
54 | --- extensions/fts++/Makefile.am 2012-02-08 18:54:58 +0000 |
55 | +++ extensions/fts++/Makefile.am 2012-02-10 12:11:19 +0000 |
56 | @@ -76,6 +76,11 @@ |
57 | -lxapian \ |
58 | $(NULL) |
59 | |
60 | +if HAVE_DEE_ICU |
61 | +AM_CPPFLAGS += $(DEE_ICU_CFLAGS) |
62 | +zeitgeist_fts_LDADD += $(DEE_ICU_LIBS) |
63 | +endif |
64 | + |
65 | BUILT_SOURCES = \ |
66 | zeitgeist-internal.stamp \ |
67 | zeitgeist-fts_vala.stamp \ |
68 | |
69 | === modified file 'extensions/fts++/fts.cpp' |
70 | --- extensions/fts++/fts.cpp 2012-02-09 09:32:33 +0000 |
71 | +++ extensions/fts++/fts.cpp 2012-02-10 12:11:19 +0000 |
72 | @@ -84,6 +84,36 @@ |
73 | return results; |
74 | } |
75 | |
76 | +GPtrArray* |
77 | +zeitgeist_indexer_search_with_relevancies (ZeitgeistIndexer *indexer, |
78 | + const gchar *search_string, |
79 | + ZeitgeistTimeRange *time_range, |
80 | + GPtrArray *templates, |
81 | + guint offset, |
82 | + guint count, |
83 | + ZeitgeistResultType result_type, |
84 | + gdouble **relevancies, |
85 | + gint *relevancies_size, |
86 | + guint *matches, |
87 | + GError **error) |
88 | +{ |
89 | + GPtrArray *results; |
90 | + ZeitgeistFTS::Controller *_indexer; |
91 | + |
92 | + g_return_val_if_fail (indexer != NULL, NULL); |
93 | + g_return_val_if_fail (search_string != NULL, NULL); |
94 | + g_return_val_if_fail (ZEITGEIST_IS_TIME_RANGE (time_range), NULL); |
95 | + g_return_val_if_fail (error == NULL || *error == NULL, NULL); |
96 | + |
97 | + _indexer = (ZeitgeistFTS::Controller*) indexer; |
98 | + |
99 | + results = _indexer->indexer->SearchWithRelevancies ( |
100 | + search_string, time_range, templates, offset, count, result_type, |
101 | + relevancies, relevancies_size, matches, error); |
102 | + |
103 | + return results; |
104 | +} |
105 | + |
106 | void zeitgeist_indexer_index_events (ZeitgeistIndexer *indexer, |
107 | GPtrArray *events) |
108 | { |
109 | |
110 | === modified file 'extensions/fts++/fts.h' |
111 | --- extensions/fts++/fts.h 2012-02-09 09:32:33 +0000 |
112 | +++ extensions/fts++/fts.h 2012-02-10 12:11:19 +0000 |
113 | @@ -43,6 +43,19 @@ |
114 | guint *matches, |
115 | GError **error); |
116 | |
117 | +GPtrArray* zeitgeist_indexer_search_with_relevancies |
118 | + (ZeitgeistIndexer *indexer, |
119 | + const gchar *search_string, |
120 | + ZeitgeistTimeRange *time_range, |
121 | + GPtrArray *templates, |
122 | + guint offset, |
123 | + guint count, |
124 | + ZeitgeistResultType result_type, |
125 | + gdouble **relevancies, |
126 | + gint *relevancies_size, |
127 | + guint *matches, |
128 | + GError **error); |
129 | + |
130 | void zeitgeist_indexer_index_events (ZeitgeistIndexer *indexer, |
131 | GPtrArray *events); |
132 | |
133 | |
134 | === modified file 'extensions/fts++/fts.vapi' |
135 | --- extensions/fts++/fts.vapi 2012-02-07 17:02:30 +0000 |
136 | +++ extensions/fts++/fts.vapi 2012-02-10 12:11:19 +0000 |
137 | @@ -14,6 +14,16 @@ |
138 | ResultType result_type, |
139 | out uint matches) throws GLib.Error; |
140 | |
141 | + public GLib.GenericArray<Event> search_with_relevancies ( |
142 | + string search_string, |
143 | + TimeRange time_range, |
144 | + GLib.GenericArray<Event> templates, |
145 | + uint offset, |
146 | + uint count, |
147 | + ResultType result_type, |
148 | + out double[] relevancies, |
149 | + out uint matches) throws GLib.Error; |
150 | + |
151 | public void index_events (GLib.GenericArray<Event> events); |
152 | |
153 | public void delete_events (uint[] event_ids); |
154 | |
155 | === modified file 'extensions/fts++/indexer.cpp' |
156 | --- extensions/fts++/indexer.cpp 2012-02-09 09:37:48 +0000 |
157 | +++ extensions/fts++/indexer.cpp 2012-02-10 12:11:19 +0000 |
158 | @@ -356,10 +356,40 @@ |
159 | } |
160 | } |
161 | |
162 | +std::string Indexer::PreprocessString (std::string const& input) |
163 | +{ |
164 | + if (input.empty ()) return input; |
165 | + |
166 | + std::string result (StringUtils::RemoveUnderscores (input)); |
167 | + // a simple heuristic for the uncamelcaser |
168 | + size_t num_digits = StringUtils::CountDigits (result); |
169 | + if (result.length () > 3 && num_digits < result.length () / 2) |
170 | + { |
171 | + // FIXME: process digits?, atm they stay attached to the text |
172 | + result = StringUtils::UnCamelcase (result); |
173 | + } |
174 | + |
175 | + std::string folded (StringUtils::AsciiFold (result)); |
176 | + if (!folded.empty ()) |
177 | + { |
178 | + result += ' '; |
179 | + result += folded; |
180 | + } |
181 | + |
182 | +#ifdef DEBUG_PREPROCESSING |
183 | + if (input != result) |
184 | + g_debug ("processed: %s\n-> %s", input.c_str (), result.c_str ()); |
185 | +#endif |
186 | + |
187 | + return result; |
188 | +} |
189 | + |
190 | void Indexer::IndexText (std::string const& text) |
191 | { |
192 | - // FIXME: ascii folding! |
193 | tokenizer->index_text (text, 5); |
194 | + // this is by definition already a human readable display string, |
195 | + // so it shouldn't need removal of underscores and uncamelcase |
196 | + tokenizer->index_text (StringUtils::AsciiFold (text), 5); |
197 | } |
198 | |
199 | void Indexer::IndexUri (std::string const& uri, std::string const& origin) |
200 | @@ -403,9 +433,10 @@ |
201 | gchar *pn = g_file_get_parse_name (f); |
202 | gchar *basename = g_path_get_basename (pn); |
203 | |
204 | - // FIXME: remove unscores, CamelCase and process digits |
205 | - tokenizer->index_text (basename, 5); |
206 | - tokenizer->index_text (basename, 5, "N"); |
207 | + // remove unscores, CamelCase and process digits |
208 | + std::string processed (PreprocessString (basename)); |
209 | + tokenizer->index_text (processed, 5); |
210 | + tokenizer->index_text (processed, 5, "N"); |
211 | |
212 | g_free (basename); |
213 | // limit the directory indexing to just a few levels |
214 | @@ -420,17 +451,17 @@ |
215 | g_free (dir); |
216 | g_free (pn); |
217 | |
218 | - while (path_component.length () > 2 && |
219 | + while (path_component.length () > 2 && |
220 | weight_index < G_N_ELEMENTS (path_weights)) |
221 | { |
222 | // if this is already home directory we don't want it |
223 | - if (path_component.length () == home_dir_path.length () && |
224 | - path_component == home_dir_path) return; |
225 | + if (path_component == home_dir_path) return; |
226 | |
227 | gchar *name = g_path_get_basename (path_component.c_str ()); |
228 | |
229 | - // FIXME: un-underscore, uncamelcase, ascii fold |
230 | - tokenizer->index_text (name, path_weights[weight_index++]); |
231 | + // un-underscore, uncamelcase, ascii fold |
232 | + processed = PreprocessString (name); |
233 | + tokenizer->index_text (processed, path_weights[weight_index++]); |
234 | |
235 | dir = g_path_get_dirname (path_component.c_str ()); |
236 | path_component = dir; |
237 | @@ -471,9 +502,10 @@ |
238 | |
239 | if (g_utf8_validate (unescaped_basename, -1, NULL)) |
240 | { |
241 | - // FIXME: remove unscores, CamelCase and process digits |
242 | - tokenizer->index_text (unescaped_basename, 5); |
243 | - tokenizer->index_text (unescaped_basename, 5, "N"); |
244 | + // remove unscores, CamelCase and process digits |
245 | + std::string processed (PreprocessString (unescaped_basename)); |
246 | + tokenizer->index_text (processed, 5); |
247 | + tokenizer->index_text (processed, 5, "N"); |
248 | } |
249 | |
250 | // and also index hostname (taken from origin field if possible) |
251 | @@ -505,6 +537,7 @@ |
252 | { |
253 | // we *really* don't want to index anything with this scheme |
254 | } |
255 | + // how about special casing (s)ftp and ssh? |
256 | else |
257 | { |
258 | std::string authority, path, query; |
259 | @@ -593,12 +626,11 @@ |
260 | unsigned name_weight = is_subject ? 5 : 2; |
261 | unsigned comment_weight = 2; |
262 | |
263 | - // FIXME: ascii folding somewhere |
264 | - |
265 | val = g_app_info_get_display_name (ai); |
266 | if (val && val[0] != '\0') |
267 | { |
268 | - std::string display_name (val); |
269 | + std::string display_name (PreprocessString (val)); |
270 | + |
271 | tokenizer->index_text (display_name, name_weight); |
272 | tokenizer->index_text (display_name, name_weight, "A"); |
273 | } |
274 | @@ -606,9 +638,14 @@ |
275 | val = g_desktop_app_info_get_generic_name (dai); |
276 | if (val && val[0] != '\0') |
277 | { |
278 | + // this shouldn't need uncamelcasing |
279 | std::string generic_name (val); |
280 | + std::string generic_name_folded (StringUtils::AsciiFold (generic_name)); |
281 | + |
282 | tokenizer->index_text (generic_name, name_weight); |
283 | tokenizer->index_text (generic_name, name_weight, "A"); |
284 | + tokenizer->index_text (generic_name_folded, name_weight); |
285 | + tokenizer->index_text (generic_name_folded, name_weight, "A"); |
286 | } |
287 | |
288 | if (!is_subject) return true; |
289 | @@ -642,7 +679,35 @@ |
290 | return true; |
291 | } |
292 | |
293 | -GPtrArray* Indexer::Search (const gchar *search_string, |
294 | +std::string Indexer::CompileQueryString (const gchar *search_string, |
295 | + ZeitgeistTimeRange *time_range, |
296 | + GPtrArray *templates) |
297 | +{ |
298 | + std::string query_string (search_string); |
299 | + |
300 | + if (templates && templates->len > 0) |
301 | + { |
302 | + std::string filters (CompileEventFilterQuery (templates)); |
303 | + query_string = "(" + query_string + ") AND (" + filters + ")"; |
304 | + } |
305 | + |
306 | + if (time_range) |
307 | + { |
308 | + gint64 start_time = zeitgeist_time_range_get_start (time_range); |
309 | + gint64 end_time = zeitgeist_time_range_get_end (time_range); |
310 | + |
311 | + if (start_time > 0 || end_time < G_MAXINT64) |
312 | + { |
313 | + std::string time_filter (CompileTimeRangeFilterQuery (start_time, end_time)); |
314 | + query_string = "(" + query_string + ") AND (" + time_filter + ")"; |
315 | + } |
316 | + } |
317 | + |
318 | + g_debug ("query: %s", query_string.c_str ()); |
319 | + return query_string; |
320 | +} |
321 | + |
322 | +GPtrArray* Indexer::Search (const gchar *search, |
323 | ZeitgeistTimeRange *time_range, |
324 | GPtrArray *templates, |
325 | guint offset, |
326 | @@ -654,28 +719,22 @@ |
327 | GPtrArray *results = NULL; |
328 | try |
329 | { |
330 | - std::string query_string(search_string); |
331 | - |
332 | - if (templates && templates->len > 0) |
333 | - { |
334 | - std::string filters (CompileEventFilterQuery (templates)); |
335 | - query_string = "(" + query_string + ") AND (" + filters + ")"; |
336 | - } |
337 | - |
338 | - if (time_range) |
339 | - { |
340 | - gint64 start_time = zeitgeist_time_range_get_start (time_range); |
341 | - gint64 end_time = zeitgeist_time_range_get_end (time_range); |
342 | - |
343 | - if (start_time > 0 || end_time < G_MAXINT64) |
344 | - { |
345 | - std::string time_filter (CompileTimeRangeFilterQuery (start_time, end_time)); |
346 | - query_string = "(" + query_string + ") AND (" + time_filter + ")"; |
347 | - } |
348 | - } |
349 | - |
350 | - // FIXME: which result types coalesce? |
351 | - guint maxhits = count * 3; |
352 | + std::string query_string (CompileQueryString (search, time_range, templates)); |
353 | + |
354 | + // When sorting by some result types, we need to fetch some extra events |
355 | + // from the Xapian index because the final result set will be coalesced |
356 | + // on some property of the event |
357 | + guint maxhits; |
358 | + if (result_type == 100 || |
359 | + result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_EVENTS || |
360 | + result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_EVENTS) |
361 | + { |
362 | + maxhits = count; |
363 | + } |
364 | + else |
365 | + { |
366 | + maxhits = count * 3; |
367 | + } |
368 | |
369 | if (result_type == 100) |
370 | { |
371 | @@ -686,7 +745,6 @@ |
372 | enquire->set_sort_by_value (VALUE_TIMESTAMP, true); |
373 | } |
374 | |
375 | - g_debug ("query: %s", query_string.c_str ()); |
376 | Xapian::Query q(query_parser->parse_query (query_string, QUERY_PARSER_FLAGS)); |
377 | enquire->set_query (q); |
378 | Xapian::MSet hits (enquire->get_mset (offset, maxhits)); |
379 | @@ -753,7 +811,119 @@ |
380 | } |
381 | catch (Xapian::Error const& e) |
382 | { |
383 | - g_warning ("Failed to index event: %s", e.get_msg ().c_str ()); |
384 | + g_warning ("Failed to search index: %s", e.get_msg ().c_str ()); |
385 | + g_set_error_literal (error, |
386 | + ZEITGEIST_ENGINE_ERROR, |
387 | + ZEITGEIST_ENGINE_ERROR_DATABASE_ERROR, |
388 | + e.get_msg ().c_str ()); |
389 | + } |
390 | + |
391 | + return results; |
392 | +} |
393 | + |
394 | +GPtrArray* Indexer::SearchWithRelevancies (const gchar *search, |
395 | + ZeitgeistTimeRange *time_range, |
396 | + GPtrArray *templates, |
397 | + guint offset, |
398 | + guint count, |
399 | + ZeitgeistResultType result_type, |
400 | + gdouble **relevancies, |
401 | + gint *relevancies_size, |
402 | + guint *matches, |
403 | + GError **error) |
404 | +{ |
405 | + GPtrArray *results = NULL; |
406 | + try |
407 | + { |
408 | + std::string query_string (CompileQueryString (search, time_range, templates)); |
409 | + |
410 | + guint maxhits = count; |
411 | + |
412 | + if (result_type == 100) |
413 | + { |
414 | + enquire->set_sort_by_relevance (); |
415 | + } |
416 | + else |
417 | + { |
418 | + enquire->set_sort_by_value (VALUE_TIMESTAMP, true); |
419 | + } |
420 | + |
421 | + Xapian::Query q(query_parser->parse_query (query_string, QUERY_PARSER_FLAGS)); |
422 | + enquire->set_query (q); |
423 | + Xapian::MSet hits (enquire->get_mset (offset, maxhits)); |
424 | + Xapian::doccount hitcount = hits.get_matches_estimated (); |
425 | + |
426 | + if (result_type == 100) |
427 | + { |
428 | + std::vector<unsigned> event_ids; |
429 | + std::vector<gdouble> relevancy_arr; |
430 | + Xapian::MSetIterator iter, end; |
431 | + for (iter = hits.begin (), end = hits.end (); iter != end; ++iter) |
432 | + { |
433 | + Xapian::Document doc(iter.get_document ()); |
434 | + double unserialized = |
435 | + Xapian::sortable_unserialise (doc.get_value (VALUE_EVENT_ID)); |
436 | + unsigned event_id = static_cast<unsigned>(unserialized); |
437 | + event_ids.push_back (event_id); |
438 | + |
439 | + double rank = iter.get_percent () / 100.; |
440 | + relevancy_arr.push_back (rank); |
441 | + } |
442 | + |
443 | + results = zeitgeist_db_reader_get_events (zg_reader, |
444 | + &event_ids[0], |
445 | + event_ids.size (), |
446 | + NULL, |
447 | + error); |
448 | + |
449 | + if (results->len != relevancy_arr.size ()) |
450 | + { |
451 | + g_warning ("Results don't match relevancies!"); |
452 | + g_set_error_literal (error, |
453 | + ZEITGEIST_ENGINE_ERROR, |
454 | + ZEITGEIST_ENGINE_ERROR_DATABASE_ERROR, |
455 | + "Internal database error"); |
456 | + return NULL; |
457 | + } |
458 | + |
459 | + if (relevancies) |
460 | + { |
461 | + *relevancies = (gdouble*) g_memdup (&relevancy_arr[0], |
462 | + sizeof (gdouble) * results->len); |
463 | + } |
464 | + if (relevancies_size) |
465 | + { |
466 | + *relevancies_size = relevancy_arr.size (); |
467 | + } |
468 | + } |
469 | + else |
470 | + { |
471 | + g_set_error_literal (error, |
472 | + ZEITGEIST_ENGINE_ERROR, |
473 | + ZEITGEIST_ENGINE_ERROR_INVALID_ARGUMENT, |
474 | + "Only RELEVANCY result type is supported"); |
475 | + /* |
476 | + * perhaps something like this could be used here? |
477 | + std::map<unsigned, gdouble> relevancy_map; |
478 | + foreach (...) |
479 | + { |
480 | + double rank = iter.get_percent () / 100.; |
481 | + if (rank > relevancy_map[event_id]) |
482 | + { |
483 | + relevancy_map[event_id] = rank; |
484 | + } |
485 | + } |
486 | + */ |
487 | + } |
488 | + |
489 | + if (matches) |
490 | + { |
491 | + *matches = hitcount; |
492 | + } |
493 | + } |
494 | + catch (Xapian::Error const& e) |
495 | + { |
496 | + g_warning ("Failed to search index: %s", e.get_msg ().c_str ()); |
497 | g_set_error_literal (error, |
498 | ZEITGEIST_ENGINE_ERROR, |
499 | ZEITGEIST_ENGINE_ERROR_DATABASE_ERROR, |
500 | |
501 | === modified file 'extensions/fts++/indexer.h' |
502 | --- extensions/fts++/indexer.h 2012-02-09 09:37:48 +0000 |
503 | +++ extensions/fts++/indexer.h 2012-02-10 12:11:19 +0000 |
504 | @@ -77,7 +77,7 @@ |
505 | void DeleteEvent (guint32 event_id); |
506 | void SetDbMetadata (std::string const& key, std::string const& value); |
507 | |
508 | - GPtrArray* Search (const gchar *search_string, |
509 | + GPtrArray* Search (const gchar *search, |
510 | ZeitgeistTimeRange *time_range, |
511 | GPtrArray *templates, |
512 | guint offset, |
513 | @@ -85,11 +85,26 @@ |
514 | ZeitgeistResultType result_type, |
515 | guint *matches, |
516 | GError **error); |
517 | + GPtrArray* SearchWithRelevancies (const gchar *search, |
518 | + ZeitgeistTimeRange *time_range, |
519 | + GPtrArray *templates, |
520 | + guint offset, |
521 | + guint count, |
522 | + ZeitgeistResultType result_type, |
523 | + gdouble **relevancies, |
524 | + gint *relevancies_size, |
525 | + guint *matches, |
526 | + GError **error); |
527 | |
528 | private: |
529 | std::string ExpandType (std::string const& prefix, const gchar* unparsed_uri); |
530 | std::string CompileEventFilterQuery (GPtrArray *templates); |
531 | std::string CompileTimeRangeFilterQuery (gint64 start, gint64 end); |
532 | + std::string CompileQueryString (const gchar *search, |
533 | + ZeitgeistTimeRange *time_range, |
534 | + GPtrArray *templates); |
535 | + |
536 | + std::string PreprocessString (std::string const& input); |
537 | |
538 | void AddDocFilters (ZeitgeistEvent *event, Xapian::Document &doc); |
539 | void IndexText (std::string const& text); |
540 | |
541 | === modified file 'extensions/fts++/stringutils.cpp' |
542 | --- extensions/fts++/stringutils.cpp 2012-02-09 09:32:33 +0000 |
543 | +++ extensions/fts++/stringutils.cpp 2012-02-10 12:11:19 +0000 |
544 | @@ -17,9 +17,14 @@ |
545 | * Authored by Mikkel Kamstrup Erlandsen <mikkel.kamstrup@gmail.com> |
546 | * |
547 | */ |
548 | + |
549 | +#include "stringutils.h" |
550 | #include <string> |
551 | +#include <algorithm> |
552 | |
553 | -#include "stringutils.h" |
554 | +#ifdef HAVE_DEE_ICU |
555 | +#include <dee-icu.h> |
556 | +#endif |
557 | |
558 | using namespace std; |
559 | |
560 | @@ -123,6 +128,87 @@ |
561 | } |
562 | } |
563 | |
564 | +string RemoveUnderscores (string const &input) |
565 | +{ |
566 | + string result (input); |
567 | + std::replace (result.begin (), result.end (), '_', ' '); |
568 | + |
569 | + return result; |
570 | +} |
571 | + |
572 | +static bool is_digit (char c) { return c >= '0' && c <= '9'; } |
573 | + |
574 | +size_t CountDigits (string const &input) |
575 | +{ |
576 | + return std::count_if (input.begin (), input.end (), is_digit); |
577 | +} |
578 | + |
579 | +static GRegex *camelcase_matcher = NULL; |
580 | + |
581 | +static gboolean |
582 | +matcher_cb (const GMatchInfo *match_info, GString *result, gpointer user_data) |
583 | +{ |
584 | + gint start_pos; |
585 | + g_match_info_fetch_pos (match_info, 0, &start_pos, NULL); |
586 | + if (start_pos != 0) g_string_append_c (result, ' '); |
587 | + gchar *word = g_match_info_fetch (match_info, 0); |
588 | + g_string_append (result, word); |
589 | + g_free (word); |
590 | + |
591 | + return FALSE; |
592 | +} |
593 | + |
594 | +string UnCamelcase (string const &input) |
595 | +{ |
596 | + if (camelcase_matcher == NULL) |
597 | + { |
598 | + camelcase_matcher = g_regex_new ("(?<=^|[[:lower:]])[[:upper:]]+[^[:upper:]]+", G_REGEX_OPTIMIZE, (GRegexMatchFlags) 0, NULL); |
599 | + if (camelcase_matcher == NULL) g_critical ("Unable to create matcher!"); |
600 | + } |
601 | + |
602 | + gchar *result = g_regex_replace_eval (camelcase_matcher, input.c_str (), |
603 | + input.length (), 0, |
604 | + (GRegexMatchFlags) 0, |
605 | + matcher_cb, NULL, NULL); |
606 | + |
607 | + string ret (result); |
608 | + g_free (result); |
609 | + return ret; |
610 | +} |
611 | + |
612 | +#ifdef HAVE_DEE_ICU |
613 | +static DeeICUTermFilter *icu_filter = NULL; |
614 | + |
615 | +/** |
616 | + * Use ascii folding filter on the input text and return folded version |
617 | + * of the original string. |
618 | + * |
619 | + * Note that if the folded version is exactly the same as the original |
620 | + * empty string will be returned. |
621 | + */ |
622 | +string AsciiFold (string const& input) |
623 | +{ |
624 | + if (icu_filter == NULL) |
625 | + { |
626 | + icu_filter = dee_icu_term_filter_new_ascii_folder (); |
627 | + if (icu_filter == NULL) return ""; |
628 | + } |
629 | + |
630 | + // FIXME: check first if the input contains any non-ascii chars? |
631 | + |
632 | + gchar *folded = dee_icu_term_filter_apply (icu_filter, input.c_str ()); |
633 | + string result (folded); |
634 | + g_free (folded); |
635 | + |
636 | + return result == input ? "" : result; |
637 | +} |
638 | +#else |
639 | +string AsciiFold (string const& input) |
640 | +{ |
641 | + return ""; |
642 | +} |
643 | +#endif |
644 | + |
645 | } /* namespace StringUtils */ |
646 | |
647 | } /* namespace ZeitgeistFTS */ |
648 | |
649 | === modified file 'extensions/fts++/stringutils.h' |
650 | --- extensions/fts++/stringutils.h 2012-02-09 09:32:33 +0000 |
651 | +++ extensions/fts++/stringutils.h 2012-02-10 12:11:19 +0000 |
652 | @@ -37,6 +37,14 @@ |
653 | std::string &path, |
654 | std::string &basename); |
655 | |
656 | +std::string RemoveUnderscores (std::string const &input); |
657 | + |
658 | +size_t CountDigits (std::string const &input); |
659 | + |
660 | +std::string UnCamelcase (std::string const &input); |
661 | + |
662 | +std::string AsciiFold (std::string const& input); |
663 | + |
664 | } /* namespace StringUtils */ |
665 | |
666 | } /* namespace ZeitgeistFTS */ |
667 | |
668 | === modified file 'extensions/fts++/test/Makefile.am' |
669 | --- extensions/fts++/test/Makefile.am 2012-02-08 18:54:58 +0000 |
670 | +++ extensions/fts++/test/Makefile.am 2012-02-10 12:11:19 +0000 |
671 | @@ -25,3 +25,8 @@ |
672 | -lxapian \ |
673 | $(NULL) |
674 | |
675 | +if HAVE_DEE_ICU |
676 | +AM_CPPFLAGS += $(DEE_ICU_CFLAGS) |
677 | +test_fts_LDADD += $(DEE_ICU_LIBS) |
678 | +endif |
679 | + |
680 | |
681 | === modified file 'extensions/fts++/test/test-indexer.cpp' |
682 | --- extensions/fts++/test/test-indexer.cpp 2012-02-09 09:32:33 +0000 |
683 | +++ extensions/fts++/test/test-indexer.cpp 2012-02-10 12:11:19 +0000 |
684 | @@ -145,6 +145,26 @@ |
685 | return event; |
686 | } |
687 | |
688 | +static ZeitgeistEvent* create_test_event5 (void) |
689 | +{ |
690 | + ZeitgeistEvent *event = zeitgeist_event_new (); |
691 | + ZeitgeistSubject *subject = zeitgeist_subject_new (); |
692 | + |
693 | + zeitgeist_subject_set_interpretation (subject, ZEITGEIST_NFO_SOURCE_CODE); |
694 | + zeitgeist_subject_set_manifestation (subject, ZEITGEIST_NFO_FILE_DATA_OBJECT); |
695 | + zeitgeist_subject_set_uri (subject, "file:///home/username/projects/GLibSignalImplementation.cpp"); |
696 | + zeitgeist_subject_set_text (subject, "Because c++ is awesome"); |
697 | + zeitgeist_subject_set_mimetype (subject, "text/x-c++src"); |
698 | + |
699 | + zeitgeist_event_set_interpretation (event, ZEITGEIST_ZG_CREATE_EVENT); |
700 | + zeitgeist_event_set_manifestation (event, ZEITGEIST_ZG_USER_ACTIVITY); |
701 | + zeitgeist_event_set_actor (event, "application://gedit.desktop"); |
702 | + zeitgeist_event_add_subject (event, subject); |
703 | + |
704 | + g_object_unref (subject); |
705 | + return event; |
706 | +} |
707 | + |
708 | // Steals the event, ref it if you want to keep it |
709 | static guint |
710 | index_event (Fixture *fix, ZeitgeistEvent *event) |
711 | @@ -426,6 +446,71 @@ |
712 | } |
713 | |
714 | static void |
715 | +test_simple_underscores (Fixture *fix, gconstpointer data) |
716 | +{ |
717 | + guint matches; |
718 | + guint event_id; |
719 | + ZeitgeistEvent* event; |
720 | + ZeitgeistSubject *subject; |
721 | + |
722 | + // add test events to DBs |
723 | + index_event (fix, create_test_event1 ()); |
724 | + index_event (fix, create_test_event2 ()); |
725 | + index_event (fix, create_test_event3 ()); |
726 | + event_id = index_event (fix, create_test_event4 ()); |
727 | + |
728 | + GPtrArray *results = |
729 | + zeitgeist_indexer_search (fix->indexer, |
730 | + "fabulo*", |
731 | + zeitgeist_time_range_new_anytime (), |
732 | + g_ptr_array_new (), |
733 | + 0, |
734 | + 10, |
735 | + ZEITGEIST_RESULT_TYPE_MOST_RECENT_EVENTS, |
736 | + &matches, |
737 | + NULL); |
738 | + |
739 | + g_assert_cmpuint (matches, >, 0); |
740 | + g_assert_cmpuint (results->len, ==, 1); |
741 | + |
742 | + event = (ZeitgeistEvent*) results->pdata[0]; |
743 | + g_assert_cmpuint (zeitgeist_event_get_id (event), ==, event_id); |
744 | +} |
745 | + |
746 | +static void |
747 | +test_simple_camelcase (Fixture *fix, gconstpointer data) |
748 | +{ |
749 | + guint matches; |
750 | + guint event_id; |
751 | + ZeitgeistEvent* event; |
752 | + ZeitgeistSubject *subject; |
753 | + |
754 | + // add test events to DBs |
755 | + index_event (fix, create_test_event1 ()); |
756 | + index_event (fix, create_test_event2 ()); |
757 | + index_event (fix, create_test_event3 ()); |
758 | + index_event (fix, create_test_event4 ()); |
759 | + event_id = index_event (fix, create_test_event5 ()); |
760 | + |
761 | + GPtrArray *results = |
762 | + zeitgeist_indexer_search (fix->indexer, |
763 | + "signal", |
764 | + zeitgeist_time_range_new_anytime (), |
765 | + g_ptr_array_new (), |
766 | + 0, |
767 | + 10, |
768 | + ZEITGEIST_RESULT_TYPE_MOST_RECENT_EVENTS, |
769 | + &matches, |
770 | + NULL); |
771 | + |
772 | + g_assert_cmpuint (matches, >, 0); |
773 | + g_assert_cmpuint (results->len, ==, 1); |
774 | + |
775 | + event = (ZeitgeistEvent*) results->pdata[0]; |
776 | + g_assert_cmpuint (zeitgeist_event_get_id (event), ==, event_id); |
777 | +} |
778 | + |
779 | +static void |
780 | test_simple_cjk (Fixture *fix, gconstpointer data) |
781 | { |
782 | guint matches; |
783 | @@ -517,6 +602,10 @@ |
784 | setup, test_simple_noexpand, teardown); |
785 | g_test_add ("/Zeitgeist/FTS/Indexer/SimpleNoexpandValid", Fixture, 0, |
786 | setup, test_simple_noexpand_valid, teardown); |
787 | + g_test_add ("/Zeitgeist/FTS/Indexer/SimpleUnderscores", Fixture, 0, |
788 | + setup, test_simple_underscores, teardown); |
789 | + g_test_add ("/Zeitgeist/FTS/Indexer/SimpleCamelcase", Fixture, 0, |
790 | + setup, test_simple_camelcase, teardown); |
791 | g_test_add ("/Zeitgeist/FTS/Indexer/URLUnescape", Fixture, 0, |
792 | setup, test_simple_url_unescape, teardown); |
793 | g_test_add ("/Zeitgeist/FTS/Indexer/IDNSupport", Fixture, 0, |
794 | |
795 | === modified file 'extensions/fts++/test/test-stringutils.cpp' |
796 | --- extensions/fts++/test/test-stringutils.cpp 2012-02-09 09:32:33 +0000 |
797 | +++ extensions/fts++/test/test-stringutils.cpp 2012-02-10 12:11:19 +0000 |
798 | @@ -163,6 +163,91 @@ |
799 | g_assert_cmpstr ("type=A", ==, query.c_str ()); |
800 | } |
801 | |
802 | +static void |
803 | +test_ascii_fold (Fixture *fix, gconstpointer data) |
804 | +{ |
805 | + std::string folded; |
806 | + |
807 | + folded = StringUtils::AsciiFold (""); |
808 | + g_assert_cmpstr ("", ==, folded.c_str ()); |
809 | + |
810 | + // if the original matches the folded version, AsciiFold returns "" |
811 | + folded = StringUtils::AsciiFold ("a"); |
812 | + g_assert_cmpstr ("", ==, folded.c_str ()); |
813 | + |
814 | + folded = StringUtils::AsciiFold ("abcdef"); |
815 | + g_assert_cmpstr ("", ==, folded.c_str ()); |
816 | + |
817 | + folded = StringUtils::AsciiFold ("å"); |
818 | + g_assert_cmpstr ("a", ==, folded.c_str ()); |
819 | + |
820 | + folded = StringUtils::AsciiFold ("åå"); |
821 | + g_assert_cmpstr ("aa", ==, folded.c_str ()); |
822 | + |
823 | + folded = StringUtils::AsciiFold ("aåaåa"); |
824 | + g_assert_cmpstr ("aaaaa", ==, folded.c_str ()); |
825 | +} |
826 | + |
827 | +static void |
828 | +test_underscores (Fixture *fix, gconstpointer data) |
829 | +{ |
830 | + g_assert_cmpstr ("", ==, StringUtils::RemoveUnderscores ("").c_str ()); |
831 | + |
832 | + g_assert_cmpstr (" ", ==, StringUtils::RemoveUnderscores ("_").c_str ()); |
833 | + |
834 | + g_assert_cmpstr (" ", ==, StringUtils::RemoveUnderscores ("___").c_str ()); |
835 | + |
836 | + g_assert_cmpstr ("abcd", ==, StringUtils::RemoveUnderscores ("abcd").c_str ()); |
837 | + |
838 | + g_assert_cmpstr (" abcd ", ==, StringUtils::RemoveUnderscores ("_abcd_").c_str ()); |
839 | + |
840 | + g_assert_cmpstr ("a b c d", ==, StringUtils::RemoveUnderscores ("a_b_c_d").c_str ()); |
841 | +} |
842 | + |
843 | +static void |
844 | +test_uncamelcase (Fixture *fix, gconstpointer data) |
845 | +{ |
846 | + g_assert_cmpstr ("", ==, StringUtils::UnCamelcase ("").c_str ()); |
847 | + |
848 | + g_assert_cmpstr ("abcd", ==, StringUtils::UnCamelcase ("abcd").c_str ()); |
849 | + |
850 | + g_assert_cmpstr ("Abcd", ==, StringUtils::UnCamelcase ("Abcd").c_str ()); |
851 | + |
852 | + g_assert_cmpstr ("ABCD", ==, StringUtils::UnCamelcase ("ABCD").c_str ()); |
853 | + |
854 | + g_assert_cmpstr ("ABcd", ==, StringUtils::UnCamelcase ("ABcd").c_str ()); |
855 | + |
856 | + g_assert_cmpstr ("Abcd Ef", ==, StringUtils::UnCamelcase ("AbcdEf").c_str ()); |
857 | + |
858 | + g_assert_cmpstr ("Text Editor", ==, StringUtils::UnCamelcase ("Text Editor").c_str ()); |
859 | + |
860 | + g_assert_cmpstr ("py Karaoke", ==, StringUtils::UnCamelcase ("pyKaraoke").c_str ()); |
861 | + |
862 | + g_assert_cmpstr ("Zeitgeist Project", ==, StringUtils::UnCamelcase ("ZeitgeistProject").c_str ()); |
863 | + |
864 | + g_assert_cmpstr ("Very Nice Camel Case Text", ==, StringUtils::UnCamelcase ("VeryNiceCamelCaseText").c_str ()); |
865 | + |
866 | + g_assert_cmpstr ("Ňeedš Ťo Wórk Óń Útf Čhářacters As WelL", ==, |
867 | + StringUtils::UnCamelcase ("ŇeedšŤoWórkÓńÚtfČhářactersAsWelL").c_str ()); |
868 | +} |
869 | + |
870 | +static void |
871 | +test_count_digits (Fixture *fix, gconstpointer data) |
872 | +{ |
873 | + g_assert_cmpuint (0, ==, StringUtils::CountDigits ("")); |
874 | + |
875 | + g_assert_cmpuint (0, ==, StringUtils::CountDigits ("abcdefghijklmnopqrstuvwxyz")); |
876 | + |
877 | + g_assert_cmpuint (10, ==, StringUtils::CountDigits ("0123456789")); |
878 | + |
879 | + g_assert_cmpuint (1, ==, StringUtils::CountDigits ("abc3")); |
880 | + |
881 | + g_assert_cmpuint (3, ==, StringUtils::CountDigits ("::123__poa//weee")); |
882 | + |
883 | + g_assert_cmpuint (5, ==, StringUtils::CountDigits ("PCN30129.JPG")); |
884 | + |
885 | +} |
886 | + |
887 | G_BEGIN_DECLS |
888 | |
889 | void test_stringutils_create_suite (void) |
890 | @@ -173,6 +258,16 @@ |
891 | setup, test_mangle, teardown); |
892 | g_test_add ("/Zeitgeist/FTS/StringUtils/SplitUri", Fixture, 0, |
893 | setup, test_split, teardown); |
894 | + g_test_add ("/Zeitgeist/FTS/StringUtils/RemoveUnderscores", Fixture, 0, |
895 | + setup, test_underscores, teardown); |
896 | + g_test_add ("/Zeitgeist/FTS/StringUtils/UnCamelcase", Fixture, 0, |
897 | + setup, test_uncamelcase, teardown); |
898 | + g_test_add ("/Zeitgeist/FTS/StringUtils/CountDigits", Fixture, 0, |
899 | + setup, test_count_digits, teardown); |
900 | +#ifdef HAVE_DEE_ICU |
901 | + g_test_add ("/Zeitgeist/FTS/StringUtils/AsciiFold", Fixture, 0, |
902 | + setup, test_ascii_fold, teardown); |
903 | +#endif |
904 | } |
905 | |
906 | G_END_DECLS |
907 | |
908 | === modified file 'extensions/fts++/zeitgeist-fts.vala' |
909 | --- extensions/fts++/zeitgeist-fts.vala 2012-02-09 09:32:33 +0000 |
910 | +++ extensions/fts++/zeitgeist-fts.vala 2012-02-10 12:11:19 +0000 |
911 | @@ -132,6 +132,23 @@ |
912 | events = Events.to_variant (results); |
913 | } |
914 | |
915 | + public async void search_with_relevancies ( |
916 | + string query_string, Variant time_range, |
917 | + Variant filter_templates, |
918 | + uint offset, uint count, uint result_type, |
919 | + out Variant events, out double[] relevancies, |
920 | + out uint matches) |
921 | + throws Error |
922 | + { |
923 | + var tr = new TimeRange.from_variant (time_range); |
924 | + var templates = Events.from_variant (filter_templates); |
925 | + var results = instance.indexer.search_with_relevancies ( |
926 | + query_string, tr, templates, offset, count, |
927 | + (ResultType) result_type, out relevancies, out matches); |
928 | + |
929 | + events = Events.to_variant (results); |
930 | + } |
931 | + |
932 | private static void name_acquired_callback (DBusConnection conn) |
933 | { |
934 | name_acquired = true; |
935 | |
936 | === modified file 'extensions/fts.vala' |
937 | --- extensions/fts.vala 2012-02-07 12:47:44 +0000 |
938 | +++ extensions/fts.vala 2012-02-10 12:11:19 +0000 |
939 | @@ -31,6 +31,14 @@ |
940 | uint offset, uint count, uint result_type, |
941 | [DBus (signature = "a(asaasay)")] out Variant events, |
942 | out uint matches) throws Error; |
943 | + public abstract async void search_with_relevancies ( |
944 | + string query_string, |
945 | + [DBus (signature = "(xx)")] Variant time_range, |
946 | + [DBus (signature = "a(asaasay)")] Variant filter_templates, |
947 | + uint offset, uint count, uint result_type, |
948 | + [DBus (signature = "a(asaasay)")] out Variant events, |
949 | + out double[] relevancies, |
950 | + out uint matches) throws Error; |
951 | } |
952 | |
953 | /* Because of a Vala bug we have to define the proxy interface outside of |
954 | @@ -55,6 +63,7 @@ |
955 | private const string INDEXER_NAME = "org.gnome.zeitgeist.SimpleIndexer"; |
956 | |
957 | private RemoteSimpleIndexer siin; |
958 | + private bool siin_connection_failed = false; |
959 | private uint registration_id; |
960 | private MonitorManager? notifier; |
961 | |
962 | @@ -67,6 +76,8 @@ |
963 | { |
964 | if (Utils.using_in_memory_database ()) return; |
965 | |
966 | + // FIXME: check dbus and see if fts is installed? |
967 | + |
968 | // installing a monitor from the daemon will ensure that we don't |
969 | // miss any notifications that would be emitted in between |
970 | // zeitgeist start and fts daemon start |
971 | @@ -109,23 +120,40 @@ |
972 | try |
973 | { |
974 | siin = conn.get_proxy.end<RemoteSimpleIndexer> (res); |
975 | + siin_connection_failed = false; |
976 | } |
977 | catch (IOError err) |
978 | { |
979 | + siin_connection_failed = true; |
980 | warning ("%s", err.message); |
981 | } |
982 | } |
983 | |
984 | - public async void search (string query_string, Variant time_range, |
985 | - Variant filter_templates, uint offset, uint count, uint result_type, |
986 | - out Variant events, out uint matches) throws Error |
987 | + public async void wait_for_proxy () throws Error |
988 | { |
989 | + int i = 0; |
990 | + while (this.siin == null && i < 6 && !siin_connection_failed) |
991 | + { |
992 | + Timeout.add_full (Priority.DEFAULT_IDLE, 250, |
993 | + wait_for_proxy.callback); |
994 | + i++; |
995 | + yield; |
996 | + } |
997 | + |
998 | if (siin == null || !(siin is DBusProxy)) |
999 | { |
1000 | // FIXME: queue until we have the proxy |
1001 | throw new EngineError.DATABASE_ERROR ( |
1002 | "Not connected to SimpleIndexer"); |
1003 | } |
1004 | + } |
1005 | + |
1006 | + public async void search (string query_string, Variant time_range, |
1007 | + Variant filter_templates, uint offset, uint count, uint result_type, |
1008 | + out Variant events, out uint matches) throws Error |
1009 | + { |
1010 | + if (siin == null) yield wait_for_proxy (); |
1011 | + |
1012 | var timer = new Timer (); |
1013 | yield siin.search (query_string, time_range, filter_templates, |
1014 | offset, count, result_type, |
1015 | @@ -134,6 +162,24 @@ |
1016 | (uint) events.n_children (), matches, timer.elapsed ()); |
1017 | } |
1018 | |
1019 | + public async void search_with_relevancies ( |
1020 | + string query_string, Variant time_range, |
1021 | + Variant filter_templates, uint offset, uint count, uint result_type, |
1022 | + out Variant events, out double[] relevancies, out uint matches) |
1023 | + throws Error |
1024 | + { |
1025 | + if (siin == null) yield wait_for_proxy (); |
1026 | + |
1027 | + var timer = new Timer (); |
1028 | + yield siin.search_with_relevancies ( |
1029 | + query_string, time_range, filter_templates, |
1030 | + offset, count, result_type, |
1031 | + out events, out relevancies, out matches); |
1032 | + |
1033 | + debug ("Got %u[/%u] results from indexer (in %f seconds)", |
1034 | + (uint) events.n_children (), matches, timer.elapsed ()); |
1035 | + } |
1036 | + |
1037 | } |
1038 | |
1039 | [ModuleInit] |
1040 | |
1041 | === modified file 'src/remote.vala' |
1042 | --- src/remote.vala 2012-02-05 14:52:13 +0000 |
1043 | +++ src/remote.vala 2012-02-10 12:11:19 +0000 |
1044 | @@ -121,6 +121,13 @@ |
1045 | uint offset, uint count, uint result_type, |
1046 | [DBus (signature = "a(asaasay)")] out Variant events, |
1047 | out uint matches) throws Error; |
1048 | + public abstract async void search_with_relevancies ( |
1049 | + string query_string, |
1050 | + [DBus (signature = "(xx)")] Variant time_range, |
1051 | + [DBus (signature = "a(asaasay)")] Variant filter_templates, |
1052 | + uint offset, uint count, uint result_type, |
1053 | + [DBus (signature = "a(asaasay)")] out Variant events, |
1054 | + out double[] relevancies, out uint matches) throws Error; |
1055 | } |
1056 | |
1057 | /* FIXME: Remove this! Only here because of a bug in Vala (see ext-fts) */ |
Awesome.