Merge lp:~mhr3/zeitgeist/fts-extras into lp:~zeitgeist/zeitgeist/bluebird

Proposed by Michal Hruby
Status: Merged
Merged at revision: 391
Proposed branch: lp:~mhr3/zeitgeist/fts-extras
Merge into: lp:~zeitgeist/zeitgeist/bluebird
Prerequisite: lp:~zeitgeist/zeitgeist/fts++
Diff against target: 1057 lines (+678/-45)
15 files modified
configure.ac (+37/-0)
extensions/fts++/Makefile.am (+5/-0)
extensions/fts++/fts.cpp (+30/-0)
extensions/fts++/fts.h (+13/-0)
extensions/fts++/fts.vapi (+10/-0)
extensions/fts++/indexer.cpp (+210/-40)
extensions/fts++/indexer.h (+16/-1)
extensions/fts++/stringutils.cpp (+87/-1)
extensions/fts++/stringutils.h (+8/-0)
extensions/fts++/test/Makefile.am (+5/-0)
extensions/fts++/test/test-indexer.cpp (+89/-0)
extensions/fts++/test/test-stringutils.cpp (+95/-0)
extensions/fts++/zeitgeist-fts.vala (+17/-0)
extensions/fts.vala (+49/-3)
src/remote.vala (+7/-0)
To merge this branch: bzr merge lp:~mhr3/zeitgeist/fts-extras
Reviewer Review Type Date Requested Status
Siegfried Gevatter Approve
Review via email: mp+92430@code.launchpad.net

Description of the change

Adds a few more extra features to FTS.

To post a comment you must log in.
lp:~mhr3/zeitgeist/fts-extras updated
438. By Michal Hruby

Lower prio of the timeout source

439. By Michal Hruby

Add more string utils

440. By Michal Hruby

Preprocess everything we index

441. By Michal Hruby

Few more fixes

442. By Michal Hruby

Add more tests

Revision history for this message
Siegfried Gevatter (rainct) wrote :

Awesome.

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'configure.ac'
2--- configure.ac 2012-02-08 18:54:58 +0000
3+++ configure.ac 2012-02-10 12:11:19 +0000
4@@ -40,6 +40,30 @@
5 AC_SUBST(ZEITGEIST_LIBS)
6
7 #################################################
8+# Dee-ICU check
9+#################################################
10+DEE_ICU_REQUIRED=1.0.2
11+
12+AC_ARG_WITH([dee-icu],
13+ AS_HELP_STRING([--with-dee-icu[=@<:@no/auto/yes@:>@]],
14+ [Build the FTS extension with dee-icu]),
15+ [with_dee_icu=$withval],
16+ [with_dee_icu="auto"])
17+
18+if test "x$with_dee_icu" = "xauto" ; then
19+ PKG_CHECK_EXISTS([dee-icu-1.0 >= $DEE_ICU_REQUIRED],
20+ with_dee_icu="yes",
21+ with_dee_icu="no")
22+fi
23+
24+if test "x$with_dee_icu" = "xyes" ; then
25+ PKG_CHECK_MODULES(DEE_ICU, dee-icu-1.0 >= $DEE_ICU_REQUIRED)
26+ AC_DEFINE(HAVE_DEE_ICU, 1, [Have dee-icu])
27+fi
28+
29+AM_CONDITIONAL(HAVE_DEE_ICU, test "x$with_dee_icu" = "xyes")
30+
31+#################################################
32 # DBus service
33 #################################################
34
35@@ -88,3 +112,16 @@
36 fi
37
38 AC_OUTPUT
39+
40+cat <<EOF
41+
42+${PACKAGE}-${VERSION}
43+
44+ Build Environment
45+ Install Prefix: ${prefix}
46+
47+ Optional dependencies
48+ dee-icu: ${with_dee_icu}
49+
50+EOF
51+
52
53=== modified file 'extensions/fts++/Makefile.am'
54--- extensions/fts++/Makefile.am 2012-02-08 18:54:58 +0000
55+++ extensions/fts++/Makefile.am 2012-02-10 12:11:19 +0000
56@@ -76,6 +76,11 @@
57 -lxapian \
58 $(NULL)
59
60+if HAVE_DEE_ICU
61+AM_CPPFLAGS += $(DEE_ICU_CFLAGS)
62+zeitgeist_fts_LDADD += $(DEE_ICU_LIBS)
63+endif
64+
65 BUILT_SOURCES = \
66 zeitgeist-internal.stamp \
67 zeitgeist-fts_vala.stamp \
68
69=== modified file 'extensions/fts++/fts.cpp'
70--- extensions/fts++/fts.cpp 2012-02-09 09:32:33 +0000
71+++ extensions/fts++/fts.cpp 2012-02-10 12:11:19 +0000
72@@ -84,6 +84,36 @@
73 return results;
74 }
75
76+GPtrArray*
77+zeitgeist_indexer_search_with_relevancies (ZeitgeistIndexer *indexer,
78+ const gchar *search_string,
79+ ZeitgeistTimeRange *time_range,
80+ GPtrArray *templates,
81+ guint offset,
82+ guint count,
83+ ZeitgeistResultType result_type,
84+ gdouble **relevancies,
85+ gint *relevancies_size,
86+ guint *matches,
87+ GError **error)
88+{
89+ GPtrArray *results;
90+ ZeitgeistFTS::Controller *_indexer;
91+
92+ g_return_val_if_fail (indexer != NULL, NULL);
93+ g_return_val_if_fail (search_string != NULL, NULL);
94+ g_return_val_if_fail (ZEITGEIST_IS_TIME_RANGE (time_range), NULL);
95+ g_return_val_if_fail (error == NULL || *error == NULL, NULL);
96+
97+ _indexer = (ZeitgeistFTS::Controller*) indexer;
98+
99+ results = _indexer->indexer->SearchWithRelevancies (
100+ search_string, time_range, templates, offset, count, result_type,
101+ relevancies, relevancies_size, matches, error);
102+
103+ return results;
104+}
105+
106 void zeitgeist_indexer_index_events (ZeitgeistIndexer *indexer,
107 GPtrArray *events)
108 {
109
110=== modified file 'extensions/fts++/fts.h'
111--- extensions/fts++/fts.h 2012-02-09 09:32:33 +0000
112+++ extensions/fts++/fts.h 2012-02-10 12:11:19 +0000
113@@ -43,6 +43,19 @@
114 guint *matches,
115 GError **error);
116
117+GPtrArray* zeitgeist_indexer_search_with_relevancies
118+ (ZeitgeistIndexer *indexer,
119+ const gchar *search_string,
120+ ZeitgeistTimeRange *time_range,
121+ GPtrArray *templates,
122+ guint offset,
123+ guint count,
124+ ZeitgeistResultType result_type,
125+ gdouble **relevancies,
126+ gint *relevancies_size,
127+ guint *matches,
128+ GError **error);
129+
130 void zeitgeist_indexer_index_events (ZeitgeistIndexer *indexer,
131 GPtrArray *events);
132
133
134=== modified file 'extensions/fts++/fts.vapi'
135--- extensions/fts++/fts.vapi 2012-02-07 17:02:30 +0000
136+++ extensions/fts++/fts.vapi 2012-02-10 12:11:19 +0000
137@@ -14,6 +14,16 @@
138 ResultType result_type,
139 out uint matches) throws GLib.Error;
140
141+ public GLib.GenericArray<Event> search_with_relevancies (
142+ string search_string,
143+ TimeRange time_range,
144+ GLib.GenericArray<Event> templates,
145+ uint offset,
146+ uint count,
147+ ResultType result_type,
148+ out double[] relevancies,
149+ out uint matches) throws GLib.Error;
150+
151 public void index_events (GLib.GenericArray<Event> events);
152
153 public void delete_events (uint[] event_ids);
154
155=== modified file 'extensions/fts++/indexer.cpp'
156--- extensions/fts++/indexer.cpp 2012-02-09 09:37:48 +0000
157+++ extensions/fts++/indexer.cpp 2012-02-10 12:11:19 +0000
158@@ -356,10 +356,40 @@
159 }
160 }
161
162+std::string Indexer::PreprocessString (std::string const& input)
163+{
164+ if (input.empty ()) return input;
165+
166+ std::string result (StringUtils::RemoveUnderscores (input));
167+ // a simple heuristic for the uncamelcaser
168+ size_t num_digits = StringUtils::CountDigits (result);
169+ if (result.length () > 3 && num_digits < result.length () / 2)
170+ {
171+ // FIXME: process digits?, atm they stay attached to the text
172+ result = StringUtils::UnCamelcase (result);
173+ }
174+
175+ std::string folded (StringUtils::AsciiFold (result));
176+ if (!folded.empty ())
177+ {
178+ result += ' ';
179+ result += folded;
180+ }
181+
182+#ifdef DEBUG_PREPROCESSING
183+ if (input != result)
184+ g_debug ("processed: %s\n-> %s", input.c_str (), result.c_str ());
185+#endif
186+
187+ return result;
188+}
189+
190 void Indexer::IndexText (std::string const& text)
191 {
192- // FIXME: ascii folding!
193 tokenizer->index_text (text, 5);
194+ // this is by definition already a human readable display string,
195+ // so it shouldn't need removal of underscores and uncamelcase
196+ tokenizer->index_text (StringUtils::AsciiFold (text), 5);
197 }
198
199 void Indexer::IndexUri (std::string const& uri, std::string const& origin)
200@@ -403,9 +433,10 @@
201 gchar *pn = g_file_get_parse_name (f);
202 gchar *basename = g_path_get_basename (pn);
203
204- // FIXME: remove unscores, CamelCase and process digits
205- tokenizer->index_text (basename, 5);
206- tokenizer->index_text (basename, 5, "N");
207+ // remove unscores, CamelCase and process digits
208+ std::string processed (PreprocessString (basename));
209+ tokenizer->index_text (processed, 5);
210+ tokenizer->index_text (processed, 5, "N");
211
212 g_free (basename);
213 // limit the directory indexing to just a few levels
214@@ -420,17 +451,17 @@
215 g_free (dir);
216 g_free (pn);
217
218- while (path_component.length () > 2 &&
219+ while (path_component.length () > 2 &&
220 weight_index < G_N_ELEMENTS (path_weights))
221 {
222 // if this is already home directory we don't want it
223- if (path_component.length () == home_dir_path.length () &&
224- path_component == home_dir_path) return;
225+ if (path_component == home_dir_path) return;
226
227 gchar *name = g_path_get_basename (path_component.c_str ());
228
229- // FIXME: un-underscore, uncamelcase, ascii fold
230- tokenizer->index_text (name, path_weights[weight_index++]);
231+ // un-underscore, uncamelcase, ascii fold
232+ processed = PreprocessString (name);
233+ tokenizer->index_text (processed, path_weights[weight_index++]);
234
235 dir = g_path_get_dirname (path_component.c_str ());
236 path_component = dir;
237@@ -471,9 +502,10 @@
238
239 if (g_utf8_validate (unescaped_basename, -1, NULL))
240 {
241- // FIXME: remove unscores, CamelCase and process digits
242- tokenizer->index_text (unescaped_basename, 5);
243- tokenizer->index_text (unescaped_basename, 5, "N");
244+ // remove unscores, CamelCase and process digits
245+ std::string processed (PreprocessString (unescaped_basename));
246+ tokenizer->index_text (processed, 5);
247+ tokenizer->index_text (processed, 5, "N");
248 }
249
250 // and also index hostname (taken from origin field if possible)
251@@ -505,6 +537,7 @@
252 {
253 // we *really* don't want to index anything with this scheme
254 }
255+ // how about special casing (s)ftp and ssh?
256 else
257 {
258 std::string authority, path, query;
259@@ -593,12 +626,11 @@
260 unsigned name_weight = is_subject ? 5 : 2;
261 unsigned comment_weight = 2;
262
263- // FIXME: ascii folding somewhere
264-
265 val = g_app_info_get_display_name (ai);
266 if (val && val[0] != '\0')
267 {
268- std::string display_name (val);
269+ std::string display_name (PreprocessString (val));
270+
271 tokenizer->index_text (display_name, name_weight);
272 tokenizer->index_text (display_name, name_weight, "A");
273 }
274@@ -606,9 +638,14 @@
275 val = g_desktop_app_info_get_generic_name (dai);
276 if (val && val[0] != '\0')
277 {
278+ // this shouldn't need uncamelcasing
279 std::string generic_name (val);
280+ std::string generic_name_folded (StringUtils::AsciiFold (generic_name));
281+
282 tokenizer->index_text (generic_name, name_weight);
283 tokenizer->index_text (generic_name, name_weight, "A");
284+ tokenizer->index_text (generic_name_folded, name_weight);
285+ tokenizer->index_text (generic_name_folded, name_weight, "A");
286 }
287
288 if (!is_subject) return true;
289@@ -642,7 +679,35 @@
290 return true;
291 }
292
293-GPtrArray* Indexer::Search (const gchar *search_string,
294+std::string Indexer::CompileQueryString (const gchar *search_string,
295+ ZeitgeistTimeRange *time_range,
296+ GPtrArray *templates)
297+{
298+ std::string query_string (search_string);
299+
300+ if (templates && templates->len > 0)
301+ {
302+ std::string filters (CompileEventFilterQuery (templates));
303+ query_string = "(" + query_string + ") AND (" + filters + ")";
304+ }
305+
306+ if (time_range)
307+ {
308+ gint64 start_time = zeitgeist_time_range_get_start (time_range);
309+ gint64 end_time = zeitgeist_time_range_get_end (time_range);
310+
311+ if (start_time > 0 || end_time < G_MAXINT64)
312+ {
313+ std::string time_filter (CompileTimeRangeFilterQuery (start_time, end_time));
314+ query_string = "(" + query_string + ") AND (" + time_filter + ")";
315+ }
316+ }
317+
318+ g_debug ("query: %s", query_string.c_str ());
319+ return query_string;
320+}
321+
322+GPtrArray* Indexer::Search (const gchar *search,
323 ZeitgeistTimeRange *time_range,
324 GPtrArray *templates,
325 guint offset,
326@@ -654,28 +719,22 @@
327 GPtrArray *results = NULL;
328 try
329 {
330- std::string query_string(search_string);
331-
332- if (templates && templates->len > 0)
333- {
334- std::string filters (CompileEventFilterQuery (templates));
335- query_string = "(" + query_string + ") AND (" + filters + ")";
336- }
337-
338- if (time_range)
339- {
340- gint64 start_time = zeitgeist_time_range_get_start (time_range);
341- gint64 end_time = zeitgeist_time_range_get_end (time_range);
342-
343- if (start_time > 0 || end_time < G_MAXINT64)
344- {
345- std::string time_filter (CompileTimeRangeFilterQuery (start_time, end_time));
346- query_string = "(" + query_string + ") AND (" + time_filter + ")";
347- }
348- }
349-
350- // FIXME: which result types coalesce?
351- guint maxhits = count * 3;
352+ std::string query_string (CompileQueryString (search, time_range, templates));
353+
354+ // When sorting by some result types, we need to fetch some extra events
355+ // from the Xapian index because the final result set will be coalesced
356+ // on some property of the event
357+ guint maxhits;
358+ if (result_type == 100 ||
359+ result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_EVENTS ||
360+ result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_EVENTS)
361+ {
362+ maxhits = count;
363+ }
364+ else
365+ {
366+ maxhits = count * 3;
367+ }
368
369 if (result_type == 100)
370 {
371@@ -686,7 +745,6 @@
372 enquire->set_sort_by_value (VALUE_TIMESTAMP, true);
373 }
374
375- g_debug ("query: %s", query_string.c_str ());
376 Xapian::Query q(query_parser->parse_query (query_string, QUERY_PARSER_FLAGS));
377 enquire->set_query (q);
378 Xapian::MSet hits (enquire->get_mset (offset, maxhits));
379@@ -753,7 +811,119 @@
380 }
381 catch (Xapian::Error const& e)
382 {
383- g_warning ("Failed to index event: %s", e.get_msg ().c_str ());
384+ g_warning ("Failed to search index: %s", e.get_msg ().c_str ());
385+ g_set_error_literal (error,
386+ ZEITGEIST_ENGINE_ERROR,
387+ ZEITGEIST_ENGINE_ERROR_DATABASE_ERROR,
388+ e.get_msg ().c_str ());
389+ }
390+
391+ return results;
392+}
393+
394+GPtrArray* Indexer::SearchWithRelevancies (const gchar *search,
395+ ZeitgeistTimeRange *time_range,
396+ GPtrArray *templates,
397+ guint offset,
398+ guint count,
399+ ZeitgeistResultType result_type,
400+ gdouble **relevancies,
401+ gint *relevancies_size,
402+ guint *matches,
403+ GError **error)
404+{
405+ GPtrArray *results = NULL;
406+ try
407+ {
408+ std::string query_string (CompileQueryString (search, time_range, templates));
409+
410+ guint maxhits = count;
411+
412+ if (result_type == 100)
413+ {
414+ enquire->set_sort_by_relevance ();
415+ }
416+ else
417+ {
418+ enquire->set_sort_by_value (VALUE_TIMESTAMP, true);
419+ }
420+
421+ Xapian::Query q(query_parser->parse_query (query_string, QUERY_PARSER_FLAGS));
422+ enquire->set_query (q);
423+ Xapian::MSet hits (enquire->get_mset (offset, maxhits));
424+ Xapian::doccount hitcount = hits.get_matches_estimated ();
425+
426+ if (result_type == 100)
427+ {
428+ std::vector<unsigned> event_ids;
429+ std::vector<gdouble> relevancy_arr;
430+ Xapian::MSetIterator iter, end;
431+ for (iter = hits.begin (), end = hits.end (); iter != end; ++iter)
432+ {
433+ Xapian::Document doc(iter.get_document ());
434+ double unserialized =
435+ Xapian::sortable_unserialise (doc.get_value (VALUE_EVENT_ID));
436+ unsigned event_id = static_cast<unsigned>(unserialized);
437+ event_ids.push_back (event_id);
438+
439+ double rank = iter.get_percent () / 100.;
440+ relevancy_arr.push_back (rank);
441+ }
442+
443+ results = zeitgeist_db_reader_get_events (zg_reader,
444+ &event_ids[0],
445+ event_ids.size (),
446+ NULL,
447+ error);
448+
449+ if (results->len != relevancy_arr.size ())
450+ {
451+ g_warning ("Results don't match relevancies!");
452+ g_set_error_literal (error,
453+ ZEITGEIST_ENGINE_ERROR,
454+ ZEITGEIST_ENGINE_ERROR_DATABASE_ERROR,
455+ "Internal database error");
456+ return NULL;
457+ }
458+
459+ if (relevancies)
460+ {
461+ *relevancies = (gdouble*) g_memdup (&relevancy_arr[0],
462+ sizeof (gdouble) * results->len);
463+ }
464+ if (relevancies_size)
465+ {
466+ *relevancies_size = relevancy_arr.size ();
467+ }
468+ }
469+ else
470+ {
471+ g_set_error_literal (error,
472+ ZEITGEIST_ENGINE_ERROR,
473+ ZEITGEIST_ENGINE_ERROR_INVALID_ARGUMENT,
474+ "Only RELEVANCY result type is supported");
475+ /*
476+ * perhaps something like this could be used here?
477+ std::map<unsigned, gdouble> relevancy_map;
478+ foreach (...)
479+ {
480+ double rank = iter.get_percent () / 100.;
481+ if (rank > relevancy_map[event_id])
482+ {
483+ relevancy_map[event_id] = rank;
484+ }
485+ }
486+ */
487+ }
488+
489+ if (matches)
490+ {
491+ *matches = hitcount;
492+ }
493+ }
494+ catch (Xapian::Error const& e)
495+ {
496+ g_warning ("Failed to search index: %s", e.get_msg ().c_str ());
497 g_set_error_literal (error,
498 ZEITGEIST_ENGINE_ERROR,
499 ZEITGEIST_ENGINE_ERROR_DATABASE_ERROR,
500
501=== modified file 'extensions/fts++/indexer.h'
502--- extensions/fts++/indexer.h 2012-02-09 09:37:48 +0000
503+++ extensions/fts++/indexer.h 2012-02-10 12:11:19 +0000
504@@ -77,7 +77,7 @@
505 void DeleteEvent (guint32 event_id);
506 void SetDbMetadata (std::string const& key, std::string const& value);
507
508- GPtrArray* Search (const gchar *search_string,
509+ GPtrArray* Search (const gchar *search,
510 ZeitgeistTimeRange *time_range,
511 GPtrArray *templates,
512 guint offset,
513@@ -85,11 +85,26 @@
514 ZeitgeistResultType result_type,
515 guint *matches,
516 GError **error);
517+ GPtrArray* SearchWithRelevancies (const gchar *search,
518+ ZeitgeistTimeRange *time_range,
519+ GPtrArray *templates,
520+ guint offset,
521+ guint count,
522+ ZeitgeistResultType result_type,
523+ gdouble **relevancies,
524+ gint *relevancies_size,
525+ guint *matches,
526+ GError **error);
527
528 private:
529 std::string ExpandType (std::string const& prefix, const gchar* unparsed_uri);
530 std::string CompileEventFilterQuery (GPtrArray *templates);
531 std::string CompileTimeRangeFilterQuery (gint64 start, gint64 end);
532+ std::string CompileQueryString (const gchar *search,
533+ ZeitgeistTimeRange *time_range,
534+ GPtrArray *templates);
535+
536+ std::string PreprocessString (std::string const& input);
537
538 void AddDocFilters (ZeitgeistEvent *event, Xapian::Document &doc);
539 void IndexText (std::string const& text);
540
541=== modified file 'extensions/fts++/stringutils.cpp'
542--- extensions/fts++/stringutils.cpp 2012-02-09 09:32:33 +0000
543+++ extensions/fts++/stringutils.cpp 2012-02-10 12:11:19 +0000
544@@ -17,9 +17,14 @@
545 * Authored by Mikkel Kamstrup Erlandsen <mikkel.kamstrup@gmail.com>
546 *
547 */
548+
549+#include "stringutils.h"
550 #include <string>
551+#include <algorithm>
552
553-#include "stringutils.h"
554+#ifdef HAVE_DEE_ICU
555+#include <dee-icu.h>
556+#endif
557
558 using namespace std;
559
560@@ -123,6 +128,87 @@
561 }
562 }
563
564+string RemoveUnderscores (string const &input)
565+{
566+ string result (input);
567+ std::replace (result.begin (), result.end (), '_', ' ');
568+
569+ return result;
570+}
571+
572+static bool is_digit (char c) { return c >= '0' && c <= '9'; }
573+
574+size_t CountDigits (string const &input)
575+{
576+ return std::count_if (input.begin (), input.end (), is_digit);
577+}
578+
579+static GRegex *camelcase_matcher = NULL;
580+
581+static gboolean
582+matcher_cb (const GMatchInfo *match_info, GString *result, gpointer user_data)
583+{
584+ gint start_pos;
585+ g_match_info_fetch_pos (match_info, 0, &start_pos, NULL);
586+ if (start_pos != 0) g_string_append_c (result, ' ');
587+ gchar *word = g_match_info_fetch (match_info, 0);
588+ g_string_append (result, word);
589+ g_free (word);
590+
591+ return FALSE;
592+}
593+
594+string UnCamelcase (string const &input)
595+{
596+ if (camelcase_matcher == NULL)
597+ {
598+ camelcase_matcher = g_regex_new ("(?<=^|[[:lower:]])[[:upper:]]+[^[:upper:]]+", G_REGEX_OPTIMIZE, (GRegexMatchFlags) 0, NULL);
599+ if (camelcase_matcher == NULL) g_critical ("Unable to create matcher!");
600+ }
601+
602+ gchar *result = g_regex_replace_eval (camelcase_matcher, input.c_str (),
603+ input.length (), 0,
604+ (GRegexMatchFlags) 0,
605+ matcher_cb, NULL, NULL);
606+
607+ string ret (result);
608+ g_free (result);
609+ return ret;
610+}
611+
612+#ifdef HAVE_DEE_ICU
613+static DeeICUTermFilter *icu_filter = NULL;
614+
615+/**
616+ * Use ascii folding filter on the input text and return folded version
617+ * of the original string.
618+ *
619+ * Note that if the folded version is exactly the same as the original
620+ * empty string will be returned.
621+ */
622+string AsciiFold (string const& input)
623+{
624+ if (icu_filter == NULL)
625+ {
626+ icu_filter = dee_icu_term_filter_new_ascii_folder ();
627+ if (icu_filter == NULL) return "";
628+ }
629+
630+ // FIXME: check first if the input contains any non-ascii chars?
631+
632+ gchar *folded = dee_icu_term_filter_apply (icu_filter, input.c_str ());
633+ string result (folded);
634+ g_free (folded);
635+
636+ return result == input ? "" : result;
637+}
638+#else
639+string AsciiFold (string const& input)
640+{
641+ return "";
642+}
643+#endif
644+
645 } /* namespace StringUtils */
646
647 } /* namespace ZeitgeistFTS */
648
649=== modified file 'extensions/fts++/stringutils.h'
650--- extensions/fts++/stringutils.h 2012-02-09 09:32:33 +0000
651+++ extensions/fts++/stringutils.h 2012-02-10 12:11:19 +0000
652@@ -37,6 +37,14 @@
653 std::string &path,
654 std::string &basename);
655
656+std::string RemoveUnderscores (std::string const &input);
657+
658+size_t CountDigits (std::string const &input);
659+
660+std::string UnCamelcase (std::string const &input);
661+
662+std::string AsciiFold (std::string const& input);
663+
664 } /* namespace StringUtils */
665
666 } /* namespace ZeitgeistFTS */
667
668=== modified file 'extensions/fts++/test/Makefile.am'
669--- extensions/fts++/test/Makefile.am 2012-02-08 18:54:58 +0000
670+++ extensions/fts++/test/Makefile.am 2012-02-10 12:11:19 +0000
671@@ -25,3 +25,8 @@
672 -lxapian \
673 $(NULL)
674
675+if HAVE_DEE_ICU
676+AM_CPPFLAGS += $(DEE_ICU_CFLAGS)
677+test_fts_LDADD += $(DEE_ICU_LIBS)
678+endif
679+
680
681=== modified file 'extensions/fts++/test/test-indexer.cpp'
682--- extensions/fts++/test/test-indexer.cpp 2012-02-09 09:32:33 +0000
683+++ extensions/fts++/test/test-indexer.cpp 2012-02-10 12:11:19 +0000
684@@ -145,6 +145,26 @@
685 return event;
686 }
687
688+static ZeitgeistEvent* create_test_event5 (void)
689+{
690+ ZeitgeistEvent *event = zeitgeist_event_new ();
691+ ZeitgeistSubject *subject = zeitgeist_subject_new ();
692+
693+ zeitgeist_subject_set_interpretation (subject, ZEITGEIST_NFO_SOURCE_CODE);
694+ zeitgeist_subject_set_manifestation (subject, ZEITGEIST_NFO_FILE_DATA_OBJECT);
695+ zeitgeist_subject_set_uri (subject, "file:///home/username/projects/GLibSignalImplementation.cpp");
696+ zeitgeist_subject_set_text (subject, "Because c++ is awesome");
697+ zeitgeist_subject_set_mimetype (subject, "text/x-c++src");
698+
699+ zeitgeist_event_set_interpretation (event, ZEITGEIST_ZG_CREATE_EVENT);
700+ zeitgeist_event_set_manifestation (event, ZEITGEIST_ZG_USER_ACTIVITY);
701+ zeitgeist_event_set_actor (event, "application://gedit.desktop");
702+ zeitgeist_event_add_subject (event, subject);
703+
704+ g_object_unref (subject);
705+ return event;
706+}
707+
708 // Steals the event, ref it if you want to keep it
709 static guint
710 index_event (Fixture *fix, ZeitgeistEvent *event)
711@@ -426,6 +446,71 @@
712 }
713
714 static void
715+test_simple_underscores (Fixture *fix, gconstpointer data)
716+{
717+ guint matches;
718+ guint event_id;
719+ ZeitgeistEvent* event;
720+ ZeitgeistSubject *subject;
721+
722+ // add test events to DBs
723+ index_event (fix, create_test_event1 ());
724+ index_event (fix, create_test_event2 ());
725+ index_event (fix, create_test_event3 ());
726+ event_id = index_event (fix, create_test_event4 ());
727+
728+ GPtrArray *results =
729+ zeitgeist_indexer_search (fix->indexer,
730+ "fabulo*",
731+ zeitgeist_time_range_new_anytime (),
732+ g_ptr_array_new (),
733+ 0,
734+ 10,
735+ ZEITGEIST_RESULT_TYPE_MOST_RECENT_EVENTS,
736+ &matches,
737+ NULL);
738+
739+ g_assert_cmpuint (matches, >, 0);
740+ g_assert_cmpuint (results->len, ==, 1);
741+
742+ event = (ZeitgeistEvent*) results->pdata[0];
743+ g_assert_cmpuint (zeitgeist_event_get_id (event), ==, event_id);
744+}
745+
746+static void
747+test_simple_camelcase (Fixture *fix, gconstpointer data)
748+{
749+ guint matches;
750+ guint event_id;
751+ ZeitgeistEvent* event;
752+ ZeitgeistSubject *subject;
753+
754+ // add test events to DBs
755+ index_event (fix, create_test_event1 ());
756+ index_event (fix, create_test_event2 ());
757+ index_event (fix, create_test_event3 ());
758+ index_event (fix, create_test_event4 ());
759+ event_id = index_event (fix, create_test_event5 ());
760+
761+ GPtrArray *results =
762+ zeitgeist_indexer_search (fix->indexer,
763+ "signal",
764+ zeitgeist_time_range_new_anytime (),
765+ g_ptr_array_new (),
766+ 0,
767+ 10,
768+ ZEITGEIST_RESULT_TYPE_MOST_RECENT_EVENTS,
769+ &matches,
770+ NULL);
771+
772+ g_assert_cmpuint (matches, >, 0);
773+ g_assert_cmpuint (results->len, ==, 1);
774+
775+ event = (ZeitgeistEvent*) results->pdata[0];
776+ g_assert_cmpuint (zeitgeist_event_get_id (event), ==, event_id);
777+}
778+
779+static void
780 test_simple_cjk (Fixture *fix, gconstpointer data)
781 {
782 guint matches;
783@@ -517,6 +602,10 @@
784 setup, test_simple_noexpand, teardown);
785 g_test_add ("/Zeitgeist/FTS/Indexer/SimpleNoexpandValid", Fixture, 0,
786 setup, test_simple_noexpand_valid, teardown);
787+ g_test_add ("/Zeitgeist/FTS/Indexer/SimpleUnderscores", Fixture, 0,
788+ setup, test_simple_underscores, teardown);
789+ g_test_add ("/Zeitgeist/FTS/Indexer/SimpleCamelcase", Fixture, 0,
790+ setup, test_simple_camelcase, teardown);
791 g_test_add ("/Zeitgeist/FTS/Indexer/URLUnescape", Fixture, 0,
792 setup, test_simple_url_unescape, teardown);
793 g_test_add ("/Zeitgeist/FTS/Indexer/IDNSupport", Fixture, 0,
794
795=== modified file 'extensions/fts++/test/test-stringutils.cpp'
796--- extensions/fts++/test/test-stringutils.cpp 2012-02-09 09:32:33 +0000
797+++ extensions/fts++/test/test-stringutils.cpp 2012-02-10 12:11:19 +0000
798@@ -163,6 +163,91 @@
799 g_assert_cmpstr ("type=A", ==, query.c_str ());
800 }
801
802+static void
803+test_ascii_fold (Fixture *fix, gconstpointer data)
804+{
805+ std::string folded;
806+
807+ folded = StringUtils::AsciiFold ("");
808+ g_assert_cmpstr ("", ==, folded.c_str ());
809+
810+ // if the original matches the folded version, AsciiFold returns ""
811+ folded = StringUtils::AsciiFold ("a");
812+ g_assert_cmpstr ("", ==, folded.c_str ());
813+
814+ folded = StringUtils::AsciiFold ("abcdef");
815+ g_assert_cmpstr ("", ==, folded.c_str ());
816+
817+ folded = StringUtils::AsciiFold ("å");
818+ g_assert_cmpstr ("a", ==, folded.c_str ());
819+
820+ folded = StringUtils::AsciiFold ("åå");
821+ g_assert_cmpstr ("aa", ==, folded.c_str ());
822+
823+ folded = StringUtils::AsciiFold ("aåaåa");
824+ g_assert_cmpstr ("aaaaa", ==, folded.c_str ());
825+}
826+
827+static void
828+test_underscores (Fixture *fix, gconstpointer data)
829+{
830+ g_assert_cmpstr ("", ==, StringUtils::RemoveUnderscores ("").c_str ());
831+
832+ g_assert_cmpstr (" ", ==, StringUtils::RemoveUnderscores ("_").c_str ());
833+
834+ g_assert_cmpstr (" ", ==, StringUtils::RemoveUnderscores ("___").c_str ());
835+
836+ g_assert_cmpstr ("abcd", ==, StringUtils::RemoveUnderscores ("abcd").c_str ());
837+
838+ g_assert_cmpstr (" abcd ", ==, StringUtils::RemoveUnderscores ("_abcd_").c_str ());
839+
840+ g_assert_cmpstr ("a b c d", ==, StringUtils::RemoveUnderscores ("a_b_c_d").c_str ());
841+}
842+
843+static void
844+test_uncamelcase (Fixture *fix, gconstpointer data)
845+{
846+ g_assert_cmpstr ("", ==, StringUtils::UnCamelcase ("").c_str ());
847+
848+ g_assert_cmpstr ("abcd", ==, StringUtils::UnCamelcase ("abcd").c_str ());
849+
850+ g_assert_cmpstr ("Abcd", ==, StringUtils::UnCamelcase ("Abcd").c_str ());
851+
852+ g_assert_cmpstr ("ABCD", ==, StringUtils::UnCamelcase ("ABCD").c_str ());
853+
854+ g_assert_cmpstr ("ABcd", ==, StringUtils::UnCamelcase ("ABcd").c_str ());
855+
856+ g_assert_cmpstr ("Abcd Ef", ==, StringUtils::UnCamelcase ("AbcdEf").c_str ());
857+
858+ g_assert_cmpstr ("Text Editor", ==, StringUtils::UnCamelcase ("Text Editor").c_str ());
859+
860+ g_assert_cmpstr ("py Karaoke", ==, StringUtils::UnCamelcase ("pyKaraoke").c_str ());
861+
862+ g_assert_cmpstr ("Zeitgeist Project", ==, StringUtils::UnCamelcase ("ZeitgeistProject").c_str ());
863+
864+ g_assert_cmpstr ("Very Nice Camel Case Text", ==, StringUtils::UnCamelcase ("VeryNiceCamelCaseText").c_str ());
865+
866+ g_assert_cmpstr ("Ňeedš Ťo Wórk Óń Útf Čhářacters As WelL", ==,
867+ StringUtils::UnCamelcase ("ŇeedšŤoWórkÓńÚtfČhářactersAsWelL").c_str ());
868+}
869+
870+static void
871+test_count_digits (Fixture *fix, gconstpointer data)
872+{
873+ g_assert_cmpuint (0, ==, StringUtils::CountDigits (""));
874+
875+ g_assert_cmpuint (0, ==, StringUtils::CountDigits ("abcdefghijklmnopqrstuvwxyz"));
876+
877+ g_assert_cmpuint (10, ==, StringUtils::CountDigits ("0123456789"));
878+
879+ g_assert_cmpuint (1, ==, StringUtils::CountDigits ("abc3"));
880+
881+ g_assert_cmpuint (3, ==, StringUtils::CountDigits ("::123__poa//weee"));
882+
883+ g_assert_cmpuint (5, ==, StringUtils::CountDigits ("PCN30129.JPG"));
884+
885+}
886+
887 G_BEGIN_DECLS
888
889 void test_stringutils_create_suite (void)
890@@ -173,6 +258,16 @@
891 setup, test_mangle, teardown);
892 g_test_add ("/Zeitgeist/FTS/StringUtils/SplitUri", Fixture, 0,
893 setup, test_split, teardown);
894+ g_test_add ("/Zeitgeist/FTS/StringUtils/RemoveUnderscores", Fixture, 0,
895+ setup, test_underscores, teardown);
896+ g_test_add ("/Zeitgeist/FTS/StringUtils/UnCamelcase", Fixture, 0,
897+ setup, test_uncamelcase, teardown);
898+ g_test_add ("/Zeitgeist/FTS/StringUtils/CountDigits", Fixture, 0,
899+ setup, test_count_digits, teardown);
900+#ifdef HAVE_DEE_ICU
901+ g_test_add ("/Zeitgeist/FTS/StringUtils/AsciiFold", Fixture, 0,
902+ setup, test_ascii_fold, teardown);
903+#endif
904 }
905
906 G_END_DECLS
907
908=== modified file 'extensions/fts++/zeitgeist-fts.vala'
909--- extensions/fts++/zeitgeist-fts.vala 2012-02-09 09:32:33 +0000
910+++ extensions/fts++/zeitgeist-fts.vala 2012-02-10 12:11:19 +0000
911@@ -132,6 +132,23 @@
912 events = Events.to_variant (results);
913 }
914
915+ public async void search_with_relevancies (
916+ string query_string, Variant time_range,
917+ Variant filter_templates,
918+ uint offset, uint count, uint result_type,
919+ out Variant events, out double[] relevancies,
920+ out uint matches)
921+ throws Error
922+ {
923+ var tr = new TimeRange.from_variant (time_range);
924+ var templates = Events.from_variant (filter_templates);
925+ var results = instance.indexer.search_with_relevancies (
926+ query_string, tr, templates, offset, count,
927+ (ResultType) result_type, out relevancies, out matches);
928+
929+ events = Events.to_variant (results);
930+ }
931+
932 private static void name_acquired_callback (DBusConnection conn)
933 {
934 name_acquired = true;
935
936=== modified file 'extensions/fts.vala'
937--- extensions/fts.vala 2012-02-07 12:47:44 +0000
938+++ extensions/fts.vala 2012-02-10 12:11:19 +0000
939@@ -31,6 +31,14 @@
940 uint offset, uint count, uint result_type,
941 [DBus (signature = "a(asaasay)")] out Variant events,
942 out uint matches) throws Error;
943+ public abstract async void search_with_relevancies (
944+ string query_string,
945+ [DBus (signature = "(xx)")] Variant time_range,
946+ [DBus (signature = "a(asaasay)")] Variant filter_templates,
947+ uint offset, uint count, uint result_type,
948+ [DBus (signature = "a(asaasay)")] out Variant events,
949+ out double[] relevancies,
950+ out uint matches) throws Error;
951 }
952
953 /* Because of a Vala bug we have to define the proxy interface outside of
954@@ -55,6 +63,7 @@
955 private const string INDEXER_NAME = "org.gnome.zeitgeist.SimpleIndexer";
956
957 private RemoteSimpleIndexer siin;
958+ private bool siin_connection_failed = false;
959 private uint registration_id;
960 private MonitorManager? notifier;
961
962@@ -67,6 +76,8 @@
963 {
964 if (Utils.using_in_memory_database ()) return;
965
966+ // FIXME: check dbus and see if fts is installed?
967+
968 // installing a monitor from the daemon will ensure that we don't
969 // miss any notifications that would be emitted in between
970 // zeitgeist start and fts daemon start
971@@ -109,23 +120,40 @@
972 try
973 {
974 siin = conn.get_proxy.end<RemoteSimpleIndexer> (res);
975+ siin_connection_failed = false;
976 }
977 catch (IOError err)
978 {
979+ siin_connection_failed = true;
980 warning ("%s", err.message);
981 }
982 }
983
984- public async void search (string query_string, Variant time_range,
985- Variant filter_templates, uint offset, uint count, uint result_type,
986- out Variant events, out uint matches) throws Error
987+ public async void wait_for_proxy () throws Error
988 {
989+ int i = 0;
990+ while (this.siin == null && i < 6 && !siin_connection_failed)
991+ {
992+ Timeout.add_full (Priority.DEFAULT_IDLE, 250,
993+ wait_for_proxy.callback);
994+ i++;
995+ yield;
996+ }
997+
998 if (siin == null || !(siin is DBusProxy))
999 {
1000 // FIXME: queue until we have the proxy
1001 throw new EngineError.DATABASE_ERROR (
1002 "Not connected to SimpleIndexer");
1003 }
1004+ }
1005+
1006+ public async void search (string query_string, Variant time_range,
1007+ Variant filter_templates, uint offset, uint count, uint result_type,
1008+ out Variant events, out uint matches) throws Error
1009+ {
1010+ if (siin == null) yield wait_for_proxy ();
1011+
1012 var timer = new Timer ();
1013 yield siin.search (query_string, time_range, filter_templates,
1014 offset, count, result_type,
1015@@ -134,6 +162,24 @@
1016 (uint) events.n_children (), matches, timer.elapsed ());
1017 }
1018
1019+ public async void search_with_relevancies (
1020+ string query_string, Variant time_range,
1021+ Variant filter_templates, uint offset, uint count, uint result_type,
1022+ out Variant events, out double[] relevancies, out uint matches)
1023+ throws Error
1024+ {
1025+ if (siin == null) yield wait_for_proxy ();
1026+
1027+ var timer = new Timer ();
1028+ yield siin.search_with_relevancies (
1029+ query_string, time_range, filter_templates,
1030+ offset, count, result_type,
1031+ out events, out relevancies, out matches);
1032+
1033+ debug ("Got %u[/%u] results from indexer (in %f seconds)",
1034+ (uint) events.n_children (), matches, timer.elapsed ());
1035+ }
1036+
1037 }
1038
1039 [ModuleInit]
1040
1041=== modified file 'src/remote.vala'
1042--- src/remote.vala 2012-02-05 14:52:13 +0000
1043+++ src/remote.vala 2012-02-10 12:11:19 +0000
1044@@ -121,6 +121,13 @@
1045 uint offset, uint count, uint result_type,
1046 [DBus (signature = "a(asaasay)")] out Variant events,
1047 out uint matches) throws Error;
1048+ public abstract async void search_with_relevancies (
1049+ string query_string,
1050+ [DBus (signature = "(xx)")] Variant time_range,
1051+ [DBus (signature = "a(asaasay)")] Variant filter_templates,
1052+ uint offset, uint count, uint result_type,
1053+ [DBus (signature = "a(asaasay)")] out Variant events,
1054+ out double[] relevancies, out uint matches) throws Error;
1055 }
1056
1057 /* FIXME: Remove this! Only here because of a bug in Vala (see ext-fts) */

Subscribers

People subscribed via source and target branches