Merge lp:~mhr3/zeitgeist/fts-extras into lp:~zeitgeist/zeitgeist/bluebird

Proposed by Michal Hruby
Status: Merged
Merged at revision: 391
Proposed branch: lp:~mhr3/zeitgeist/fts-extras
Merge into: lp:~zeitgeist/zeitgeist/bluebird
Prerequisite: lp:~zeitgeist/zeitgeist/fts++
Diff against target: 1057 lines (+678/-45)
15 files modified
configure.ac (+37/-0)
extensions/fts++/Makefile.am (+5/-0)
extensions/fts++/fts.cpp (+30/-0)
extensions/fts++/fts.h (+13/-0)
extensions/fts++/fts.vapi (+10/-0)
extensions/fts++/indexer.cpp (+210/-40)
extensions/fts++/indexer.h (+16/-1)
extensions/fts++/stringutils.cpp (+87/-1)
extensions/fts++/stringutils.h (+8/-0)
extensions/fts++/test/Makefile.am (+5/-0)
extensions/fts++/test/test-indexer.cpp (+89/-0)
extensions/fts++/test/test-stringutils.cpp (+95/-0)
extensions/fts++/zeitgeist-fts.vala (+17/-0)
extensions/fts.vala (+49/-3)
src/remote.vala (+7/-0)
To merge this branch: bzr merge lp:~mhr3/zeitgeist/fts-extras
Reviewer Review Type Date Requested Status
Siegfried Gevatter Approve
Review via email: mp+92430@code.launchpad.net

Description of the change

Adds a few more extra features to FTS.

To post a comment you must log in.
lp:~mhr3/zeitgeist/fts-extras updated
438. By Michal Hruby

Lower prio of the timeout source

439. By Michal Hruby

Add more string utils

440. By Michal Hruby

Preprocess everything we index

441. By Michal Hruby

Few more fixes

442. By Michal Hruby

Add more tests

Revision history for this message
Siegfried Gevatter (rainct) wrote :

Awesome.

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
=== modified file 'configure.ac'
--- configure.ac 2012-02-08 18:54:58 +0000
+++ configure.ac 2012-02-10 12:11:19 +0000
@@ -40,6 +40,30 @@
40AC_SUBST(ZEITGEIST_LIBS)40AC_SUBST(ZEITGEIST_LIBS)
4141
42#################################################42#################################################
43# Dee-ICU check
44#################################################
45DEE_ICU_REQUIRED=1.0.2
46
47AC_ARG_WITH([dee-icu],
48 AS_HELP_STRING([--with-dee-icu[=@<:@no/auto/yes@:>@]],
49 [Build the FTS extension with dee-icu]),
50 [with_dee_icu=$withval],
51 [with_dee_icu="auto"])
52
53if test "x$with_dee_icu" = "xauto" ; then
54 PKG_CHECK_EXISTS([dee-icu-1.0 >= $DEE_ICU_REQUIRED],
55 with_dee_icu="yes",
56 with_dee_icu="no")
57fi
58
59if test "x$with_dee_icu" = "xyes" ; then
60 PKG_CHECK_MODULES(DEE_ICU, dee-icu-1.0 >= $DEE_ICU_REQUIRED)
61 AC_DEFINE(HAVE_DEE_ICU, 1, [Have dee-icu])
62fi
63
64AM_CONDITIONAL(HAVE_DEE_ICU, test "x$with_dee_icu" = "xyes")
65
66#################################################
43# DBus service67# DBus service
44#################################################68#################################################
4569
@@ -88,3 +112,16 @@
88fi112fi
89113
90AC_OUTPUT114AC_OUTPUT
115
116cat <<EOF
117
118${PACKAGE}-${VERSION}
119
120 Build Environment
121 Install Prefix: ${prefix}
122
123 Optional dependencies
124 dee-icu: ${with_dee_icu}
125
126EOF
127
91128
=== modified file 'extensions/fts++/Makefile.am'
--- extensions/fts++/Makefile.am 2012-02-08 18:54:58 +0000
+++ extensions/fts++/Makefile.am 2012-02-10 12:11:19 +0000
@@ -76,6 +76,11 @@
76 -lxapian \76 -lxapian \
77 $(NULL)77 $(NULL)
7878
79if HAVE_DEE_ICU
80AM_CPPFLAGS += $(DEE_ICU_CFLAGS)
81zeitgeist_fts_LDADD += $(DEE_ICU_LIBS)
82endif
83
79BUILT_SOURCES = \84BUILT_SOURCES = \
80 zeitgeist-internal.stamp \85 zeitgeist-internal.stamp \
81 zeitgeist-fts_vala.stamp \86 zeitgeist-fts_vala.stamp \
8287
=== modified file 'extensions/fts++/fts.cpp'
--- extensions/fts++/fts.cpp 2012-02-09 09:32:33 +0000
+++ extensions/fts++/fts.cpp 2012-02-10 12:11:19 +0000
@@ -84,6 +84,36 @@
84 return results;84 return results;
85}85}
8686
87GPtrArray*
88zeitgeist_indexer_search_with_relevancies (ZeitgeistIndexer *indexer,
89 const gchar *search_string,
90 ZeitgeistTimeRange *time_range,
91 GPtrArray *templates,
92 guint offset,
93 guint count,
94 ZeitgeistResultType result_type,
95 gdouble **relevancies,
96 gint *relevancies_size,
97 guint *matches,
98 GError **error)
99{
100 GPtrArray *results;
101 ZeitgeistFTS::Controller *_indexer;
102
103 g_return_val_if_fail (indexer != NULL, NULL);
104 g_return_val_if_fail (search_string != NULL, NULL);
105 g_return_val_if_fail (ZEITGEIST_IS_TIME_RANGE (time_range), NULL);
106 g_return_val_if_fail (error == NULL || *error == NULL, NULL);
107
108 _indexer = (ZeitgeistFTS::Controller*) indexer;
109
110 results = _indexer->indexer->SearchWithRelevancies (
111 search_string, time_range, templates, offset, count, result_type,
112 relevancies, relevancies_size, matches, error);
113
114 return results;
115}
116
87void zeitgeist_indexer_index_events (ZeitgeistIndexer *indexer,117void zeitgeist_indexer_index_events (ZeitgeistIndexer *indexer,
88 GPtrArray *events)118 GPtrArray *events)
89{119{
90120
=== modified file 'extensions/fts++/fts.h'
--- extensions/fts++/fts.h 2012-02-09 09:32:33 +0000
+++ extensions/fts++/fts.h 2012-02-10 12:11:19 +0000
@@ -43,6 +43,19 @@
43 guint *matches,43 guint *matches,
44 GError **error);44 GError **error);
4545
46GPtrArray* zeitgeist_indexer_search_with_relevancies
47 (ZeitgeistIndexer *indexer,
48 const gchar *search_string,
49 ZeitgeistTimeRange *time_range,
50 GPtrArray *templates,
51 guint offset,
52 guint count,
53 ZeitgeistResultType result_type,
54 gdouble **relevancies,
55 gint *relevancies_size,
56 guint *matches,
57 GError **error);
58
46void zeitgeist_indexer_index_events (ZeitgeistIndexer *indexer,59void zeitgeist_indexer_index_events (ZeitgeistIndexer *indexer,
47 GPtrArray *events);60 GPtrArray *events);
4861
4962
=== modified file 'extensions/fts++/fts.vapi'
--- extensions/fts++/fts.vapi 2012-02-07 17:02:30 +0000
+++ extensions/fts++/fts.vapi 2012-02-10 12:11:19 +0000
@@ -14,6 +14,16 @@
14 ResultType result_type,14 ResultType result_type,
15 out uint matches) throws GLib.Error;15 out uint matches) throws GLib.Error;
1616
17 public GLib.GenericArray<Event> search_with_relevancies (
18 string search_string,
19 TimeRange time_range,
20 GLib.GenericArray<Event> templates,
21 uint offset,
22 uint count,
23 ResultType result_type,
24 out double[] relevancies,
25 out uint matches) throws GLib.Error;
26
17 public void index_events (GLib.GenericArray<Event> events);27 public void index_events (GLib.GenericArray<Event> events);
1828
19 public void delete_events (uint[] event_ids);29 public void delete_events (uint[] event_ids);
2030
=== modified file 'extensions/fts++/indexer.cpp'
--- extensions/fts++/indexer.cpp 2012-02-09 09:37:48 +0000
+++ extensions/fts++/indexer.cpp 2012-02-10 12:11:19 +0000
@@ -356,10 +356,40 @@
356 }356 }
357}357}
358358
359std::string Indexer::PreprocessString (std::string const& input)
360{
361 if (input.empty ()) return input;
362
363 std::string result (StringUtils::RemoveUnderscores (input));
364 // a simple heuristic for the uncamelcaser
365 size_t num_digits = StringUtils::CountDigits (result);
366 if (result.length () > 3 && num_digits < result.length () / 2)
367 {
368 // FIXME: process digits?, atm they stay attached to the text
369 result = StringUtils::UnCamelcase (result);
370 }
371
372 std::string folded (StringUtils::AsciiFold (result));
373 if (!folded.empty ())
374 {
375 result += ' ';
376 result += folded;
377 }
378
379#ifdef DEBUG_PREPROCESSING
380 if (input != result)
381 g_debug ("processed: %s\n-> %s", input.c_str (), result.c_str ());
382#endif
383
384 return result;
385}
386
359void Indexer::IndexText (std::string const& text)387void Indexer::IndexText (std::string const& text)
360{388{
361 // FIXME: ascii folding!
362 tokenizer->index_text (text, 5);389 tokenizer->index_text (text, 5);
390 // this is by definition already a human readable display string,
391 // so it shouldn't need removal of underscores and uncamelcase
392 tokenizer->index_text (StringUtils::AsciiFold (text), 5);
363}393}
364394
365void Indexer::IndexUri (std::string const& uri, std::string const& origin)395void Indexer::IndexUri (std::string const& uri, std::string const& origin)
@@ -403,9 +433,10 @@
403 gchar *pn = g_file_get_parse_name (f);433 gchar *pn = g_file_get_parse_name (f);
404 gchar *basename = g_path_get_basename (pn);434 gchar *basename = g_path_get_basename (pn);
405435
406 // FIXME: remove unscores, CamelCase and process digits436 // remove unscores, CamelCase and process digits
407 tokenizer->index_text (basename, 5);437 std::string processed (PreprocessString (basename));
408 tokenizer->index_text (basename, 5, "N");438 tokenizer->index_text (processed, 5);
439 tokenizer->index_text (processed, 5, "N");
409440
410 g_free (basename);441 g_free (basename);
411 // limit the directory indexing to just a few levels442 // limit the directory indexing to just a few levels
@@ -420,17 +451,17 @@
420 g_free (dir);451 g_free (dir);
421 g_free (pn);452 g_free (pn);
422453
423 while (path_component.length () > 2 && 454 while (path_component.length () > 2 &&
424 weight_index < G_N_ELEMENTS (path_weights))455 weight_index < G_N_ELEMENTS (path_weights))
425 {456 {
426 // if this is already home directory we don't want it457 // if this is already home directory we don't want it
427 if (path_component.length () == home_dir_path.length () &&458 if (path_component == home_dir_path) return;
428 path_component == home_dir_path) return;
429459
430 gchar *name = g_path_get_basename (path_component.c_str ());460 gchar *name = g_path_get_basename (path_component.c_str ());
431461
432 // FIXME: un-underscore, uncamelcase, ascii fold462 // un-underscore, uncamelcase, ascii fold
433 tokenizer->index_text (name, path_weights[weight_index++]);463 processed = PreprocessString (name);
464 tokenizer->index_text (processed, path_weights[weight_index++]);
434465
435 dir = g_path_get_dirname (path_component.c_str ());466 dir = g_path_get_dirname (path_component.c_str ());
436 path_component = dir;467 path_component = dir;
@@ -471,9 +502,10 @@
471 502
472 if (g_utf8_validate (unescaped_basename, -1, NULL))503 if (g_utf8_validate (unescaped_basename, -1, NULL))
473 {504 {
474 // FIXME: remove unscores, CamelCase and process digits505 // remove unscores, CamelCase and process digits
475 tokenizer->index_text (unescaped_basename, 5);506 std::string processed (PreprocessString (unescaped_basename));
476 tokenizer->index_text (unescaped_basename, 5, "N");507 tokenizer->index_text (processed, 5);
508 tokenizer->index_text (processed, 5, "N");
477 }509 }
478510
479 // and also index hostname (taken from origin field if possible)511 // and also index hostname (taken from origin field if possible)
@@ -505,6 +537,7 @@
505 {537 {
506 // we *really* don't want to index anything with this scheme538 // we *really* don't want to index anything with this scheme
507 }539 }
540 // how about special casing (s)ftp and ssh?
508 else541 else
509 {542 {
510 std::string authority, path, query;543 std::string authority, path, query;
@@ -593,12 +626,11 @@
593 unsigned name_weight = is_subject ? 5 : 2;626 unsigned name_weight = is_subject ? 5 : 2;
594 unsigned comment_weight = 2;627 unsigned comment_weight = 2;
595628
596 // FIXME: ascii folding somewhere
597
598 val = g_app_info_get_display_name (ai);629 val = g_app_info_get_display_name (ai);
599 if (val && val[0] != '\0')630 if (val && val[0] != '\0')
600 {631 {
601 std::string display_name (val);632 std::string display_name (PreprocessString (val));
633
602 tokenizer->index_text (display_name, name_weight);634 tokenizer->index_text (display_name, name_weight);
603 tokenizer->index_text (display_name, name_weight, "A");635 tokenizer->index_text (display_name, name_weight, "A");
604 }636 }
@@ -606,9 +638,14 @@
606 val = g_desktop_app_info_get_generic_name (dai);638 val = g_desktop_app_info_get_generic_name (dai);
607 if (val && val[0] != '\0')639 if (val && val[0] != '\0')
608 {640 {
641 // this shouldn't need uncamelcasing
609 std::string generic_name (val);642 std::string generic_name (val);
643 std::string generic_name_folded (StringUtils::AsciiFold (generic_name));
644
610 tokenizer->index_text (generic_name, name_weight);645 tokenizer->index_text (generic_name, name_weight);
611 tokenizer->index_text (generic_name, name_weight, "A");646 tokenizer->index_text (generic_name, name_weight, "A");
647 tokenizer->index_text (generic_name_folded, name_weight);
648 tokenizer->index_text (generic_name_folded, name_weight, "A");
612 }649 }
613650
614 if (!is_subject) return true;651 if (!is_subject) return true;
@@ -642,7 +679,35 @@
642 return true;679 return true;
643}680}
644681
645GPtrArray* Indexer::Search (const gchar *search_string,682std::string Indexer::CompileQueryString (const gchar *search_string,
683 ZeitgeistTimeRange *time_range,
684 GPtrArray *templates)
685{
686 std::string query_string (search_string);
687
688 if (templates && templates->len > 0)
689 {
690 std::string filters (CompileEventFilterQuery (templates));
691 query_string = "(" + query_string + ") AND (" + filters + ")";
692 }
693
694 if (time_range)
695 {
696 gint64 start_time = zeitgeist_time_range_get_start (time_range);
697 gint64 end_time = zeitgeist_time_range_get_end (time_range);
698
699 if (start_time > 0 || end_time < G_MAXINT64)
700 {
701 std::string time_filter (CompileTimeRangeFilterQuery (start_time, end_time));
702 query_string = "(" + query_string + ") AND (" + time_filter + ")";
703 }
704 }
705
706 g_debug ("query: %s", query_string.c_str ());
707 return query_string;
708}
709
710GPtrArray* Indexer::Search (const gchar *search,
646 ZeitgeistTimeRange *time_range,711 ZeitgeistTimeRange *time_range,
647 GPtrArray *templates,712 GPtrArray *templates,
648 guint offset,713 guint offset,
@@ -654,28 +719,22 @@
654 GPtrArray *results = NULL;719 GPtrArray *results = NULL;
655 try720 try
656 {721 {
657 std::string query_string(search_string);722 std::string query_string (CompileQueryString (search, time_range, templates));
658723
659 if (templates && templates->len > 0)724 // When sorting by some result types, we need to fetch some extra events
660 {725 // from the Xapian index because the final result set will be coalesced
661 std::string filters (CompileEventFilterQuery (templates));726 // on some property of the event
662 query_string = "(" + query_string + ") AND (" + filters + ")";727 guint maxhits;
663 }728 if (result_type == 100 ||
664729 result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_EVENTS ||
665 if (time_range)730 result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_EVENTS)
666 {731 {
667 gint64 start_time = zeitgeist_time_range_get_start (time_range);732 maxhits = count;
668 gint64 end_time = zeitgeist_time_range_get_end (time_range);733 }
669734 else
670 if (start_time > 0 || end_time < G_MAXINT64)735 {
671 {736 maxhits = count * 3;
672 std::string time_filter (CompileTimeRangeFilterQuery (start_time, end_time));737 }
673 query_string = "(" + query_string + ") AND (" + time_filter + ")";
674 }
675 }
676
677 // FIXME: which result types coalesce?
678 guint maxhits = count * 3;
679738
680 if (result_type == 100)739 if (result_type == 100)
681 {740 {
@@ -686,7 +745,6 @@
686 enquire->set_sort_by_value (VALUE_TIMESTAMP, true);745 enquire->set_sort_by_value (VALUE_TIMESTAMP, true);
687 }746 }
688747
689 g_debug ("query: %s", query_string.c_str ());
690 Xapian::Query q(query_parser->parse_query (query_string, QUERY_PARSER_FLAGS));748 Xapian::Query q(query_parser->parse_query (query_string, QUERY_PARSER_FLAGS));
691 enquire->set_query (q);749 enquire->set_query (q);
692 Xapian::MSet hits (enquire->get_mset (offset, maxhits));750 Xapian::MSet hits (enquire->get_mset (offset, maxhits));
@@ -753,7 +811,119 @@
753 }811 }
754 catch (Xapian::Error const& e)812 catch (Xapian::Error const& e)
755 {813 {
756 g_warning ("Failed to index event: %s", e.get_msg ().c_str ());814 g_warning ("Failed to search index: %s", e.get_msg ().c_str ());
815 g_set_error_literal (error,
816 ZEITGEIST_ENGINE_ERROR,
817 ZEITGEIST_ENGINE_ERROR_DATABASE_ERROR,
818 e.get_msg ().c_str ());
819 }
820
821 return results;
822}
823
824GPtrArray* Indexer::SearchWithRelevancies (const gchar *search,
825 ZeitgeistTimeRange *time_range,
826 GPtrArray *templates,
827 guint offset,
828 guint count,
829 ZeitgeistResultType result_type,
830 gdouble **relevancies,
831 gint *relevancies_size,
832 guint *matches,
833 GError **error)
834{
835 GPtrArray *results = NULL;
836 try
837 {
838 std::string query_string (CompileQueryString (search, time_range, templates));
839
840 guint maxhits = count;
841
842 if (result_type == 100)
843 {
844 enquire->set_sort_by_relevance ();
845 }
846 else
847 {
848 enquire->set_sort_by_value (VALUE_TIMESTAMP, true);
849 }
850
851 Xapian::Query q(query_parser->parse_query (query_string, QUERY_PARSER_FLAGS));
852 enquire->set_query (q);
853 Xapian::MSet hits (enquire->get_mset (offset, maxhits));
854 Xapian::doccount hitcount = hits.get_matches_estimated ();
855
856 if (result_type == 100)
857 {
858 std::vector<unsigned> event_ids;
859 std::vector<gdouble> relevancy_arr;
860 Xapian::MSetIterator iter, end;
861 for (iter = hits.begin (), end = hits.end (); iter != end; ++iter)
862 {
863 Xapian::Document doc(iter.get_document ());
864 double unserialized =
865 Xapian::sortable_unserialise (doc.get_value (VALUE_EVENT_ID));
866 unsigned event_id = static_cast<unsigned>(unserialized);
867 event_ids.push_back (event_id);
868
869 double rank = iter.get_percent () / 100.;
870 relevancy_arr.push_back (rank);
871 }
872
873 results = zeitgeist_db_reader_get_events (zg_reader,
874 &event_ids[0],
875 event_ids.size (),
876 NULL,
877 error);
878
879 if (results->len != relevancy_arr.size ())
880 {
881 g_warning ("Results don't match relevancies!");
882 g_set_error_literal (error,
883 ZEITGEIST_ENGINE_ERROR,
884 ZEITGEIST_ENGINE_ERROR_DATABASE_ERROR,
885 "Internal database error");
886 return NULL;
887 }
888
889 if (relevancies)
890 {
891 *relevancies = (gdouble*) g_memdup (&relevancy_arr[0],
892 sizeof (gdouble) * results->len);
893 }
894 if (relevancies_size)
895 {
896 *relevancies_size = relevancy_arr.size ();
897 }
898 }
899 else
900 {
901 g_set_error_literal (error,
902 ZEITGEIST_ENGINE_ERROR,
903 ZEITGEIST_ENGINE_ERROR_INVALID_ARGUMENT,
904 "Only RELEVANCY result type is supported");
905 /*
906 * perhaps something like this could be used here?
907 std::map<unsigned, gdouble> relevancy_map;
908 foreach (...)
909 {
910 double rank = iter.get_percent () / 100.;
911 if (rank > relevancy_map[event_id])
912 {
913 relevancy_map[event_id] = rank;
914 }
915 }
916 */
917 }
918
919 if (matches)
920 {
921 *matches = hitcount;
922 }
923 }
924 catch (Xapian::Error const& e)
925 {
926 g_warning ("Failed to search index: %s", e.get_msg ().c_str ());
757 g_set_error_literal (error,927 g_set_error_literal (error,
758 ZEITGEIST_ENGINE_ERROR,928 ZEITGEIST_ENGINE_ERROR,
759 ZEITGEIST_ENGINE_ERROR_DATABASE_ERROR,929 ZEITGEIST_ENGINE_ERROR_DATABASE_ERROR,
760930
=== modified file 'extensions/fts++/indexer.h'
--- extensions/fts++/indexer.h 2012-02-09 09:37:48 +0000
+++ extensions/fts++/indexer.h 2012-02-10 12:11:19 +0000
@@ -77,7 +77,7 @@
77 void DeleteEvent (guint32 event_id);77 void DeleteEvent (guint32 event_id);
78 void SetDbMetadata (std::string const& key, std::string const& value);78 void SetDbMetadata (std::string const& key, std::string const& value);
7979
80 GPtrArray* Search (const gchar *search_string,80 GPtrArray* Search (const gchar *search,
81 ZeitgeistTimeRange *time_range,81 ZeitgeistTimeRange *time_range,
82 GPtrArray *templates,82 GPtrArray *templates,
83 guint offset,83 guint offset,
@@ -85,11 +85,26 @@
85 ZeitgeistResultType result_type,85 ZeitgeistResultType result_type,
86 guint *matches,86 guint *matches,
87 GError **error);87 GError **error);
88 GPtrArray* SearchWithRelevancies (const gchar *search,
89 ZeitgeistTimeRange *time_range,
90 GPtrArray *templates,
91 guint offset,
92 guint count,
93 ZeitgeistResultType result_type,
94 gdouble **relevancies,
95 gint *relevancies_size,
96 guint *matches,
97 GError **error);
8898
89private:99private:
90 std::string ExpandType (std::string const& prefix, const gchar* unparsed_uri);100 std::string ExpandType (std::string const& prefix, const gchar* unparsed_uri);
91 std::string CompileEventFilterQuery (GPtrArray *templates);101 std::string CompileEventFilterQuery (GPtrArray *templates);
92 std::string CompileTimeRangeFilterQuery (gint64 start, gint64 end);102 std::string CompileTimeRangeFilterQuery (gint64 start, gint64 end);
103 std::string CompileQueryString (const gchar *search,
104 ZeitgeistTimeRange *time_range,
105 GPtrArray *templates);
106
107 std::string PreprocessString (std::string const& input);
93108
94 void AddDocFilters (ZeitgeistEvent *event, Xapian::Document &doc);109 void AddDocFilters (ZeitgeistEvent *event, Xapian::Document &doc);
95 void IndexText (std::string const& text);110 void IndexText (std::string const& text);
96111
=== modified file 'extensions/fts++/stringutils.cpp'
--- extensions/fts++/stringutils.cpp 2012-02-09 09:32:33 +0000
+++ extensions/fts++/stringutils.cpp 2012-02-10 12:11:19 +0000
@@ -17,9 +17,14 @@
17 * Authored by Mikkel Kamstrup Erlandsen <mikkel.kamstrup@gmail.com>17 * Authored by Mikkel Kamstrup Erlandsen <mikkel.kamstrup@gmail.com>
18 *18 *
19 */19 */
20
21#include "stringutils.h"
20#include <string>22#include <string>
23#include <algorithm>
2124
22#include "stringutils.h"25#ifdef HAVE_DEE_ICU
26#include <dee-icu.h>
27#endif
2328
24using namespace std;29using namespace std;
2530
@@ -123,6 +128,87 @@
123 }128 }
124}129}
125130
131string RemoveUnderscores (string const &input)
132{
133 string result (input);
134 std::replace (result.begin (), result.end (), '_', ' ');
135
136 return result;
137}
138
139static bool is_digit (char c) { return c >= '0' && c <= '9'; }
140
141size_t CountDigits (string const &input)
142{
143 return std::count_if (input.begin (), input.end (), is_digit);
144}
145
146static GRegex *camelcase_matcher = NULL;
147
148static gboolean
149matcher_cb (const GMatchInfo *match_info, GString *result, gpointer user_data)
150{
151 gint start_pos;
152 g_match_info_fetch_pos (match_info, 0, &start_pos, NULL);
153 if (start_pos != 0) g_string_append_c (result, ' ');
154 gchar *word = g_match_info_fetch (match_info, 0);
155 g_string_append (result, word);
156 g_free (word);
157
158 return FALSE;
159}
160
161string UnCamelcase (string const &input)
162{
163 if (camelcase_matcher == NULL)
164 {
165 camelcase_matcher = g_regex_new ("(?<=^|[[:lower:]])[[:upper:]]+[^[:upper:]]+", G_REGEX_OPTIMIZE, (GRegexMatchFlags) 0, NULL);
166 if (camelcase_matcher == NULL) g_critical ("Unable to create matcher!");
167 }
168
169 gchar *result = g_regex_replace_eval (camelcase_matcher, input.c_str (),
170 input.length (), 0,
171 (GRegexMatchFlags) 0,
172 matcher_cb, NULL, NULL);
173
174 string ret (result);
175 g_free (result);
176 return ret;
177}
178
179#ifdef HAVE_DEE_ICU
180static DeeICUTermFilter *icu_filter = NULL;
181
182/**
183 * Use ascii folding filter on the input text and return folded version
184 * of the original string.
185 *
186 * Note that if the folded version is exactly the same as the original
187 * empty string will be returned.
188 */
189string AsciiFold (string const& input)
190{
191 if (icu_filter == NULL)
192 {
193 icu_filter = dee_icu_term_filter_new_ascii_folder ();
194 if (icu_filter == NULL) return "";
195 }
196
197 // FIXME: check first if the input contains any non-ascii chars?
198
199 gchar *folded = dee_icu_term_filter_apply (icu_filter, input.c_str ());
200 string result (folded);
201 g_free (folded);
202
203 return result == input ? "" : result;
204}
205#else
206string AsciiFold (string const& input)
207{
208 return "";
209}
210#endif
211
126} /* namespace StringUtils */212} /* namespace StringUtils */
127213
128} /* namespace ZeitgeistFTS */214} /* namespace ZeitgeistFTS */
129215
=== modified file 'extensions/fts++/stringutils.h'
--- extensions/fts++/stringutils.h 2012-02-09 09:32:33 +0000
+++ extensions/fts++/stringutils.h 2012-02-10 12:11:19 +0000
@@ -37,6 +37,14 @@
37 std::string &path,37 std::string &path,
38 std::string &basename);38 std::string &basename);
3939
40std::string RemoveUnderscores (std::string const &input);
41
42size_t CountDigits (std::string const &input);
43
44std::string UnCamelcase (std::string const &input);
45
46std::string AsciiFold (std::string const& input);
47
40} /* namespace StringUtils */48} /* namespace StringUtils */
4149
42} /* namespace ZeitgeistFTS */50} /* namespace ZeitgeistFTS */
4351
=== modified file 'extensions/fts++/test/Makefile.am'
--- extensions/fts++/test/Makefile.am 2012-02-08 18:54:58 +0000
+++ extensions/fts++/test/Makefile.am 2012-02-10 12:11:19 +0000
@@ -25,3 +25,8 @@
25 -lxapian \25 -lxapian \
26 $(NULL)26 $(NULL)
2727
28if HAVE_DEE_ICU
29AM_CPPFLAGS += $(DEE_ICU_CFLAGS)
30test_fts_LDADD += $(DEE_ICU_LIBS)
31endif
32
2833
=== modified file 'extensions/fts++/test/test-indexer.cpp'
--- extensions/fts++/test/test-indexer.cpp 2012-02-09 09:32:33 +0000
+++ extensions/fts++/test/test-indexer.cpp 2012-02-10 12:11:19 +0000
@@ -145,6 +145,26 @@
145 return event;145 return event;
146}146}
147147
148static ZeitgeistEvent* create_test_event5 (void)
149{
150 ZeitgeistEvent *event = zeitgeist_event_new ();
151 ZeitgeistSubject *subject = zeitgeist_subject_new ();
152
153 zeitgeist_subject_set_interpretation (subject, ZEITGEIST_NFO_SOURCE_CODE);
154 zeitgeist_subject_set_manifestation (subject, ZEITGEIST_NFO_FILE_DATA_OBJECT);
155 zeitgeist_subject_set_uri (subject, "file:///home/username/projects/GLibSignalImplementation.cpp");
156 zeitgeist_subject_set_text (subject, "Because c++ is awesome");
157 zeitgeist_subject_set_mimetype (subject, "text/x-c++src");
158
159 zeitgeist_event_set_interpretation (event, ZEITGEIST_ZG_CREATE_EVENT);
160 zeitgeist_event_set_manifestation (event, ZEITGEIST_ZG_USER_ACTIVITY);
161 zeitgeist_event_set_actor (event, "application://gedit.desktop");
162 zeitgeist_event_add_subject (event, subject);
163
164 g_object_unref (subject);
165 return event;
166}
167
148// Steals the event, ref it if you want to keep it168// Steals the event, ref it if you want to keep it
149static guint169static guint
150index_event (Fixture *fix, ZeitgeistEvent *event)170index_event (Fixture *fix, ZeitgeistEvent *event)
@@ -426,6 +446,71 @@
426}446}
427447
428static void448static void
449test_simple_underscores (Fixture *fix, gconstpointer data)
450{
451 guint matches;
452 guint event_id;
453 ZeitgeistEvent* event;
454 ZeitgeistSubject *subject;
455
456 // add test events to DBs
457 index_event (fix, create_test_event1 ());
458 index_event (fix, create_test_event2 ());
459 index_event (fix, create_test_event3 ());
460 event_id = index_event (fix, create_test_event4 ());
461
462 GPtrArray *results =
463 zeitgeist_indexer_search (fix->indexer,
464 "fabulo*",
465 zeitgeist_time_range_new_anytime (),
466 g_ptr_array_new (),
467 0,
468 10,
469 ZEITGEIST_RESULT_TYPE_MOST_RECENT_EVENTS,
470 &matches,
471 NULL);
472
473 g_assert_cmpuint (matches, >, 0);
474 g_assert_cmpuint (results->len, ==, 1);
475
476 event = (ZeitgeistEvent*) results->pdata[0];
477 g_assert_cmpuint (zeitgeist_event_get_id (event), ==, event_id);
478}
479
480static void
481test_simple_camelcase (Fixture *fix, gconstpointer data)
482{
483 guint matches;
484 guint event_id;
485 ZeitgeistEvent* event;
486 ZeitgeistSubject *subject;
487
488 // add test events to DBs
489 index_event (fix, create_test_event1 ());
490 index_event (fix, create_test_event2 ());
491 index_event (fix, create_test_event3 ());
492 index_event (fix, create_test_event4 ());
493 event_id = index_event (fix, create_test_event5 ());
494
495 GPtrArray *results =
496 zeitgeist_indexer_search (fix->indexer,
497 "signal",
498 zeitgeist_time_range_new_anytime (),
499 g_ptr_array_new (),
500 0,
501 10,
502 ZEITGEIST_RESULT_TYPE_MOST_RECENT_EVENTS,
503 &matches,
504 NULL);
505
506 g_assert_cmpuint (matches, >, 0);
507 g_assert_cmpuint (results->len, ==, 1);
508
509 event = (ZeitgeistEvent*) results->pdata[0];
510 g_assert_cmpuint (zeitgeist_event_get_id (event), ==, event_id);
511}
512
513static void
429test_simple_cjk (Fixture *fix, gconstpointer data)514test_simple_cjk (Fixture *fix, gconstpointer data)
430{515{
431 guint matches;516 guint matches;
@@ -517,6 +602,10 @@
517 setup, test_simple_noexpand, teardown);602 setup, test_simple_noexpand, teardown);
518 g_test_add ("/Zeitgeist/FTS/Indexer/SimpleNoexpandValid", Fixture, 0,603 g_test_add ("/Zeitgeist/FTS/Indexer/SimpleNoexpandValid", Fixture, 0,
519 setup, test_simple_noexpand_valid, teardown);604 setup, test_simple_noexpand_valid, teardown);
605 g_test_add ("/Zeitgeist/FTS/Indexer/SimpleUnderscores", Fixture, 0,
606 setup, test_simple_underscores, teardown);
607 g_test_add ("/Zeitgeist/FTS/Indexer/SimpleCamelcase", Fixture, 0,
608 setup, test_simple_camelcase, teardown);
520 g_test_add ("/Zeitgeist/FTS/Indexer/URLUnescape", Fixture, 0,609 g_test_add ("/Zeitgeist/FTS/Indexer/URLUnescape", Fixture, 0,
521 setup, test_simple_url_unescape, teardown);610 setup, test_simple_url_unescape, teardown);
522 g_test_add ("/Zeitgeist/FTS/Indexer/IDNSupport", Fixture, 0,611 g_test_add ("/Zeitgeist/FTS/Indexer/IDNSupport", Fixture, 0,
523612
=== modified file 'extensions/fts++/test/test-stringutils.cpp'
--- extensions/fts++/test/test-stringutils.cpp 2012-02-09 09:32:33 +0000
+++ extensions/fts++/test/test-stringutils.cpp 2012-02-10 12:11:19 +0000
@@ -163,6 +163,91 @@
163 g_assert_cmpstr ("type=A", ==, query.c_str ());163 g_assert_cmpstr ("type=A", ==, query.c_str ());
164}164}
165165
166static void
167test_ascii_fold (Fixture *fix, gconstpointer data)
168{
169 std::string folded;
170
171 folded = StringUtils::AsciiFold ("");
172 g_assert_cmpstr ("", ==, folded.c_str ());
173
174 // if the original matches the folded version, AsciiFold returns ""
175 folded = StringUtils::AsciiFold ("a");
176 g_assert_cmpstr ("", ==, folded.c_str ());
177
178 folded = StringUtils::AsciiFold ("abcdef");
179 g_assert_cmpstr ("", ==, folded.c_str ());
180
181 folded = StringUtils::AsciiFold ("å");
182 g_assert_cmpstr ("a", ==, folded.c_str ());
183
184 folded = StringUtils::AsciiFold ("åå");
185 g_assert_cmpstr ("aa", ==, folded.c_str ());
186
187 folded = StringUtils::AsciiFold ("aåaåa");
188 g_assert_cmpstr ("aaaaa", ==, folded.c_str ());
189}
190
191static void
192test_underscores (Fixture *fix, gconstpointer data)
193{
194 g_assert_cmpstr ("", ==, StringUtils::RemoveUnderscores ("").c_str ());
195
196 g_assert_cmpstr (" ", ==, StringUtils::RemoveUnderscores ("_").c_str ());
197
198 g_assert_cmpstr (" ", ==, StringUtils::RemoveUnderscores ("___").c_str ());
199
200 g_assert_cmpstr ("abcd", ==, StringUtils::RemoveUnderscores ("abcd").c_str ());
201
202 g_assert_cmpstr (" abcd ", ==, StringUtils::RemoveUnderscores ("_abcd_").c_str ());
203
204 g_assert_cmpstr ("a b c d", ==, StringUtils::RemoveUnderscores ("a_b_c_d").c_str ());
205}
206
207static void
208test_uncamelcase (Fixture *fix, gconstpointer data)
209{
210 g_assert_cmpstr ("", ==, StringUtils::UnCamelcase ("").c_str ());
211
212 g_assert_cmpstr ("abcd", ==, StringUtils::UnCamelcase ("abcd").c_str ());
213
214 g_assert_cmpstr ("Abcd", ==, StringUtils::UnCamelcase ("Abcd").c_str ());
215
216 g_assert_cmpstr ("ABCD", ==, StringUtils::UnCamelcase ("ABCD").c_str ());
217
218 g_assert_cmpstr ("ABcd", ==, StringUtils::UnCamelcase ("ABcd").c_str ());
219
220 g_assert_cmpstr ("Abcd Ef", ==, StringUtils::UnCamelcase ("AbcdEf").c_str ());
221
222 g_assert_cmpstr ("Text Editor", ==, StringUtils::UnCamelcase ("Text Editor").c_str ());
223
224 g_assert_cmpstr ("py Karaoke", ==, StringUtils::UnCamelcase ("pyKaraoke").c_str ());
225
226 g_assert_cmpstr ("Zeitgeist Project", ==, StringUtils::UnCamelcase ("ZeitgeistProject").c_str ());
227
228 g_assert_cmpstr ("Very Nice Camel Case Text", ==, StringUtils::UnCamelcase ("VeryNiceCamelCaseText").c_str ());
229
230 g_assert_cmpstr ("Ňeedš Ťo Wórk Óń Útf Čhářacters As WelL", ==,
231 StringUtils::UnCamelcase ("ŇeedšŤoWórkÓńÚtfČhářactersAsWelL").c_str ());
232}
233
234static void
235test_count_digits (Fixture *fix, gconstpointer data)
236{
237 g_assert_cmpuint (0, ==, StringUtils::CountDigits (""));
238
239 g_assert_cmpuint (0, ==, StringUtils::CountDigits ("abcdefghijklmnopqrstuvwxyz"));
240
241 g_assert_cmpuint (10, ==, StringUtils::CountDigits ("0123456789"));
242
243 g_assert_cmpuint (1, ==, StringUtils::CountDigits ("abc3"));
244
245 g_assert_cmpuint (3, ==, StringUtils::CountDigits ("::123__poa//weee"));
246
247 g_assert_cmpuint (5, ==, StringUtils::CountDigits ("PCN30129.JPG"));
248
249}
250
166G_BEGIN_DECLS251G_BEGIN_DECLS
167252
168void test_stringutils_create_suite (void)253void test_stringutils_create_suite (void)
@@ -173,6 +258,16 @@
173 setup, test_mangle, teardown);258 setup, test_mangle, teardown);
174 g_test_add ("/Zeitgeist/FTS/StringUtils/SplitUri", Fixture, 0,259 g_test_add ("/Zeitgeist/FTS/StringUtils/SplitUri", Fixture, 0,
175 setup, test_split, teardown);260 setup, test_split, teardown);
261 g_test_add ("/Zeitgeist/FTS/StringUtils/RemoveUnderscores", Fixture, 0,
262 setup, test_underscores, teardown);
263 g_test_add ("/Zeitgeist/FTS/StringUtils/UnCamelcase", Fixture, 0,
264 setup, test_uncamelcase, teardown);
265 g_test_add ("/Zeitgeist/FTS/StringUtils/CountDigits", Fixture, 0,
266 setup, test_count_digits, teardown);
267#ifdef HAVE_DEE_ICU
268 g_test_add ("/Zeitgeist/FTS/StringUtils/AsciiFold", Fixture, 0,
269 setup, test_ascii_fold, teardown);
270#endif
176}271}
177272
178G_END_DECLS273G_END_DECLS
179274
=== modified file 'extensions/fts++/zeitgeist-fts.vala'
--- extensions/fts++/zeitgeist-fts.vala 2012-02-09 09:32:33 +0000
+++ extensions/fts++/zeitgeist-fts.vala 2012-02-10 12:11:19 +0000
@@ -132,6 +132,23 @@
132 events = Events.to_variant (results);132 events = Events.to_variant (results);
133 }133 }
134134
135 public async void search_with_relevancies (
136 string query_string, Variant time_range,
137 Variant filter_templates,
138 uint offset, uint count, uint result_type,
139 out Variant events, out double[] relevancies,
140 out uint matches)
141 throws Error
142 {
143 var tr = new TimeRange.from_variant (time_range);
144 var templates = Events.from_variant (filter_templates);
145 var results = instance.indexer.search_with_relevancies (
146 query_string, tr, templates, offset, count,
147 (ResultType) result_type, out relevancies, out matches);
148
149 events = Events.to_variant (results);
150 }
151
135 private static void name_acquired_callback (DBusConnection conn)152 private static void name_acquired_callback (DBusConnection conn)
136 {153 {
137 name_acquired = true;154 name_acquired = true;
138155
=== modified file 'extensions/fts.vala'
--- extensions/fts.vala 2012-02-07 12:47:44 +0000
+++ extensions/fts.vala 2012-02-10 12:11:19 +0000
@@ -31,6 +31,14 @@
31 uint offset, uint count, uint result_type,31 uint offset, uint count, uint result_type,
32 [DBus (signature = "a(asaasay)")] out Variant events,32 [DBus (signature = "a(asaasay)")] out Variant events,
33 out uint matches) throws Error;33 out uint matches) throws Error;
34 public abstract async void search_with_relevancies (
35 string query_string,
36 [DBus (signature = "(xx)")] Variant time_range,
37 [DBus (signature = "a(asaasay)")] Variant filter_templates,
38 uint offset, uint count, uint result_type,
39 [DBus (signature = "a(asaasay)")] out Variant events,
40 out double[] relevancies,
41 out uint matches) throws Error;
34 }42 }
3543
36 /* Because of a Vala bug we have to define the proxy interface outside of44 /* Because of a Vala bug we have to define the proxy interface outside of
@@ -55,6 +63,7 @@
55 private const string INDEXER_NAME = "org.gnome.zeitgeist.SimpleIndexer";63 private const string INDEXER_NAME = "org.gnome.zeitgeist.SimpleIndexer";
5664
57 private RemoteSimpleIndexer siin;65 private RemoteSimpleIndexer siin;
66 private bool siin_connection_failed = false;
58 private uint registration_id;67 private uint registration_id;
59 private MonitorManager? notifier;68 private MonitorManager? notifier;
6069
@@ -67,6 +76,8 @@
67 {76 {
68 if (Utils.using_in_memory_database ()) return;77 if (Utils.using_in_memory_database ()) return;
6978
79 // FIXME: check dbus and see if fts is installed?
80
70 // installing a monitor from the daemon will ensure that we don't81 // installing a monitor from the daemon will ensure that we don't
71 // miss any notifications that would be emitted in between82 // miss any notifications that would be emitted in between
72 // zeitgeist start and fts daemon start83 // zeitgeist start and fts daemon start
@@ -109,23 +120,40 @@
109 try120 try
110 {121 {
111 siin = conn.get_proxy.end<RemoteSimpleIndexer> (res);122 siin = conn.get_proxy.end<RemoteSimpleIndexer> (res);
123 siin_connection_failed = false;
112 }124 }
113 catch (IOError err)125 catch (IOError err)
114 {126 {
127 siin_connection_failed = true;
115 warning ("%s", err.message);128 warning ("%s", err.message);
116 }129 }
117 }130 }
118131
119 public async void search (string query_string, Variant time_range,132 public async void wait_for_proxy () throws Error
120 Variant filter_templates, uint offset, uint count, uint result_type,
121 out Variant events, out uint matches) throws Error
122 {133 {
134 int i = 0;
135 while (this.siin == null && i < 6 && !siin_connection_failed)
136 {
137 Timeout.add_full (Priority.DEFAULT_IDLE, 250,
138 wait_for_proxy.callback);
139 i++;
140 yield;
141 }
142
123 if (siin == null || !(siin is DBusProxy))143 if (siin == null || !(siin is DBusProxy))
124 {144 {
125 // FIXME: queue until we have the proxy145 // FIXME: queue until we have the proxy
126 throw new EngineError.DATABASE_ERROR (146 throw new EngineError.DATABASE_ERROR (
127 "Not connected to SimpleIndexer");147 "Not connected to SimpleIndexer");
128 }148 }
149 }
150
151 public async void search (string query_string, Variant time_range,
152 Variant filter_templates, uint offset, uint count, uint result_type,
153 out Variant events, out uint matches) throws Error
154 {
155 if (siin == null) yield wait_for_proxy ();
156
129 var timer = new Timer ();157 var timer = new Timer ();
130 yield siin.search (query_string, time_range, filter_templates,158 yield siin.search (query_string, time_range, filter_templates,
131 offset, count, result_type,159 offset, count, result_type,
@@ -134,6 +162,24 @@
134 (uint) events.n_children (), matches, timer.elapsed ());162 (uint) events.n_children (), matches, timer.elapsed ());
135 }163 }
136164
165 public async void search_with_relevancies (
166 string query_string, Variant time_range,
167 Variant filter_templates, uint offset, uint count, uint result_type,
168 out Variant events, out double[] relevancies, out uint matches)
169 throws Error
170 {
171 if (siin == null) yield wait_for_proxy ();
172
173 var timer = new Timer ();
174 yield siin.search_with_relevancies (
175 query_string, time_range, filter_templates,
176 offset, count, result_type,
177 out events, out relevancies, out matches);
178
179 debug ("Got %u[/%u] results from indexer (in %f seconds)",
180 (uint) events.n_children (), matches, timer.elapsed ());
181 }
182
137 }183 }
138184
139 [ModuleInit]185 [ModuleInit]
140186
=== modified file 'src/remote.vala'
--- src/remote.vala 2012-02-05 14:52:13 +0000
+++ src/remote.vala 2012-02-10 12:11:19 +0000
@@ -121,6 +121,13 @@
121 uint offset, uint count, uint result_type,121 uint offset, uint count, uint result_type,
122 [DBus (signature = "a(asaasay)")] out Variant events,122 [DBus (signature = "a(asaasay)")] out Variant events,
123 out uint matches) throws Error;123 out uint matches) throws Error;
124 public abstract async void search_with_relevancies (
125 string query_string,
126 [DBus (signature = "(xx)")] Variant time_range,
127 [DBus (signature = "a(asaasay)")] Variant filter_templates,
128 uint offset, uint count, uint result_type,
129 [DBus (signature = "a(asaasay)")] out Variant events,
130 out double[] relevancies, out uint matches) throws Error;
124 }131 }
125 132
126 /* FIXME: Remove this! Only here because of a bug in Vala (see ext-fts) */133 /* FIXME: Remove this! Only here because of a bug in Vala (see ext-fts) */

Subscribers

People subscribed via source and target branches