Merge lp:~zeitgeist/zeitgeist/fts-origin-hashing into lp:~zeitgeist/zeitgeist/bluebird

Proposed by Siegfried Gevatter
Status: Merged
Merged at revision: 440
Proposed branch: lp:~zeitgeist/zeitgeist/fts-origin-hashing
Merge into: lp:~zeitgeist/zeitgeist/bluebird
Diff against target: 98 lines (+33/-12)
1 file modified
extensions/fts++/indexer.cpp (+33/-12)
To merge this branch: bzr merge lp:~zeitgeist/zeitgeist/fts-origin-hashing
Reviewer Review Type Date Requested Status
Siegfried Gevatter Approve
Review via email: mp+98281@code.launchpad.net
To post a comment you must log in.
Revision history for this message
Siegfried Gevatter (rainct) wrote :

Maybe move the md5 summing into a function? It's ugly :p

Revision history for this message
Siegfried Gevatter (rainct) wrote :

+ // Make sure the schemas of the URI and the origin is the same,
s/is/are

439. By Michal Hruby

FTS++: origin hashing

440. By Michal Hruby

Clean up

Revision history for this message
Siegfried Gevatter (rainct) wrote :

Lovely :)

review: Approve
441. By Michal Hruby

Bump index version number

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'extensions/fts++/indexer.cpp'
2--- extensions/fts++/indexer.cpp 2012-03-19 19:56:38 +0000
3+++ extensions/fts++/indexer.cpp 2012-03-19 21:37:38 +0000
4@@ -43,6 +43,7 @@
5 const Xapian::valueno VALUE_EVENT_ID = 0;
6 const Xapian::valueno VALUE_TIMESTAMP = 1;
7 const Xapian::valueno VALUE_URI_HASH = 2;
8+const Xapian::valueno VALUE_ORIGIN_HASH = 3;
9
10 #define QUERY_PARSER_FLAGS \
11 Xapian::QueryParser::FLAG_PHRASE | Xapian::QueryParser::FLAG_BOOLEAN | \
12@@ -763,7 +764,11 @@
13 result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_SUBJECTS ||
14 result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_SUBJECTS ||
15 result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_SUBJECTS ||
16- result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_SUBJECTS)
17+ result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_SUBJECTS ||
18+ result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_ORIGIN ||
19+ result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_ORIGIN ||
20+ result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_ORIGIN ||
21+ result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_ORIGIN)
22 {
23 maxhits = count;
24 }
25@@ -795,8 +800,7 @@
26 result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_ORIGIN ||
27 result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_ORIGIN)
28 {
29- // FIXME: not really correct but close :)
30- enquire->set_collapse_key (VALUE_URI_HASH);
31+ enquire->set_collapse_key (VALUE_ORIGIN_HASH);
32 }
33 else if (result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_EVENTS ||
34 result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_EVENTS)
35@@ -1137,10 +1141,8 @@
36 result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_ORIGIN ||
37 result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_ORIGIN)
38 {
39- // FIXME: not really correct but close :)
40 enquire->set_sort_by_relevance_then_value (VALUE_TIMESTAMP, reversed_sort);
41- enquire->set_collapse_key (VALUE_URI_HASH);
42- maxhits *= 3;
43+ enquire->set_collapse_key (VALUE_ORIGIN_HASH);
44 }
45 else
46 {
47@@ -1272,6 +1274,16 @@
48 return results;
49 }
50
51+static void
52+get_digest_for_uri (GChecksum *checksum, const gchar *uri,
53+ guint8 *digest, gsize *digest_size)
54+{
55+ g_checksum_update (checksum, (guchar *) uri, -1);
56+ g_checksum_get_digest (checksum, digest, digest_size);
57+ g_checksum_reset (checksum);
58+ g_assert (digest_size == NULL || *digest_size == HASH_LENGTH);
59+}
60+
61 void Indexer::IndexEvent (ZeitgeistEvent *event)
62 {
63 try
64@@ -1322,19 +1334,28 @@
65 return; // ignore this event completely...
66 }
67
68+ guint8 uri_hash[HASH_LENGTH + 1];
69+ gsize hash_size = HASH_LENGTH;
70+
71 // We need the subject URI so we can use Xapian's collapse key feature
72 // for *_SUBJECT grouping. However, to save space, we'll just save a hash.
73 // A better option would be using URI's id, but for that we'd need a SQL
74 // query that'd be subject to races.
75 // FIXME(?): This doesn't work for events with multiple subjects.
76- g_checksum_update (checksum, (guchar *) uri.c_str (), -1);
77- guint8 uri_hash[HASH_LENGTH + 1];
78- gsize hash_size = HASH_LENGTH;
79- g_checksum_get_digest (checksum, uri_hash, &hash_size);
80- g_checksum_reset (checksum);
81- g_assert (hash_size == HASH_LENGTH);
82+ get_digest_for_uri (checksum, uri.c_str (), uri_hash, &hash_size);
83 doc.add_value (VALUE_URI_HASH, std::string((char *) uri_hash, hash_size));
84
85+ size_t colon_pos = uri.find (':');
86+ // FIXME: current_origin once we have that
87+ val = zeitgeist_subject_get_origin (subject);
88+ // make sure the schemas of the URI and origin are the same
89+ if (val && colon_pos != std::string::npos && strncmp (uri.c_str (), val, colon_pos+1) == 0)
90+ {
91+ hash_size = HASH_LENGTH;
92+ get_digest_for_uri (checksum, val, uri_hash, &hash_size);
93+ doc.add_value (VALUE_ORIGIN_HASH, std::string((char *) uri_hash, hash_size));
94+ }
95+
96 val = zeitgeist_subject_get_text (subject);
97 if (val && val[0] != '\0')
98 {

Subscribers

People subscribed via source and target branches