Merge lp:~kamstrup/zeitgeist-extensions/fts-cap-term-length into lp:zeitgeist-extensions

Proposed by Mikkel Kamstrup Erlandsen
Status: Merged
Merged at revision: 73
Proposed branch: lp:~kamstrup/zeitgeist-extensions/fts-cap-term-length
Merge into: lp:zeitgeist-extensions
Diff against target: 114 lines (+41/-13)
2 files modified
fts/_tests.py (+1/-0)
fts/fts.py (+40/-13)
To merge this branch: bzr merge lp:~kamstrup/zeitgeist-extensions/fts-cap-term-length
Reviewer Review Type Date Requested Status
Zeitgeist Extensions Pending
Review via email: mp+74362@code.launchpad.net

Description of the change

See attached bug

To post a comment you must log in.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
=== modified file 'fts/_tests.py'
--- fts/_tests.py 2011-09-01 13:46:30 +0000
+++ fts/_tests.py 2011-09-07 08:45:23 +0000
@@ -104,3 +104,4 @@
104assert u"漢字" in results[0].subjects[0].text, results[0].subjects[0].uri104assert u"漢字" in results[0].subjects[0].text, results[0].subjects[0].uri
105105
106106
107
107108
=== modified file 'fts/fts.py'
--- fts/fts.py 2011-09-06 10:03:23 +0000
+++ fts/fts.py 2011-09-07 08:45:23 +0000
@@ -50,6 +50,7 @@
50import threading50import threading
51from urllib import quote as url_escape, unquote as url_unescape51from urllib import quote as url_escape, unquote as url_unescape
52import gobject, gio52import gobject, gio
53from cStringIO import StringIO
5354
54from zeitgeist.datamodel import Symbol, StorageState, ResultType, TimeRange, NULL_EVENT, NEGATION_OPERATOR55from zeitgeist.datamodel import Symbol, StorageState, ResultType, TimeRange, NULL_EVENT, NEGATION_OPERATOR
55from _zeitgeist.engine.datamodel import Event, Subject56from _zeitgeist.engine.datamodel import Event, Subject
@@ -93,6 +94,10 @@
93 ResultType.LeastPopularActor,94 ResultType.LeastPopularActor,
94]95]
9596
97# Xapian has a maximum term length of 245 bytes and Bad Things(TM) happen
98# if you bust that. We use the cap_string() function to control this.
99MAX_TERM_LENGTH = 245
100
96def synchronized(lock):101def synchronized(lock):
97 """ Synchronization decorator. """102 """ Synchronization decorator. """
98103
@@ -197,6 +202,31 @@
197 result += c202 result += c
198 return result203 return result
199204
205def cap_string (s, nbytes=MAX_TERM_LENGTH):
206 """
207 If s has more than nbytes bytes (not characters) then cap it off
208 after nbytes bytes in a way still producing a valid utf-8 string.
209
210 Assumes that s is a utf-8 string.
211
212 This function useful for working with Xapian terms because Xapian has
213 a max term length of 245 (which is not very well documented, but see
214 http://xapian.org/docs/omega/termprefixes.html).
215 """
216 # Check if we can fast-path this string
217 if (len(s.encode("utf-8")) <= nbytes):
218 return s
219
220 # We use a StringIO here to avoid mem thrashing via naiive
221 # string concatenation. See fx. http://www.skymind.com/~ocrow/python_string/
222 buf = StringIO()
223 for char in s :
224 if buf.tell() >= nbytes - 1 :
225 return buf.getvalue()
226 buf.write(char.encode("utf-8"))
227
228 return unicode(buf.getvalue().decode("utf-8"))
229
200def expand_type (type_prefix, uri):230def expand_type (type_prefix, uri):
201 """231 """
202 Return a string with a Xapian query matching all child types of 'uri'232 Return a string with a Xapian query matching all child types of 'uri'
@@ -564,7 +594,7 @@
564 594
565 doc = self._tokenizer.get_document()595 doc = self._tokenizer.get_document()
566 for cat in desktop.getCategories():596 for cat in desktop.getCategories():
567 doc.add_boolean_term(FILTER_PREFIX_XDG_CATEGORY+cat.lower())597 doc.add_boolean_term(cap_string(FILTER_PREFIX_XDG_CATEGORY+cat.lower()))
568 else:598 else:
569 log.debug("Unable to look up app info for %s" % actor)599 log.debug("Unable to look up app info for %s" % actor)
570 600
@@ -649,25 +679,25 @@
649 """Adds the filtering rules to the doc. Filtering rules will679 """Adds the filtering rules to the doc. Filtering rules will
650 not affect the relevancy ranking of the event/doc"""680 not affect the relevancy ranking of the event/doc"""
651 if event.interpretation:681 if event.interpretation:
652 doc.add_boolean_term (FILTER_PREFIX_EVENT_INTERPRETATION+event.interpretation)682 doc.add_boolean_term (cap_string(FILTER_PREFIX_EVENT_INTERPRETATION+event.interpretation))
653 if event.manifestation:683 if event.manifestation:
654 doc.add_boolean_term (FILTER_PREFIX_EVENT_MANIFESTATION+event.manifestation)684 doc.add_boolean_term (cap_string(FILTER_PREFIX_EVENT_MANIFESTATION+event.manifestation))
655 if event.actor:685 if event.actor:
656 doc.add_boolean_term (FILTER_PREFIX_ACTOR+mangle_uri(event.actor))686 doc.add_boolean_term (cap_string(FILTER_PREFIX_ACTOR+mangle_uri(event.actor)))
657 687
658 for su in event.subjects:688 for su in event.subjects:
659 if su.uri:689 if su.uri:
660 doc.add_boolean_term (FILTER_PREFIX_SUBJECT_URI+mangle_uri(su.uri))690 doc.add_boolean_term (cap_string(FILTER_PREFIX_SUBJECT_URI+mangle_uri(su.uri)))
661 if su.interpretation:691 if su.interpretation:
662 doc.add_boolean_term (FILTER_PREFIX_SUBJECT_INTERPRETATION+su.interpretation)692 doc.add_boolean_term (cap_string(FILTER_PREFIX_SUBJECT_INTERPRETATION+su.interpretation))
663 if su.manifestation:693 if su.manifestation:
664 doc.add_boolean_term (FILTER_PREFIX_SUBJECT_MANIFESTATION+su.manifestation)694 doc.add_boolean_term (cap_string(FILTER_PREFIX_SUBJECT_MANIFESTATION+su.manifestation))
665 if su.origin:695 if su.origin:
666 doc.add_boolean_term (FILTER_PREFIX_SUBJECT_ORIGIN+mangle_uri(su.origin))696 doc.add_boolean_term (cap_string(FILTER_PREFIX_SUBJECT_ORIGIN+mangle_uri(su.origin)))
667 if su.mimetype:697 if su.mimetype:
668 doc.add_boolean_term (FILTER_PREFIX_SUBJECT_MIMETYPE+su.mimetype)698 doc.add_boolean_term (cap_string(FILTER_PREFIX_SUBJECT_MIMETYPE+su.mimetype))
669 if su.storage:699 if su.storage:
670 doc.add_boolean_term (FILTER_PREFIX_SUBJECT_STORAGE+su.storage)700 doc.add_boolean_term (cap_string(FILTER_PREFIX_SUBJECT_STORAGE+su.storage))
671 701
672 @synchronized (INDEX_LOCK)702 @synchronized (INDEX_LOCK)
673 def _index_event_real (self, event):703 def _index_event_real (self, event):
@@ -766,6 +796,3 @@
766 796
767 return "%s..%sms" % (time_range.begin, time_range.end)797 return "%s..%sms" % (time_range.begin, time_range.end)
768798
769if __name__ == "__main__":
770 indexer = Indexer(None)
771 print indexer._compile_filter_query([Event.new_for_values(subject_interpretation="http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#Document")])

Subscribers

People subscribed via source and target branches