Merge lp:~kamstrup/zeitgeist/query-expansion into lp:zeitgeist/0.1

Proposed by Mikkel Kamstrup Erlandsen on 2010-05-10
Status: Merged
Merge reported by: Mikkel Kamstrup Erlandsen
Merged at revision: not available
Proposed branch: lp:~kamstrup/zeitgeist/query-expansion
Merge into: lp:zeitgeist/0.1
Diff against target: 323 lines (+218/-15)
4 files modified
_zeitgeist/engine/main.py (+47/-11)
test/datamodel-test.py (+52/-0)
test/test-sql.py (+51/-0)
zeitgeist/datamodel.py (+68/-4)
To merge this branch: bzr merge lp:~kamstrup/zeitgeist/query-expansion
Reviewer Review Type Date Requested Status
Markus Korn 2010-05-10 Needs Fixing on 2010-05-12
Review via email: mp+25000@code.launchpad.net

Description of the change

Huzzah! Smackeroo! I have query expansion fully working now all unit tests passing. Both on the SQL level and on our template matching level.

So what does "query expansion" mean. Consider a query for subject with interp. nfo:Media. Right that would only match stuff that has been explicitly identified as nfo:Media (which is not much since we usually can identify whther stuff is Audio, Image, or Video data).

With query expansion we'll also match any children of nfo:Media. Ie also nfo:Image, nfo:Audio, and nfo:Video. Also recursively matching children of these like nfo:RasterImage and nfo:Vector image.

The way it's implemented is really simple. We simply expand the tree of children and compile a big OR query with everything.

To post a comment you must log in.
Markus Korn (thekorn) wrote :
Download full text (16.5 KiB)

Hey Mikkel,
thanks you for your works, it is working fine for me.
Feel free to merge this branch into lp:zeitgeist once you thought about my three comments ;)

Markus

> === modified file '_zeitgeist/engine/main.py'
> --- _zeitgeist/engine/main.py   2010-05-03 16:32:00 +0000
> +++ _zeitgeist/engine/main.py   2010-05-12 19:32:33 +0000
> @@ -32,7 +32,7 @@
>  from collections import defaultdict
>
>  from zeitgeist.datamodel import Event as OrigEvent, StorageState, TimeRange, \
> -       ResultType, get_timestamp_for_now, Interpretation
> +       ResultType, get_timestamp_for_now, Interpretation, Symbol
>  from _zeitgeist.engine.datamodel import Event, Subject
>  from _zeitgeist.engine.extension import ExtensionsCollection, load_class
>  from _zeitgeist.engine import constants
> @@ -163,16 +163,51 @@
>                for (event_template, subject_template) in self._build_templates(templates):
>                        subwhere = WhereClause(WhereClause.AND)
>                        try:
> -                               for key in ("interpretation", "manifestation", "actor"):
> -                                       value = getattr(event_template, key)
> -                                       if value:
> -                                               subwhere.add("%s = ?" % key,
> -                                                       getattr(self, "_" + key).id(value))
> -                               for key in ("interpretation", "manifestation", "mimetype"):
> -                                       value = getattr(subject_template, key)
> -                                       if value:
> -                                               subwhere.add("subj_%s = ?" % key,
> -                                                       getattr(self, "_" + key).id(value))
> +                               # Expand event interpretation children
> +                               event_interp_where = WhereClause(WhereClause.OR)
> +                               for child_interp in
> (Symbol.find_child_uris_extended(event_template.interpretation)):
> +                                       if child_interp:
> +                                               event_interp_where.add("interpretation = ?",
> +                                                                      self._interpretation.id(child_interp))
> +                               if event_interp_where:
> +                                       subwhere.extend(event_interp_where)
> +
> +                               # Expand event manifestation children
> +                               event_manif_where = WhereClause(WhereClause.OR)
> +                               for child_manif in
> (Symbol.find_child_uris_extended(event_template.manifestation)):
> +                                       if child_manif:
> +                                               event_manif_where.add("manifestation = ?",
> +                                                                     self._manifestation.id(child_manif))
> +                               if event_manif_where:
> +                                       subwhere.extend(event_manif_where)
> +
> +                               # Expand subjec...

Markus Korn (thekorn) :
review: Needs Fixing

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file '_zeitgeist/engine/main.py'
2--- _zeitgeist/engine/main.py 2010-05-01 22:18:55 +0000
3+++ _zeitgeist/engine/main.py 2010-05-10 14:47:20 +0000
4@@ -32,7 +32,7 @@
5 from collections import defaultdict
6
7 from zeitgeist.datamodel import Event as OrigEvent, StorageState, TimeRange, \
8- ResultType, get_timestamp_for_now, Interpretation
9+ ResultType, get_timestamp_for_now, Interpretation, Symbol
10 from _zeitgeist.engine.datamodel import Event, Subject
11 from _zeitgeist.engine.extension import ExtensionsCollection, load_class
12 from _zeitgeist.engine import constants
13@@ -163,16 +163,51 @@
14 for (event_template, subject_template) in self._build_templates(templates):
15 subwhere = WhereClause(WhereClause.AND)
16 try:
17- for key in ("interpretation", "manifestation", "actor"):
18- value = getattr(event_template, key)
19- if value:
20- subwhere.add("%s = ?" % key,
21- getattr(self, "_" + key).id(value))
22- for key in ("interpretation", "manifestation", "mimetype"):
23- value = getattr(subject_template, key)
24- if value:
25- subwhere.add("subj_%s = ?" % key,
26- getattr(self, "_" + key).id(value))
27+ # Expand event interpretation children
28+ event_interp_where = WhereClause(WhereClause.OR)
29+ for child_interp in (Symbol.find_child_uris_extended(event_template.interpretation)):
30+ if child_interp:
31+ event_interp_where.add("interpretation = ?",
32+ self._interpretation.id(child_interp))
33+ if event_interp_where:
34+ subwhere.extend(event_interp_where)
35+
36+ # Expand event manifestation children
37+ event_manif_where = WhereClause(WhereClause.OR)
38+ for child_manif in (Symbol.find_child_uris_extended(event_template.manifestation)):
39+ if child_manif:
40+ event_manif_where.add("manifestation = ?",
41+ self._manifestation.id(child_manif))
42+ if event_manif_where:
43+ subwhere.extend(event_manif_where)
44+
45+ # Expand subject interpretation children
46+ su_interp_where = WhereClause(WhereClause.OR)
47+ for child_interp in (Symbol.find_child_uris_extended(subject_template.interpretation)):
48+ if child_interp:
49+ su_interp_where.add("subj_interpretation = ?",
50+ self._interpretation.id(child_interp))
51+ if su_interp_where:
52+ subwhere.extend(su_interp_where)
53+
54+ # Expand subject manifestation children
55+ su_manif_where = WhereClause(WhereClause.OR)
56+ for child_manif in (Symbol.find_child_uris_extended(subject_template.manifestation)):
57+ if child_manif:
58+ su_manif_where.add("subj_manifestation = ?",
59+ self._manifestation.id(child_manif))
60+ if su_manif_where:
61+ subwhere.extend(su_manif_where)
62+
63+ # FIXME: Expand mime children as well.
64+ # Right now we only do exact matching for mimetypes
65+ if subject_template.mimetype:
66+ subwhere.add("subj_mimetype = ?",
67+ self._mimetype.id(subject_tempalte.mimetype))
68+
69+ if event_template.actor:
70+ subwhere.add("actor = ?",
71+ self._actor.id(event_template.actor))
72 except KeyError:
73 # Value not in DB
74 where_or.register_no_result()
75@@ -183,6 +218,7 @@
76 subwhere.add("subj_%s = ?" % key, value)
77 where_or.extend(subwhere)
78
79+ print "SQL: ", where_or.sql, where_or.arguments
80 return where_or
81
82 def _build_sql_event_filter(self, time_range, templates, storage_state):
83
84=== modified file 'test/datamodel-test.py'
85--- test/datamodel-test.py 2010-04-26 19:42:07 +0000
86+++ test/datamodel-test.py 2010-05-10 14:47:20 +0000
87@@ -51,6 +51,47 @@
88 self.assertTrue(f.display_name != None)
89 self.assertTrue(f.doc != None)
90
91+class RelationshipTest (unittest.TestCase):
92+ """
93+ Tests for parent/child relationships in the loaded ontologies
94+ """
95+
96+ def testDirectParents (self):
97+ """
98+ Tests relationship tracking for immediate parents
99+ """
100+ self.assertTrue(Interpretation.AUDIO.is_a(Interpretation.MEDIA))
101+
102+ def testSecondLevelParents (self):
103+ """
104+ Tests relationship tracking for second level parents
105+ """
106+ self.assertTrue(Interpretation.VECTOR_IMAGE.is_a(Interpretation.MEDIA))
107+ self.assertTrue(Interpretation.VECTOR_IMAGE.is_a(Interpretation.IMAGE))
108+
109+ def testRootParents (self):
110+ """
111+ Tests relationship tracking for root nodes, ie Interpretation
112+ and Manifestation
113+ """
114+ self.assertTrue(Interpretation.VECTOR_IMAGE.is_a(Interpretation))
115+ self.assertTrue(Manifestation.FILE_DATA_OBJECT.is_a(Manifestation))
116+ self.assertTrue(Manifestation.USER_ACTIVITY.is_a(Manifestation))
117+
118+ def testReflecsive (self):
119+ """
120+ Assert that a symbol is a child of itself
121+ """
122+ self.assertTrue(Manifestation.USER_ACTIVITY.is_a(Manifestation.USER_ACTIVITY))
123+
124+ def testFindExtendedChildren (self):
125+ self.assertEquals(["foo://bar"], Symbol.find_child_uris_extended("foo://bar"))
126+ self.assertEquals(["http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#Icon",
127+ "http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#VectorImage",
128+ "http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#Cursor",
129+ "http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#RasterImage",
130+ "http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#Image"],
131+ Symbol.find_child_uris_extended("http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#Image"))
132
133 class EventTest (unittest.TestCase):
134 def setUp(self):
135@@ -116,6 +157,17 @@
136 e.manifestation="ILLEGAL SNAFU"
137 self.assertFalse(e.matches_template(template))
138
139+ def testTemplateParentMatching(self):
140+ template = Event.new_for_values(
141+ manifestation=Manifestation.EVENT_MANIFESTATION,
142+ subject_interpretation=Interpretation)
143+
144+ e = Event.new_for_values(
145+ manifestation=Manifestation.USER_ACTIVITY,
146+ subject_interpretation=Interpretation.TEXT_DOCUMENT,
147+ subject_text="Foo")
148+ self.assertTrue(e.matches_template(template))
149+
150 def testTemplateFiltering(self):
151 template = Event.new_for_values(interpretation="stfu:OpenEvent")
152 events = parse_events("test/data/five_events.js")
153
154=== added file 'test/test-sql.py'
155--- test/test-sql.py 1970-01-01 00:00:00 +0000
156+++ test/test-sql.py 2010-05-10 14:47:20 +0000
157@@ -0,0 +1,51 @@
158+#! /usr/bin/python
159+# -.- coding: utf-8 -.-
160+
161+# Zeitgeist
162+#
163+# Copyright © 2010 Mikkel Kamstrup Erlandsen <mikkel.kamstrup@gmail.com>
164+#
165+# This program is free software: you can redistribute it and/or modify
166+# it under the terms of the GNU Lesser General Public License as published by
167+# the Free Software Foundation, either version 3 of the License, or
168+# (at your option) any later version.
169+#
170+# This program is distributed in the hope that it will be useful,
171+# but WITHOUT ANY WARRANTY; without even the implied warranty of
172+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
173+# GNU Lesser General Public License for more details.
174+#
175+# You should have received a copy of the GNU Lesser General Public License
176+# along with this program. If not, see <http://www.gnu.org/licenses/>.
177+#
178+
179+import sys, os
180+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
181+
182+import unittest
183+from _zeitgeist.engine.sql import *
184+
185+class SQLTest (unittest.TestCase):
186+
187+ def testFlat (self):
188+ where = WhereClause(WhereClause.AND)
189+ where.add ("foo = %s", 10)
190+ where.add ("bar = %s", 27)
191+ self.assertEquals(where.sql % tuple(where.arguments),
192+ "(foo = 10 AND bar = 27)")
193+
194+ def testNested (self):
195+ where = WhereClause(WhereClause.AND)
196+ where.add ("foo = %s", 10)
197+
198+ subwhere = WhereClause(WhereClause.OR)
199+ subwhere.add ("subfoo = %s", 68)
200+ subwhere.add ("subbar = %s", 69)
201+ where.extend(subwhere)
202+ where.add ("bar = %s", 11)
203+
204+ self.assertEquals(where.sql % tuple(where.arguments),
205+ "(foo = 10 AND (subfoo = 68 OR subbar = 69) AND bar = 11)")
206+
207+if __name__ == "__main__":
208+ unittest.main()
209
210=== modified file 'zeitgeist/datamodel.py'
211--- zeitgeist/datamodel.py 2010-04-29 08:28:44 +0000
212+++ zeitgeist/datamodel.py 2010-05-10 14:47:20 +0000
213@@ -185,6 +185,22 @@
214 dikt[self.name] = self
215 for child in self._children.itervalues():
216 child._visit(dikt)
217+
218+ @staticmethod
219+ def find_child_uris_extended (uri):
220+ """
221+ Creates a list of all known child URIs of `uri`, including
222+ `uri` itself in the list. Hence the "extended". If `uri`
223+ is unknown a list containing only `uri` is returned.
224+ """
225+ try:
226+ symbol = _SYMBOLS_BY_URI[uri]
227+ children = [child.uri for child in symbol.get_all_children()]
228+ children.append(uri)
229+ return children
230+ except KeyError, e:
231+ return [uri]
232+
233
234 @property
235 def uri(self):
236@@ -236,7 +252,51 @@
237 Returns a list of immediate parent symbols
238 """
239 return frozenset(self._parents.itervalues())
240-
241+
242+ def is_a (self, parent):
243+ """
244+ Returns True if this symbol is a child of `parent`.
245+ """
246+ if not isinstance (parent, Symbol):
247+ try:
248+ parent = _SYMBOLS_BY_URI[parent]
249+ except KeyError, e:
250+ # Parent is not a known URI
251+ print 11111111111, self.uri, parent
252+ return self.uri == parent
253+
254+ # Invariant: parent is a Symbol
255+ if self.uri == parent.uri : return True
256+
257+ parent._ensure_all_children()
258+
259+ # FIXME: We should really check that child.uri is in there,
260+ # but that is not fast with the current code layout
261+ return self.name in parent._all_children
262+
263+ @staticmethod
264+ def uri_is_a (child, parent):
265+ """
266+ Returns True if `child` is a child of `parent`. Both `child`
267+ and `parent` arguments must be any combination of
268+ :class:`Symbol` and/or string.
269+ """
270+ if isinstance (child, basestring):
271+ try:
272+ child = _SYMBOLS_BY_URI[child]
273+ except KeyError, e:
274+ # Child is not a know URI
275+ if isinstance (parent, basestring):
276+ return child == parent
277+ elif isinstance (parent, Symbol):
278+ return child == parent.uri
279+ else:
280+ return False
281+
282+ if not isinstance (child, Symbol):
283+ raise ValueError("Child argument must be a Symbol or string. Got %s" % type(child))
284+
285+ return child.is_a(parent)
286
287 class TimeRange(list):
288 """
289@@ -463,11 +523,13 @@
290 """
291 Return True if this Subject matches *subject_template*. Empty
292 fields in the template are treated as wildcards.
293+ Interpretations and manifestations are also matched if they are
294+ children of the types specified in `subject_template`.
295
296 See also :meth:`Event.matches_template`
297 """
298 for m in Subject.Fields:
299- if subject_template[m] and subject_template[m] != self[m] :
300+ if subject_template[m] and not Symbol.uri_is_a (self[m], subject_template[m]):
301 return False
302 return True
303
304@@ -693,7 +755,9 @@
305 """
306 Return True if this event matches *event_template*. The
307 matching is done where unset fields in the template is
308- interpreted as wild cards. If the template has more than one
309+ interpreted as wild cards. Interpretations and manifestations
310+ are also matched if they are children of the types specified
311+ in `event_template`. If the template has more than one
312 subject, this event matches if at least one of the subjects
313 on this event matches any single one of the subjects on the
314 template.
315@@ -707,7 +771,7 @@
316 tdata = event_template[0]
317 for m in Event.Fields:
318 if m == Event.Timestamp : continue
319- if tdata[m] and tdata[m] != data[m] : return False
320+ if tdata[m] and not Symbol.uri_is_a (data[m], tdata[m]) : return False
321
322 # If template has no subjects we have a match
323 if len(event_template[1]) == 0 : return True