Merge lp:~seif/zeitgeist/rewrite-find-related-uris into lp:zeitgeist/0.1

Proposed by Seif Lotfy
Status: Merged
Approved by: Seif Lotfy
Approved revision: 1627
Merged at revision: 1619
Proposed branch: lp:~seif/zeitgeist/rewrite-find-related-uris
Merge into: lp:zeitgeist/0.1
Diff against target: 185 lines (+42/-85)
1 file modified
_zeitgeist/engine/main.py (+42/-85)
To merge this branch: bzr merge lp:~seif/zeitgeist/rewrite-find-related-uris
Reviewer Review Type Date Requested Status
Markus Korn code, tests Approve
Review via email: mp+38746@code.launchpad.net

Description of the change

I took the time to rewrite the find_related_uris to use a more or less graph structuring before pushing everything into one pot and going through a 1-step aproiri.
The results are the same except for one test case where [i1, i3, i5] is expected but [i3, i1, i5] is sent. However this results is also right since i3 and i2 both have the same count = 2 but were sorted differently whereas i5 count = 1.

The new code is documented and is much shorter allowing me to get rid of many custom helpers that were written for find_related_uris.

If this branch is ok with you i will change the last test case and merge it.
Cheers
Seif

To post a comment you must log in.
1620. By Seif Lotfy

remove prints and reduce 1 code line

1621. By Seif Lotfy

remove print

1622. By Seif Lotfy

fix printing time for calculation

1623. By Seif Lotfy

cleanup code

1624. By Seif Lotfy

fix defaultdict issue

1625. By Seif Lotfy

fix missing timestamps in dicts value

1626. By Seif Lotfy

assured that highest timestamp is taken into the leatest_uris dict

1627. By Seif Lotfy

clean up code and remove unneseccary imports

Revision history for this message
Markus Korn (thekorn) wrote :

I had some discussions with seif on irc about this changes, and I'm happy with the final result.

review: Approve (code, tests)

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file '_zeitgeist/engine/main.py'
2--- _zeitgeist/engine/main.py 2010-10-12 18:37:10 +0000
3+++ _zeitgeist/engine/main.py 2010-10-18 19:53:41 +0000
4@@ -25,8 +25,6 @@
5 import sys
6 import os
7 import logging
8-import operator
9-from itertools import islice
10 from collections import defaultdict
11
12 from zeitgeist.datamodel import Event as OrigEvent, StorageState, TimeRange, \
13@@ -36,8 +34,6 @@
14 from _zeitgeist.engine import constants
15 from _zeitgeist.engine.sql import get_default_cursor, unset_cursor, \
16 TableLookup, WhereClause
17-
18-WINDOW_SIZE = 7
19
20 logging.basicConfig(level=logging.DEBUG)
21 log = logging.getLogger("zeitgeist.engine")
22@@ -325,7 +321,6 @@
23 Return modes:
24 - 0: IDs.
25 - 1: Events.
26- - 2: (Timestamp, SubjectUri)
27 """
28 t = time.time()
29
30@@ -339,8 +334,6 @@
31 sql = "SELECT DISTINCT id FROM event_view"
32 elif return_mode == 1:
33 sql = "SELECT id FROM event_view"
34- elif return_mode == 2:
35- sql = "SELECT subj_uri, timestamp FROM event_view"
36 else:
37 raise NotImplementedError, "Unsupported return_mode."
38
39@@ -388,10 +381,7 @@
40 result = [row[0] for row in result]
41 elif return_mode == 1:
42 log.debug("Found %d events IDs in %fs" % (len(result), time.time()- t))
43- result = self.get_events(ids=[row[0] for row in result], sender=sender)
44- elif return_mode == 2:
45- log.debug("Found %d (uri,timestamp) tuples in %fs" % (len(result), time.time()- t))
46- result = map(lambda row: (row[0], row[1]), result)
47+ result = self.get_events(ids=[row[0] for row in result], sender=sender)
48 else:
49 raise Exception("%d" % return_mode)
50
51@@ -403,12 +393,6 @@
52 def find_events(self, *args):
53 return self._find_events(1, *args)
54
55- def __add_window(self, _set, assoc, landmarks, windows):
56- if _set & landmarks: # intersection != 0
57- windows.append(_set)
58- for i in _set.difference(landmarks):
59- assoc[i] += 1
60-
61 def find_related_uris(self, timerange, event_templates, result_event_templates,
62 result_storage_state, num_results, result_type):
63 """
64@@ -420,81 +404,54 @@
65 and `result_storage_state` are returned.
66 """
67
68- if result_type == 0 or result_type == 1:
69-
70+ if result_type in (0, 1):
71+ # Instead of using sliding windows we will be using a graph representation
72+
73 t1 = time.time()
74
75- if len(result_event_templates) == 0:
76- uris = self._find_events(2, timerange, result_event_templates,
77- result_storage_state, 0, ResultType.LeastRecentEvents)
78- else:
79- uris = self._find_events(2, timerange, result_event_templates + event_templates,
80- result_storage_state, 0, ResultType.LeastRecentEvents)
81-
82- assoc = defaultdict(int)
83-
84- landmarks = self._find_events(2, timerange, event_templates,
85- result_storage_state, 0, 4)
86- landmarks = set([unicode(event[0]) for event in landmarks])
87-
88- latest_uris = dict(uris)
89- events = [unicode(u[0]) for u in uris]
90-
91- furis = filter(lambda x: x[0] in landmarks, uris)
92- if len(furis) == 0:
93- return []
94-
95- _min = min(furis, key=operator.itemgetter(1))
96- _max = max(furis, key=operator.itemgetter(1))
97- min_index = uris.index(_min) - WINDOW_SIZE
98- max_index = uris.index(_max) + WINDOW_SIZE
99- _min = _min[1]
100- _max = _max[1]
101-
102- if min_index < 0:
103- min_index = 0
104- if max_index > len(events):
105- max_index = -1
106-
107- func = self.__add_window
108-
109- if len(events) == 0 or len(landmarks) == 0:
110- return []
111-
112- windows = []
113-
114- if len(events) <= WINDOW_SIZE:
115- #TODO bug! windows is not defined, seems the algorithm never touches these loop
116- func(events, assoc, landmarks, windows)
117- else:
118- events = events[min_index:max_index]
119- offset = WINDOW_SIZE/2
120-
121- for i in xrange(offset):
122- func(set(events[0: offset - i]), assoc, landmarks,
123- windows)
124- func(set(events[len(events) - offset + i: len(events)]),
125- assoc, landmarks, windows)
126-
127- it = iter(events)
128- result = tuple(islice(it, WINDOW_SIZE))
129- for elem in it:
130- result = result[1:] + (elem,)
131- func(set(result), assoc, landmarks, windows)
132-
133-
134- log.debug("FindRelatedUris: Finished sliding windows in %fs." % \
135+ # We pick out the ids for relational event so we can set them as roots
136+ # the ids are taken from the events that match the events_templates
137+ ids = self._find_events(0, timerange, event_templates,
138+ result_storage_state, 0, ResultType.LeastRecentEvents)
139+
140+ # Pick out the result_ids for the filtered results we would like to take into account
141+ # the ids are taken from the events that match the result_event_templates
142+ # if no result_event_templates are set we consider all results as allowed
143+ result_ids = []
144+ if len(result_event_templates) > 0:
145+ result_ids = self._find_events(0, timerange, result_event_templates,
146+ result_storage_state, 0, ResultType.LeastRecentEvents)
147+
148+ # From here we create several graphs with the maximum depth of 2
149+ # and push all the nodes and vertices (events) in one pot together
150+ # FIXME: the depth should be adaptable
151+ pot = []
152+ for id in ids:
153+ for x in xrange(id-2,id+3):
154+ if len(result_ids) == 0 or x in result_ids:
155+ pot.append(x)
156+
157+ # Out of the pot we get all respected events and count which uris occur most
158+ events = self.get_events(pot)
159+ subject_uri_counter = defaultdict(int)
160+ latest_uris = defaultdict(int)
161+ for event in events:
162+ if event and event.id not in ids:
163+ subj = event.subjects[0]
164+ subject_uri_counter[subj.uri] += 1
165+ if latest_uris[subj.uri] < event.timestamp:
166+ latest_uris[subj.uri] = event.timestamp
167+
168+ log.debug("FindRelatedUris: Finished ranking subjects %fs." % \
169 (time.time()-t1))
170
171 if result_type == 0:
172- sets = [[v, k] for k, v in assoc.iteritems()]
173+ sets = subject_uri_counter.iteritems()
174 elif result_type == 1:
175- sets = [[latest_uris[k], k] for k in assoc]
176+ sets = ((k, latest_uris[k]) for k in subject_uri_counter)
177
178- sets.sort(reverse = True)
179- sets = map(lambda result: result[1], sets[:num_results])
180-
181- return sets
182+ sets = sorted(sets, reverse=True, key=lambda x: x[1])[:num_results]
183+ return map(lambda result: result[0], sets)
184 else:
185 raise NotImplementedError, "Unsupported ResultType."
186

Subscribers

People subscribed via source and target branches