Merge lp:~thekorn/zeitgeist/db_schema_3 into lp:zeitgeist/0.1

Proposed by Markus Korn
Status: Merged
Merged at revision: 1635
Proposed branch: lp:~thekorn/zeitgeist/db_schema_3
Merge into: lp:zeitgeist/0.1
Diff against target: 123 lines (+32/-16)
5 files modified
_zeitgeist/engine/__init__.py (+1/-1)
_zeitgeist/engine/main.py (+9/-5)
_zeitgeist/engine/sql.py (+9/-8)
_zeitgeist/engine/upgrades/core_2_3.py (+11/-0)
test/sql-test.py (+2/-2)
To merge this branch: bzr merge lp:~thekorn/zeitgeist/db_schema_3
Reviewer Review Type Date Requested Status
Siegfried Gevatter Approve
Seif Lotfy Approve
Mikkel Kamstrup Erlandsen Pending
Review via email: mp+40877@code.launchpad.net

Description of the change

This branch introduces a new db schema (version 3) and bundles a few performance related fixes, [0] compares the performance of this branch lp:zeitgeist (each query is run ~10 times, the plot shows the average).
Changes in detail:
 * renamed event_view.subj_origin to event_view.subj_origin_uri and event_view.subj_origin_id to event_view.subj_origin, this makes "subj_origin" point to an id in the uri table, and "subj_origin_uri" the resolved value. Also we are not using the redundant "SELECT id FROM uri WHERE uri.id=event.subj_origin" query anymore. (LP: #673452)
 * removed event_view.subj_uri_id and added event.subj_id, both values are in fact the same, but we again sting on another "SELECT" query in the event_view.
 * Optimization in FindEvent queries: we are not ordering and grouping by resolved values anymore, we are doing it on integer ids, which is much faster (LP: #673394)

[0] https://dl.dropbox.com/u/174479/compare_queries.svg

To post a comment you must log in.
lp:~thekorn/zeitgeist/db_schema_3 updated
1637. By Markus Korn

fixed typo

1638. By Markus Korn

fixed another comment

Revision history for this message
Seif Lotfy (seif) wrote :

Looks good, makes sense, and works perfectly.
AWESOME JOB. +1

review: Approve
Revision history for this message
Siegfried Gevatter (rainct) wrote :

+1

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file '_zeitgeist/engine/__init__.py'
2--- _zeitgeist/engine/__init__.py 2010-10-19 13:54:12 +0000
3+++ _zeitgeist/engine/__init__.py 2010-11-15 15:32:17 +0000
4@@ -55,7 +55,7 @@
5
6 # Required version of DB schema
7 CORE_SCHEMA="core"
8- CORE_SCHEMA_VERSION = 2
9+ CORE_SCHEMA_VERSION = 3
10
11 USER_EXTENSION_PATH = os.path.join(DATA_PATH, "extensions")
12
13
14=== modified file '_zeitgeist/engine/main.py'
15--- _zeitgeist/engine/main.py 2010-11-12 17:39:28 +0000
16+++ _zeitgeist/engine/main.py 2010-11-15 15:32:17 +0000
17@@ -148,8 +148,9 @@
18
19 def _get_subject_from_row(self, row):
20 subject = Subject()
21- for field in ("uri", "origin", "text", "storage"):
22+ for field in ("uri", "text", "storage"):
23 setattr(subject, field, row["subj_" + field])
24+ setattr(subject, "origin", row["subj_origin_uri"])
25 for field in ("interpretation", "manifestation", "mimetype"):
26 setattr(subject, field,
27 getattr(self, "_" + field).value(row["subj_" + field]))
28@@ -353,10 +354,13 @@
29
30 sql += (" ORDER BY timestamp DESC",
31 " ORDER BY timestamp ASC",
32- " GROUP BY subj_uri ORDER BY timestamp DESC",
33- " GROUP BY subj_uri ORDER BY timestamp ASC",
34- " GROUP BY subj_uri ORDER BY COUNT(subj_uri) DESC, timestamp DESC",
35- " GROUP BY subj_uri ORDER BY COUNT(subj_uri) ASC, timestamp ASC",
36+ # thekorn: please note, event.subj_id == uri.id, as in
37+ # the subj_id points directly to an entry in the uri table,
38+ # so we are in fact grouping by subj_uris here
39+ " GROUP BY subj_id ORDER BY timestamp DESC",
40+ " GROUP BY subj_id ORDER BY timestamp ASC",
41+ " GROUP BY subj_id ORDER BY COUNT(subj_id) DESC, timestamp DESC",
42+ " GROUP BY subj_id ORDER BY COUNT(subj_id) ASC, timestamp ASC",
43 " GROUP BY actor ORDER BY COUNT(actor) DESC, timestamp DESC",
44 " GROUP BY actor ORDER BY COUNT(actor) ASC, timestamp ASC",
45 " GROUP BY actor ORDER BY timestamp DESC",
46
47=== modified file '_zeitgeist/engine/sql.py'
48--- _zeitgeist/engine/sql.py 2010-10-25 20:26:03 +0000
49+++ _zeitgeist/engine/sql.py 2010-11-15 15:32:17 +0000
50@@ -369,14 +369,12 @@
51 AS payload,
52 (SELECT value FROM uri WHERE uri.id=event.subj_id)
53 AS subj_uri,
54- (SELECT id FROM uri WHERE uri.id=event.subj_id)
55- AS subj_uri_id,
56+ event.subj_id, -- #this directly points to an id in the uri table
57 event.subj_interpretation,
58 event.subj_manifestation,
59+ event.subj_origin,
60 (SELECT value FROM uri WHERE uri.id=event.subj_origin)
61- AS subj_origin,
62- (SELECT id FROM uri WHERE uri.id=event.subj_origin)
63- AS subj_origin_id,
64+ AS subj_origin_uri,
65 event.subj_mimetype,
66 (SELECT value FROM text WHERE text.id = event.subj_text)
67 AS subj_text,
68@@ -528,14 +526,17 @@
69 def add_text_condition(self, column, value, like=False, negation=False, cache=None):
70 if like:
71 assert column in ("subj_uri", "subj_origin", "actor", "subj_mimetype"), \
72- "prefix search on the %r column is not supported by zeitgeist"
73- if column in ("subj_uri", "subj_origin"):
74- view_column = "%s_id" %column
75+ "prefix search on the %r column is not supported by zeitgeist" %column
76+ if column == "subj_uri":
77+ # subj_id directly points to the id of an uri entry
78+ view_column = "subj_id"
79 else:
80 view_column = column
81 optimized_glob, value = self.optimize_glob("id", TABLE_MAP.get(column, column), value)
82 sql = "%s %sIN (%s)" %(view_column, self.NOT if negation else "", optimized_glob)
83 else:
84+ if column == "subj_origin":
85+ column = "subj_origin_uri"
86 sql = "%s %s= ?" %(column, "!" if negation else "")
87 if cache is not None:
88 value = cache[value]
89
90=== added file '_zeitgeist/engine/upgrades/core_2_3.py'
91--- _zeitgeist/engine/upgrades/core_2_3.py 1970-01-01 00:00:00 +0000
92+++ _zeitgeist/engine/upgrades/core_2_3.py 2010-11-15 15:32:17 +0000
93@@ -0,0 +1,11 @@
94+# upgrading from db version 2 to 3
95+# this requires no update to the actual data in the database
96+# it is only a schema change of event_view. This change is done
97+# in sql.create_db()
98+
99+# the schema change is renaming 'subj_uri_id' column to 'subj_id', as
100+# both values are the same. Also 'subj_origin' gets renamed to
101+# 'subj_origin_uri' and 'subj_origin_id' to 'subj_origin'.
102+
103+def run(cursor):
104+ pass
105
106=== modified file 'test/sql-test.py'
107--- test/sql-test.py 2010-09-22 17:15:03 +0000
108+++ test/sql-test.py 2010-11-15 15:32:17 +0000
109@@ -97,12 +97,12 @@
110 where = WhereClause(WhereClause.AND)
111 where.add_text_condition("subj_uri", "bar", like=True)
112 self.assertEquals(where.sql.replace("?", "%s") % tuple(where.arguments),
113- "(subj_uri_id IN (SELECT id FROM uri WHERE (value >= bar AND value < bas)))")
114+ "(subj_id IN (SELECT id FROM uri WHERE (value >= bar AND value < bas)))")
115
116 where = WhereClause(WhereClause.AND)
117 where.add_text_condition("subj_origin", "bar", like=True)
118 self.assertEquals(where.sql.replace("?", "%s") % tuple(where.arguments),
119- "(subj_origin_id IN (SELECT id FROM uri WHERE (value >= bar AND value < bas)))")
120+ "(subj_origin IN (SELECT id FROM uri WHERE (value >= bar AND value < bas)))")
121
122 where = WhereClause(WhereClause.AND)
123 where.add_text_condition("actor", "bar", like=True, negation=True)

Subscribers

People subscribed via source and target branches