Merge lp:~wgrant/launchpad/bug-736005-trivialise into lp:launchpad

Proposed by William Grant
Status: Work in progress
Proposed branch: lp:~wgrant/launchpad/bug-736005-trivialise
Merge into: lp:launchpad
Diff against target: 242 lines (+65/-119)
1 file modified
lib/lp/translations/model/pofile.py (+65/-119)
To merge this branch: bzr merge lp:~wgrant/launchpad/bug-736005-trivialise
Reviewer Review Type Date Requested Status
Launchpad code reviewers Pending
Review via email: mp+250447@code.launchpad.net

Commit message

Rewrite the POFile:+translate text search query to perform adequately.

Description of the change

Rewrite translations text search to use the simplest query that can work, which happens to optimise best nowadays. The old code is littered with comments describing how simpler queries degrade to seqscans of POMsgID and POTranslation, but that's not the case in 9.3 with a sensible simple query.

https://pastebin.canonical.com/126018/ compares the new and old queries. Further optimisation will come soon once POTMsgSet.msgid_(singular|plural) are denormalised to TranslationTemplateItem, and the COUNT(*) queries are eliminated from large template views.

To post a comment you must log in.
17361. By William Grant

Drop duplicate join condition.

17362. By William Grant

Fix distinct by including TTI in the result.

17363. By William Grant

Don't double-join POMsgID and POTranslation.

Unmerged revisions

17363. By William Grant

Don't double-join POMsgID and POTranslation.

17362. By William Grant

Fix distinct by including TTI in the result.

17361. By William Grant

Drop duplicate join condition.

17360. By William Grant

Distinctify.

17359. By William Grant

Reimplement XPI translation search.

17358. By William Grant

Support multiple plural forms in search rewrite.

17357. By William Grant

Rewrite translation search to the simplest form that happens to perform a lot better than the old planner workarounds.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'lib/lp/translations/model/pofile.py'
2--- lib/lp/translations/model/pofile.py 2013-06-20 05:50:00 +0000
3+++ lib/lp/translations/model/pofile.py 2015-02-21 03:06:32 +0000
4@@ -13,6 +13,7 @@
5 ]
6
7 import datetime
8+from operator import itemgetter
9
10 import pytz
11 from sqlobject import (
12@@ -49,6 +50,7 @@
13 from lp.registry.interfaces.person import validate_public_person
14 from lp.services.database.constants import UTC_NOW
15 from lp.services.database.datetimecol import UtcDateTimeCol
16+from lp.services.database.decoratedresultset import DecoratedResultSet
17 from lp.services.database.interfaces import (
18 IMasterStore,
19 IStore,
20@@ -56,7 +58,6 @@
21 from lp.services.database.sqlbase import (
22 flush_database_updates,
23 quote,
24- quote_like,
25 SQLBase,
26 sqlvalues,
27 )
28@@ -96,6 +97,7 @@
29 credits_message_str,
30 POTMsgSet,
31 )
32+from lp.translations.model.potranslation import POTranslation
33 from lp.translations.model.translationimportqueue import collect_import_info
34 from lp.translations.model.translationmessage import (
35 make_plurals_sql_fragment,
36@@ -165,102 +167,6 @@
37 header.has_plural_forms = self.potemplate.hasPluralMessage()
38 return header
39
40- def _getTranslationSearchQuery(self, pofile, plural_form, text):
41- """Query to find `text` in `plural_form` translations of a `pofile`.
42-
43- This produces a list of clauses that can be used to search for
44- TranslationMessages containing `text` in their msgstr[`plural_form`].
45- Returned values are POTMsgSet ids containing them, expected to be
46- used in a UNION across all plural forms.
47- """
48- translation_match = """
49- -- Find translations containing `text`.
50- -- Like in findPOTMsgSetsContaining(), to avoid seqscans on
51- -- POTranslation table, we do ILIKE comparison on them in
52- -- a subselect which is first filtered by the POFile.
53- SELECT TranslationMessage.potmsgset
54- FROM TranslationMessage
55- JOIN TranslationTemplateItem
56- ON TranslationMessage.potmsgset
57- = TranslationTemplateItem.potmsgset
58- WHERE
59- TranslationTemplateItem.potemplate = %(potemplate)s AND
60- TranslationMessage.language = %(language)s AND
61- TranslationMessage.msgstr%(plural_form)d IN (
62- SELECT POTranslation.id FROM POTranslation WHERE
63- POTranslation.id IN (
64- SELECT DISTINCT(msgstr%(plural_form)d)
65- FROM TranslationMessage AS tm_ids
66- JOIN TranslationTemplateItem
67- ON tm_ids.potmsgset=TranslationTemplateItem.potmsgset
68- WHERE
69- TranslationTemplateItem.potemplate
70- = %(potemplate)s AND
71- TranslationTemplateItem.sequence > 0 AND
72- tm_ids.language=%(language)s
73- ) AND
74- POTranslation.translation
75- ILIKE '%%' || %(text)s || '%%')
76- """ % dict(potemplate=quote(pofile.potemplate),
77- language=quote(pofile.language),
78- plural_form=plural_form,
79- text=quote_like(text))
80- return translation_match
81-
82- def _getTemplateSearchQuery(self, text):
83- """Query for finding `text` in msgids of this POFile.
84- """
85- english_match = """
86- -- Step 1a: get POTMsgSets where msgid_singular contains `text`
87- -- To avoid seqscans on POMsgID table (what LIKE usually
88- -- does), we do ILIKE comparison on them in a subselect first
89- -- filtered by this POTemplate.
90- SELECT POTMsgSet.id
91- FROM POTMsgSet
92- JOIN TranslationTemplateItem
93- ON TranslationTemplateItem.potmsgset=POTMsgSet.id AND
94- TranslationTemplateItem.potemplate=%s
95- WHERE
96- (POTMsgSet.msgid_singular IS NOT NULL AND
97- POTMsgSet.msgid_singular IN (
98- SELECT POMsgID.id FROM POMsgID
99- WHERE id IN (
100- SELECT DISTINCT(msgid_singular)
101- FROM POTMsgSet
102- JOIN TranslationTemplateItem
103- ON TranslationTemplateItem.potmsgset = POTMsgSet.id
104- WHERE
105- TranslationTemplateItem.potemplate=%s AND
106- TranslationTemplateItem.sequence > 0
107- ) AND
108- msgid ILIKE '%%' || %s || '%%'))
109- UNION
110- -- Step 1b: like above, just on msgid_plural.
111- SELECT POTMsgSet.id
112- FROM POTMsgSet
113- JOIN TranslationTemplateItem
114- ON TranslationTemplateItem.potmsgset=POTMsgSet.id AND
115- TranslationTemplateItem.potemplate=%s
116- WHERE
117- (POTMsgSet.msgid_plural IS NOT NULL AND
118- POTMsgSet.msgid_plural IN (
119- SELECT POMsgID.id FROM POMsgID
120- WHERE id IN (
121- SELECT DISTINCT(msgid_plural)
122- FROM POTMsgSet
123- JOIN TranslationTemplateItem
124- ON TranslationTemplateItem.potmsgset = POTMsgSet.id
125- WHERE
126- TranslationTemplateItem.potemplate=%s AND
127- TranslationTemplateItem.sequence > 0
128- ) AND
129- msgid ILIKE '%%' || %s || '%%'))
130- """ % (quote(self.potemplate), quote(self.potemplate),
131- quote_like(text),
132- quote(self.potemplate), quote(self.potemplate),
133- quote_like(text))
134- return english_match
135-
136 def _getOrderedPOTMsgSets(self, origin_tables, query):
137 """Find all POTMsgSets matching `query` from `origin_tables`.
138
139@@ -271,41 +177,81 @@
140 POTMsgSet, SQL(query))
141 return results.order_by(TranslationTemplateItem.sequence)
142
143+ def _getTranslationSearchBits(self, pofile):
144+ ThisTM = ClassAlias(TranslationMessage)
145+ ThisPT = ClassAlias(POTranslation)
146+ str_id_cols = [
147+ getattr(ThisTM, 'msgstr%dID' % plural_form)
148+ for plural_form in range(pofile.plural_forms)]
149+ origin = [
150+ LeftJoin(
151+ ThisTM,
152+ And(
153+ ThisTM.potmsgsetID ==
154+ TranslationTemplateItem.potmsgsetID,
155+ ThisTM.languageID == pofile.languageID)),
156+ LeftJoin(
157+ ThisPT,
158+ Or(*(ThisPT.id == col for col in str_id_cols))),
159+ ]
160+ search_options = [ThisPT.translation]
161+ return origin, search_options
162+
163 def findPOTMsgSetsContaining(self, text):
164 """See `IPOFile`."""
165+ origin = [
166+ POTMsgSet,
167+ Join(
168+ TranslationTemplateItem,
169+ TranslationTemplateItem.potmsgsetID == POTMsgSet.id),
170+ ]
171 clauses = [
172- 'TranslationTemplateItem.potemplate = %s' % sqlvalues(
173- self.potemplate),
174- 'TranslationTemplateItem.potmsgset = POTMsgSet.id',
175- 'TranslationTemplateItem.sequence > 0',
176+ TranslationTemplateItem.potemplateID == self.potemplate.id,
177+ TranslationTemplateItem.sequence > 0,
178 ]
179
180 if text is not None:
181 assert len(text) > 1, (
182 "You can not search for strings shorter than 2 characters.")
183
184+ # A list of string columns, of which at least one must match
185+ # the search term.
186+ search_options = []
187+
188+ # Search the English strings.
189 if self.potemplate.uses_english_msgids:
190- english_match = self._getTemplateSearchQuery(text)
191+ origin.extend([
192+ Join(
193+ POMsgID,
194+ Or(
195+ POMsgID.id == POTMsgSet.msgid_singularID,
196+ POMsgID.id == POTMsgSet.msgid_pluralID)),
197+ ])
198+ search_options.extend([POMsgID.msgid])
199 else:
200- # If msgids are not in English, use English PO file
201- # to fetch original strings instead.
202- en_pofile = self.potemplate.getPOFileByLang('en')
203- english_match = self._getTranslationSearchQuery(
204- en_pofile, 0, text)
205+ en_po = self.potemplate.getPOFileByLang('en')
206+ en_origin, en_options = self._getTranslationSearchBits(en_po)
207+ origin.extend(en_origin)
208+ search_options.extend(en_options)
209
210- # Do not look for translations in a DummyPOFile.
211- search_clauses = [english_match]
212+ # Search the translations themselves, unless this is a
213+ # DummyPOFile.
214 if self.id is not None:
215- for plural_form in range(self.plural_forms):
216- translation_match = self._getTranslationSearchQuery(
217- self, plural_form, text)
218- search_clauses.append(translation_match)
219-
220- clauses.append(
221- "POTMsgSet.id IN (" + " UNION ".join(search_clauses) + ")")
222-
223- return self._getOrderedPOTMsgSets(
224- [POTMsgSet, TranslationTemplateItem], ' AND '.join(clauses))
225+ po_origin, po_options = self._getTranslationSearchBits(self)
226+ origin.extend(po_origin)
227+ search_options.extend(po_options)
228+
229+ # Case-insensitively substring-match the search term against
230+ # each searchable column.
231+ clauses.append(Or(*(
232+ str.like('%%%s%%' % text, case_sensitive=False)
233+ for str in search_options)))
234+
235+ result = IMasterStore(POTMsgSet).using(*origin).find(
236+ (POTMsgSet, TranslationTemplateItem),
237+ *clauses).config(distinct=True).order_by(
238+ TranslationTemplateItem.sequence)
239+ return DecoratedResultSet(result, itemgetter(0))
240
241 def getFullLanguageCode(self):
242 """See `IPOFile`."""