Merge lp:~adeuring/launchpad/bug-29713-db-dev into lp:launchpad/db-devel

Proposed by Abel Deuring
Status: Merged
Approved by: Abel Deuring
Approved revision: no longer in the source branch.
Merged at revision: 11713
Proposed branch: lp:~adeuring/launchpad/bug-29713-db-dev
Merge into: lp:launchpad/db-devel
Diff against target: 607 lines (+311/-117)
4 files modified
database/schema/patch-2209-24-1.sql (+125/-0)
lib/lp/answers/stories/question-browse-and-search.txt (+1/-1)
lib/lp/registry/doc/vocabularies.txt (+1/-2)
lib/lp/services/database/doc/textsearching.txt (+184/-114)
To merge this branch: bzr merge lp:~adeuring/launchpad/bug-29713-db-dev
Reviewer Review Type Date Requested Status
Abel Deuring (community) Approve
Review via email: mp+112043@code.launchpad.net

Commit message

Better handling of punctuation in full text search strings

Description of the change

See https://code.launchpad.net/~adeuring/launchpad/bug-29713/+merge/111212 . That branch was merged into devel, while it should have been merged into db-devel

To post a comment you must log in.
Revision history for this message
Abel Deuring (adeuring) wrote :

r=self: See the linked MP for the real review.

review: Approve
Revision history for this message
Stuart Bishop (stub) wrote :

db patch has been applied to production

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
=== added file 'database/schema/patch-2209-24-1.sql'
--- database/schema/patch-2209-24-1.sql 1970-01-01 00:00:00 +0000
+++ database/schema/patch-2209-24-1.sql 2012-06-26 10:15:26 +0000
@@ -0,0 +1,125 @@
1-- Copyright 2012 Canonical Ltd. This software is licensed under the
2-- GNU Affero General Public License version 3 (see the file LICENSE).
3
4SET client_min_messages=ERROR;
5
6CREATE OR REPLACE FUNCTION _ftq(text) RETURNS text
7 LANGUAGE plpythonu IMMUTABLE STRICT
8 AS $_$
9 import re
10
11 # I think this method would be more robust if we used a real
12 # tokenizer and parser to generate the query string, but we need
13 # something suitable for use as a stored procedure which currently
14 # means no external dependancies.
15
16 # Convert to Unicode
17 query = args[0].decode('utf8')
18 ## plpy.debug('1 query is %s' % repr(query))
19
20 # Normalize whitespace
21 query = re.sub("(?u)\s+"," ", query)
22
23 # Convert AND, OR, NOT and - to tsearch2 punctuation
24 query = re.sub(r"(?u)(?:^|\s)-([\w\(])", r" !\1", query)
25 query = re.sub(r"(?u)\bAND\b", "&", query)
26 query = re.sub(r"(?u)\bOR\b", "|", query)
27 query = re.sub(r"(?u)\bNOT\b", " !", query)
28 ## plpy.debug('2 query is %s' % repr(query))
29
30 # Deal with unwanted punctuation.
31 # ':' is used in queries to specify a weight of a word.
32 # '\' is treated differently in to_tsvector() and to_tsquery().
33 punctuation = r'[:\\]'
34 query = re.sub(r"(?u)%s+" % (punctuation,), " ", query)
35 ## plpy.debug('3 query is %s' % repr(query))
36
37 # Strip ! characters inside and at the end of a word
38 query = re.sub(r"(?u)(?<=\w)[\!]+", " ", query)
39
40 # Now that we have handle case sensitive booleans, convert to lowercase
41 query = query.lower()
42
43 # Remove unpartnered bracket on the left and right
44 query = re.sub(r"(?ux) ^ ( [^(]* ) \)", r"(\1)", query)
45 query = re.sub(r"(?ux) \( ( [^)]* ) $", r"(\1)", query)
46
47 # Remove spurious brackets
48 query = re.sub(r"(?u)\(([^\&\|]*?)\)", r" \1 ", query)
49 ## plpy.debug('5 query is %s' % repr(query))
50
51 # Insert & between tokens without an existing boolean operator
52 # ( not proceeded by (|&!
53 query = re.sub(r"(?u)(?<![\(\|\&\!])\s*\(", "&(", query)
54 ## plpy.debug('6 query is %s' % repr(query))
55 # ) not followed by )|&
56 query = re.sub(r"(?u)\)(?!\s*(\)|\||\&|\s*$))", ")&", query)
57 ## plpy.debug('6.1 query is %s' % repr(query))
58 # Whitespace not proceded by (|&! not followed by &|
59 query = re.sub(r"(?u)(?<![\(\|\&\!\s])\s+(?![\&\|\s])", "&", query)
60 ## plpy.debug('7 query is %s' % repr(query))
61
62 # Detect and repair syntax errors - we are lenient because
63 # this input is generally from users.
64
65 # Fix unbalanced brackets
66 openings = query.count("(")
67 closings = query.count(")")
68 if openings > closings:
69 query = query + " ) "*(openings-closings)
70 elif closings > openings:
71 query = " ( "*(closings-openings) + query
72 ## plpy.debug('8 query is %s' % repr(query))
73
74 # Strip ' character that do not have letters on both sides
75 query = re.sub(r"(?u)((?<!\w)'|'(?!\w))", "", query)
76
77 # Brackets containing nothing but whitespace and booleans, recursive
78 last = ""
79 while last != query:
80 last = query
81 query = re.sub(r"(?u)\([\s\&\|\!]*\)", "", query)
82 ## plpy.debug('9 query is %s' % repr(query))
83
84 # An & or | following a (
85 query = re.sub(r"(?u)(?<=\()[\&\|\s]+", "", query)
86 ## plpy.debug('10 query is %s' % repr(query))
87
88 # An &, | or ! immediatly before a )
89 query = re.sub(r"(?u)[\&\|\!\s]*[\&\|\!]+\s*(?=\))", "", query)
90 ## plpy.debug('11 query is %s' % repr(query))
91
92 # An &,| or ! followed by another boolean.
93 query = re.sub(r"(?ux) \s* ( [\&\|\!] ) [\s\&\|]+", r"\1", query)
94 ## plpy.debug('12 query is %s' % repr(query))
95
96 # Leading & or |
97 query = re.sub(r"(?u)^[\s\&\|]+", "", query)
98 ## plpy.debug('13 query is %s' % repr(query))
99
100 # Trailing &, | or !
101 query = re.sub(r"(?u)[\&\|\!\s]+$", "", query)
102 ## plpy.debug('14 query is %s' % repr(query))
103
104 # If we have nothing but whitespace and tsearch2 operators,
105 # return NULL.
106 if re.search(r"(?u)^[\&\|\!\s\(\)]*$", query) is not None:
107 return None
108
109 # Convert back to UTF-8
110 query = query.encode('utf8')
111 ## plpy.debug('15 query is %s' % repr(query))
112
113 return query or None
114 $_$;
115
116CREATE OR REPLACE FUNCTION ftq(text) RETURNS pg_catalog.tsquery
117 LANGUAGE plpythonu IMMUTABLE STRICT
118 AS $_$
119 p = plpy.prepare(
120 "SELECT to_tsquery('default', _ftq($1)) AS x", ["text"])
121 query = plpy.execute(p, args, 1)[0]["x"]
122 return query or None
123 $_$;
124
125INSERT INTO LaunchpadDatabaseRevision VALUES (2209, 24, 1);
0126
=== modified file 'lib/lp/answers/stories/question-browse-and-search.txt'
--- lib/lp/answers/stories/question-browse-and-search.txt 2012-06-22 16:17:48 +0000
+++ lib/lp/answers/stories/question-browse-and-search.txt 2012-06-26 10:15:26 +0000
@@ -311,7 +311,7 @@
311dealing with plugins problems, he always start by a search for such311dealing with plugins problems, he always start by a search for such
312problems:312problems:
313313
314 >>> browser.getControl(name='field.search_text').value = 'plug-in'314 >>> browser.getControl(name='field.search_text').value = 'plugin'
315 >>> browser.getControl('Search', index=0).click()315 >>> browser.getControl('Search', index=0).click()
316 >>> questions = find_tag_by_id(browser.contents, 'question-listing')316 >>> questions = find_tag_by_id(browser.contents, 'question-listing')
317 >>> for question in questions.fetch('td', 'questionTITLE'):317 >>> for question in questions.fetch('td', 'questionTITLE'):
318318
=== modified file 'lib/lp/registry/doc/vocabularies.txt'
--- lib/lp/registry/doc/vocabularies.txt 2012-06-22 16:17:48 +0000
+++ lib/lp/registry/doc/vocabularies.txt 2012-06-26 10:15:26 +0000
@@ -731,8 +731,7 @@
731731
732 >>> [(p.name, getattr(p.teamowner, 'name', None))732 >>> [(p.name, getattr(p.teamowner, 'name', None))
733 ... for p in vocab.search('ubuntu-team')]733 ... for p in vocab.search('ubuntu-team')]
734 [(u'doc', None), (u'name18', u'mark'),734 [(u'ubuntu-team', u'mark')]
735 (u'ubuntu-security', u'kamion'), (u'ubuntu-team', u'mark')]
736735
737But it doesn't include merged accounts:736But it doesn't include merged accounts:
738737
739738
=== modified file 'lib/lp/services/database/doc/textsearching.txt'
--- lib/lp/services/database/doc/textsearching.txt 2012-06-22 16:17:48 +0000
+++ lib/lp/services/database/doc/textsearching.txt 2012-06-26 10:15:26 +0000
@@ -138,7 +138,22 @@
138 ... compiled = compiled.decode('UTF-8')138 ... compiled = compiled.decode('UTF-8')
139 ... compiled = compiled.encode('US-ASCII', 'backslashreplace')139 ... compiled = compiled.encode('US-ASCII', 'backslashreplace')
140 ... print '%s <=> %s' % (uncompiled, compiled)140 ... print '%s <=> %s' % (uncompiled, compiled)
141141 >>>
142 >>> def search(text_to_search, search_phrase):
143 ... cur = cursor()
144 ... cur.execute("SELECT to_tsvector(%s)", (text_to_search, ))
145 ... ts_vector = cur.fetchall()[0][0]
146 ... cur.execute("SELECT ftq(%s)", (search_phrase, ))
147 ... ts_query = cur.fetchall()[0][0]
148 ... cur.execute(
149 ... "SELECT to_tsvector(%s) @@ ftq(%s)",
150 ... (text_to_search, search_phrase))
151 ... match = cur.fetchall()[0][0]
152 ... return "FTI data: %s query: %s match: %s" % (
153 ... ts_vector, ts_query, str(match))
154 >>>
155 >>> def search_same(text):
156 ... return search(text, text)
142157
143Queries are lowercased158Queries are lowercased
144159
@@ -225,127 +240,178 @@
225 (hi&ho|hoe)&work&go <=> ( 'hi' & 'ho' | 'hoe' ) & 'work' & 'go'240 (hi&ho|hoe)&work&go <=> ( 'hi' & 'ho' | 'hoe' ) & 'work' & 'go'
226241
227242
228Hypenation is handled specially. Note that the & operator has precidence243If a single '-' precedes a word, it is converted into the '!' operator.
229over the | operator and that tsearch2 removes the unnecessary branckets.244Note also that a trailing '-' is dropped by to_tsquery().
230
231 >>> ftq('foo-bar')
232 ((foo&bar)|foobar) <=> 'foo' & 'bar' | 'foobar'
233
234 >>> ftq('foo-bar-baz')
235 ((foo&bar&baz)|foobarbaz) <=> 'foo' & 'bar' & 'baz' | 'foobarbaz'
236
237 >>> ftq('foo & bar-baz')
238 foo&((bar&baz)|barbaz) <=> 'foo' & ( 'bar' & 'baz' | 'barbaz' )
239245
240 >>> ftq('-foo bar-')246 >>> ftq('-foo bar-')
241 !foo&bar <=> !'foo' & 'bar'247 !foo&bar- <=> !'foo' & 'bar'
248
249Repeated '-' are simply ignored by to_tsquery().
242250
243 >>> ftq('---foo--- ---bar---')251 >>> ftq('---foo--- ---bar---')
244 foo&bar <=> 'foo' & 'bar'252 ---foo---&---bar--- <=> 'foo' & 'bar'
245253
246 >>> ftq('foo-bar test')254Hyphens surrounded by two words are retained. This reflects the way
247 ((foo&bar)|foobar)&test <=> ( 'foo' & 'bar' | 'foobar' ) & 'test'255how to_tsquery() and to_tsvector() handle such strings.
248256
249 >>> ftq('foo-bar OR test')257 >>> print search_same('foo-bar')
250 ((foo&bar)|foobar)|test <=> ( 'foo' & 'bar' | 'foobar' ) | 'test'258 FTI data: 'bar':3 'foo':2 'foo-bar':1
251259 query: 'foo-bar' & 'foo' & 'bar'
252260 match: True
253Most punctuation characters are converted to whitespace outside of261
254words, or treated as a hypen inside words. The exceptions are the262
255operators ()!&|!.263Punctuation is handled consistently. If a string containing punctuation
256264appears in an FTI, it can also be passed to ftq(),and a search for this
257 >>> ftq(':100%')265string finds the indexed text.
258 100 <=> '100'266
259267 >>> punctuation = '\'"#$%*+,./:;<=>?@[\]^`{}~'
260 >>> ftq(r'foo\bar')268 >>> for symbol in punctuation:
261 ((foo&bar)|foobar) <=> 'foo' & 'bar' | 'foobar'269 ... print repr(symbol), search_same('foo%sbar' % symbol)
262270 "'" FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
263 >>> ftq('/dev/pmu')271 '"' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
264 ((dev&pmu)|devpmu) <=> 'dev' & 'pmu' | 'devpmu'272 '#' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
273 '$' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
274 '%' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
275 '*' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
276 '+' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
277 ',' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
278 '.' FTI data: 'foo.bar':1 query: 'foo.bar' match: True
279 '/' FTI data: 'foo/bar':1 query: 'foo/bar' match: True
280 ':' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
281 ';' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
282 '<' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
283 '=' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
284 '>' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
285 '?' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
286 '@' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
287 '[' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
288 '\\' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
289 ']' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
290 '^' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
291 '`' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
292 '{' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
293 '}' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
294 '~' FTI data: 'foo':1 '~bar':2 query: 'foo' & '~bar' match: True
295
296 >>> for symbol in punctuation:
297 ... print repr(symbol), search_same('aa %sbb%s cc' % (symbol, symbol))
298 "'" FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
299 '"' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
300 '#' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
301 '$' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
302 '%' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
303 '*' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
304 '+' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
305 ',' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
306 '.' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
307 '/' FTI data: '/bb':2 'aa':1 'cc':3 query: 'aa' & '/bb' & 'cc' match: True
308 ':' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
309 ';' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
310 '<' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
311 '=' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
312 '>' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
313 '?' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
314 '@' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
315 '[' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
316 '\\' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
317 ']' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
318 '^' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
319 '`' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
320 '{' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
321 '}' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
322 '~' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & '~bb' & 'cc' match: False
323
324XXX Abel Deuring 2012-06-20 bug=1015511: Note that the last line above
325shows a bug: The FTI data for the string "aa ~bb~ cc" contains the words
326'aa', 'bb', 'cc', while the ts_query object for the same text contains
327'aa', '~bb', 'cc', hence the query does not match the string. More details_
328
329XXX Abel Deuring 2012-06-20 bug=1015519: XML tags cannot be searched.
330
331 >>> print search_same('foo <bar> baz')
332 FTI data: 'baz':2 'foo':1 query: 'foo' & 'baz' match: True
333
334More specifically, tags are simply dropped from the FTI data and from
335search queries.
336
337 >>> print search('some text <div>whatever</div>', '<div>')
338 FTI data: 'text':2 'whatev':3 query: None match: None
339
340Of course, omitting '<' and '>'from the query does not help.
341
342 >>> print search('some text <div>whatever</div>', 'div')
343 FTI data: 'text':2 'whatev':3 query: 'div' match: False
344
345Treatment of characters that are used as operators in to_tsquery():
265346
266 >>> ftq('cool!')347 >>> ftq('cool!')
267 cool <=> 'cool'348 cool <=> 'cool'
268349
269 >>> ftq('foo@bar.com')350Email addresses are retained as a whole, both by to_tsvector() and by
270 ((foo&bar&com)|foobarcom) <=> 'foo' & 'bar' & 'com' | 'foobarcom'351ftq().
271352
353 >>> print search_same('foo@bar.com')
354 FTI data: 'foo@bar.com':1 query: 'foo@bar.com' match: True
355
356File names are retained as a whole.
357
358 >>> print search_same('foo-bar.txt')
359 FTI data: 'foo-bar.txt':1 query: 'foo-bar.txt' match: True
272360
273Some punctuation we pass through to tsearch2 for it to handle.361Some punctuation we pass through to tsearch2 for it to handle.
274362NB. This gets stemmed, see below.
275 >>> ftq("shouldn't") # NB. This gets stemmed, see below363
276 shouldn't <=> 'shouldn'364 >>> print search_same("shouldn't")
277365 FTI data: 'shouldn':1 query: 'shouldn' match: True
278It was noticed though in Bug #33920 that tsearch2 couldn't cope if the366
279apostrophe was not inside a word. So we strip it in these cases.367Bug #44913 - Unicode characters in the wrong place.
280368
281 >>> ftq("'cool")369 >>> search_same(u'abc-a\N{LATIN SMALL LETTER C WITH CEDILLA}')
282 cool <=> 'cool'370 "FTI data: 'abc':2 'abc-a\xc3\xa7':1 'a\xc3\xa7':3
283 >>> ftq("'shouldn't")371 query: 'abc-a\xc3\xa7' & 'abc' & 'a\xc3\xa7'
284 shouldn't <=> 'shouldn'372 match: True"
285 >>> ftq("' cool")373
286 cool <=> 'cool'374Cut & Paste of 'Smart' quotes. Note that the quotation mark is retained
287 >>> ftq("cool '")375in the FTI.
288 cool <=> 'cool'376
289 >>> ftq("' cool '")377 >>> print search_same(u'a-a\N{RIGHT DOUBLE QUOTATION MARK}')
290 cool <=> 'cool'378 FTI data: 'a-a”':1 'a”':3 query: 'a-a”' & 'a”' match: True
291 >>> ftq("'cool'")379
292 cool <=> 'cool'380 >>> print search_same(
293 >>> ftq("('cool' AND bananas)")381 ... u'\N{LEFT SINGLE QUOTATION MARK}a.a'
294 (cool&bananas) <=> 'cool' & 'banana'382 ... u'\N{RIGHT SINGLE QUOTATION MARK}')
295383 FTI data: 'a’':2 '‘a':1 query: '‘a' & 'a’' match: True
296It was also noticed through Bug #39828 that tsearch2 will not cope if the
297! character is embedded inside or found at the end of a word.
298
299 >>> ftq('cool!')
300 cool <=> 'cool'
301 >>> ftq('hi!mom')
302 hi&mom <=> 'hi' & 'mom'
303 >>> ftq('hi!!!!!mom')
304 hi&mom <=> 'hi' & 'mom'
305 >>> ftq('hi !mom')
306 hi&!mom <=> 'hi' & !'mom'
307
308
309Bug #44913 - Unicode characters in the wrong place
310
311 >>> ftq(u'a-a\N{LATIN SMALL LETTER C WITH CEDILLA}')
312 ((a&a\xe7)|aa\xe7) <=> 'a\xe7' | 'aa\xe7'
313
314 Cut & Paste of 'Smart' quotes
315
316 >>> ftq(u'a-a\N{RIGHT DOUBLE QUOTATION MARK}')
317 ((a&a)|aa) <=> 'aa'
318
319 >>> ftq(u'\N{LEFT SINGLE QUOTATION MARK}a.a\N{RIGHT SINGLE QUOTATION MARK}')
320 ((a&a)|aa) <=> 'aa'
321384
322385
323Bug #44913 - Nothing but stopwords in a query needing repair386Bug #44913 - Nothing but stopwords in a query needing repair
324387
325 >>> ftq('a)a')388 >>> print search_same('a)a')
326 a&a <=> None389 FTI data: query: None match: None
327390
328391
329Stop words (words deemed too common in English to search on) are removed392Stop words (words deemed too common in English to search on) are removed
330from queries by tsearch2.393from queries by tsearch2.
331394
332 >>> ftq("Don't do it harder!")395 >>> print search_same("Don't do it harder!")
333 don't&do&it&harder <=> 'harder'396 FTI data: 'harder':5 query: 'harder' match: True
334397
335398
336Note that some queries will return None after compilation, because they399Note that some queries will return None after compilation, because they
337contained nothing but stop words or punctuation.400contained nothing but stop words or punctuation.
338401
339 >>> ftq("don't do it!")402 >>> print search_same("don't do it!")
340 don't&do&it <=> None403 FTI data: query: None match: None
341404
342 >>> ftq(",,,")405 >>> print search_same(",,,")
343 None <=> None406 FTI data: query: None match: None
344407
345408
346Queries containing nothing except whitespace, boolean operators and409Queries containing nothing except whitespace, boolean operators and
347punctuation will just return None.410punctuation will just return None.
348411
412Note in the fourth example below that the '-' left in the query by _ftq()
413is ignored by to_tsquery().
414
349 >>> ftq(" ")415 >>> ftq(" ")
350 None <=> None416 None <=> None
351 >>> ftq("AND")417 >>> ftq("AND")
@@ -353,7 +419,7 @@
353 >>> ftq(" AND (!)")419 >>> ftq(" AND (!)")
354 None <=> None420 None <=> None
355 >>> ftq("-")421 >>> ftq("-")
356 None <=> None422 - <=> None
357423
358424
359Words are also stemmed by tsearch2 (using the English stemmer).425Words are also stemmed by tsearch2 (using the English stemmer).
@@ -381,7 +447,7 @@
381 (hi|!hello)&mom <=> ( 'hi' | !'hello' ) & 'mom'447 (hi|!hello)&mom <=> ( 'hi' | !'hello' ) & 'mom'
382448
383 >>> ftq('(hi OR - AND hello) AND mom')449 >>> ftq('(hi OR - AND hello) AND mom')
384 (hi|hello)&mom <=> ( 'hi' | 'hello' ) & 'mom'450 (hi|-&hello)&mom <=> ( 'hi' | 'hello' ) & 'mom'
385451
386 >>> ftq('hi AND mom AND')452 >>> ftq('hi AND mom AND')
387 hi&mom <=> 'hi' & 'mom'453 hi&mom <=> 'hi' & 'mom'
@@ -393,7 +459,7 @@
393 (hi|hello)&mom <=> ( 'hi' | 'hello' ) & 'mom'459 (hi|hello)&mom <=> ( 'hi' | 'hello' ) & 'mom'
394460
395 >>> ftq('() hi mom ( ) ((! |((&)))) :-)')461 >>> ftq('() hi mom ( ) ((! |((&)))) :-)')
396 (hi&mom) <=> 'hi' & 'mom'462 (hi&mom&-) <=> 'hi' & 'mom'
397463
398 >>> ftq("(hi mom")464 >>> ftq("(hi mom")
399 hi&mom <=> 'hi' & 'mom'465 hi&mom <=> 'hi' & 'mom'
@@ -414,15 +480,15 @@
414 hi&mom <=> 'hi' & 'mom'480 hi&mom <=> 'hi' & 'mom'
415481
416 >>> ftq("(foo .") # Bug 43245482 >>> ftq("(foo .") # Bug 43245
417 foo <=> 'foo'483 foo&. <=> 'foo'
418484
419 >>> ftq("(foo.")485 >>> ftq("(foo.")
420 foo <=> 'foo'486 foo. <=> 'foo'
421487
422 Bug #54972488 Bug #54972
423489
424 >>> ftq("a[a\n[a")490 >>> ftq("a[a\n[a")
425 ((a&a)|aa)&a <=> 'aa'491 a[a&[a <=> None
426492
427 Bug #96698493 Bug #96698
428494
@@ -437,10 +503,10 @@
437 Bug #160236503 Bug #160236
438504
439 >>> ftq("foo&&bar-baz")505 >>> ftq("foo&&bar-baz")
440 foo&((bar&baz)|barbaz) <=> 'foo' & ( 'bar' & 'baz' | 'barbaz' )506 foo&bar-baz <=> 'foo' & 'bar-baz' & 'bar' & 'baz'
441507
442 >>> ftq("foo||bar.baz")508 >>> ftq("foo||bar.baz")
443 foo|((bar&baz)|barbaz) <=> 'foo' | ( 'bar' & 'baz' | 'barbaz' )509 foo|bar.baz <=> 'foo' | 'bar.baz'
444510
445511
446Phrase Searching512Phrase Searching
@@ -482,7 +548,8 @@
482548
483 >>> runsql(r"""549 >>> runsql(r"""
484 ... SELECT title, max(ranking) FROM (550 ... SELECT title, max(ranking) FROM (
485 ... SELECT Bug.title,rank(Bug.fti||Message.fti,ftq('firefox')) AS ranking551 ... SELECT Bug.title,rank(Bug.fti||Message.fti,ftq('firefox'))
552 ... AS ranking
486 ... FROM Bug, BugMessage, Message553 ... FROM Bug, BugMessage, Message
487 ... WHERE Bug.id = BugMessage.bug AND Message.id = BugMessage.message554 ... WHERE Bug.id = BugMessage.bug AND Message.id = BugMessage.message
488 ... AND (Bug.fti @@ ftq('firefox') OR Message.fti @@ ftq('firefox'))555 ... AND (Bug.fti @@ ftq('firefox') OR Message.fti @@ ftq('firefox'))
@@ -499,7 +566,8 @@
499 ... AND BugTask.product = Product.id566 ... AND BugTask.product = Product.id
500 ... AND Product.name LIKE lower('%firefox%')567 ... AND Product.name LIKE lower('%firefox%')
501 ... UNION568 ... UNION
502 ... SELECT Bug.title, rank(Product.fti, ftq('firefox')) - 0.3 AS ranking569 ... SELECT Bug.title, rank(Product.fti, ftq('firefox')) - 0.3
570 ... AS ranking
503 ... FROM Bug, BugTask, Product571 ... FROM Bug, BugTask, Product
504 ... WHERE Bug.id = BugTask.bug572 ... WHERE Bug.id = BugTask.bug
505 ... AND BugTask.product = Product.id573 ... AND BugTask.product = Product.id
@@ -518,7 +586,8 @@
518 Printing doesn't work 0.70586 Printing doesn't work 0.70
519587
520588
521== Natural Language Phrase Query ==589Natural Language Phrase Query
590-----------------------------
522591
523The standard boolean searches of tsearch2 are fine, but sometime you592The standard boolean searches of tsearch2 are fine, but sometime you
524want more fuzzy searches.593want more fuzzy searches.
@@ -557,7 +626,8 @@
557on Ubuntu) - so we are disabling this and reworking from the ground up.626on Ubuntu) - so we are disabling this and reworking from the ground up.
558627
559628
560=== nl_term_candidates() ===629nl_term_candidates()
630~~~~~~~~~~~~~~~~~~~~
561631
562To find the terms in a search phrase that are canditates for the search,632To find the terms in a search phrase that are canditates for the search,
563we can use the nl_term_candidates() function. This function uses ftq()633we can use the nl_term_candidates() function. This function uses ftq()
@@ -574,19 +644,16 @@
574 >>> nl_term_candidates('how do I do this?')644 >>> nl_term_candidates('how do I do this?')
575 []645 []
576646
577We also handle expansion of hypenated words (like ftq does):
578
579 >>> nl_term_candidates('firefox foo-bar give me trouble')
580 [u'firefox', u'foo', u'bar', u'foobar', u'give', u'troubl']
581
582Except for the hyphenation character, all non-word caracters are ignored:647Except for the hyphenation character, all non-word caracters are ignored:
583648
584 >>> nl_term_candidates(649 >>> nl_term_candidates(
585 ... "Will the \'\'|\'\' character (inside a ''quoted'' string) work???")650 ... "Will the \'\'|\'\' character (inside a ''quoted'' string) "
651 ... "work???")
586 [u'charact', u'insid', u'quot', u'string', u'work']652 [u'charact', u'insid', u'quot', u'string', u'work']
587653
588654
589=== nl_phrase_search() ===655nl_phrase_search()
656~~~~~~~~~~~~~~~~~~
590657
591To get the actual tsearch2 query that should be run, you will use the658To get the actual tsearch2 query that should be run, you will use the
592nl_phrase_search() function. This one takes two mandatory parameters and659nl_phrase_search() function. This one takes two mandatory parameters and
@@ -637,7 +704,8 @@
637 u'slow|system'704 u'slow|system'
638705
639706
640==== Using other constraints ====707Using other constraints
708.......................
641709
642You can pass a third parameter to the function that will be use as710You can pass a third parameter to the function that will be use as
643an additional constraint to determine the total number of rows that711an additional constraint to determine the total number of rows that
@@ -659,7 +727,8 @@
659727
660 >>> nl_phrase_search(728 >>> nl_phrase_search(
661 ... 'firefox gets very slow on flickr', Question,729 ... 'firefox gets very slow on flickr', Question,
662 ... "Question.product = %s AND Product.active = 't'" % firefox_product.id,730 ... "Question.product = %s AND Product.active = 't'"
731 ... % firefox_product.id,
663 ... ['Product'], fast_enabled=False)732 ... ['Product'], fast_enabled=False)
664 u'slow|flickr'733 u'slow|flickr'
665734
@@ -679,7 +748,8 @@
679 u'(firefox&flickr&slow)|(flickr&slow)|(firefox&slow)|(firefox&flickr)'748 u'(firefox&flickr&slow)|(flickr&slow)|(firefox&slow)|(firefox&flickr)'
680749
681750
682==== No keywords filtering with few rows ====751No keywords filtering with few rows
752...................................
683753
684The 50% rule is really useful only when there are many rows. When there754The 50% rule is really useful only when there are many rows. When there
685only very few rows, that keyword elimination becomes a problem since755only very few rows, that keyword elimination becomes a problem since

Subscribers

People subscribed via source and target branches

to status/vote changes: