Merge lp:~adeuring/launchpad/bug-29713-db-dev into lp:launchpad/db-devel

Proposed by Abel Deuring
Status: Merged
Approved by: Abel Deuring
Approved revision: no longer in the source branch.
Merged at revision: 11713
Proposed branch: lp:~adeuring/launchpad/bug-29713-db-dev
Merge into: lp:launchpad/db-devel
Diff against target: 607 lines (+311/-117)
4 files modified
database/schema/patch-2209-24-1.sql (+125/-0)
lib/lp/answers/stories/question-browse-and-search.txt (+1/-1)
lib/lp/registry/doc/vocabularies.txt (+1/-2)
lib/lp/services/database/doc/textsearching.txt (+184/-114)
To merge this branch: bzr merge lp:~adeuring/launchpad/bug-29713-db-dev
Reviewer Review Type Date Requested Status
Abel Deuring (community) Approve
Review via email: mp+112043@code.launchpad.net

Commit message

Better handling of punctuation in full text search strings

Description of the change

See https://code.launchpad.net/~adeuring/launchpad/bug-29713/+merge/111212 . That branch was merged into devel, while it should have been merged into db-devel

To post a comment you must log in.
Revision history for this message
Abel Deuring (adeuring) wrote :

r=self: See the linked MP for the real review.

review: Approve
Revision history for this message
Stuart Bishop (stub) wrote :

db patch has been applied to production

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== added file 'database/schema/patch-2209-24-1.sql'
2--- database/schema/patch-2209-24-1.sql 1970-01-01 00:00:00 +0000
3+++ database/schema/patch-2209-24-1.sql 2012-06-26 10:15:26 +0000
4@@ -0,0 +1,125 @@
5+-- Copyright 2012 Canonical Ltd. This software is licensed under the
6+-- GNU Affero General Public License version 3 (see the file LICENSE).
7+
8+SET client_min_messages=ERROR;
9+
10+CREATE OR REPLACE FUNCTION _ftq(text) RETURNS text
11+ LANGUAGE plpythonu IMMUTABLE STRICT
12+ AS $_$
13+ import re
14+
15+ # I think this method would be more robust if we used a real
16+ # tokenizer and parser to generate the query string, but we need
17+ # something suitable for use as a stored procedure which currently
18+ # means no external dependancies.
19+
20+ # Convert to Unicode
21+ query = args[0].decode('utf8')
22+ ## plpy.debug('1 query is %s' % repr(query))
23+
24+ # Normalize whitespace
25+ query = re.sub("(?u)\s+"," ", query)
26+
27+ # Convert AND, OR, NOT and - to tsearch2 punctuation
28+ query = re.sub(r"(?u)(?:^|\s)-([\w\(])", r" !\1", query)
29+ query = re.sub(r"(?u)\bAND\b", "&", query)
30+ query = re.sub(r"(?u)\bOR\b", "|", query)
31+ query = re.sub(r"(?u)\bNOT\b", " !", query)
32+ ## plpy.debug('2 query is %s' % repr(query))
33+
34+ # Deal with unwanted punctuation.
35+ # ':' is used in queries to specify a weight of a word.
36+ # '\' is treated differently in to_tsvector() and to_tsquery().
37+ punctuation = r'[:\\]'
38+ query = re.sub(r"(?u)%s+" % (punctuation,), " ", query)
39+ ## plpy.debug('3 query is %s' % repr(query))
40+
41+ # Strip ! characters inside and at the end of a word
42+ query = re.sub(r"(?u)(?<=\w)[\!]+", " ", query)
43+
44+ # Now that we have handle case sensitive booleans, convert to lowercase
45+ query = query.lower()
46+
47+ # Remove unpartnered bracket on the left and right
48+ query = re.sub(r"(?ux) ^ ( [^(]* ) \)", r"(\1)", query)
49+ query = re.sub(r"(?ux) \( ( [^)]* ) $", r"(\1)", query)
50+
51+ # Remove spurious brackets
52+ query = re.sub(r"(?u)\(([^\&\|]*?)\)", r" \1 ", query)
53+ ## plpy.debug('5 query is %s' % repr(query))
54+
55+ # Insert & between tokens without an existing boolean operator
56+ # ( not proceeded by (|&!
57+ query = re.sub(r"(?u)(?<![\(\|\&\!])\s*\(", "&(", query)
58+ ## plpy.debug('6 query is %s' % repr(query))
59+ # ) not followed by )|&
60+ query = re.sub(r"(?u)\)(?!\s*(\)|\||\&|\s*$))", ")&", query)
61+ ## plpy.debug('6.1 query is %s' % repr(query))
62+ # Whitespace not proceded by (|&! not followed by &|
63+ query = re.sub(r"(?u)(?<![\(\|\&\!\s])\s+(?![\&\|\s])", "&", query)
64+ ## plpy.debug('7 query is %s' % repr(query))
65+
66+ # Detect and repair syntax errors - we are lenient because
67+ # this input is generally from users.
68+
69+ # Fix unbalanced brackets
70+ openings = query.count("(")
71+ closings = query.count(")")
72+ if openings > closings:
73+ query = query + " ) "*(openings-closings)
74+ elif closings > openings:
75+ query = " ( "*(closings-openings) + query
76+ ## plpy.debug('8 query is %s' % repr(query))
77+
78+ # Strip ' character that do not have letters on both sides
79+ query = re.sub(r"(?u)((?<!\w)'|'(?!\w))", "", query)
80+
81+ # Brackets containing nothing but whitespace and booleans, recursive
82+ last = ""
83+ while last != query:
84+ last = query
85+ query = re.sub(r"(?u)\([\s\&\|\!]*\)", "", query)
86+ ## plpy.debug('9 query is %s' % repr(query))
87+
88+ # An & or | following a (
89+ query = re.sub(r"(?u)(?<=\()[\&\|\s]+", "", query)
90+ ## plpy.debug('10 query is %s' % repr(query))
91+
92+ # An &, | or ! immediatly before a )
93+ query = re.sub(r"(?u)[\&\|\!\s]*[\&\|\!]+\s*(?=\))", "", query)
94+ ## plpy.debug('11 query is %s' % repr(query))
95+
96+ # An &,| or ! followed by another boolean.
97+ query = re.sub(r"(?ux) \s* ( [\&\|\!] ) [\s\&\|]+", r"\1", query)
98+ ## plpy.debug('12 query is %s' % repr(query))
99+
100+ # Leading & or |
101+ query = re.sub(r"(?u)^[\s\&\|]+", "", query)
102+ ## plpy.debug('13 query is %s' % repr(query))
103+
104+ # Trailing &, | or !
105+ query = re.sub(r"(?u)[\&\|\!\s]+$", "", query)
106+ ## plpy.debug('14 query is %s' % repr(query))
107+
108+ # If we have nothing but whitespace and tsearch2 operators,
109+ # return NULL.
110+ if re.search(r"(?u)^[\&\|\!\s\(\)]*$", query) is not None:
111+ return None
112+
113+ # Convert back to UTF-8
114+ query = query.encode('utf8')
115+ ## plpy.debug('15 query is %s' % repr(query))
116+
117+ return query or None
118+ $_$;
119+
120+CREATE OR REPLACE FUNCTION ftq(text) RETURNS pg_catalog.tsquery
121+ LANGUAGE plpythonu IMMUTABLE STRICT
122+ AS $_$
123+ p = plpy.prepare(
124+ "SELECT to_tsquery('default', _ftq($1)) AS x", ["text"])
125+ query = plpy.execute(p, args, 1)[0]["x"]
126+ return query or None
127+ $_$;
128+
129+INSERT INTO LaunchpadDatabaseRevision VALUES (2209, 24, 1);
130
131=== modified file 'lib/lp/answers/stories/question-browse-and-search.txt'
132--- lib/lp/answers/stories/question-browse-and-search.txt 2012-06-22 16:17:48 +0000
133+++ lib/lp/answers/stories/question-browse-and-search.txt 2012-06-26 10:15:26 +0000
134@@ -311,7 +311,7 @@
135 dealing with plugins problems, he always start by a search for such
136 problems:
137
138- >>> browser.getControl(name='field.search_text').value = 'plug-in'
139+ >>> browser.getControl(name='field.search_text').value = 'plugin'
140 >>> browser.getControl('Search', index=0).click()
141 >>> questions = find_tag_by_id(browser.contents, 'question-listing')
142 >>> for question in questions.fetch('td', 'questionTITLE'):
143
144=== modified file 'lib/lp/registry/doc/vocabularies.txt'
145--- lib/lp/registry/doc/vocabularies.txt 2012-06-22 16:17:48 +0000
146+++ lib/lp/registry/doc/vocabularies.txt 2012-06-26 10:15:26 +0000
147@@ -731,8 +731,7 @@
148
149 >>> [(p.name, getattr(p.teamowner, 'name', None))
150 ... for p in vocab.search('ubuntu-team')]
151- [(u'doc', None), (u'name18', u'mark'),
152- (u'ubuntu-security', u'kamion'), (u'ubuntu-team', u'mark')]
153+ [(u'ubuntu-team', u'mark')]
154
155 But it doesn't include merged accounts:
156
157
158=== modified file 'lib/lp/services/database/doc/textsearching.txt'
159--- lib/lp/services/database/doc/textsearching.txt 2012-06-22 16:17:48 +0000
160+++ lib/lp/services/database/doc/textsearching.txt 2012-06-26 10:15:26 +0000
161@@ -138,7 +138,22 @@
162 ... compiled = compiled.decode('UTF-8')
163 ... compiled = compiled.encode('US-ASCII', 'backslashreplace')
164 ... print '%s <=> %s' % (uncompiled, compiled)
165-
166+ >>>
167+ >>> def search(text_to_search, search_phrase):
168+ ... cur = cursor()
169+ ... cur.execute("SELECT to_tsvector(%s)", (text_to_search, ))
170+ ... ts_vector = cur.fetchall()[0][0]
171+ ... cur.execute("SELECT ftq(%s)", (search_phrase, ))
172+ ... ts_query = cur.fetchall()[0][0]
173+ ... cur.execute(
174+ ... "SELECT to_tsvector(%s) @@ ftq(%s)",
175+ ... (text_to_search, search_phrase))
176+ ... match = cur.fetchall()[0][0]
177+ ... return "FTI data: %s query: %s match: %s" % (
178+ ... ts_vector, ts_query, str(match))
179+ >>>
180+ >>> def search_same(text):
181+ ... return search(text, text)
182
183 Queries are lowercased
184
185@@ -225,127 +240,178 @@
186 (hi&ho|hoe)&work&go <=> ( 'hi' & 'ho' | 'hoe' ) & 'work' & 'go'
187
188
189-Hypenation is handled specially. Note that the & operator has precidence
190-over the | operator and that tsearch2 removes the unnecessary branckets.
191-
192- >>> ftq('foo-bar')
193- ((foo&bar)|foobar) <=> 'foo' & 'bar' | 'foobar'
194-
195- >>> ftq('foo-bar-baz')
196- ((foo&bar&baz)|foobarbaz) <=> 'foo' & 'bar' & 'baz' | 'foobarbaz'
197-
198- >>> ftq('foo & bar-baz')
199- foo&((bar&baz)|barbaz) <=> 'foo' & ( 'bar' & 'baz' | 'barbaz' )
200+If a single '-' precedes a word, it is converted into the '!' operator.
201+Note also that a trailing '-' is dropped by to_tsquery().
202
203 >>> ftq('-foo bar-')
204- !foo&bar <=> !'foo' & 'bar'
205+ !foo&bar- <=> !'foo' & 'bar'
206+
207+Repeated '-' are simply ignored by to_tsquery().
208
209 >>> ftq('---foo--- ---bar---')
210- foo&bar <=> 'foo' & 'bar'
211-
212- >>> ftq('foo-bar test')
213- ((foo&bar)|foobar)&test <=> ( 'foo' & 'bar' | 'foobar' ) & 'test'
214-
215- >>> ftq('foo-bar OR test')
216- ((foo&bar)|foobar)|test <=> ( 'foo' & 'bar' | 'foobar' ) | 'test'
217-
218-
219-Most punctuation characters are converted to whitespace outside of
220-words, or treated as a hypen inside words. The exceptions are the
221-operators ()!&|!.
222-
223- >>> ftq(':100%')
224- 100 <=> '100'
225-
226- >>> ftq(r'foo\bar')
227- ((foo&bar)|foobar) <=> 'foo' & 'bar' | 'foobar'
228-
229- >>> ftq('/dev/pmu')
230- ((dev&pmu)|devpmu) <=> 'dev' & 'pmu' | 'devpmu'
231+ ---foo---&---bar--- <=> 'foo' & 'bar'
232+
233+Hyphens surrounded by two words are retained. This reflects the way
234+how to_tsquery() and to_tsvector() handle such strings.
235+
236+ >>> print search_same('foo-bar')
237+ FTI data: 'bar':3 'foo':2 'foo-bar':1
238+ query: 'foo-bar' & 'foo' & 'bar'
239+ match: True
240+
241+
242+Punctuation is handled consistently. If a string containing punctuation
243+appears in an FTI, it can also be passed to ftq(),and a search for this
244+string finds the indexed text.
245+
246+ >>> punctuation = '\'"#$%*+,./:;<=>?@[\]^`{}~'
247+ >>> for symbol in punctuation:
248+ ... print repr(symbol), search_same('foo%sbar' % symbol)
249+ "'" FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
250+ '"' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
251+ '#' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
252+ '$' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
253+ '%' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
254+ '*' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
255+ '+' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
256+ ',' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
257+ '.' FTI data: 'foo.bar':1 query: 'foo.bar' match: True
258+ '/' FTI data: 'foo/bar':1 query: 'foo/bar' match: True
259+ ':' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
260+ ';' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
261+ '<' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
262+ '=' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
263+ '>' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
264+ '?' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
265+ '@' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
266+ '[' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
267+ '\\' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
268+ ']' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
269+ '^' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
270+ '`' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
271+ '{' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
272+ '}' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
273+ '~' FTI data: 'foo':1 '~bar':2 query: 'foo' & '~bar' match: True
274+
275+ >>> for symbol in punctuation:
276+ ... print repr(symbol), search_same('aa %sbb%s cc' % (symbol, symbol))
277+ "'" FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
278+ '"' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
279+ '#' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
280+ '$' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
281+ '%' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
282+ '*' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
283+ '+' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
284+ ',' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
285+ '.' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
286+ '/' FTI data: '/bb':2 'aa':1 'cc':3 query: 'aa' & '/bb' & 'cc' match: True
287+ ':' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
288+ ';' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
289+ '<' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
290+ '=' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
291+ '>' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
292+ '?' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
293+ '@' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
294+ '[' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
295+ '\\' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
296+ ']' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
297+ '^' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
298+ '`' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
299+ '{' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
300+ '}' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
301+ '~' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & '~bb' & 'cc' match: False
302+
303+XXX Abel Deuring 2012-06-20 bug=1015511: Note that the last line above
304+shows a bug: The FTI data for the string "aa ~bb~ cc" contains the words
305+'aa', 'bb', 'cc', while the ts_query object for the same text contains
306+'aa', '~bb', 'cc', hence the query does not match the string. More details_
307+
308+XXX Abel Deuring 2012-06-20 bug=1015519: XML tags cannot be searched.
309+
310+ >>> print search_same('foo <bar> baz')
311+ FTI data: 'baz':2 'foo':1 query: 'foo' & 'baz' match: True
312+
313+More specifically, tags are simply dropped from the FTI data and from
314+search queries.
315+
316+ >>> print search('some text <div>whatever</div>', '<div>')
317+ FTI data: 'text':2 'whatev':3 query: None match: None
318+
319+Of course, omitting '<' and '>'from the query does not help.
320+
321+ >>> print search('some text <div>whatever</div>', 'div')
322+ FTI data: 'text':2 'whatev':3 query: 'div' match: False
323+
324+Treatment of characters that are used as operators in to_tsquery():
325
326 >>> ftq('cool!')
327 cool <=> 'cool'
328
329- >>> ftq('foo@bar.com')
330- ((foo&bar&com)|foobarcom) <=> 'foo' & 'bar' & 'com' | 'foobarcom'
331-
332+Email addresses are retained as a whole, both by to_tsvector() and by
333+ftq().
334+
335+ >>> print search_same('foo@bar.com')
336+ FTI data: 'foo@bar.com':1 query: 'foo@bar.com' match: True
337+
338+File names are retained as a whole.
339+
340+ >>> print search_same('foo-bar.txt')
341+ FTI data: 'foo-bar.txt':1 query: 'foo-bar.txt' match: True
342
343 Some punctuation we pass through to tsearch2 for it to handle.
344-
345- >>> ftq("shouldn't") # NB. This gets stemmed, see below
346- shouldn't <=> 'shouldn'
347-
348-It was noticed though in Bug #33920 that tsearch2 couldn't cope if the
349-apostrophe was not inside a word. So we strip it in these cases.
350-
351- >>> ftq("'cool")
352- cool <=> 'cool'
353- >>> ftq("'shouldn't")
354- shouldn't <=> 'shouldn'
355- >>> ftq("' cool")
356- cool <=> 'cool'
357- >>> ftq("cool '")
358- cool <=> 'cool'
359- >>> ftq("' cool '")
360- cool <=> 'cool'
361- >>> ftq("'cool'")
362- cool <=> 'cool'
363- >>> ftq("('cool' AND bananas)")
364- (cool&bananas) <=> 'cool' & 'banana'
365-
366-It was also noticed through Bug #39828 that tsearch2 will not cope if the
367-! character is embedded inside or found at the end of a word.
368-
369- >>> ftq('cool!')
370- cool <=> 'cool'
371- >>> ftq('hi!mom')
372- hi&mom <=> 'hi' & 'mom'
373- >>> ftq('hi!!!!!mom')
374- hi&mom <=> 'hi' & 'mom'
375- >>> ftq('hi !mom')
376- hi&!mom <=> 'hi' & !'mom'
377-
378-
379-Bug #44913 - Unicode characters in the wrong place
380-
381- >>> ftq(u'a-a\N{LATIN SMALL LETTER C WITH CEDILLA}')
382- ((a&a\xe7)|aa\xe7) <=> 'a\xe7' | 'aa\xe7'
383-
384- Cut & Paste of 'Smart' quotes
385-
386- >>> ftq(u'a-a\N{RIGHT DOUBLE QUOTATION MARK}')
387- ((a&a)|aa) <=> 'aa'
388-
389- >>> ftq(u'\N{LEFT SINGLE QUOTATION MARK}a.a\N{RIGHT SINGLE QUOTATION MARK}')
390- ((a&a)|aa) <=> 'aa'
391+NB. This gets stemmed, see below.
392+
393+ >>> print search_same("shouldn't")
394+ FTI data: 'shouldn':1 query: 'shouldn' match: True
395+
396+Bug #44913 - Unicode characters in the wrong place.
397+
398+ >>> search_same(u'abc-a\N{LATIN SMALL LETTER C WITH CEDILLA}')
399+ "FTI data: 'abc':2 'abc-a\xc3\xa7':1 'a\xc3\xa7':3
400+ query: 'abc-a\xc3\xa7' & 'abc' & 'a\xc3\xa7'
401+ match: True"
402+
403+Cut & Paste of 'Smart' quotes. Note that the quotation mark is retained
404+in the FTI.
405+
406+ >>> print search_same(u'a-a\N{RIGHT DOUBLE QUOTATION MARK}')
407+ FTI data: 'a-a”':1 'a”':3 query: 'a-a”' & 'a”' match: True
408+
409+ >>> print search_same(
410+ ... u'\N{LEFT SINGLE QUOTATION MARK}a.a'
411+ ... u'\N{RIGHT SINGLE QUOTATION MARK}')
412+ FTI data: 'a’':2 '‘a':1 query: '‘a' & 'a’' match: True
413
414
415 Bug #44913 - Nothing but stopwords in a query needing repair
416
417- >>> ftq('a)a')
418- a&a <=> None
419+ >>> print search_same('a)a')
420+ FTI data: query: None match: None
421
422
423 Stop words (words deemed too common in English to search on) are removed
424 from queries by tsearch2.
425
426- >>> ftq("Don't do it harder!")
427- don't&do&it&harder <=> 'harder'
428+ >>> print search_same("Don't do it harder!")
429+ FTI data: 'harder':5 query: 'harder' match: True
430
431
432 Note that some queries will return None after compilation, because they
433 contained nothing but stop words or punctuation.
434
435- >>> ftq("don't do it!")
436- don't&do&it <=> None
437+ >>> print search_same("don't do it!")
438+ FTI data: query: None match: None
439
440- >>> ftq(",,,")
441- None <=> None
442+ >>> print search_same(",,,")
443+ FTI data: query: None match: None
444
445
446 Queries containing nothing except whitespace, boolean operators and
447 punctuation will just return None.
448
449+Note in the fourth example below that the '-' left in the query by _ftq()
450+is ignored by to_tsquery().
451+
452 >>> ftq(" ")
453 None <=> None
454 >>> ftq("AND")
455@@ -353,7 +419,7 @@
456 >>> ftq(" AND (!)")
457 None <=> None
458 >>> ftq("-")
459- None <=> None
460+ - <=> None
461
462
463 Words are also stemmed by tsearch2 (using the English stemmer).
464@@ -381,7 +447,7 @@
465 (hi|!hello)&mom <=> ( 'hi' | !'hello' ) & 'mom'
466
467 >>> ftq('(hi OR - AND hello) AND mom')
468- (hi|hello)&mom <=> ( 'hi' | 'hello' ) & 'mom'
469+ (hi|-&hello)&mom <=> ( 'hi' | 'hello' ) & 'mom'
470
471 >>> ftq('hi AND mom AND')
472 hi&mom <=> 'hi' & 'mom'
473@@ -393,7 +459,7 @@
474 (hi|hello)&mom <=> ( 'hi' | 'hello' ) & 'mom'
475
476 >>> ftq('() hi mom ( ) ((! |((&)))) :-)')
477- (hi&mom) <=> 'hi' & 'mom'
478+ (hi&mom&-) <=> 'hi' & 'mom'
479
480 >>> ftq("(hi mom")
481 hi&mom <=> 'hi' & 'mom'
482@@ -414,15 +480,15 @@
483 hi&mom <=> 'hi' & 'mom'
484
485 >>> ftq("(foo .") # Bug 43245
486- foo <=> 'foo'
487+ foo&. <=> 'foo'
488
489 >>> ftq("(foo.")
490- foo <=> 'foo'
491+ foo. <=> 'foo'
492
493 Bug #54972
494
495 >>> ftq("a[a\n[a")
496- ((a&a)|aa)&a <=> 'aa'
497+ a[a&[a <=> None
498
499 Bug #96698
500
501@@ -437,10 +503,10 @@
502 Bug #160236
503
504 >>> ftq("foo&&bar-baz")
505- foo&((bar&baz)|barbaz) <=> 'foo' & ( 'bar' & 'baz' | 'barbaz' )
506+ foo&bar-baz <=> 'foo' & 'bar-baz' & 'bar' & 'baz'
507
508 >>> ftq("foo||bar.baz")
509- foo|((bar&baz)|barbaz) <=> 'foo' | ( 'bar' & 'baz' | 'barbaz' )
510+ foo|bar.baz <=> 'foo' | 'bar.baz'
511
512
513 Phrase Searching
514@@ -482,7 +548,8 @@
515
516 >>> runsql(r"""
517 ... SELECT title, max(ranking) FROM (
518- ... SELECT Bug.title,rank(Bug.fti||Message.fti,ftq('firefox')) AS ranking
519+ ... SELECT Bug.title,rank(Bug.fti||Message.fti,ftq('firefox'))
520+ ... AS ranking
521 ... FROM Bug, BugMessage, Message
522 ... WHERE Bug.id = BugMessage.bug AND Message.id = BugMessage.message
523 ... AND (Bug.fti @@ ftq('firefox') OR Message.fti @@ ftq('firefox'))
524@@ -499,7 +566,8 @@
525 ... AND BugTask.product = Product.id
526 ... AND Product.name LIKE lower('%firefox%')
527 ... UNION
528- ... SELECT Bug.title, rank(Product.fti, ftq('firefox')) - 0.3 AS ranking
529+ ... SELECT Bug.title, rank(Product.fti, ftq('firefox')) - 0.3
530+ ... AS ranking
531 ... FROM Bug, BugTask, Product
532 ... WHERE Bug.id = BugTask.bug
533 ... AND BugTask.product = Product.id
534@@ -518,7 +586,8 @@
535 Printing doesn't work 0.70
536
537
538-== Natural Language Phrase Query ==
539+Natural Language Phrase Query
540+-----------------------------
541
542 The standard boolean searches of tsearch2 are fine, but sometime you
543 want more fuzzy searches.
544@@ -557,7 +626,8 @@
545 on Ubuntu) - so we are disabling this and reworking from the ground up.
546
547
548-=== nl_term_candidates() ===
549+nl_term_candidates()
550+~~~~~~~~~~~~~~~~~~~~
551
552 To find the terms in a search phrase that are canditates for the search,
553 we can use the nl_term_candidates() function. This function uses ftq()
554@@ -574,19 +644,16 @@
555 >>> nl_term_candidates('how do I do this?')
556 []
557
558-We also handle expansion of hypenated words (like ftq does):
559-
560- >>> nl_term_candidates('firefox foo-bar give me trouble')
561- [u'firefox', u'foo', u'bar', u'foobar', u'give', u'troubl']
562-
563 Except for the hyphenation character, all non-word caracters are ignored:
564
565 >>> nl_term_candidates(
566- ... "Will the \'\'|\'\' character (inside a ''quoted'' string) work???")
567+ ... "Will the \'\'|\'\' character (inside a ''quoted'' string) "
568+ ... "work???")
569 [u'charact', u'insid', u'quot', u'string', u'work']
570
571
572-=== nl_phrase_search() ===
573+nl_phrase_search()
574+~~~~~~~~~~~~~~~~~~
575
576 To get the actual tsearch2 query that should be run, you will use the
577 nl_phrase_search() function. This one takes two mandatory parameters and
578@@ -637,7 +704,8 @@
579 u'slow|system'
580
581
582-==== Using other constraints ====
583+Using other constraints
584+.......................
585
586 You can pass a third parameter to the function that will be use as
587 an additional constraint to determine the total number of rows that
588@@ -659,7 +727,8 @@
589
590 >>> nl_phrase_search(
591 ... 'firefox gets very slow on flickr', Question,
592- ... "Question.product = %s AND Product.active = 't'" % firefox_product.id,
593+ ... "Question.product = %s AND Product.active = 't'"
594+ ... % firefox_product.id,
595 ... ['Product'], fast_enabled=False)
596 u'slow|flickr'
597
598@@ -679,7 +748,8 @@
599 u'(firefox&flickr&slow)|(flickr&slow)|(firefox&slow)|(firefox&flickr)'
600
601
602-==== No keywords filtering with few rows ====
603+No keywords filtering with few rows
604+...................................
605
606 The 50% rule is really useful only when there are many rows. When there
607 only very few rows, that keyword elimination becomes a problem since

Subscribers

People subscribed via source and target branches

to status/vote changes: