Merge lp:~adeuring/launchpad/bug-1020443-db-patch into lp:launchpad

Proposed by Abel Deuring
Status: Merged
Approved by: Stuart Bishop
Approved revision: no longer in the source branch.
Merged at revision: 15730
Proposed branch: lp:~adeuring/launchpad/bug-1020443-db-patch
Merge into: lp:launchpad
Prerequisite: lp:~adeuring/launchpad/bug-1020443-model
Diff against target: 338 lines (+197/-30)
2 files modified
database/schema/patch-2209-24-3.sql (+124/-0)
lib/lp/services/database/doc/textsearching.txt (+73/-30)
To merge this branch: bzr merge lp:~adeuring/launchpad/bug-1020443-db-patch
Reviewer Review Type Date Requested Status
Stuart Bishop (community) Approve
Review via email: mp+117077@code.launchpad.net

Commit message

Treat the characters "&|!" as whitespace in ftq()

Description of the change

This branch changes the stored procedure ftq() so that it no longer treats the characters "&|!" as logical operators but as white space. See also https://code.launchpad.net/~adeuring/launchpad/bug-1020443/+merge/115721 .

To post a comment you must log in.
Revision history for this message
Stuart Bishop (stub) wrote :

This all seems like an improvement.I suspect some tests will now be redundant, but identifying them would be a pain.

review: Approve
Revision history for this message
Abel Deuring (adeuring) wrote :

On 31.07.2012 16:09, Stuart Bishop wrote:
> Review: Approve
>
> This all seems like an improvement.I suspect some tests will now be redundant, but identifying them would be a pain.
>
 I tried to remove all duplicate tests from
lib/lp/services/database/doc/textsearching.txt but of course it is
possible that I missed some redundancies.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
=== added file 'database/schema/patch-2209-24-3.sql'
--- database/schema/patch-2209-24-3.sql 1970-01-01 00:00:00 +0000
+++ database/schema/patch-2209-24-3.sql 2012-07-27 15:05:26 +0000
@@ -0,0 +1,124 @@
1-- Copyright 2012 Canonical Ltd. This software is licensed under the
2-- GNU Affero General Public License version 3 (see the file LICENSE).
3
4SET client_min_messages=ERROR;
5
6CREATE OR REPLACE FUNCTION _ftq(text) RETURNS text
7 LANGUAGE plpythonu IMMUTABLE STRICT
8 AS $_$
9 import re
10
11 # I think this method would be more robust if we used a real
12 # tokenizer and parser to generate the query string, but we need
13 # something suitable for use as a stored procedure which currently
14 # means no external dependancies.
15
16 # Convert to Unicode
17 query = args[0].decode('utf8')
18 ## plpy.debug('1 query is %s' % repr(query))
19
20 # Replace tsquery operators with ' '.
21 query = re.sub('[|&!]', ' ', query)
22
23 # Normalize whitespace
24 query = re.sub("(?u)\s+"," ", query)
25
26 # Convert AND, OR, NOT to tsearch2 punctuation
27 query = re.sub(r"(?u)\bAND\b", "&", query)
28 query = re.sub(r"(?u)\bOR\b", "|", query)
29 query = re.sub(r"(?u)\bNOT\b", " !", query)
30 ## plpy.debug('2 query is %s' % repr(query))
31
32 # Deal with unwanted punctuation.
33 # ':' is used in queries to specify a weight of a word.
34 # '\' is treated differently in to_tsvector() and to_tsquery().
35 punctuation = r'[:\\]'
36 query = re.sub(r"(?u)%s+" % (punctuation,), " ", query)
37 ## plpy.debug('3 query is %s' % repr(query))
38
39 # Now that we have handle case sensitive booleans, convert to lowercase
40 query = query.lower()
41
42 # Remove unpartnered bracket on the left and right
43 query = re.sub(r"(?ux) ^ ( [^(]* ) \)", r"(\1)", query)
44 query = re.sub(r"(?ux) \( ( [^)]* ) $", r"(\1)", query)
45
46 # Remove spurious brackets
47 query = re.sub(r"(?u)\(([^\&\|]*?)\)", r" \1 ", query)
48 ## plpy.debug('5 query is %s' % repr(query))
49
50 # Insert & between tokens without an existing boolean operator
51 # ( not proceeded by (|&!
52 query = re.sub(r"(?u)(?<![\(\|\&\!])\s*\(", "&(", query)
53 ## plpy.debug('6 query is %s' % repr(query))
54 # ) not followed by )|&
55 query = re.sub(r"(?u)\)(?!\s*(\)|\||\&|\s*$))", ")&", query)
56 ## plpy.debug('6.1 query is %s' % repr(query))
57 # Whitespace not proceded by (|&! not followed by &|
58 query = re.sub(r"(?u)(?<![\(\|\&\!\s])\s+(?![\&\|\s])", "&", query)
59 ## plpy.debug('7 query is %s' % repr(query))
60
61 # Detect and repair syntax errors - we are lenient because
62 # this input is generally from users.
63
64 # Fix unbalanced brackets
65 openings = query.count("(")
66 closings = query.count(")")
67 if openings > closings:
68 query = query + " ) "*(openings-closings)
69 elif closings > openings:
70 query = " ( "*(closings-openings) + query
71 ## plpy.debug('8 query is %s' % repr(query))
72
73 # Strip ' character that do not have letters on both sides
74 query = re.sub(r"(?u)((?<!\w)'|'(?!\w))", "", query)
75
76 # Brackets containing nothing but whitespace and booleans, recursive
77 last = ""
78 while last != query:
79 last = query
80 query = re.sub(r"(?u)\([\s\&\|\!]*\)", "", query)
81 ## plpy.debug('9 query is %s' % repr(query))
82
83 # An & or | following a (
84 query = re.sub(r"(?u)(?<=\()[\&\|\s]+", "", query)
85 ## plpy.debug('10 query is %s' % repr(query))
86
87 # An &, | or ! immediatly before a )
88 query = re.sub(r"(?u)[\&\|\!\s]*[\&\|\!]+\s*(?=\))", "", query)
89 ## plpy.debug('11 query is %s' % repr(query))
90
91 # An &,| or ! followed by another boolean.
92 query = re.sub(r"(?ux) \s* ( [\&\|\!] ) [\s\&\|]+", r"\1", query)
93 ## plpy.debug('12 query is %s' % repr(query))
94
95 # Leading & or |
96 query = re.sub(r"(?u)^[\s\&\|]+", "", query)
97 ## plpy.debug('13 query is %s' % repr(query))
98
99 # Trailing &, | or !
100 query = re.sub(r"(?u)[\&\|\!\s]+$", "", query)
101 ## plpy.debug('14 query is %s' % repr(query))
102
103 # If we have nothing but whitespace and tsearch2 operators,
104 # return NULL.
105 if re.search(r"(?u)^[\&\|\!\s\(\)]*$", query) is not None:
106 return None
107
108 # Convert back to UTF-8
109 query = query.encode('utf8')
110 ## plpy.debug('15 query is %s' % repr(query))
111
112 return query or None
113 $_$;
114
115CREATE OR REPLACE FUNCTION ftq(text) RETURNS pg_catalog.tsquery
116 LANGUAGE plpythonu IMMUTABLE STRICT
117 AS $_$
118 p = plpy.prepare(
119 "SELECT to_tsquery('default', _ftq($1)) AS x", ["text"])
120 query = plpy.execute(p, args, 1)[0]["x"]
121 return query or None
122 $_$;
123
124INSERT INTO LaunchpadDatabaseRevision VALUES (2209, 24, 3);
0125
=== modified file 'lib/lp/services/database/doc/textsearching.txt'
--- lib/lp/services/database/doc/textsearching.txt 2012-07-27 00:25:42 +0000
+++ lib/lp/services/database/doc/textsearching.txt 2012-07-27 15:05:26 +0000
@@ -172,23 +172,16 @@
172 >>> ftq('hi AND mom')172 >>> ftq('hi AND mom')
173 hi&mom <=> 'hi' & 'mom'173 hi&mom <=> 'hi' & 'mom'
174174
175 >>> ftq('hi & mom')
176 hi&mom <=> 'hi' & 'mom'
177
178 >>> ftq('hi OR mom')175 >>> ftq('hi OR mom')
179 hi|mom <=> 'hi' | 'mom'176 hi|mom <=> 'hi' | 'mom'
180177
181 >>> ftq('hi | mom')178 >>> ftq('hi AND NOT dad')
182 hi|mom <=> 'hi' | 'mom'
183
184 >>> ftq('hi & -dad')
185 hi&!dad <=> 'hi' & !'dad'179 hi&!dad <=> 'hi' & !'dad'
186180
187181
188
189Brackets are allowed to specify precidence182Brackets are allowed to specify precidence
190183
191 >>> ftq('(HI OR HELLO) & mom')184 >>> ftq('(HI OR HELLO) AND mom')
192 (hi|hello)&mom <=> ( 'hi' | 'hello' ) & 'mom'185 (hi|hello)&mom <=> ( 'hi' | 'hello' ) & 'mom'
193186
194 >>> ftq('Hi(Mom)')187 >>> ftq('Hi(Mom)')
@@ -203,19 +196,16 @@
203 >>> ftq('foo(bar OR baz)') # Bug #32071196 >>> ftq('foo(bar OR baz)') # Bug #32071
204 foo&(bar|baz) <=> 'foo' & ( 'bar' | 'baz' )197 foo&(bar|baz) <=> 'foo' & ( 'bar' | 'baz' )
205198
206 >>> ftq('foo (bar OR baz)')
207 foo&(bar|baz) <=> 'foo' & ( 'bar' | 'baz' )
208
209199
210We also support negation200We also support negation
211201
212 >>> ftq('!Hi')202 >>> ftq('NOT Hi')
213 !hi <=> !'hi'203 !hi <=> !'hi'
214204
215 >>> ftq('-(Hi & Mom)')205 >>> ftq('NOT(Hi AND Mom)')
216 !(hi&mom) <=> !( 'hi' & 'mom' )206 !(hi&mom) <=> !( 'hi' & 'mom' )
217207
218 >>> ftq('Foo & ! Bar')208 >>> ftq('Foo AND NOT Bar')
219 foo&!bar <=> 'foo' & !'bar'209 foo&!bar <=> 'foo' & !'bar'
220210
221211
@@ -224,7 +214,7 @@
224 >>> ftq('Hi Mom')214 >>> ftq('Hi Mom')
225 hi&mom <=> 'hi' & 'mom'215 hi&mom <=> 'hi' & 'mom'
226216
227 >>> ftq('Hi -mom')217 >>> ftq('Hi NOT mom')
228 hi&!mom <=> 'hi' & !'mom'218 hi&!mom <=> 'hi' & !'mom'
229219
230 >>> ftq('hi (mom OR mum)')220 >>> ftq('hi (mom OR mum)')
@@ -233,18 +223,34 @@
233 >>> ftq('(hi OR hello) mom')223 >>> ftq('(hi OR hello) mom')
234 (hi|hello)&mom <=> ( 'hi' | 'hello' ) & 'mom'224 (hi|hello)&mom <=> ( 'hi' | 'hello' ) & 'mom'
235225
236 >>> ftq('(hi OR hello) -mom')226 >>> ftq('(hi OR hello) NOT mom')
237 (hi|hello)&!mom <=> ( 'hi' | 'hello' ) & !'mom'227 (hi|hello)&!mom <=> ( 'hi' | 'hello' ) & !'mom'
238228
239 >>> ftq('(hi ho OR hoe) work go')229 >>> ftq('(hi ho OR hoe) work go')
240 (hi&ho|hoe)&work&go <=> ( 'hi' & 'ho' | 'hoe' ) & 'work' & 'go'230 (hi&ho|hoe)&work&go <=> ( 'hi' & 'ho' | 'hoe' ) & 'work' & 'go'
241231
242232
243If a single '-' precedes a word, it is converted into the '!' operator.233'-' symbols are treated by the Postgres FTI parser context sensitive.
244Note also that a trailing '-' is dropped by to_tsquery().234If they precede a word, they are removed.
245235
246 >>> ftq('-foo bar-')236 >>> print search_same('foo -bar')
247 !foo&bar- <=> !'foo' & 'bar'237 FTI data: 'bar':2 'foo':1
238 query: 'foo' & 'bar'
239 match: True
240
241If a '-' precedes a number, it is retained.
242
243 >>> print search_same('123 -456')
244 FTI data: '-456':2 '123':1
245 query: '123' & '-456'
246 match: True
247
248Trailing '-' are always ignored.
249
250 >>> print search_same('bar- 123-')
251 FTI data: '123':2 'bar':1
252 query: 'bar' & '123'
253 match: True
248254
249Repeated '-' are simply ignored by to_tsquery().255Repeated '-' are simply ignored by to_tsquery().
250256
@@ -259,6 +265,12 @@
259 query: 'foo-bar' & 'foo' & 'bar'265 query: 'foo-bar' & 'foo' & 'bar'
260 match: True266 match: True
261267
268A '-' surrounded by numbers is treated as the sign of the right-hand number.
269
270 >>> print search_same('123-456')
271 FTI data: '-456':2 '123':1
272 query: '123' & '-456'
273 match: True
262274
263Punctuation is handled consistently. If a string containing punctuation275Punctuation is handled consistently. If a string containing punctuation
264appears in an FTI, it can also be passed to ftq(),and a search for this276appears in an FTI, it can also be passed to ftq(),and a search for this
@@ -342,11 +354,36 @@
342 >>> print search('some text <div>whatever</div>', 'div')354 >>> print search('some text <div>whatever</div>', 'div')
343 FTI data: 'text':2 'whatev':3 query: 'div' match: False355 FTI data: 'text':2 'whatev':3 query: 'div' match: False
344356
345Treatment of characters that are used as operators in to_tsquery():357The symbols '&', '|' and '!' are treated as operators by to_tsquery();
358to_tsvector() treats them as whitespace. ftq() converts the words 'AND',
359'OR', 'NOT' are into these operators expected by to_tsquery(), and it
360replaces the symbols '&', '|' and '!' with spaces. This avoids
361surprising search results when the operator symbols appear accidentally
362in search terms, e.g., by using a plain copy of a source code line as
363the search term.
346364
347 >>> ftq('cool!')365 >>> ftq('cool!')
348 cool <=> 'cool'366 cool <=> 'cool'
349367
368 >>> print search_same('Shell scripts usually start with #!/bin/sh.')
369 FTI data: '/bin/sh':6 'script':2 'shell':1 'start':4 'usual':3
370 query: 'shell' & 'script' & 'usual' & 'start' & '/bin/sh'
371 match: True
372
373 >>> print search_same('int foo = (bar & ! baz) | bla;')
374 FTI data: 'bar':3 'baz':4 'bla':5 'foo':2 'int':1
375 query: 'int' & 'foo' & 'bar' & 'baz' & 'bla'
376 match: True
377
378Queries containing only punctuation symbols yield an empty ts_query
379object. Note that _ftq() first replaces the '!' with a ' '; later on,
380_ftq() joins the two remaining terms '?' and '.' with the "AND"
381operator '&'. Finally, to_tsquery() detects the AND combination of
382two symbols that are not tokenized and returns null.
383
384 >>> ftq('?!.') # Bug 1020443
385 ?&. <=> None
386
350Email addresses are retained as a whole, both by to_tsvector() and by387Email addresses are retained as a whole, both by to_tsvector() and by
351ftq().388ftq().
352389
@@ -430,11 +467,17 @@
430 >>> ftq("administrate")467 >>> ftq("administrate")
431 administrate <=> 'administr'468 administrate <=> 'administr'
432469
470Note that stemming is not always idempotent:
471
472 >>> ftq('extension')
473 extension <=> 'extens'
474 >>> ftq('extens')
475 extens <=> 'exten'
433476
434Dud queries are 'repaired', such as doubled operators, trailing operators477Dud queries are 'repaired', such as doubled operators, trailing operators
435or invalid leading operators478or invalid leading operators
436479
437 >>> ftq('hi & OR mom')480 >>> ftq('hi AND OR mom')
438 hi&mom <=> 'hi' & 'mom'481 hi&mom <=> 'hi' & 'mom'
439482
440 >>> ftq('(hi OR OR hello) AND mom')483 >>> ftq('(hi OR OR hello) AND mom')
@@ -443,7 +486,7 @@
443 >>> ftq('(hi OR AND hello) AND mom')486 >>> ftq('(hi OR AND hello) AND mom')
444 (hi|hello)&mom <=> ( 'hi' | 'hello' ) & 'mom'487 (hi|hello)&mom <=> ( 'hi' | 'hello' ) & 'mom'
445488
446 >>> ftq('(hi OR -AND hello) AND mom')489 >>> ftq('(hi OR NOT AND hello) AND mom')
447 (hi|!hello)&mom <=> ( 'hi' | !'hello' ) & 'mom'490 (hi|!hello)&mom <=> ( 'hi' | !'hello' ) & 'mom'
448491
449 >>> ftq('(hi OR - AND hello) AND mom')492 >>> ftq('(hi OR - AND hello) AND mom')
@@ -452,13 +495,13 @@
452 >>> ftq('hi AND mom AND')495 >>> ftq('hi AND mom AND')
453 hi&mom <=> 'hi' & 'mom'496 hi&mom <=> 'hi' & 'mom'
454497
455 >>> ftq('& hi & mom')498 >>> ftq('AND hi AND mom')
456 hi&mom <=> 'hi' & 'mom'499 hi&mom <=> 'hi' & 'mom'
457500
458 >>> ftq('(& hi | hello) AND mom')501 >>> ftq('(AND hi OR hello) AND mom')
459 (hi|hello)&mom <=> ( 'hi' | 'hello' ) & 'mom'502 (hi|hello)&mom <=> ( 'hi' | 'hello' ) & 'mom'
460503
461 >>> ftq('() hi mom ( ) ((! |((&)))) :-)')504 >>> ftq('() hi mom ( ) ((NOT OR((AND)))) :-)')
462 (hi&mom&-) <=> 'hi' & 'mom'505 (hi&mom&-) <=> 'hi' & 'mom'
463506
464 >>> ftq("(hi mom")507 >>> ftq("(hi mom")
@@ -502,10 +545,10 @@
502545
503 Bug #160236546 Bug #160236
504547
505 >>> ftq("foo&&bar-baz")548 >>> ftq("foo AND AND bar-baz")
506 foo&bar-baz <=> 'foo' & 'bar-baz' & 'bar' & 'baz'549 foo&bar-baz <=> 'foo' & 'bar-baz' & 'bar' & 'baz'
507550
508 >>> ftq("foo||bar.baz")551 >>> ftq("foo OR OR bar.baz")
509 foo|bar.baz <=> 'foo' | 'bar.baz'552 foo|bar.baz <=> 'foo' | 'bar.baz'
510553
511554