Merge lp:~adeuring/launchpad/bug-1020443-db-patch into lp:launchpad

Proposed by Abel Deuring
Status: Merged
Approved by: Stuart Bishop
Approved revision: no longer in the source branch.
Merged at revision: 15730
Proposed branch: lp:~adeuring/launchpad/bug-1020443-db-patch
Merge into: lp:launchpad
Prerequisite: lp:~adeuring/launchpad/bug-1020443-model
Diff against target: 338 lines (+197/-30)
2 files modified
database/schema/patch-2209-24-3.sql (+124/-0)
lib/lp/services/database/doc/textsearching.txt (+73/-30)
To merge this branch: bzr merge lp:~adeuring/launchpad/bug-1020443-db-patch
Reviewer Review Type Date Requested Status
Stuart Bishop (community) Approve
Review via email: mp+117077@code.launchpad.net

Commit message

Treat the characters "&|!" as whitespace in ftq()

Description of the change

This branch changes the stored procedure ftq() so that it no longer treats the characters "&|!" as logical operators but as white space. See also https://code.launchpad.net/~adeuring/launchpad/bug-1020443/+merge/115721 .

To post a comment you must log in.
Revision history for this message
Stuart Bishop (stub) wrote :

This all seems like an improvement.I suspect some tests will now be redundant, but identifying them would be a pain.

review: Approve
Revision history for this message
Abel Deuring (adeuring) wrote :

On 31.07.2012 16:09, Stuart Bishop wrote:
> Review: Approve
>
> This all seems like an improvement.I suspect some tests will now be redundant, but identifying them would be a pain.
>
 I tried to remove all duplicate tests from
lib/lp/services/database/doc/textsearching.txt but of course it is
possible that I missed some redundancies.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== added file 'database/schema/patch-2209-24-3.sql'
2--- database/schema/patch-2209-24-3.sql 1970-01-01 00:00:00 +0000
3+++ database/schema/patch-2209-24-3.sql 2012-07-27 15:05:26 +0000
4@@ -0,0 +1,124 @@
5+-- Copyright 2012 Canonical Ltd. This software is licensed under the
6+-- GNU Affero General Public License version 3 (see the file LICENSE).
7+
8+SET client_min_messages=ERROR;
9+
10+CREATE OR REPLACE FUNCTION _ftq(text) RETURNS text
11+ LANGUAGE plpythonu IMMUTABLE STRICT
12+ AS $_$
13+ import re
14+
15+ # I think this method would be more robust if we used a real
16+ # tokenizer and parser to generate the query string, but we need
17+ # something suitable for use as a stored procedure which currently
18+ # means no external dependancies.
19+
20+ # Convert to Unicode
21+ query = args[0].decode('utf8')
22+ ## plpy.debug('1 query is %s' % repr(query))
23+
24+ # Replace tsquery operators with ' '.
25+ query = re.sub('[|&!]', ' ', query)
26+
27+ # Normalize whitespace
28+ query = re.sub("(?u)\s+"," ", query)
29+
30+ # Convert AND, OR, NOT to tsearch2 punctuation
31+ query = re.sub(r"(?u)\bAND\b", "&", query)
32+ query = re.sub(r"(?u)\bOR\b", "|", query)
33+ query = re.sub(r"(?u)\bNOT\b", " !", query)
34+ ## plpy.debug('2 query is %s' % repr(query))
35+
36+ # Deal with unwanted punctuation.
37+ # ':' is used in queries to specify a weight of a word.
38+ # '\' is treated differently in to_tsvector() and to_tsquery().
39+ punctuation = r'[:\\]'
40+ query = re.sub(r"(?u)%s+" % (punctuation,), " ", query)
41+ ## plpy.debug('3 query is %s' % repr(query))
42+
43+ # Now that we have handle case sensitive booleans, convert to lowercase
44+ query = query.lower()
45+
46+ # Remove unpartnered bracket on the left and right
47+ query = re.sub(r"(?ux) ^ ( [^(]* ) \)", r"(\1)", query)
48+ query = re.sub(r"(?ux) \( ( [^)]* ) $", r"(\1)", query)
49+
50+ # Remove spurious brackets
51+ query = re.sub(r"(?u)\(([^\&\|]*?)\)", r" \1 ", query)
52+ ## plpy.debug('5 query is %s' % repr(query))
53+
54+ # Insert & between tokens without an existing boolean operator
55+ # ( not proceeded by (|&!
56+ query = re.sub(r"(?u)(?<![\(\|\&\!])\s*\(", "&(", query)
57+ ## plpy.debug('6 query is %s' % repr(query))
58+ # ) not followed by )|&
59+ query = re.sub(r"(?u)\)(?!\s*(\)|\||\&|\s*$))", ")&", query)
60+ ## plpy.debug('6.1 query is %s' % repr(query))
61+ # Whitespace not proceded by (|&! not followed by &|
62+ query = re.sub(r"(?u)(?<![\(\|\&\!\s])\s+(?![\&\|\s])", "&", query)
63+ ## plpy.debug('7 query is %s' % repr(query))
64+
65+ # Detect and repair syntax errors - we are lenient because
66+ # this input is generally from users.
67+
68+ # Fix unbalanced brackets
69+ openings = query.count("(")
70+ closings = query.count(")")
71+ if openings > closings:
72+ query = query + " ) "*(openings-closings)
73+ elif closings > openings:
74+ query = " ( "*(closings-openings) + query
75+ ## plpy.debug('8 query is %s' % repr(query))
76+
77+ # Strip ' character that do not have letters on both sides
78+ query = re.sub(r"(?u)((?<!\w)'|'(?!\w))", "", query)
79+
80+ # Brackets containing nothing but whitespace and booleans, recursive
81+ last = ""
82+ while last != query:
83+ last = query
84+ query = re.sub(r"(?u)\([\s\&\|\!]*\)", "", query)
85+ ## plpy.debug('9 query is %s' % repr(query))
86+
87+ # An & or | following a (
88+ query = re.sub(r"(?u)(?<=\()[\&\|\s]+", "", query)
89+ ## plpy.debug('10 query is %s' % repr(query))
90+
91+ # An &, | or ! immediatly before a )
92+ query = re.sub(r"(?u)[\&\|\!\s]*[\&\|\!]+\s*(?=\))", "", query)
93+ ## plpy.debug('11 query is %s' % repr(query))
94+
95+ # An &,| or ! followed by another boolean.
96+ query = re.sub(r"(?ux) \s* ( [\&\|\!] ) [\s\&\|]+", r"\1", query)
97+ ## plpy.debug('12 query is %s' % repr(query))
98+
99+ # Leading & or |
100+ query = re.sub(r"(?u)^[\s\&\|]+", "", query)
101+ ## plpy.debug('13 query is %s' % repr(query))
102+
103+ # Trailing &, | or !
104+ query = re.sub(r"(?u)[\&\|\!\s]+$", "", query)
105+ ## plpy.debug('14 query is %s' % repr(query))
106+
107+ # If we have nothing but whitespace and tsearch2 operators,
108+ # return NULL.
109+ if re.search(r"(?u)^[\&\|\!\s\(\)]*$", query) is not None:
110+ return None
111+
112+ # Convert back to UTF-8
113+ query = query.encode('utf8')
114+ ## plpy.debug('15 query is %s' % repr(query))
115+
116+ return query or None
117+ $_$;
118+
119+CREATE OR REPLACE FUNCTION ftq(text) RETURNS pg_catalog.tsquery
120+ LANGUAGE plpythonu IMMUTABLE STRICT
121+ AS $_$
122+ p = plpy.prepare(
123+ "SELECT to_tsquery('default', _ftq($1)) AS x", ["text"])
124+ query = plpy.execute(p, args, 1)[0]["x"]
125+ return query or None
126+ $_$;
127+
128+INSERT INTO LaunchpadDatabaseRevision VALUES (2209, 24, 3);
129
130=== modified file 'lib/lp/services/database/doc/textsearching.txt'
131--- lib/lp/services/database/doc/textsearching.txt 2012-07-27 00:25:42 +0000
132+++ lib/lp/services/database/doc/textsearching.txt 2012-07-27 15:05:26 +0000
133@@ -172,23 +172,16 @@
134 >>> ftq('hi AND mom')
135 hi&mom <=> 'hi' & 'mom'
136
137- >>> ftq('hi & mom')
138- hi&mom <=> 'hi' & 'mom'
139-
140 >>> ftq('hi OR mom')
141 hi|mom <=> 'hi' | 'mom'
142
143- >>> ftq('hi | mom')
144- hi|mom <=> 'hi' | 'mom'
145-
146- >>> ftq('hi & -dad')
147+ >>> ftq('hi AND NOT dad')
148 hi&!dad <=> 'hi' & !'dad'
149
150
151-
152 Brackets are allowed to specify precidence
153
154- >>> ftq('(HI OR HELLO) & mom')
155+ >>> ftq('(HI OR HELLO) AND mom')
156 (hi|hello)&mom <=> ( 'hi' | 'hello' ) & 'mom'
157
158 >>> ftq('Hi(Mom)')
159@@ -203,19 +196,16 @@
160 >>> ftq('foo(bar OR baz)') # Bug #32071
161 foo&(bar|baz) <=> 'foo' & ( 'bar' | 'baz' )
162
163- >>> ftq('foo (bar OR baz)')
164- foo&(bar|baz) <=> 'foo' & ( 'bar' | 'baz' )
165-
166
167 We also support negation
168
169- >>> ftq('!Hi')
170+ >>> ftq('NOT Hi')
171 !hi <=> !'hi'
172
173- >>> ftq('-(Hi & Mom)')
174+ >>> ftq('NOT(Hi AND Mom)')
175 !(hi&mom) <=> !( 'hi' & 'mom' )
176
177- >>> ftq('Foo & ! Bar')
178+ >>> ftq('Foo AND NOT Bar')
179 foo&!bar <=> 'foo' & !'bar'
180
181
182@@ -224,7 +214,7 @@
183 >>> ftq('Hi Mom')
184 hi&mom <=> 'hi' & 'mom'
185
186- >>> ftq('Hi -mom')
187+ >>> ftq('Hi NOT mom')
188 hi&!mom <=> 'hi' & !'mom'
189
190 >>> ftq('hi (mom OR mum)')
191@@ -233,18 +223,34 @@
192 >>> ftq('(hi OR hello) mom')
193 (hi|hello)&mom <=> ( 'hi' | 'hello' ) & 'mom'
194
195- >>> ftq('(hi OR hello) -mom')
196+ >>> ftq('(hi OR hello) NOT mom')
197 (hi|hello)&!mom <=> ( 'hi' | 'hello' ) & !'mom'
198
199 >>> ftq('(hi ho OR hoe) work go')
200 (hi&ho|hoe)&work&go <=> ( 'hi' & 'ho' | 'hoe' ) & 'work' & 'go'
201
202
203-If a single '-' precedes a word, it is converted into the '!' operator.
204-Note also that a trailing '-' is dropped by to_tsquery().
205-
206- >>> ftq('-foo bar-')
207- !foo&bar- <=> !'foo' & 'bar'
208+'-' symbols are treated by the Postgres FTI parser context sensitive.
209+If they precede a word, they are removed.
210+
211+ >>> print search_same('foo -bar')
212+ FTI data: 'bar':2 'foo':1
213+ query: 'foo' & 'bar'
214+ match: True
215+
216+If a '-' precedes a number, it is retained.
217+
218+ >>> print search_same('123 -456')
219+ FTI data: '-456':2 '123':1
220+ query: '123' & '-456'
221+ match: True
222+
223+Trailing '-' are always ignored.
224+
225+ >>> print search_same('bar- 123-')
226+ FTI data: '123':2 'bar':1
227+ query: 'bar' & '123'
228+ match: True
229
230 Repeated '-' are simply ignored by to_tsquery().
231
232@@ -259,6 +265,12 @@
233 query: 'foo-bar' & 'foo' & 'bar'
234 match: True
235
236+A '-' surrounded by numbers is treated as the sign of the right-hand number.
237+
238+ >>> print search_same('123-456')
239+ FTI data: '-456':2 '123':1
240+ query: '123' & '-456'
241+ match: True
242
243 Punctuation is handled consistently. If a string containing punctuation
244 appears in an FTI, it can also be passed to ftq(),and a search for this
245@@ -342,11 +354,36 @@
246 >>> print search('some text <div>whatever</div>', 'div')
247 FTI data: 'text':2 'whatev':3 query: 'div' match: False
248
249-Treatment of characters that are used as operators in to_tsquery():
250+The symbols '&', '|' and '!' are treated as operators by to_tsquery();
251+to_tsvector() treats them as whitespace. ftq() converts the words 'AND',
252+'OR', 'NOT' are into these operators expected by to_tsquery(), and it
253+replaces the symbols '&', '|' and '!' with spaces. This avoids
254+surprising search results when the operator symbols appear accidentally
255+in search terms, e.g., by using a plain copy of a source code line as
256+the search term.
257
258 >>> ftq('cool!')
259 cool <=> 'cool'
260
261+ >>> print search_same('Shell scripts usually start with #!/bin/sh.')
262+ FTI data: '/bin/sh':6 'script':2 'shell':1 'start':4 'usual':3
263+ query: 'shell' & 'script' & 'usual' & 'start' & '/bin/sh'
264+ match: True
265+
266+ >>> print search_same('int foo = (bar & ! baz) | bla;')
267+ FTI data: 'bar':3 'baz':4 'bla':5 'foo':2 'int':1
268+ query: 'int' & 'foo' & 'bar' & 'baz' & 'bla'
269+ match: True
270+
271+Queries containing only punctuation symbols yield an empty ts_query
272+object. Note that _ftq() first replaces the '!' with a ' '; later on,
273+_ftq() joins the two remaining terms '?' and '.' with the "AND"
274+operator '&'. Finally, to_tsquery() detects the AND combination of
275+two symbols that are not tokenized and returns null.
276+
277+ >>> ftq('?!.') # Bug 1020443
278+ ?&. <=> None
279+
280 Email addresses are retained as a whole, both by to_tsvector() and by
281 ftq().
282
283@@ -430,11 +467,17 @@
284 >>> ftq("administrate")
285 administrate <=> 'administr'
286
287+Note that stemming is not always idempotent:
288+
289+ >>> ftq('extension')
290+ extension <=> 'extens'
291+ >>> ftq('extens')
292+ extens <=> 'exten'
293
294 Dud queries are 'repaired', such as doubled operators, trailing operators
295 or invalid leading operators
296
297- >>> ftq('hi & OR mom')
298+ >>> ftq('hi AND OR mom')
299 hi&mom <=> 'hi' & 'mom'
300
301 >>> ftq('(hi OR OR hello) AND mom')
302@@ -443,7 +486,7 @@
303 >>> ftq('(hi OR AND hello) AND mom')
304 (hi|hello)&mom <=> ( 'hi' | 'hello' ) & 'mom'
305
306- >>> ftq('(hi OR -AND hello) AND mom')
307+ >>> ftq('(hi OR NOT AND hello) AND mom')
308 (hi|!hello)&mom <=> ( 'hi' | !'hello' ) & 'mom'
309
310 >>> ftq('(hi OR - AND hello) AND mom')
311@@ -452,13 +495,13 @@
312 >>> ftq('hi AND mom AND')
313 hi&mom <=> 'hi' & 'mom'
314
315- >>> ftq('& hi & mom')
316+ >>> ftq('AND hi AND mom')
317 hi&mom <=> 'hi' & 'mom'
318
319- >>> ftq('(& hi | hello) AND mom')
320+ >>> ftq('(AND hi OR hello) AND mom')
321 (hi|hello)&mom <=> ( 'hi' | 'hello' ) & 'mom'
322
323- >>> ftq('() hi mom ( ) ((! |((&)))) :-)')
324+ >>> ftq('() hi mom ( ) ((NOT OR((AND)))) :-)')
325 (hi&mom&-) <=> 'hi' & 'mom'
326
327 >>> ftq("(hi mom")
328@@ -502,10 +545,10 @@
329
330 Bug #160236
331
332- >>> ftq("foo&&bar-baz")
333+ >>> ftq("foo AND AND bar-baz")
334 foo&bar-baz <=> 'foo' & 'bar-baz' & 'bar' & 'baz'
335
336- >>> ftq("foo||bar.baz")
337+ >>> ftq("foo OR OR bar.baz")
338 foo|bar.baz <=> 'foo' | 'bar.baz'
339
340