1
=== added file 'database/schema/patch-2209-24-1.sql'
2
--- database/schema/patch-2209-24-1.sql	1970-01-01 00:00:00 +0000
3
+++ database/schema/patch-2209-24-1.sql	2012-06-26 10:15:26 +0000
4
@@ -0,0 +1,125 @@
5
1
-- Copyright 2012 Canonical Ltd.  This software is licensed under the
6
2
-- GNU Affero General Public License version 3 (see the file LICENSE).
7
3
8
4
SET client_min_messages=ERROR;
9
5
10
6
CREATE OR REPLACE FUNCTION _ftq(text) RETURNS text
11
7
    LANGUAGE plpythonu IMMUTABLE STRICT
12
8
    AS $_$
13
9
        import re
14
10
15
11
        # I think this method would be more robust if we used a real
16
12
        # tokenizer and parser to generate the query string, but we need
17
13
        # something suitable for use as a stored procedure which currently
18
14
        # means no external dependancies.
19
15
20
16
        # Convert to Unicode
21
17
        query = args[0].decode('utf8')
22
18
        ## plpy.debug('1 query is %s' % repr(query))
23
19
24
20
        # Normalize whitespace
25
21
        query = re.sub("(?u)\s+"," ", query)
26
22
27
23
        # Convert AND, OR, NOT and - to tsearch2 punctuation
28
24
        query = re.sub(r"(?u)(?:^|\s)-([\w\(])", r" !\1", query)
29
25
        query = re.sub(r"(?u)\bAND\b", "&", query)
30
26
        query = re.sub(r"(?u)\bOR\b", "|", query)
31
27
        query = re.sub(r"(?u)\bNOT\b", " !", query)
32
28
        ## plpy.debug('2 query is %s' % repr(query))
33
29
34
30
        # Deal with unwanted punctuation.
35
31
        # ':' is used in queries to specify a weight of a word.
36
32
        # '\' is treated differently in to_tsvector() and to_tsquery().
37
33
        punctuation = r'[:\\]'
38
34
        query = re.sub(r"(?u)%s+" % (punctuation,), " ", query)
39
35
        ## plpy.debug('3 query is %s' % repr(query))
40
36
41
37
        # Strip ! characters inside and at the end of a word
42
38
        query = re.sub(r"(?u)(?<=\w)[\!]+", " ", query)
43
39
44
40
        # Now that we have handle case sensitive booleans, convert to lowercase
45
41
        query = query.lower()
46
42
47
43
        # Remove unpartnered bracket on the left and right
48
44
        query = re.sub(r"(?ux) ^ ( [^(]* ) \)", r"(\1)", query)
49
45
        query = re.sub(r"(?ux) \( ( [^)]* ) $", r"(\1)", query)
50
46
51
47
        # Remove spurious brackets
52
48
        query = re.sub(r"(?u)\(([^\&\|]*?)\)", r" \1 ", query)
53
49
        ## plpy.debug('5 query is %s' % repr(query))
54
50
55
51
        # Insert & between tokens without an existing boolean operator
56
52
        # ( not proceeded by (|&!
57
53
        query = re.sub(r"(?u)(?<![\(\|\&\!])\s*\(", "&(", query)
58
54
        ## plpy.debug('6 query is %s' % repr(query))
59
55
        # ) not followed by )|&
60
56
        query = re.sub(r"(?u)\)(?!\s*(\)|\||\&|\s*$))", ")&", query)
61
57
        ## plpy.debug('6.1 query is %s' % repr(query))
62
58
        # Whitespace not proceded by (|&! not followed by &|
63
59
        query = re.sub(r"(?u)(?<![\(\|\&\!\s])\s+(?![\&\|\s])", "&", query)
64
60
        ## plpy.debug('7 query is %s' % repr(query))
65
61
66
62
        # Detect and repair syntax errors - we are lenient because
67
63
        # this input is generally from users.
68
64
69
65
        # Fix unbalanced brackets
70
66
        openings = query.count("(")
71
67
        closings = query.count(")")
72
68
        if openings > closings:
73
69
            query = query + " ) "*(openings-closings)
74
70
        elif closings > openings:
75
71
            query = " ( "*(closings-openings) + query
76
72
        ## plpy.debug('8 query is %s' % repr(query))
77
73
78
74
        # Strip ' character that do not have letters on both sides
79
75
        query = re.sub(r"(?u)((?<!\w)'|'(?!\w))", "", query)
80
76
81
77
        # Brackets containing nothing but whitespace and booleans, recursive
82
78
        last = ""
83
79
        while last != query:
84
80
            last = query
85
81
            query = re.sub(r"(?u)\([\s\&\|\!]*\)", "", query)
86
82
        ## plpy.debug('9 query is %s' % repr(query))
87
83
88
84
        # An & or | following a (
89
85
        query = re.sub(r"(?u)(?<=\()[\&\|\s]+", "", query)
90
86
        ## plpy.debug('10 query is %s' % repr(query))
91
87
92
88
        # An &, | or ! immediatly before a )
93
89
        query = re.sub(r"(?u)[\&\|\!\s]*[\&\|\!]+\s*(?=\))", "", query)
94
90
        ## plpy.debug('11 query is %s' % repr(query))
95
91
96
92
        # An &,| or ! followed by another boolean.
97
93
        query = re.sub(r"(?ux) \s* ( [\&\|\!] ) [\s\&\|]+", r"\1", query)
98
94
        ## plpy.debug('12 query is %s' % repr(query))
99
95
100
96
        # Leading & or |
101
97
        query = re.sub(r"(?u)^[\s\&\|]+", "", query)
102
98
        ## plpy.debug('13 query is %s' % repr(query))
103
99
104
100
        # Trailing &, | or !
105
101
        query = re.sub(r"(?u)[\&\|\!\s]+$", "", query)
106
102
        ## plpy.debug('14 query is %s' % repr(query))
107
103
108
104
        # If we have nothing but whitespace and tsearch2 operators,
109
105
        # return NULL.
110
106
        if re.search(r"(?u)^[\&\|\!\s\(\)]*$", query) is not None:
111
107
            return None
112
108
113
109
        # Convert back to UTF-8
114
110
        query = query.encode('utf8')
115
111
        ## plpy.debug('15 query is %s' % repr(query))
116
112
117
113
        return query or None
118
114
        $_$;
119
115
120
116
CREATE OR REPLACE FUNCTION ftq(text) RETURNS pg_catalog.tsquery
121
117
    LANGUAGE plpythonu IMMUTABLE STRICT
122
118
    AS $_$
123
119
        p = plpy.prepare(
124
120
            "SELECT to_tsquery('default', _ftq($1)) AS x", ["text"])
125
121
        query = plpy.execute(p, args, 1)[0]["x"]
126
122
        return query or None
127
123
        $_$;
128
124
129
125
INSERT INTO LaunchpadDatabaseRevision VALUES (2209, 24, 1);
130
0
126
131
=== modified file 'lib/lp/answers/stories/question-browse-and-search.txt'
132
--- lib/lp/answers/stories/question-browse-and-search.txt	2012-06-22 16:17:48 +0000
133
+++ lib/lp/answers/stories/question-browse-and-search.txt	2012-06-26 10:15:26 +0000
134
@@ -311,7 +311,7 @@
135
311
dealing with plugins problems, he always start by a search for such
311
dealing with plugins problems, he always start by a search for such
136
312
problems:
312
problems:
137
313
313
139
314
    >>> browser.getControl(name='field.search_text').value = 'plug-in'
314
    >>> browser.getControl(name='field.search_text').value = 'plugin'
140
315
    >>> browser.getControl('Search', index=0).click()
315
    >>> browser.getControl('Search', index=0).click()
141
316
    >>> questions = find_tag_by_id(browser.contents, 'question-listing')
316
    >>> questions = find_tag_by_id(browser.contents, 'question-listing')
142
317
    >>> for question in questions.fetch('td', 'questionTITLE'):
317
    >>> for question in questions.fetch('td', 'questionTITLE'):
143
318
318
144
=== modified file 'lib/lp/registry/doc/vocabularies.txt'
145
--- lib/lp/registry/doc/vocabularies.txt	2012-06-22 16:17:48 +0000
146
+++ lib/lp/registry/doc/vocabularies.txt	2012-06-26 10:15:26 +0000
147
@@ -731,8 +731,7 @@
148
731
731
149
732
    >>> [(p.name, getattr(p.teamowner, 'name', None))
732
    >>> [(p.name, getattr(p.teamowner, 'name', None))
150
733
    ...  for p in vocab.search('ubuntu-team')]
733
    ...  for p in vocab.search('ubuntu-team')]
153
734
    [(u'doc', None), (u'name18', u'mark'),
734
    [(u'ubuntu-team', u'mark')]
152
735
     (u'ubuntu-security', u'kamion'), (u'ubuntu-team', u'mark')]
154
736
735
155
737
But it doesn't include merged accounts:
736
But it doesn't include merged accounts:
156
738
737
157
739
738
158
=== modified file 'lib/lp/services/database/doc/textsearching.txt'
159
--- lib/lp/services/database/doc/textsearching.txt	2012-06-22 16:17:48 +0000
160
+++ lib/lp/services/database/doc/textsearching.txt	2012-06-26 10:15:26 +0000
161
@@ -138,7 +138,22 @@
162
138
    ...         compiled = compiled.decode('UTF-8')
138
    ...         compiled = compiled.decode('UTF-8')
163
139
    ...         compiled = compiled.encode('US-ASCII', 'backslashreplace')
139
    ...         compiled = compiled.encode('US-ASCII', 'backslashreplace')
164
140
    ...     print '%s <=> %s' % (uncompiled, compiled)
140
    ...     print '%s <=> %s' % (uncompiled, compiled)
166
141
141
    >>>
167
142
    >>> def search(text_to_search, search_phrase):
168
143
    ...     cur = cursor()
169
144
    ...     cur.execute("SELECT to_tsvector(%s)", (text_to_search, ))
170
145
    ...     ts_vector = cur.fetchall()[0][0]
171
146
    ...     cur.execute("SELECT ftq(%s)", (search_phrase, ))
172
147
    ...     ts_query = cur.fetchall()[0][0]
173
148
    ...     cur.execute(
174
149
    ...         "SELECT to_tsvector(%s) @@ ftq(%s)",
175
150
    ...         (text_to_search, search_phrase))
176
151
    ...     match = cur.fetchall()[0][0]
177
152
    ...     return "FTI data: %s query: %s match: %s" % (
178
153
    ...         ts_vector, ts_query, str(match))
179
154
    >>>
180
155
    >>> def search_same(text):
181
156
    ...     return search(text, text)
182
142
157
183
143
Queries are lowercased
158
Queries are lowercased
184
144
159
185
@@ -225,127 +240,178 @@
186
225
    (hi&ho|hoe)&work&go <=> ( 'hi' & 'ho' | 'hoe' ) & 'work' & 'go'
240
    (hi&ho|hoe)&work&go <=> ( 'hi' & 'ho' | 'hoe' ) & 'work' & 'go'
187
226
241
188
227
242
200
228
Hypenation is handled specially. Note that the & operator has precidence
243
If a single '-' precedes a word, it is converted into the '!' operator.
201
229
over the | operator and that tsearch2 removes the unnecessary branckets.
244
Note also that a trailing '-' is dropped by to_tsquery().
191
230
192
231
    >>> ftq('foo-bar')
193
232
    ((foo&bar)|foobar) <=> 'foo' & 'bar' | 'foobar'
194
233
195
234
    >>> ftq('foo-bar-baz')
196
235
    ((foo&bar&baz)|foobarbaz) <=> 'foo' & 'bar' & 'baz' | 'foobarbaz'
197
236
198
237
    >>> ftq('foo & bar-baz')
199
238
    foo&((bar&baz)|barbaz) <=> 'foo' & ( 'bar' & 'baz' | 'barbaz' )
202
239
245
203
240
    >>> ftq('-foo bar-')
246
    >>> ftq('-foo bar-')
205
241
    !foo&bar <=> !'foo' & 'bar'
247
    !foo&bar- <=> !'foo' & 'bar'
206
248
207
249
Repeated '-' are simply ignored by to_tsquery().
208
242
250
209
243
    >>> ftq('---foo--- ---bar---')
251
    >>> ftq('---foo--- ---bar---')
231
244
    foo&bar <=> 'foo' & 'bar'
252
    ---foo---&---bar--- <=> 'foo' & 'bar'
232
245
253
233
246
    >>> ftq('foo-bar test')
254
Hyphens surrounded by two words are retained. This reflects the way
234
247
    ((foo&bar)|foobar)&test <=> ( 'foo' & 'bar' | 'foobar' ) & 'test'
255
how to_tsquery() and to_tsvector() handle such strings.
235
248
256
236
249
    >>> ftq('foo-bar OR test')
257
    >>> print search_same('foo-bar')
237
250
    ((foo&bar)|foobar)|test <=> ( 'foo' & 'bar' | 'foobar' ) | 'test'
258
    FTI data: 'bar':3 'foo':2 'foo-bar':1
238
251
259
    query: 'foo-bar' & 'foo' & 'bar'
239
252
260
    match: True
240
253
Most punctuation characters are converted to whitespace outside of
261
241
254
words, or treated as a hypen inside words. The exceptions are the
262
242
255
operators ()!&|!.
263
Punctuation is handled consistently. If a string containing punctuation
243
256
264
appears in an FTI, it can also be passed to ftq(),and a search for this
244
257
    >>> ftq(':100%')
265
string finds the indexed text.
245
258
    100 <=> '100'
266
246
259
267
    >>> punctuation = '\'"#$%*+,./:;<=>?@[\]^`{}~'
247
260
    >>> ftq(r'foo\bar')
268
    >>> for symbol in punctuation:
248
261
    ((foo&bar)|foobar) <=> 'foo' & 'bar' | 'foobar'
269
    ...     print repr(symbol), search_same('foo%sbar' % symbol)
249
262
270
    "'" FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
250
263
    >>> ftq('/dev/pmu')
271
    '"' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
251
264
    ((dev&pmu)|devpmu) <=> 'dev' & 'pmu' | 'devpmu'
272
    '#' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
252
273
    '$' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
253
274
    '%' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
254
275
    '*' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
255
276
    '+' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
256
277
    ',' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
257
278
    '.' FTI data: 'foo.bar':1 query: 'foo.bar' match: True
258
279
    '/' FTI data: 'foo/bar':1 query: 'foo/bar' match: True
259
280
    ':' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
260
281
    ';' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
261
282
    '<' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
262
283
    '=' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
263
284
    '>' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
264
285
    '?' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
265
286
    '@' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
266
287
    '[' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
267
288
    '\\' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
268
289
    ']' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
269
290
    '^' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
270
291
    '`' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
271
292
    '{' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
272
293
    '}' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True
273
294
    '~' FTI data: 'foo':1 '~bar':2 query: 'foo' & '~bar' match: True
274
295
275
296
    >>> for symbol in punctuation:
276
297
    ...     print repr(symbol), search_same('aa %sbb%s cc' % (symbol, symbol))
277
298
    "'" FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
278
299
    '"' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
279
300
    '#' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
280
301
    '$' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
281
302
    '%' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
282
303
    '*' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
283
304
    '+' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
284
305
    ',' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
285
306
    '.' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
286
307
    '/' FTI data: '/bb':2 'aa':1 'cc':3 query: 'aa' & '/bb' & 'cc' match: True
287
308
    ':' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
288
309
    ';' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
289
310
    '<' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
290
311
    '=' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
291
312
    '>' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
292
313
    '?' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
293
314
    '@' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
294
315
    '[' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
295
316
    '\\' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
296
317
    ']' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
297
318
    '^' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
298
319
    '`' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
299
320
    '{' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
300
321
    '}' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True
301
322
    '~' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & '~bb' & 'cc' match: False
302
323
303
324
XXX Abel Deuring 2012-06-20 bug=1015511: Note that the last line above
304
325
shows a bug: The FTI data for the string "aa ~bb~ cc" contains the words
305
326
'aa', 'bb', 'cc', while the ts_query object for the same text contains
306
327
'aa', '~bb', 'cc', hence the query does not match the string. More details_
307
328
308
329
XXX Abel Deuring 2012-06-20 bug=1015519: XML tags cannot be searched.
309
330
310
331
    >>> print search_same('foo <bar> baz')
311
332
    FTI data: 'baz':2 'foo':1 query: 'foo' & 'baz' match: True
312
333
313
334
More specifically, tags are simply dropped from the FTI data and from
314
335
search queries.
315
336
316
337
    >>> print search('some text <div>whatever</div>', '<div>')
317
338
    FTI data: 'text':2 'whatev':3 query: None match: None
318
339
319
340
Of course, omitting '<' and '>'from the query does not help.
320
341
321
342
    >>> print search('some text <div>whatever</div>', 'div')
322
343
    FTI data: 'text':2 'whatev':3 query: 'div' match: False
323
344
324
345
Treatment of characters that are used as operators in to_tsquery():
325
265
346
326
266
    >>> ftq('cool!')
347
    >>> ftq('cool!')
327
267
    cool <=> 'cool'
348
    cool <=> 'cool'
328
268
349
332
269
    >>> ftq('foo@bar.com')
350
Email addresses are retained as a whole, both by to_tsvector() and by
333
270
    ((foo&bar&com)|foobarcom) <=> 'foo' & 'bar' & 'com' | 'foobarcom'
351
ftq().
334
271
352
335
353
    >>> print search_same('foo@bar.com')
336
354
    FTI data: 'foo@bar.com':1 query: 'foo@bar.com' match: True
337
355
338
356
File names are retained as a whole.
339
357
340
358
    >>> print search_same('foo-bar.txt')
341
359
    FTI data: 'foo-bar.txt':1 query: 'foo-bar.txt' match: True
342
272
360
343
273
Some punctuation we pass through to tsearch2 for it to handle.
361
Some punctuation we pass through to tsearch2 for it to handle.
391
274
362
NB. This gets stemmed, see below.
392
275
    >>> ftq("shouldn't") # NB. This gets stemmed, see below
363
393
276
    shouldn't <=> 'shouldn'
364
    >>> print search_same("shouldn't")
394
277
365
    FTI data: 'shouldn':1 query: 'shouldn' match: True
395
278
It was noticed though in Bug #33920 that tsearch2 couldn't cope if the
366
396
279
apostrophe was not inside a word. So we strip it in these cases.
367
Bug #44913 - Unicode characters in the wrong place.
397
280
368
398
281
    >>> ftq("'cool")
369
    >>> search_same(u'abc-a\N{LATIN SMALL LETTER C WITH CEDILLA}')
399
282
    cool <=> 'cool'
370
    "FTI data: 'abc':2 'abc-a\xc3\xa7':1 'a\xc3\xa7':3
400
283
    >>> ftq("'shouldn't")
371
    query: 'abc-a\xc3\xa7' & 'abc' & 'a\xc3\xa7'
401
284
    shouldn't <=> 'shouldn'
372
    match: True"
402
285
    >>> ftq("' cool")
373
403
286
    cool <=> 'cool'
374
Cut & Paste of 'Smart' quotes. Note that the quotation mark is retained
404
287
    >>> ftq("cool '")
375
in the FTI.
405
288
    cool <=> 'cool'
376
406
289
    >>> ftq("' cool '")
377
    >>> print search_same(u'a-a\N{RIGHT DOUBLE QUOTATION MARK}')
407
290
    cool <=> 'cool'
378
    FTI data: 'a-a”':1 'a”':3 query: 'a-a”' & 'a”' match: True
408
291
    >>> ftq("'cool'")
379
409
292
    cool <=> 'cool'
380
    >>> print search_same(
410
293
    >>> ftq("('cool' AND bananas)")
381
    ...     u'\N{LEFT SINGLE QUOTATION MARK}a.a'
411
294
    (cool&bananas) <=> 'cool' & 'banana'
382
    ...     u'\N{RIGHT SINGLE QUOTATION MARK}')
412
295
383
    FTI data: 'a’':2 '‘a':1 query: '‘a' & 'a’' match: True
366
296
It was also noticed through Bug #39828 that tsearch2 will not cope if the
367
297
! character is embedded inside or found at the end of a word.
368
298
369
299
    >>> ftq('cool!')
370
300
    cool <=> 'cool'
371
301
    >>> ftq('hi!mom')
372
302
    hi&mom <=> 'hi' & 'mom'
373
303
    >>> ftq('hi!!!!!mom')
374
304
    hi&mom <=> 'hi' & 'mom'
375
305
    >>> ftq('hi !mom')
376
306
    hi&!mom <=> 'hi' & !'mom'
377
307
378
308
379
309
Bug #44913 - Unicode characters in the wrong place
380
310
381
311
    >>> ftq(u'a-a\N{LATIN SMALL LETTER C WITH CEDILLA}')
382
312
    ((a&a\xe7)|aa\xe7) <=> 'a\xe7' | 'aa\xe7'
383
313
384
314
    Cut & Paste of 'Smart' quotes
385
315
386
316
    >>> ftq(u'a-a\N{RIGHT DOUBLE QUOTATION MARK}')
387
317
    ((a&a)|aa) <=> 'aa'
388
318
389
319
    >>> ftq(u'\N{LEFT SINGLE QUOTATION MARK}a.a\N{RIGHT SINGLE QUOTATION MARK}')
390
320
    ((a&a)|aa) <=> 'aa'
413
321
384
414
322
385
415
323
Bug #44913 - Nothing but stopwords in a query needing repair
386
Bug #44913 - Nothing but stopwords in a query needing repair
416
324
387
419
325
    >>> ftq('a)a')
388
    >>> print search_same('a)a')
420
326
    a&a <=> None
389
    FTI data:  query: None match: None
421
327
390
422
328
391
423
329
Stop words (words deemed too common in English to search on) are removed
392
Stop words (words deemed too common in English to search on) are removed
424
330
from queries by tsearch2.
393
from queries by tsearch2.
425
331
394
428
332
    >>> ftq("Don't do it harder!")
395
    >>> print search_same("Don't do it harder!")
429
333
    don't&do&it&harder <=> 'harder'
396
    FTI data: 'harder':5 query: 'harder' match: True
430
334
397
431
335
398
432
336
Note that some queries will return None after compilation, because they
399
Note that some queries will return None after compilation, because they
433
337
contained nothing but stop words or punctuation.
400
contained nothing but stop words or punctuation.
434
338
401
437
339
    >>> ftq("don't do it!")
402
    >>> print search_same("don't do it!")
438
340
    don't&do&it <=> None
403
    FTI data:  query: None match: None
439
341
404
442
342
    >>> ftq(",,,")
405
    >>> print search_same(",,,")
443
343
    None <=> None
406
    FTI data:  query: None match: None
444
344
407
445
345
408
446
346
Queries containing nothing except whitespace, boolean operators and
409
Queries containing nothing except whitespace, boolean operators and
447
347
punctuation will just return None.
410
punctuation will just return None.
448
348
411
449
412
Note in the fourth example below that the '-' left in the query by _ftq()
450
413
is ignored by to_tsquery().
451
414
452
349
    >>> ftq(" ")
415
    >>> ftq(" ")
453
350
    None <=> None
416
    None <=> None
454
351
    >>> ftq("AND")
417
    >>> ftq("AND")
455
@@ -353,7 +419,7 @@
456
353
    >>> ftq(" AND (!)")
419
    >>> ftq(" AND (!)")
457
354
    None <=> None
420
    None <=> None
458
355
    >>> ftq("-")
421
    >>> ftq("-")
460
356
    None <=> None
422
    - <=> None
461
357
423
462
358
424
463
359
Words are also stemmed by tsearch2 (using the English stemmer).
425
Words are also stemmed by tsearch2 (using the English stemmer).
464
@@ -381,7 +447,7 @@
465
381
    (hi|!hello)&mom <=> ( 'hi' | !'hello' ) & 'mom'
447
    (hi|!hello)&mom <=> ( 'hi' | !'hello' ) & 'mom'
466
382
448
467
383
    >>> ftq('(hi OR - AND hello) AND mom')
449
    >>> ftq('(hi OR - AND hello) AND mom')
469
384
    (hi|hello)&mom <=> ( 'hi' | 'hello' ) & 'mom'
450
    (hi|-&hello)&mom <=> ( 'hi' | 'hello' ) & 'mom'
470
385
451
471
386
    >>> ftq('hi AND mom AND')
452
    >>> ftq('hi AND mom AND')
472
387
    hi&mom <=> 'hi' & 'mom'
453
    hi&mom <=> 'hi' & 'mom'
473
@@ -393,7 +459,7 @@
474
393
    (hi|hello)&mom <=> ( 'hi' | 'hello' ) & 'mom'
459
    (hi|hello)&mom <=> ( 'hi' | 'hello' ) & 'mom'
475
394
460
476
395
    >>> ftq('() hi mom ( ) ((! |((&)))) :-)')
461
    >>> ftq('() hi mom ( ) ((! |((&)))) :-)')
478
396
    (hi&mom) <=> 'hi' & 'mom'
462
    (hi&mom&-) <=> 'hi' & 'mom'
479
397
463
480
398
    >>> ftq("(hi mom")
464
    >>> ftq("(hi mom")
481
399
    hi&mom <=> 'hi' & 'mom'
465
    hi&mom <=> 'hi' & 'mom'
482
@@ -414,15 +480,15 @@
483
414
    hi&mom <=> 'hi' & 'mom'
480
    hi&mom <=> 'hi' & 'mom'
484
415
481
485
416
    >>> ftq("(foo .") # Bug 43245
482
    >>> ftq("(foo .") # Bug 43245
487
417
    foo <=> 'foo'
483
    foo&. <=> 'foo'
488
418
484
489
419
    >>> ftq("(foo.")
485
    >>> ftq("(foo.")
491
420
    foo <=> 'foo'
486
    foo. <=> 'foo'
492
421
487
493
422
    Bug #54972
488
    Bug #54972
494
423
489
495
424
    >>> ftq("a[a\n[a")
490
    >>> ftq("a[a\n[a")
497
425
    ((a&a)|aa)&a <=> 'aa'
491
    a[a&[a <=> None
498
426
492
499
427
    Bug #96698
493
    Bug #96698
500
428
494
501
@@ -437,10 +503,10 @@
502
437
    Bug #160236
503
    Bug #160236
503
438
504
504
439
    >>> ftq("foo&&bar-baz")
505
    >>> ftq("foo&&bar-baz")
506
440
    foo&((bar&baz)|barbaz) <=> 'foo' & ( 'bar' & 'baz' | 'barbaz' )
506
    foo&bar-baz <=> 'foo' & 'bar-baz' & 'bar' & 'baz'
507
441
507
508
442
    >>> ftq("foo||bar.baz")
508
    >>> ftq("foo||bar.baz")
510
443
    foo|((bar&baz)|barbaz) <=> 'foo' | ( 'bar' & 'baz' | 'barbaz' )
509
    foo|bar.baz <=> 'foo' | 'bar.baz'
511
444
510
512
445
511
513
446
Phrase Searching
512
Phrase Searching
514
@@ -482,7 +548,8 @@
515
482
548
516
483
    >>> runsql(r"""
549
    >>> runsql(r"""
517
484
    ...   SELECT title, max(ranking) FROM (
550
    ...   SELECT title, max(ranking) FROM (
519
485
    ...    SELECT Bug.title,rank(Bug.fti||Message.fti,ftq('firefox')) AS ranking
551
    ...    SELECT Bug.title,rank(Bug.fti||Message.fti,ftq('firefox'))
520
552
    ...    AS ranking
521
486
    ...    FROM Bug, BugMessage, Message
553
    ...    FROM Bug, BugMessage, Message
522
487
    ...    WHERE Bug.id = BugMessage.bug AND Message.id = BugMessage.message
554
    ...    WHERE Bug.id = BugMessage.bug AND Message.id = BugMessage.message
523
488
    ...       AND (Bug.fti @@ ftq('firefox') OR Message.fti @@ ftq('firefox'))
555
    ...       AND (Bug.fti @@ ftq('firefox') OR Message.fti @@ ftq('firefox'))
524
@@ -499,7 +566,8 @@
525
499
    ...       AND BugTask.product = Product.id
566
    ...       AND BugTask.product = Product.id
526
500
    ...       AND Product.name LIKE lower('%firefox%')
567
    ...       AND Product.name LIKE lower('%firefox%')
527
501
    ...    UNION
568
    ...    UNION
529
502
    ...    SELECT Bug.title, rank(Product.fti, ftq('firefox')) - 0.3 AS ranking
569
    ...    SELECT Bug.title, rank(Product.fti, ftq('firefox')) - 0.3
530
570
    ...    AS ranking
531
503
    ...    FROM Bug, BugTask, Product
571
    ...    FROM Bug, BugTask, Product
532
504
    ...    WHERE Bug.id = BugTask.bug
572
    ...    WHERE Bug.id = BugTask.bug
533
505
    ...       AND BugTask.product = Product.id
573
    ...       AND BugTask.product = Product.id
534
@@ -518,7 +586,8 @@
535
518
    Printing doesn't work     0.70
586
    Printing doesn't work     0.70
536
519
587
537
520
588
539
521
== Natural Language Phrase Query ==
589
Natural Language Phrase Query
540
590
-----------------------------
541
522
591
542
523
The standard boolean searches of tsearch2 are fine, but sometime you
592
The standard boolean searches of tsearch2 are fine, but sometime you
543
524
want more fuzzy searches.
593
want more fuzzy searches.
544
@@ -557,7 +626,8 @@
545
557
on Ubuntu) - so we are disabling this and reworking from the ground up.
626
on Ubuntu) - so we are disabling this and reworking from the ground up.
546
558
627
547
559
628
549
560
=== nl_term_candidates() ===
629
nl_term_candidates()
550
630
~~~~~~~~~~~~~~~~~~~~
551
561
631
552
562
To find the terms in a search phrase that are canditates for the search,
632
To find the terms in a search phrase that are canditates for the search,
553
563
we can use the nl_term_candidates() function. This function uses ftq()
633
we can use the nl_term_candidates() function. This function uses ftq()
554
@@ -574,19 +644,16 @@
555
574
    >>> nl_term_candidates('how do I do this?')
644
    >>> nl_term_candidates('how do I do this?')
556
575
    []
645
    []
557
576
646
558
577
We also handle expansion of hypenated words (like ftq does):
559
578
560
579
    >>> nl_term_candidates('firefox foo-bar give me trouble')
561
580
    [u'firefox', u'foo', u'bar', u'foobar', u'give', u'troubl']
562
581
563
582
Except for the hyphenation character, all non-word caracters are ignored:
647
Except for the hyphenation character, all non-word caracters are ignored:
564
583
648
565
584
    >>> nl_term_candidates(
649
    >>> nl_term_candidates(
567
585
    ...     "Will the \'\'|\'\' character (inside a ''quoted'' string) work???")
650
    ...     "Will the \'\'|\'\' character (inside a ''quoted'' string) "
568
651
    ...     "work???")
569
586
    [u'charact', u'insid', u'quot', u'string', u'work']
652
    [u'charact', u'insid', u'quot', u'string', u'work']
570
587
653
571
588
654
573
589
=== nl_phrase_search() ===
655
nl_phrase_search()
574
656
~~~~~~~~~~~~~~~~~~
575
590
657
576
591
To get the actual tsearch2 query that should be run, you will use the
658
To get the actual tsearch2 query that should be run, you will use the
577
592
nl_phrase_search() function. This one takes two mandatory parameters and
659
nl_phrase_search() function. This one takes two mandatory parameters and
578
@@ -637,7 +704,8 @@
579
637
    u'slow|system'
704
    u'slow|system'
580
638
705
581
639
706
583
640
==== Using other constraints ====
707
Using other constraints
584
708
.......................
585
641
709
586
642
You can pass a third parameter to the function that will be use as
710
You can pass a third parameter to the function that will be use as
587
643
an additional constraint to determine the total number of rows that
711
an additional constraint to determine the total number of rows that
588
@@ -659,7 +727,8 @@
589
659
727
590
660
    >>> nl_phrase_search(
728
    >>> nl_phrase_search(
591
661
    ...     'firefox gets very slow on flickr', Question,
729
    ...     'firefox gets very slow on flickr', Question,
593
662
    ...     "Question.product = %s AND Product.active = 't'" % firefox_product.id,
730
    ...     "Question.product = %s AND Product.active = 't'"
594
731
    ...     % firefox_product.id,
595
663
    ...     ['Product'], fast_enabled=False)
732
    ...     ['Product'], fast_enabled=False)
596
664
    u'slow|flickr'
733
    u'slow|flickr'
597
665
734
598
@@ -679,7 +748,8 @@
599
679
    u'(firefox&flickr&slow)|(flickr&slow)|(firefox&slow)|(firefox&flickr)'
748
    u'(firefox&flickr&slow)|(flickr&slow)|(firefox&slow)|(firefox&flickr)'
600
680
749
601
681
750
603
682
==== No keywords filtering with few rows ====
751
No keywords filtering with few rows
604
752
...................................
605
683
753
606
684
The 50% rule is really useful only when there are many rows. When there
754
The 50% rule is really useful only when there are many rows. When there
607
685
only very few rows, that keyword elimination becomes a problem since
755
only very few rows, that keyword elimination becomes a problem since
Reviewer	Review Type	Date Requested	Status
Abel Deuring (community)			Approve on 2012-06-26
Review via email: mp+112043@code.launchpad.net