Merge lp:~adeuring/launchpad/bug-29713-db-dev into lp:launchpad/db-devel
- bug-29713-db-dev
- Merge into db-devel
Proposed by
Abel Deuring
Status: | Merged |
---|---|
Approved by: | Abel Deuring |
Approved revision: | no longer in the source branch. |
Merged at revision: | 11713 |
Proposed branch: | lp:~adeuring/launchpad/bug-29713-db-dev |
Merge into: | lp:launchpad/db-devel |
Diff against target: |
607 lines (+311/-117) 4 files modified
database/schema/patch-2209-24-1.sql (+125/-0) lib/lp/answers/stories/question-browse-and-search.txt (+1/-1) lib/lp/registry/doc/vocabularies.txt (+1/-2) lib/lp/services/database/doc/textsearching.txt (+184/-114) |
To merge this branch: | bzr merge lp:~adeuring/launchpad/bug-29713-db-dev |
Related bugs: |
Reviewer | Review Type | Date Requested | Status |
---|---|---|---|
Abel Deuring (community) | Approve | ||
Review via email: mp+112043@code.launchpad.net |
Commit message
Better handling of punctuation in full text search strings
Description of the change
See https:/
To post a comment you must log in.
Revision history for this message
Stuart Bishop (stub) wrote : | # |
db patch has been applied to production
Preview Diff
[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1 | === added file 'database/schema/patch-2209-24-1.sql' |
2 | --- database/schema/patch-2209-24-1.sql 1970-01-01 00:00:00 +0000 |
3 | +++ database/schema/patch-2209-24-1.sql 2012-06-26 10:15:26 +0000 |
4 | @@ -0,0 +1,125 @@ |
5 | +-- Copyright 2012 Canonical Ltd. This software is licensed under the |
6 | +-- GNU Affero General Public License version 3 (see the file LICENSE). |
7 | + |
8 | +SET client_min_messages=ERROR; |
9 | + |
10 | +CREATE OR REPLACE FUNCTION _ftq(text) RETURNS text |
11 | + LANGUAGE plpythonu IMMUTABLE STRICT |
12 | + AS $_$ |
13 | + import re |
14 | + |
15 | + # I think this method would be more robust if we used a real |
16 | + # tokenizer and parser to generate the query string, but we need |
17 | + # something suitable for use as a stored procedure which currently |
18 | + # means no external dependancies. |
19 | + |
20 | + # Convert to Unicode |
21 | + query = args[0].decode('utf8') |
22 | + ## plpy.debug('1 query is %s' % repr(query)) |
23 | + |
24 | + # Normalize whitespace |
25 | + query = re.sub("(?u)\s+"," ", query) |
26 | + |
27 | + # Convert AND, OR, NOT and - to tsearch2 punctuation |
28 | + query = re.sub(r"(?u)(?:^|\s)-([\w\(])", r" !\1", query) |
29 | + query = re.sub(r"(?u)\bAND\b", "&", query) |
30 | + query = re.sub(r"(?u)\bOR\b", "|", query) |
31 | + query = re.sub(r"(?u)\bNOT\b", " !", query) |
32 | + ## plpy.debug('2 query is %s' % repr(query)) |
33 | + |
34 | + # Deal with unwanted punctuation. |
35 | + # ':' is used in queries to specify a weight of a word. |
36 | + # '\' is treated differently in to_tsvector() and to_tsquery(). |
37 | + punctuation = r'[:\\]' |
38 | + query = re.sub(r"(?u)%s+" % (punctuation,), " ", query) |
39 | + ## plpy.debug('3 query is %s' % repr(query)) |
40 | + |
41 | + # Strip ! characters inside and at the end of a word |
42 | + query = re.sub(r"(?u)(?<=\w)[\!]+", " ", query) |
43 | + |
44 | + # Now that we have handle case sensitive booleans, convert to lowercase |
45 | + query = query.lower() |
46 | + |
47 | + # Remove unpartnered bracket on the left and right |
48 | + query = re.sub(r"(?ux) ^ ( [^(]* ) \)", r"(\1)", query) |
49 | + query = re.sub(r"(?ux) \( ( [^)]* ) $", r"(\1)", query) |
50 | + |
51 | + # Remove spurious brackets |
52 | + query = re.sub(r"(?u)\(([^\&\|]*?)\)", r" \1 ", query) |
53 | + ## plpy.debug('5 query is %s' % repr(query)) |
54 | + |
55 | + # Insert & between tokens without an existing boolean operator |
56 | + # ( not proceeded by (|&! |
57 | + query = re.sub(r"(?u)(?<![\(\|\&\!])\s*\(", "&(", query) |
58 | + ## plpy.debug('6 query is %s' % repr(query)) |
59 | + # ) not followed by )|& |
60 | + query = re.sub(r"(?u)\)(?!\s*(\)|\||\&|\s*$))", ")&", query) |
61 | + ## plpy.debug('6.1 query is %s' % repr(query)) |
62 | + # Whitespace not proceded by (|&! not followed by &| |
63 | + query = re.sub(r"(?u)(?<![\(\|\&\!\s])\s+(?![\&\|\s])", "&", query) |
64 | + ## plpy.debug('7 query is %s' % repr(query)) |
65 | + |
66 | + # Detect and repair syntax errors - we are lenient because |
67 | + # this input is generally from users. |
68 | + |
69 | + # Fix unbalanced brackets |
70 | + openings = query.count("(") |
71 | + closings = query.count(")") |
72 | + if openings > closings: |
73 | + query = query + " ) "*(openings-closings) |
74 | + elif closings > openings: |
75 | + query = " ( "*(closings-openings) + query |
76 | + ## plpy.debug('8 query is %s' % repr(query)) |
77 | + |
78 | + # Strip ' character that do not have letters on both sides |
79 | + query = re.sub(r"(?u)((?<!\w)'|'(?!\w))", "", query) |
80 | + |
81 | + # Brackets containing nothing but whitespace and booleans, recursive |
82 | + last = "" |
83 | + while last != query: |
84 | + last = query |
85 | + query = re.sub(r"(?u)\([\s\&\|\!]*\)", "", query) |
86 | + ## plpy.debug('9 query is %s' % repr(query)) |
87 | + |
88 | + # An & or | following a ( |
89 | + query = re.sub(r"(?u)(?<=\()[\&\|\s]+", "", query) |
90 | + ## plpy.debug('10 query is %s' % repr(query)) |
91 | + |
92 | + # An &, | or ! immediatly before a ) |
93 | + query = re.sub(r"(?u)[\&\|\!\s]*[\&\|\!]+\s*(?=\))", "", query) |
94 | + ## plpy.debug('11 query is %s' % repr(query)) |
95 | + |
96 | + # An &,| or ! followed by another boolean. |
97 | + query = re.sub(r"(?ux) \s* ( [\&\|\!] ) [\s\&\|]+", r"\1", query) |
98 | + ## plpy.debug('12 query is %s' % repr(query)) |
99 | + |
100 | + # Leading & or | |
101 | + query = re.sub(r"(?u)^[\s\&\|]+", "", query) |
102 | + ## plpy.debug('13 query is %s' % repr(query)) |
103 | + |
104 | + # Trailing &, | or ! |
105 | + query = re.sub(r"(?u)[\&\|\!\s]+$", "", query) |
106 | + ## plpy.debug('14 query is %s' % repr(query)) |
107 | + |
108 | + # If we have nothing but whitespace and tsearch2 operators, |
109 | + # return NULL. |
110 | + if re.search(r"(?u)^[\&\|\!\s\(\)]*$", query) is not None: |
111 | + return None |
112 | + |
113 | + # Convert back to UTF-8 |
114 | + query = query.encode('utf8') |
115 | + ## plpy.debug('15 query is %s' % repr(query)) |
116 | + |
117 | + return query or None |
118 | + $_$; |
119 | + |
120 | +CREATE OR REPLACE FUNCTION ftq(text) RETURNS pg_catalog.tsquery |
121 | + LANGUAGE plpythonu IMMUTABLE STRICT |
122 | + AS $_$ |
123 | + p = plpy.prepare( |
124 | + "SELECT to_tsquery('default', _ftq($1)) AS x", ["text"]) |
125 | + query = plpy.execute(p, args, 1)[0]["x"] |
126 | + return query or None |
127 | + $_$; |
128 | + |
129 | +INSERT INTO LaunchpadDatabaseRevision VALUES (2209, 24, 1); |
130 | |
131 | === modified file 'lib/lp/answers/stories/question-browse-and-search.txt' |
132 | --- lib/lp/answers/stories/question-browse-and-search.txt 2012-06-22 16:17:48 +0000 |
133 | +++ lib/lp/answers/stories/question-browse-and-search.txt 2012-06-26 10:15:26 +0000 |
134 | @@ -311,7 +311,7 @@ |
135 | dealing with plugins problems, he always start by a search for such |
136 | problems: |
137 | |
138 | - >>> browser.getControl(name='field.search_text').value = 'plug-in' |
139 | + >>> browser.getControl(name='field.search_text').value = 'plugin' |
140 | >>> browser.getControl('Search', index=0).click() |
141 | >>> questions = find_tag_by_id(browser.contents, 'question-listing') |
142 | >>> for question in questions.fetch('td', 'questionTITLE'): |
143 | |
144 | === modified file 'lib/lp/registry/doc/vocabularies.txt' |
145 | --- lib/lp/registry/doc/vocabularies.txt 2012-06-22 16:17:48 +0000 |
146 | +++ lib/lp/registry/doc/vocabularies.txt 2012-06-26 10:15:26 +0000 |
147 | @@ -731,8 +731,7 @@ |
148 | |
149 | >>> [(p.name, getattr(p.teamowner, 'name', None)) |
150 | ... for p in vocab.search('ubuntu-team')] |
151 | - [(u'doc', None), (u'name18', u'mark'), |
152 | - (u'ubuntu-security', u'kamion'), (u'ubuntu-team', u'mark')] |
153 | + [(u'ubuntu-team', u'mark')] |
154 | |
155 | But it doesn't include merged accounts: |
156 | |
157 | |
158 | === modified file 'lib/lp/services/database/doc/textsearching.txt' |
159 | --- lib/lp/services/database/doc/textsearching.txt 2012-06-22 16:17:48 +0000 |
160 | +++ lib/lp/services/database/doc/textsearching.txt 2012-06-26 10:15:26 +0000 |
161 | @@ -138,7 +138,22 @@ |
162 | ... compiled = compiled.decode('UTF-8') |
163 | ... compiled = compiled.encode('US-ASCII', 'backslashreplace') |
164 | ... print '%s <=> %s' % (uncompiled, compiled) |
165 | - |
166 | + >>> |
167 | + >>> def search(text_to_search, search_phrase): |
168 | + ... cur = cursor() |
169 | + ... cur.execute("SELECT to_tsvector(%s)", (text_to_search, )) |
170 | + ... ts_vector = cur.fetchall()[0][0] |
171 | + ... cur.execute("SELECT ftq(%s)", (search_phrase, )) |
172 | + ... ts_query = cur.fetchall()[0][0] |
173 | + ... cur.execute( |
174 | + ... "SELECT to_tsvector(%s) @@ ftq(%s)", |
175 | + ... (text_to_search, search_phrase)) |
176 | + ... match = cur.fetchall()[0][0] |
177 | + ... return "FTI data: %s query: %s match: %s" % ( |
178 | + ... ts_vector, ts_query, str(match)) |
179 | + >>> |
180 | + >>> def search_same(text): |
181 | + ... return search(text, text) |
182 | |
183 | Queries are lowercased |
184 | |
185 | @@ -225,127 +240,178 @@ |
186 | (hi&ho|hoe)&work&go <=> ( 'hi' & 'ho' | 'hoe' ) & 'work' & 'go' |
187 | |
188 | |
189 | -Hypenation is handled specially. Note that the & operator has precidence |
190 | -over the | operator and that tsearch2 removes the unnecessary branckets. |
191 | - |
192 | - >>> ftq('foo-bar') |
193 | - ((foo&bar)|foobar) <=> 'foo' & 'bar' | 'foobar' |
194 | - |
195 | - >>> ftq('foo-bar-baz') |
196 | - ((foo&bar&baz)|foobarbaz) <=> 'foo' & 'bar' & 'baz' | 'foobarbaz' |
197 | - |
198 | - >>> ftq('foo & bar-baz') |
199 | - foo&((bar&baz)|barbaz) <=> 'foo' & ( 'bar' & 'baz' | 'barbaz' ) |
200 | +If a single '-' precedes a word, it is converted into the '!' operator. |
201 | +Note also that a trailing '-' is dropped by to_tsquery(). |
202 | |
203 | >>> ftq('-foo bar-') |
204 | - !foo&bar <=> !'foo' & 'bar' |
205 | + !foo&bar- <=> !'foo' & 'bar' |
206 | + |
207 | +Repeated '-' are simply ignored by to_tsquery(). |
208 | |
209 | >>> ftq('---foo--- ---bar---') |
210 | - foo&bar <=> 'foo' & 'bar' |
211 | - |
212 | - >>> ftq('foo-bar test') |
213 | - ((foo&bar)|foobar)&test <=> ( 'foo' & 'bar' | 'foobar' ) & 'test' |
214 | - |
215 | - >>> ftq('foo-bar OR test') |
216 | - ((foo&bar)|foobar)|test <=> ( 'foo' & 'bar' | 'foobar' ) | 'test' |
217 | - |
218 | - |
219 | -Most punctuation characters are converted to whitespace outside of |
220 | -words, or treated as a hypen inside words. The exceptions are the |
221 | -operators ()!&|!. |
222 | - |
223 | - >>> ftq(':100%') |
224 | - 100 <=> '100' |
225 | - |
226 | - >>> ftq(r'foo\bar') |
227 | - ((foo&bar)|foobar) <=> 'foo' & 'bar' | 'foobar' |
228 | - |
229 | - >>> ftq('/dev/pmu') |
230 | - ((dev&pmu)|devpmu) <=> 'dev' & 'pmu' | 'devpmu' |
231 | + ---foo---&---bar--- <=> 'foo' & 'bar' |
232 | + |
233 | +Hyphens surrounded by two words are retained. This reflects the way |
234 | +how to_tsquery() and to_tsvector() handle such strings. |
235 | + |
236 | + >>> print search_same('foo-bar') |
237 | + FTI data: 'bar':3 'foo':2 'foo-bar':1 |
238 | + query: 'foo-bar' & 'foo' & 'bar' |
239 | + match: True |
240 | + |
241 | + |
242 | +Punctuation is handled consistently. If a string containing punctuation |
243 | +appears in an FTI, it can also be passed to ftq(),and a search for this |
244 | +string finds the indexed text. |
245 | + |
246 | + >>> punctuation = '\'"#$%*+,./:;<=>?@[\]^`{}~' |
247 | + >>> for symbol in punctuation: |
248 | + ... print repr(symbol), search_same('foo%sbar' % symbol) |
249 | + "'" FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True |
250 | + '"' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True |
251 | + '#' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True |
252 | + '$' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True |
253 | + '%' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True |
254 | + '*' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True |
255 | + '+' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True |
256 | + ',' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True |
257 | + '.' FTI data: 'foo.bar':1 query: 'foo.bar' match: True |
258 | + '/' FTI data: 'foo/bar':1 query: 'foo/bar' match: True |
259 | + ':' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True |
260 | + ';' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True |
261 | + '<' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True |
262 | + '=' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True |
263 | + '>' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True |
264 | + '?' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True |
265 | + '@' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True |
266 | + '[' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True |
267 | + '\\' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True |
268 | + ']' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True |
269 | + '^' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True |
270 | + '`' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True |
271 | + '{' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True |
272 | + '}' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True |
273 | + '~' FTI data: 'foo':1 '~bar':2 query: 'foo' & '~bar' match: True |
274 | + |
275 | + >>> for symbol in punctuation: |
276 | + ... print repr(symbol), search_same('aa %sbb%s cc' % (symbol, symbol)) |
277 | + "'" FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True |
278 | + '"' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True |
279 | + '#' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True |
280 | + '$' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True |
281 | + '%' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True |
282 | + '*' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True |
283 | + '+' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True |
284 | + ',' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True |
285 | + '.' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True |
286 | + '/' FTI data: '/bb':2 'aa':1 'cc':3 query: 'aa' & '/bb' & 'cc' match: True |
287 | + ':' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True |
288 | + ';' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True |
289 | + '<' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True |
290 | + '=' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True |
291 | + '>' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True |
292 | + '?' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True |
293 | + '@' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True |
294 | + '[' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True |
295 | + '\\' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True |
296 | + ']' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True |
297 | + '^' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True |
298 | + '`' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True |
299 | + '{' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True |
300 | + '}' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True |
301 | + '~' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & '~bb' & 'cc' match: False |
302 | + |
303 | +XXX Abel Deuring 2012-06-20 bug=1015511: Note that the last line above |
304 | +shows a bug: The FTI data for the string "aa ~bb~ cc" contains the words |
305 | +'aa', 'bb', 'cc', while the ts_query object for the same text contains |
306 | +'aa', '~bb', 'cc', hence the query does not match the string. More details_ |
307 | + |
308 | +XXX Abel Deuring 2012-06-20 bug=1015519: XML tags cannot be searched. |
309 | + |
310 | + >>> print search_same('foo <bar> baz') |
311 | + FTI data: 'baz':2 'foo':1 query: 'foo' & 'baz' match: True |
312 | + |
313 | +More specifically, tags are simply dropped from the FTI data and from |
314 | +search queries. |
315 | + |
316 | + >>> print search('some text <div>whatever</div>', '<div>') |
317 | + FTI data: 'text':2 'whatev':3 query: None match: None |
318 | + |
319 | +Of course, omitting '<' and '>'from the query does not help. |
320 | + |
321 | + >>> print search('some text <div>whatever</div>', 'div') |
322 | + FTI data: 'text':2 'whatev':3 query: 'div' match: False |
323 | + |
324 | +Treatment of characters that are used as operators in to_tsquery(): |
325 | |
326 | >>> ftq('cool!') |
327 | cool <=> 'cool' |
328 | |
329 | - >>> ftq('foo@bar.com') |
330 | - ((foo&bar&com)|foobarcom) <=> 'foo' & 'bar' & 'com' | 'foobarcom' |
331 | - |
332 | +Email addresses are retained as a whole, both by to_tsvector() and by |
333 | +ftq(). |
334 | + |
335 | + >>> print search_same('foo@bar.com') |
336 | + FTI data: 'foo@bar.com':1 query: 'foo@bar.com' match: True |
337 | + |
338 | +File names are retained as a whole. |
339 | + |
340 | + >>> print search_same('foo-bar.txt') |
341 | + FTI data: 'foo-bar.txt':1 query: 'foo-bar.txt' match: True |
342 | |
343 | Some punctuation we pass through to tsearch2 for it to handle. |
344 | - |
345 | - >>> ftq("shouldn't") # NB. This gets stemmed, see below |
346 | - shouldn't <=> 'shouldn' |
347 | - |
348 | -It was noticed though in Bug #33920 that tsearch2 couldn't cope if the |
349 | -apostrophe was not inside a word. So we strip it in these cases. |
350 | - |
351 | - >>> ftq("'cool") |
352 | - cool <=> 'cool' |
353 | - >>> ftq("'shouldn't") |
354 | - shouldn't <=> 'shouldn' |
355 | - >>> ftq("' cool") |
356 | - cool <=> 'cool' |
357 | - >>> ftq("cool '") |
358 | - cool <=> 'cool' |
359 | - >>> ftq("' cool '") |
360 | - cool <=> 'cool' |
361 | - >>> ftq("'cool'") |
362 | - cool <=> 'cool' |
363 | - >>> ftq("('cool' AND bananas)") |
364 | - (cool&bananas) <=> 'cool' & 'banana' |
365 | - |
366 | -It was also noticed through Bug #39828 that tsearch2 will not cope if the |
367 | -! character is embedded inside or found at the end of a word. |
368 | - |
369 | - >>> ftq('cool!') |
370 | - cool <=> 'cool' |
371 | - >>> ftq('hi!mom') |
372 | - hi&mom <=> 'hi' & 'mom' |
373 | - >>> ftq('hi!!!!!mom') |
374 | - hi&mom <=> 'hi' & 'mom' |
375 | - >>> ftq('hi !mom') |
376 | - hi&!mom <=> 'hi' & !'mom' |
377 | - |
378 | - |
379 | -Bug #44913 - Unicode characters in the wrong place |
380 | - |
381 | - >>> ftq(u'a-a\N{LATIN SMALL LETTER C WITH CEDILLA}') |
382 | - ((a&a\xe7)|aa\xe7) <=> 'a\xe7' | 'aa\xe7' |
383 | - |
384 | - Cut & Paste of 'Smart' quotes |
385 | - |
386 | - >>> ftq(u'a-a\N{RIGHT DOUBLE QUOTATION MARK}') |
387 | - ((a&a)|aa) <=> 'aa' |
388 | - |
389 | - >>> ftq(u'\N{LEFT SINGLE QUOTATION MARK}a.a\N{RIGHT SINGLE QUOTATION MARK}') |
390 | - ((a&a)|aa) <=> 'aa' |
391 | +NB. This gets stemmed, see below. |
392 | + |
393 | + >>> print search_same("shouldn't") |
394 | + FTI data: 'shouldn':1 query: 'shouldn' match: True |
395 | + |
396 | +Bug #44913 - Unicode characters in the wrong place. |
397 | + |
398 | + >>> search_same(u'abc-a\N{LATIN SMALL LETTER C WITH CEDILLA}') |
399 | + "FTI data: 'abc':2 'abc-a\xc3\xa7':1 'a\xc3\xa7':3 |
400 | + query: 'abc-a\xc3\xa7' & 'abc' & 'a\xc3\xa7' |
401 | + match: True" |
402 | + |
403 | +Cut & Paste of 'Smart' quotes. Note that the quotation mark is retained |
404 | +in the FTI. |
405 | + |
406 | + >>> print search_same(u'a-a\N{RIGHT DOUBLE QUOTATION MARK}') |
407 | + FTI data: 'a-a”':1 'a”':3 query: 'a-a”' & 'a”' match: True |
408 | + |
409 | + >>> print search_same( |
410 | + ... u'\N{LEFT SINGLE QUOTATION MARK}a.a' |
411 | + ... u'\N{RIGHT SINGLE QUOTATION MARK}') |
412 | + FTI data: 'a’':2 '‘a':1 query: '‘a' & 'a’' match: True |
413 | |
414 | |
415 | Bug #44913 - Nothing but stopwords in a query needing repair |
416 | |
417 | - >>> ftq('a)a') |
418 | - a&a <=> None |
419 | + >>> print search_same('a)a') |
420 | + FTI data: query: None match: None |
421 | |
422 | |
423 | Stop words (words deemed too common in English to search on) are removed |
424 | from queries by tsearch2. |
425 | |
426 | - >>> ftq("Don't do it harder!") |
427 | - don't&do&it&harder <=> 'harder' |
428 | + >>> print search_same("Don't do it harder!") |
429 | + FTI data: 'harder':5 query: 'harder' match: True |
430 | |
431 | |
432 | Note that some queries will return None after compilation, because they |
433 | contained nothing but stop words or punctuation. |
434 | |
435 | - >>> ftq("don't do it!") |
436 | - don't&do&it <=> None |
437 | + >>> print search_same("don't do it!") |
438 | + FTI data: query: None match: None |
439 | |
440 | - >>> ftq(",,,") |
441 | - None <=> None |
442 | + >>> print search_same(",,,") |
443 | + FTI data: query: None match: None |
444 | |
445 | |
446 | Queries containing nothing except whitespace, boolean operators and |
447 | punctuation will just return None. |
448 | |
449 | +Note in the fourth example below that the '-' left in the query by _ftq() |
450 | +is ignored by to_tsquery(). |
451 | + |
452 | >>> ftq(" ") |
453 | None <=> None |
454 | >>> ftq("AND") |
455 | @@ -353,7 +419,7 @@ |
456 | >>> ftq(" AND (!)") |
457 | None <=> None |
458 | >>> ftq("-") |
459 | - None <=> None |
460 | + - <=> None |
461 | |
462 | |
463 | Words are also stemmed by tsearch2 (using the English stemmer). |
464 | @@ -381,7 +447,7 @@ |
465 | (hi|!hello)&mom <=> ( 'hi' | !'hello' ) & 'mom' |
466 | |
467 | >>> ftq('(hi OR - AND hello) AND mom') |
468 | - (hi|hello)&mom <=> ( 'hi' | 'hello' ) & 'mom' |
469 | + (hi|-&hello)&mom <=> ( 'hi' | 'hello' ) & 'mom' |
470 | |
471 | >>> ftq('hi AND mom AND') |
472 | hi&mom <=> 'hi' & 'mom' |
473 | @@ -393,7 +459,7 @@ |
474 | (hi|hello)&mom <=> ( 'hi' | 'hello' ) & 'mom' |
475 | |
476 | >>> ftq('() hi mom ( ) ((! |((&)))) :-)') |
477 | - (hi&mom) <=> 'hi' & 'mom' |
478 | + (hi&mom&-) <=> 'hi' & 'mom' |
479 | |
480 | >>> ftq("(hi mom") |
481 | hi&mom <=> 'hi' & 'mom' |
482 | @@ -414,15 +480,15 @@ |
483 | hi&mom <=> 'hi' & 'mom' |
484 | |
485 | >>> ftq("(foo .") # Bug 43245 |
486 | - foo <=> 'foo' |
487 | + foo&. <=> 'foo' |
488 | |
489 | >>> ftq("(foo.") |
490 | - foo <=> 'foo' |
491 | + foo. <=> 'foo' |
492 | |
493 | Bug #54972 |
494 | |
495 | >>> ftq("a[a\n[a") |
496 | - ((a&a)|aa)&a <=> 'aa' |
497 | + a[a&[a <=> None |
498 | |
499 | Bug #96698 |
500 | |
501 | @@ -437,10 +503,10 @@ |
502 | Bug #160236 |
503 | |
504 | >>> ftq("foo&&bar-baz") |
505 | - foo&((bar&baz)|barbaz) <=> 'foo' & ( 'bar' & 'baz' | 'barbaz' ) |
506 | + foo&bar-baz <=> 'foo' & 'bar-baz' & 'bar' & 'baz' |
507 | |
508 | >>> ftq("foo||bar.baz") |
509 | - foo|((bar&baz)|barbaz) <=> 'foo' | ( 'bar' & 'baz' | 'barbaz' ) |
510 | + foo|bar.baz <=> 'foo' | 'bar.baz' |
511 | |
512 | |
513 | Phrase Searching |
514 | @@ -482,7 +548,8 @@ |
515 | |
516 | >>> runsql(r""" |
517 | ... SELECT title, max(ranking) FROM ( |
518 | - ... SELECT Bug.title,rank(Bug.fti||Message.fti,ftq('firefox')) AS ranking |
519 | + ... SELECT Bug.title,rank(Bug.fti||Message.fti,ftq('firefox')) |
520 | + ... AS ranking |
521 | ... FROM Bug, BugMessage, Message |
522 | ... WHERE Bug.id = BugMessage.bug AND Message.id = BugMessage.message |
523 | ... AND (Bug.fti @@ ftq('firefox') OR Message.fti @@ ftq('firefox')) |
524 | @@ -499,7 +566,8 @@ |
525 | ... AND BugTask.product = Product.id |
526 | ... AND Product.name LIKE lower('%firefox%') |
527 | ... UNION |
528 | - ... SELECT Bug.title, rank(Product.fti, ftq('firefox')) - 0.3 AS ranking |
529 | + ... SELECT Bug.title, rank(Product.fti, ftq('firefox')) - 0.3 |
530 | + ... AS ranking |
531 | ... FROM Bug, BugTask, Product |
532 | ... WHERE Bug.id = BugTask.bug |
533 | ... AND BugTask.product = Product.id |
534 | @@ -518,7 +586,8 @@ |
535 | Printing doesn't work 0.70 |
536 | |
537 | |
538 | -== Natural Language Phrase Query == |
539 | +Natural Language Phrase Query |
540 | +----------------------------- |
541 | |
542 | The standard boolean searches of tsearch2 are fine, but sometime you |
543 | want more fuzzy searches. |
544 | @@ -557,7 +626,8 @@ |
545 | on Ubuntu) - so we are disabling this and reworking from the ground up. |
546 | |
547 | |
548 | -=== nl_term_candidates() === |
549 | +nl_term_candidates() |
550 | +~~~~~~~~~~~~~~~~~~~~ |
551 | |
552 | To find the terms in a search phrase that are canditates for the search, |
553 | we can use the nl_term_candidates() function. This function uses ftq() |
554 | @@ -574,19 +644,16 @@ |
555 | >>> nl_term_candidates('how do I do this?') |
556 | [] |
557 | |
558 | -We also handle expansion of hypenated words (like ftq does): |
559 | - |
560 | - >>> nl_term_candidates('firefox foo-bar give me trouble') |
561 | - [u'firefox', u'foo', u'bar', u'foobar', u'give', u'troubl'] |
562 | - |
563 | Except for the hyphenation character, all non-word caracters are ignored: |
564 | |
565 | >>> nl_term_candidates( |
566 | - ... "Will the \'\'|\'\' character (inside a ''quoted'' string) work???") |
567 | + ... "Will the \'\'|\'\' character (inside a ''quoted'' string) " |
568 | + ... "work???") |
569 | [u'charact', u'insid', u'quot', u'string', u'work'] |
570 | |
571 | |
572 | -=== nl_phrase_search() === |
573 | +nl_phrase_search() |
574 | +~~~~~~~~~~~~~~~~~~ |
575 | |
576 | To get the actual tsearch2 query that should be run, you will use the |
577 | nl_phrase_search() function. This one takes two mandatory parameters and |
578 | @@ -637,7 +704,8 @@ |
579 | u'slow|system' |
580 | |
581 | |
582 | -==== Using other constraints ==== |
583 | +Using other constraints |
584 | +....................... |
585 | |
586 | You can pass a third parameter to the function that will be use as |
587 | an additional constraint to determine the total number of rows that |
588 | @@ -659,7 +727,8 @@ |
589 | |
590 | >>> nl_phrase_search( |
591 | ... 'firefox gets very slow on flickr', Question, |
592 | - ... "Question.product = %s AND Product.active = 't'" % firefox_product.id, |
593 | + ... "Question.product = %s AND Product.active = 't'" |
594 | + ... % firefox_product.id, |
595 | ... ['Product'], fast_enabled=False) |
596 | u'slow|flickr' |
597 | |
598 | @@ -679,7 +748,8 @@ |
599 | u'(firefox&flickr&slow)|(flickr&slow)|(firefox&slow)|(firefox&flickr)' |
600 | |
601 | |
602 | -==== No keywords filtering with few rows ==== |
603 | +No keywords filtering with few rows |
604 | +................................... |
605 | |
606 | The 50% rule is really useful only when there are many rows. When there |
607 | only very few rows, that keyword elimination becomes a problem since |
r=self: See the linked MP for the real review.