Merge lp:~adeuring/launchpad/bug-29713-db-dev into lp:launchpad/db-devel
- bug-29713-db-dev
- Merge into db-devel
Proposed by
Abel Deuring
Status: | Merged |
---|---|
Approved by: | Abel Deuring |
Approved revision: | no longer in the source branch. |
Merged at revision: | 11713 |
Proposed branch: | lp:~adeuring/launchpad/bug-29713-db-dev |
Merge into: | lp:launchpad/db-devel |
Diff against target: |
607 lines (+311/-117) 4 files modified
database/schema/patch-2209-24-1.sql (+125/-0) lib/lp/answers/stories/question-browse-and-search.txt (+1/-1) lib/lp/registry/doc/vocabularies.txt (+1/-2) lib/lp/services/database/doc/textsearching.txt (+184/-114) |
To merge this branch: | bzr merge lp:~adeuring/launchpad/bug-29713-db-dev |
Related bugs: |
Reviewer | Review Type | Date Requested | Status |
---|---|---|---|
Abel Deuring (community) | Approve | ||
Review via email: mp+112043@code.launchpad.net |
Commit message
Better handling of punctuation in full text search strings
Description of the change
See https:/
To post a comment you must log in.
Revision history for this message
Stuart Bishop (stub) wrote : | # |
db patch has been applied to production
Preview Diff
[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1 | === added file 'database/schema/patch-2209-24-1.sql' | |||
2 | --- database/schema/patch-2209-24-1.sql 1970-01-01 00:00:00 +0000 | |||
3 | +++ database/schema/patch-2209-24-1.sql 2012-06-26 10:15:26 +0000 | |||
4 | @@ -0,0 +1,125 @@ | |||
5 | 1 | -- Copyright 2012 Canonical Ltd. This software is licensed under the | ||
6 | 2 | -- GNU Affero General Public License version 3 (see the file LICENSE). | ||
7 | 3 | |||
8 | 4 | SET client_min_messages=ERROR; | ||
9 | 5 | |||
10 | 6 | CREATE OR REPLACE FUNCTION _ftq(text) RETURNS text | ||
11 | 7 | LANGUAGE plpythonu IMMUTABLE STRICT | ||
12 | 8 | AS $_$ | ||
13 | 9 | import re | ||
14 | 10 | |||
15 | 11 | # I think this method would be more robust if we used a real | ||
16 | 12 | # tokenizer and parser to generate the query string, but we need | ||
17 | 13 | # something suitable for use as a stored procedure which currently | ||
18 | 14 | # means no external dependancies. | ||
19 | 15 | |||
20 | 16 | # Convert to Unicode | ||
21 | 17 | query = args[0].decode('utf8') | ||
22 | 18 | ## plpy.debug('1 query is %s' % repr(query)) | ||
23 | 19 | |||
24 | 20 | # Normalize whitespace | ||
25 | 21 | query = re.sub("(?u)\s+"," ", query) | ||
26 | 22 | |||
27 | 23 | # Convert AND, OR, NOT and - to tsearch2 punctuation | ||
28 | 24 | query = re.sub(r"(?u)(?:^|\s)-([\w\(])", r" !\1", query) | ||
29 | 25 | query = re.sub(r"(?u)\bAND\b", "&", query) | ||
30 | 26 | query = re.sub(r"(?u)\bOR\b", "|", query) | ||
31 | 27 | query = re.sub(r"(?u)\bNOT\b", " !", query) | ||
32 | 28 | ## plpy.debug('2 query is %s' % repr(query)) | ||
33 | 29 | |||
34 | 30 | # Deal with unwanted punctuation. | ||
35 | 31 | # ':' is used in queries to specify a weight of a word. | ||
36 | 32 | # '\' is treated differently in to_tsvector() and to_tsquery(). | ||
37 | 33 | punctuation = r'[:\\]' | ||
38 | 34 | query = re.sub(r"(?u)%s+" % (punctuation,), " ", query) | ||
39 | 35 | ## plpy.debug('3 query is %s' % repr(query)) | ||
40 | 36 | |||
41 | 37 | # Strip ! characters inside and at the end of a word | ||
42 | 38 | query = re.sub(r"(?u)(?<=\w)[\!]+", " ", query) | ||
43 | 39 | |||
44 | 40 | # Now that we have handle case sensitive booleans, convert to lowercase | ||
45 | 41 | query = query.lower() | ||
46 | 42 | |||
47 | 43 | # Remove unpartnered bracket on the left and right | ||
48 | 44 | query = re.sub(r"(?ux) ^ ( [^(]* ) \)", r"(\1)", query) | ||
49 | 45 | query = re.sub(r"(?ux) \( ( [^)]* ) $", r"(\1)", query) | ||
50 | 46 | |||
51 | 47 | # Remove spurious brackets | ||
52 | 48 | query = re.sub(r"(?u)\(([^\&\|]*?)\)", r" \1 ", query) | ||
53 | 49 | ## plpy.debug('5 query is %s' % repr(query)) | ||
54 | 50 | |||
55 | 51 | # Insert & between tokens without an existing boolean operator | ||
56 | 52 | # ( not proceeded by (|&! | ||
57 | 53 | query = re.sub(r"(?u)(?<![\(\|\&\!])\s*\(", "&(", query) | ||
58 | 54 | ## plpy.debug('6 query is %s' % repr(query)) | ||
59 | 55 | # ) not followed by )|& | ||
60 | 56 | query = re.sub(r"(?u)\)(?!\s*(\)|\||\&|\s*$))", ")&", query) | ||
61 | 57 | ## plpy.debug('6.1 query is %s' % repr(query)) | ||
62 | 58 | # Whitespace not proceded by (|&! not followed by &| | ||
63 | 59 | query = re.sub(r"(?u)(?<![\(\|\&\!\s])\s+(?![\&\|\s])", "&", query) | ||
64 | 60 | ## plpy.debug('7 query is %s' % repr(query)) | ||
65 | 61 | |||
66 | 62 | # Detect and repair syntax errors - we are lenient because | ||
67 | 63 | # this input is generally from users. | ||
68 | 64 | |||
69 | 65 | # Fix unbalanced brackets | ||
70 | 66 | openings = query.count("(") | ||
71 | 67 | closings = query.count(")") | ||
72 | 68 | if openings > closings: | ||
73 | 69 | query = query + " ) "*(openings-closings) | ||
74 | 70 | elif closings > openings: | ||
75 | 71 | query = " ( "*(closings-openings) + query | ||
76 | 72 | ## plpy.debug('8 query is %s' % repr(query)) | ||
77 | 73 | |||
78 | 74 | # Strip ' character that do not have letters on both sides | ||
79 | 75 | query = re.sub(r"(?u)((?<!\w)'|'(?!\w))", "", query) | ||
80 | 76 | |||
81 | 77 | # Brackets containing nothing but whitespace and booleans, recursive | ||
82 | 78 | last = "" | ||
83 | 79 | while last != query: | ||
84 | 80 | last = query | ||
85 | 81 | query = re.sub(r"(?u)\([\s\&\|\!]*\)", "", query) | ||
86 | 82 | ## plpy.debug('9 query is %s' % repr(query)) | ||
87 | 83 | |||
88 | 84 | # An & or | following a ( | ||
89 | 85 | query = re.sub(r"(?u)(?<=\()[\&\|\s]+", "", query) | ||
90 | 86 | ## plpy.debug('10 query is %s' % repr(query)) | ||
91 | 87 | |||
92 | 88 | # An &, | or ! immediatly before a ) | ||
93 | 89 | query = re.sub(r"(?u)[\&\|\!\s]*[\&\|\!]+\s*(?=\))", "", query) | ||
94 | 90 | ## plpy.debug('11 query is %s' % repr(query)) | ||
95 | 91 | |||
96 | 92 | # An &,| or ! followed by another boolean. | ||
97 | 93 | query = re.sub(r"(?ux) \s* ( [\&\|\!] ) [\s\&\|]+", r"\1", query) | ||
98 | 94 | ## plpy.debug('12 query is %s' % repr(query)) | ||
99 | 95 | |||
100 | 96 | # Leading & or | | ||
101 | 97 | query = re.sub(r"(?u)^[\s\&\|]+", "", query) | ||
102 | 98 | ## plpy.debug('13 query is %s' % repr(query)) | ||
103 | 99 | |||
104 | 100 | # Trailing &, | or ! | ||
105 | 101 | query = re.sub(r"(?u)[\&\|\!\s]+$", "", query) | ||
106 | 102 | ## plpy.debug('14 query is %s' % repr(query)) | ||
107 | 103 | |||
108 | 104 | # If we have nothing but whitespace and tsearch2 operators, | ||
109 | 105 | # return NULL. | ||
110 | 106 | if re.search(r"(?u)^[\&\|\!\s\(\)]*$", query) is not None: | ||
111 | 107 | return None | ||
112 | 108 | |||
113 | 109 | # Convert back to UTF-8 | ||
114 | 110 | query = query.encode('utf8') | ||
115 | 111 | ## plpy.debug('15 query is %s' % repr(query)) | ||
116 | 112 | |||
117 | 113 | return query or None | ||
118 | 114 | $_$; | ||
119 | 115 | |||
120 | 116 | CREATE OR REPLACE FUNCTION ftq(text) RETURNS pg_catalog.tsquery | ||
121 | 117 | LANGUAGE plpythonu IMMUTABLE STRICT | ||
122 | 118 | AS $_$ | ||
123 | 119 | p = plpy.prepare( | ||
124 | 120 | "SELECT to_tsquery('default', _ftq($1)) AS x", ["text"]) | ||
125 | 121 | query = plpy.execute(p, args, 1)[0]["x"] | ||
126 | 122 | return query or None | ||
127 | 123 | $_$; | ||
128 | 124 | |||
129 | 125 | INSERT INTO LaunchpadDatabaseRevision VALUES (2209, 24, 1); | ||
130 | 0 | 126 | ||
131 | === modified file 'lib/lp/answers/stories/question-browse-and-search.txt' | |||
132 | --- lib/lp/answers/stories/question-browse-and-search.txt 2012-06-22 16:17:48 +0000 | |||
133 | +++ lib/lp/answers/stories/question-browse-and-search.txt 2012-06-26 10:15:26 +0000 | |||
134 | @@ -311,7 +311,7 @@ | |||
135 | 311 | dealing with plugins problems, he always start by a search for such | 311 | dealing with plugins problems, he always start by a search for such |
136 | 312 | problems: | 312 | problems: |
137 | 313 | 313 | ||
139 | 314 | >>> browser.getControl(name='field.search_text').value = 'plug-in' | 314 | >>> browser.getControl(name='field.search_text').value = 'plugin' |
140 | 315 | >>> browser.getControl('Search', index=0).click() | 315 | >>> browser.getControl('Search', index=0).click() |
141 | 316 | >>> questions = find_tag_by_id(browser.contents, 'question-listing') | 316 | >>> questions = find_tag_by_id(browser.contents, 'question-listing') |
142 | 317 | >>> for question in questions.fetch('td', 'questionTITLE'): | 317 | >>> for question in questions.fetch('td', 'questionTITLE'): |
143 | 318 | 318 | ||
144 | === modified file 'lib/lp/registry/doc/vocabularies.txt' | |||
145 | --- lib/lp/registry/doc/vocabularies.txt 2012-06-22 16:17:48 +0000 | |||
146 | +++ lib/lp/registry/doc/vocabularies.txt 2012-06-26 10:15:26 +0000 | |||
147 | @@ -731,8 +731,7 @@ | |||
148 | 731 | 731 | ||
149 | 732 | >>> [(p.name, getattr(p.teamowner, 'name', None)) | 732 | >>> [(p.name, getattr(p.teamowner, 'name', None)) |
150 | 733 | ... for p in vocab.search('ubuntu-team')] | 733 | ... for p in vocab.search('ubuntu-team')] |
153 | 734 | [(u'doc', None), (u'name18', u'mark'), | 734 | [(u'ubuntu-team', u'mark')] |
152 | 735 | (u'ubuntu-security', u'kamion'), (u'ubuntu-team', u'mark')] | ||
154 | 736 | 735 | ||
155 | 737 | But it doesn't include merged accounts: | 736 | But it doesn't include merged accounts: |
156 | 738 | 737 | ||
157 | 739 | 738 | ||
158 | === modified file 'lib/lp/services/database/doc/textsearching.txt' | |||
159 | --- lib/lp/services/database/doc/textsearching.txt 2012-06-22 16:17:48 +0000 | |||
160 | +++ lib/lp/services/database/doc/textsearching.txt 2012-06-26 10:15:26 +0000 | |||
161 | @@ -138,7 +138,22 @@ | |||
162 | 138 | ... compiled = compiled.decode('UTF-8') | 138 | ... compiled = compiled.decode('UTF-8') |
163 | 139 | ... compiled = compiled.encode('US-ASCII', 'backslashreplace') | 139 | ... compiled = compiled.encode('US-ASCII', 'backslashreplace') |
164 | 140 | ... print '%s <=> %s' % (uncompiled, compiled) | 140 | ... print '%s <=> %s' % (uncompiled, compiled) |
166 | 141 | 141 | >>> | |
167 | 142 | >>> def search(text_to_search, search_phrase): | ||
168 | 143 | ... cur = cursor() | ||
169 | 144 | ... cur.execute("SELECT to_tsvector(%s)", (text_to_search, )) | ||
170 | 145 | ... ts_vector = cur.fetchall()[0][0] | ||
171 | 146 | ... cur.execute("SELECT ftq(%s)", (search_phrase, )) | ||
172 | 147 | ... ts_query = cur.fetchall()[0][0] | ||
173 | 148 | ... cur.execute( | ||
174 | 149 | ... "SELECT to_tsvector(%s) @@ ftq(%s)", | ||
175 | 150 | ... (text_to_search, search_phrase)) | ||
176 | 151 | ... match = cur.fetchall()[0][0] | ||
177 | 152 | ... return "FTI data: %s query: %s match: %s" % ( | ||
178 | 153 | ... ts_vector, ts_query, str(match)) | ||
179 | 154 | >>> | ||
180 | 155 | >>> def search_same(text): | ||
181 | 156 | ... return search(text, text) | ||
182 | 142 | 157 | ||
183 | 143 | Queries are lowercased | 158 | Queries are lowercased |
184 | 144 | 159 | ||
185 | @@ -225,127 +240,178 @@ | |||
186 | 225 | (hi&ho|hoe)&work&go <=> ( 'hi' & 'ho' | 'hoe' ) & 'work' & 'go' | 240 | (hi&ho|hoe)&work&go <=> ( 'hi' & 'ho' | 'hoe' ) & 'work' & 'go' |
187 | 226 | 241 | ||
188 | 227 | 242 | ||
200 | 228 | Hypenation is handled specially. Note that the & operator has precidence | 243 | If a single '-' precedes a word, it is converted into the '!' operator. |
201 | 229 | over the | operator and that tsearch2 removes the unnecessary branckets. | 244 | Note also that a trailing '-' is dropped by to_tsquery(). |
191 | 230 | |||
192 | 231 | >>> ftq('foo-bar') | ||
193 | 232 | ((foo&bar)|foobar) <=> 'foo' & 'bar' | 'foobar' | ||
194 | 233 | |||
195 | 234 | >>> ftq('foo-bar-baz') | ||
196 | 235 | ((foo&bar&baz)|foobarbaz) <=> 'foo' & 'bar' & 'baz' | 'foobarbaz' | ||
197 | 236 | |||
198 | 237 | >>> ftq('foo & bar-baz') | ||
199 | 238 | foo&((bar&baz)|barbaz) <=> 'foo' & ( 'bar' & 'baz' | 'barbaz' ) | ||
202 | 239 | 245 | ||
203 | 240 | >>> ftq('-foo bar-') | 246 | >>> ftq('-foo bar-') |
205 | 241 | !foo&bar <=> !'foo' & 'bar' | 247 | !foo&bar- <=> !'foo' & 'bar' |
206 | 248 | |||
207 | 249 | Repeated '-' are simply ignored by to_tsquery(). | ||
208 | 242 | 250 | ||
209 | 243 | >>> ftq('---foo--- ---bar---') | 251 | >>> ftq('---foo--- ---bar---') |
231 | 244 | foo&bar <=> 'foo' & 'bar' | 252 | ---foo---&---bar--- <=> 'foo' & 'bar' |
232 | 245 | 253 | ||
233 | 246 | >>> ftq('foo-bar test') | 254 | Hyphens surrounded by two words are retained. This reflects the way |
234 | 247 | ((foo&bar)|foobar)&test <=> ( 'foo' & 'bar' | 'foobar' ) & 'test' | 255 | how to_tsquery() and to_tsvector() handle such strings. |
235 | 248 | 256 | ||
236 | 249 | >>> ftq('foo-bar OR test') | 257 | >>> print search_same('foo-bar') |
237 | 250 | ((foo&bar)|foobar)|test <=> ( 'foo' & 'bar' | 'foobar' ) | 'test' | 258 | FTI data: 'bar':3 'foo':2 'foo-bar':1 |
238 | 251 | 259 | query: 'foo-bar' & 'foo' & 'bar' | |
239 | 252 | 260 | match: True | |
240 | 253 | Most punctuation characters are converted to whitespace outside of | 261 | |
241 | 254 | words, or treated as a hypen inside words. The exceptions are the | 262 | |
242 | 255 | operators ()!&|!. | 263 | Punctuation is handled consistently. If a string containing punctuation |
243 | 256 | 264 | appears in an FTI, it can also be passed to ftq(),and a search for this | |
244 | 257 | >>> ftq(':100%') | 265 | string finds the indexed text. |
245 | 258 | 100 <=> '100' | 266 | |
246 | 259 | 267 | >>> punctuation = '\'"#$%*+,./:;<=>?@[\]^`{}~' | |
247 | 260 | >>> ftq(r'foo\bar') | 268 | >>> for symbol in punctuation: |
248 | 261 | ((foo&bar)|foobar) <=> 'foo' & 'bar' | 'foobar' | 269 | ... print repr(symbol), search_same('foo%sbar' % symbol) |
249 | 262 | 270 | "'" FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True | |
250 | 263 | >>> ftq('/dev/pmu') | 271 | '"' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True |
251 | 264 | ((dev&pmu)|devpmu) <=> 'dev' & 'pmu' | 'devpmu' | 272 | '#' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True |
252 | 273 | '$' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True | ||
253 | 274 | '%' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True | ||
254 | 275 | '*' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True | ||
255 | 276 | '+' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True | ||
256 | 277 | ',' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True | ||
257 | 278 | '.' FTI data: 'foo.bar':1 query: 'foo.bar' match: True | ||
258 | 279 | '/' FTI data: 'foo/bar':1 query: 'foo/bar' match: True | ||
259 | 280 | ':' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True | ||
260 | 281 | ';' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True | ||
261 | 282 | '<' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True | ||
262 | 283 | '=' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True | ||
263 | 284 | '>' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True | ||
264 | 285 | '?' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True | ||
265 | 286 | '@' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True | ||
266 | 287 | '[' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True | ||
267 | 288 | '\\' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True | ||
268 | 289 | ']' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True | ||
269 | 290 | '^' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True | ||
270 | 291 | '`' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True | ||
271 | 292 | '{' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True | ||
272 | 293 | '}' FTI data: 'bar':2 'foo':1 query: 'foo' & 'bar' match: True | ||
273 | 294 | '~' FTI data: 'foo':1 '~bar':2 query: 'foo' & '~bar' match: True | ||
274 | 295 | |||
275 | 296 | >>> for symbol in punctuation: | ||
276 | 297 | ... print repr(symbol), search_same('aa %sbb%s cc' % (symbol, symbol)) | ||
277 | 298 | "'" FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True | ||
278 | 299 | '"' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True | ||
279 | 300 | '#' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True | ||
280 | 301 | '$' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True | ||
281 | 302 | '%' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True | ||
282 | 303 | '*' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True | ||
283 | 304 | '+' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True | ||
284 | 305 | ',' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True | ||
285 | 306 | '.' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True | ||
286 | 307 | '/' FTI data: '/bb':2 'aa':1 'cc':3 query: 'aa' & '/bb' & 'cc' match: True | ||
287 | 308 | ':' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True | ||
288 | 309 | ';' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True | ||
289 | 310 | '<' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True | ||
290 | 311 | '=' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True | ||
291 | 312 | '>' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True | ||
292 | 313 | '?' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True | ||
293 | 314 | '@' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True | ||
294 | 315 | '[' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True | ||
295 | 316 | '\\' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True | ||
296 | 317 | ']' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True | ||
297 | 318 | '^' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True | ||
298 | 319 | '`' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True | ||
299 | 320 | '{' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True | ||
300 | 321 | '}' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & 'bb' & 'cc' match: True | ||
301 | 322 | '~' FTI data: 'aa':1 'bb':2 'cc':3 query: 'aa' & '~bb' & 'cc' match: False | ||
302 | 323 | |||
303 | 324 | XXX Abel Deuring 2012-06-20 bug=1015511: Note that the last line above | ||
304 | 325 | shows a bug: The FTI data for the string "aa ~bb~ cc" contains the words | ||
305 | 326 | 'aa', 'bb', 'cc', while the ts_query object for the same text contains | ||
306 | 327 | 'aa', '~bb', 'cc', hence the query does not match the string. More details_ | ||
307 | 328 | |||
308 | 329 | XXX Abel Deuring 2012-06-20 bug=1015519: XML tags cannot be searched. | ||
309 | 330 | |||
310 | 331 | >>> print search_same('foo <bar> baz') | ||
311 | 332 | FTI data: 'baz':2 'foo':1 query: 'foo' & 'baz' match: True | ||
312 | 333 | |||
313 | 334 | More specifically, tags are simply dropped from the FTI data and from | ||
314 | 335 | search queries. | ||
315 | 336 | |||
316 | 337 | >>> print search('some text <div>whatever</div>', '<div>') | ||
317 | 338 | FTI data: 'text':2 'whatev':3 query: None match: None | ||
318 | 339 | |||
319 | 340 | Of course, omitting '<' and '>'from the query does not help. | ||
320 | 341 | |||
321 | 342 | >>> print search('some text <div>whatever</div>', 'div') | ||
322 | 343 | FTI data: 'text':2 'whatev':3 query: 'div' match: False | ||
323 | 344 | |||
324 | 345 | Treatment of characters that are used as operators in to_tsquery(): | ||
325 | 265 | 346 | ||
326 | 266 | >>> ftq('cool!') | 347 | >>> ftq('cool!') |
327 | 267 | cool <=> 'cool' | 348 | cool <=> 'cool' |
328 | 268 | 349 | ||
332 | 269 | >>> ftq('foo@bar.com') | 350 | Email addresses are retained as a whole, both by to_tsvector() and by |
333 | 270 | ((foo&bar&com)|foobarcom) <=> 'foo' & 'bar' & 'com' | 'foobarcom' | 351 | ftq(). |
334 | 271 | 352 | ||
335 | 353 | >>> print search_same('foo@bar.com') | ||
336 | 354 | FTI data: 'foo@bar.com':1 query: 'foo@bar.com' match: True | ||
337 | 355 | |||
338 | 356 | File names are retained as a whole. | ||
339 | 357 | |||
340 | 358 | >>> print search_same('foo-bar.txt') | ||
341 | 359 | FTI data: 'foo-bar.txt':1 query: 'foo-bar.txt' match: True | ||
342 | 272 | 360 | ||
343 | 273 | Some punctuation we pass through to tsearch2 for it to handle. | 361 | Some punctuation we pass through to tsearch2 for it to handle. |
391 | 274 | 362 | NB. This gets stemmed, see below. | |
392 | 275 | >>> ftq("shouldn't") # NB. This gets stemmed, see below | 363 | |
393 | 276 | shouldn't <=> 'shouldn' | 364 | >>> print search_same("shouldn't") |
394 | 277 | 365 | FTI data: 'shouldn':1 query: 'shouldn' match: True | |
395 | 278 | It was noticed though in Bug #33920 that tsearch2 couldn't cope if the | 366 | |
396 | 279 | apostrophe was not inside a word. So we strip it in these cases. | 367 | Bug #44913 - Unicode characters in the wrong place. |
397 | 280 | 368 | ||
398 | 281 | >>> ftq("'cool") | 369 | >>> search_same(u'abc-a\N{LATIN SMALL LETTER C WITH CEDILLA}') |
399 | 282 | cool <=> 'cool' | 370 | "FTI data: 'abc':2 'abc-a\xc3\xa7':1 'a\xc3\xa7':3 |
400 | 283 | >>> ftq("'shouldn't") | 371 | query: 'abc-a\xc3\xa7' & 'abc' & 'a\xc3\xa7' |
401 | 284 | shouldn't <=> 'shouldn' | 372 | match: True" |
402 | 285 | >>> ftq("' cool") | 373 | |
403 | 286 | cool <=> 'cool' | 374 | Cut & Paste of 'Smart' quotes. Note that the quotation mark is retained |
404 | 287 | >>> ftq("cool '") | 375 | in the FTI. |
405 | 288 | cool <=> 'cool' | 376 | |
406 | 289 | >>> ftq("' cool '") | 377 | >>> print search_same(u'a-a\N{RIGHT DOUBLE QUOTATION MARK}') |
407 | 290 | cool <=> 'cool' | 378 | FTI data: 'a-a”':1 'a”':3 query: 'a-a”' & 'a”' match: True |
408 | 291 | >>> ftq("'cool'") | 379 | |
409 | 292 | cool <=> 'cool' | 380 | >>> print search_same( |
410 | 293 | >>> ftq("('cool' AND bananas)") | 381 | ... u'\N{LEFT SINGLE QUOTATION MARK}a.a' |
411 | 294 | (cool&bananas) <=> 'cool' & 'banana' | 382 | ... u'\N{RIGHT SINGLE QUOTATION MARK}') |
412 | 295 | 383 | FTI data: 'a’':2 '‘a':1 query: '‘a' & 'a’' match: True | |
366 | 296 | It was also noticed through Bug #39828 that tsearch2 will not cope if the | ||
367 | 297 | ! character is embedded inside or found at the end of a word. | ||
368 | 298 | |||
369 | 299 | >>> ftq('cool!') | ||
370 | 300 | cool <=> 'cool' | ||
371 | 301 | >>> ftq('hi!mom') | ||
372 | 302 | hi&mom <=> 'hi' & 'mom' | ||
373 | 303 | >>> ftq('hi!!!!!mom') | ||
374 | 304 | hi&mom <=> 'hi' & 'mom' | ||
375 | 305 | >>> ftq('hi !mom') | ||
376 | 306 | hi&!mom <=> 'hi' & !'mom' | ||
377 | 307 | |||
378 | 308 | |||
379 | 309 | Bug #44913 - Unicode characters in the wrong place | ||
380 | 310 | |||
381 | 311 | >>> ftq(u'a-a\N{LATIN SMALL LETTER C WITH CEDILLA}') | ||
382 | 312 | ((a&a\xe7)|aa\xe7) <=> 'a\xe7' | 'aa\xe7' | ||
383 | 313 | |||
384 | 314 | Cut & Paste of 'Smart' quotes | ||
385 | 315 | |||
386 | 316 | >>> ftq(u'a-a\N{RIGHT DOUBLE QUOTATION MARK}') | ||
387 | 317 | ((a&a)|aa) <=> 'aa' | ||
388 | 318 | |||
389 | 319 | >>> ftq(u'\N{LEFT SINGLE QUOTATION MARK}a.a\N{RIGHT SINGLE QUOTATION MARK}') | ||
390 | 320 | ((a&a)|aa) <=> 'aa' | ||
413 | 321 | 384 | ||
414 | 322 | 385 | ||
415 | 323 | Bug #44913 - Nothing but stopwords in a query needing repair | 386 | Bug #44913 - Nothing but stopwords in a query needing repair |
416 | 324 | 387 | ||
419 | 325 | >>> ftq('a)a') | 388 | >>> print search_same('a)a') |
420 | 326 | a&a <=> None | 389 | FTI data: query: None match: None |
421 | 327 | 390 | ||
422 | 328 | 391 | ||
423 | 329 | Stop words (words deemed too common in English to search on) are removed | 392 | Stop words (words deemed too common in English to search on) are removed |
424 | 330 | from queries by tsearch2. | 393 | from queries by tsearch2. |
425 | 331 | 394 | ||
428 | 332 | >>> ftq("Don't do it harder!") | 395 | >>> print search_same("Don't do it harder!") |
429 | 333 | don't&do&it&harder <=> 'harder' | 396 | FTI data: 'harder':5 query: 'harder' match: True |
430 | 334 | 397 | ||
431 | 335 | 398 | ||
432 | 336 | Note that some queries will return None after compilation, because they | 399 | Note that some queries will return None after compilation, because they |
433 | 337 | contained nothing but stop words or punctuation. | 400 | contained nothing but stop words or punctuation. |
434 | 338 | 401 | ||
437 | 339 | >>> ftq("don't do it!") | 402 | >>> print search_same("don't do it!") |
438 | 340 | don't&do&it <=> None | 403 | FTI data: query: None match: None |
439 | 341 | 404 | ||
442 | 342 | >>> ftq(",,,") | 405 | >>> print search_same(",,,") |
443 | 343 | None <=> None | 406 | FTI data: query: None match: None |
444 | 344 | 407 | ||
445 | 345 | 408 | ||
446 | 346 | Queries containing nothing except whitespace, boolean operators and | 409 | Queries containing nothing except whitespace, boolean operators and |
447 | 347 | punctuation will just return None. | 410 | punctuation will just return None. |
448 | 348 | 411 | ||
449 | 412 | Note in the fourth example below that the '-' left in the query by _ftq() | ||
450 | 413 | is ignored by to_tsquery(). | ||
451 | 414 | |||
452 | 349 | >>> ftq(" ") | 415 | >>> ftq(" ") |
453 | 350 | None <=> None | 416 | None <=> None |
454 | 351 | >>> ftq("AND") | 417 | >>> ftq("AND") |
455 | @@ -353,7 +419,7 @@ | |||
456 | 353 | >>> ftq(" AND (!)") | 419 | >>> ftq(" AND (!)") |
457 | 354 | None <=> None | 420 | None <=> None |
458 | 355 | >>> ftq("-") | 421 | >>> ftq("-") |
460 | 356 | None <=> None | 422 | - <=> None |
461 | 357 | 423 | ||
462 | 358 | 424 | ||
463 | 359 | Words are also stemmed by tsearch2 (using the English stemmer). | 425 | Words are also stemmed by tsearch2 (using the English stemmer). |
464 | @@ -381,7 +447,7 @@ | |||
465 | 381 | (hi|!hello)&mom <=> ( 'hi' | !'hello' ) & 'mom' | 447 | (hi|!hello)&mom <=> ( 'hi' | !'hello' ) & 'mom' |
466 | 382 | 448 | ||
467 | 383 | >>> ftq('(hi OR - AND hello) AND mom') | 449 | >>> ftq('(hi OR - AND hello) AND mom') |
469 | 384 | (hi|hello)&mom <=> ( 'hi' | 'hello' ) & 'mom' | 450 | (hi|-&hello)&mom <=> ( 'hi' | 'hello' ) & 'mom' |
470 | 385 | 451 | ||
471 | 386 | >>> ftq('hi AND mom AND') | 452 | >>> ftq('hi AND mom AND') |
472 | 387 | hi&mom <=> 'hi' & 'mom' | 453 | hi&mom <=> 'hi' & 'mom' |
473 | @@ -393,7 +459,7 @@ | |||
474 | 393 | (hi|hello)&mom <=> ( 'hi' | 'hello' ) & 'mom' | 459 | (hi|hello)&mom <=> ( 'hi' | 'hello' ) & 'mom' |
475 | 394 | 460 | ||
476 | 395 | >>> ftq('() hi mom ( ) ((! |((&)))) :-)') | 461 | >>> ftq('() hi mom ( ) ((! |((&)))) :-)') |
478 | 396 | (hi&mom) <=> 'hi' & 'mom' | 462 | (hi&mom&-) <=> 'hi' & 'mom' |
479 | 397 | 463 | ||
480 | 398 | >>> ftq("(hi mom") | 464 | >>> ftq("(hi mom") |
481 | 399 | hi&mom <=> 'hi' & 'mom' | 465 | hi&mom <=> 'hi' & 'mom' |
482 | @@ -414,15 +480,15 @@ | |||
483 | 414 | hi&mom <=> 'hi' & 'mom' | 480 | hi&mom <=> 'hi' & 'mom' |
484 | 415 | 481 | ||
485 | 416 | >>> ftq("(foo .") # Bug 43245 | 482 | >>> ftq("(foo .") # Bug 43245 |
487 | 417 | foo <=> 'foo' | 483 | foo&. <=> 'foo' |
488 | 418 | 484 | ||
489 | 419 | >>> ftq("(foo.") | 485 | >>> ftq("(foo.") |
491 | 420 | foo <=> 'foo' | 486 | foo. <=> 'foo' |
492 | 421 | 487 | ||
493 | 422 | Bug #54972 | 488 | Bug #54972 |
494 | 423 | 489 | ||
495 | 424 | >>> ftq("a[a\n[a") | 490 | >>> ftq("a[a\n[a") |
497 | 425 | ((a&a)|aa)&a <=> 'aa' | 491 | a[a&[a <=> None |
498 | 426 | 492 | ||
499 | 427 | Bug #96698 | 493 | Bug #96698 |
500 | 428 | 494 | ||
501 | @@ -437,10 +503,10 @@ | |||
502 | 437 | Bug #160236 | 503 | Bug #160236 |
503 | 438 | 504 | ||
504 | 439 | >>> ftq("foo&&bar-baz") | 505 | >>> ftq("foo&&bar-baz") |
506 | 440 | foo&((bar&baz)|barbaz) <=> 'foo' & ( 'bar' & 'baz' | 'barbaz' ) | 506 | foo&bar-baz <=> 'foo' & 'bar-baz' & 'bar' & 'baz' |
507 | 441 | 507 | ||
508 | 442 | >>> ftq("foo||bar.baz") | 508 | >>> ftq("foo||bar.baz") |
510 | 443 | foo|((bar&baz)|barbaz) <=> 'foo' | ( 'bar' & 'baz' | 'barbaz' ) | 509 | foo|bar.baz <=> 'foo' | 'bar.baz' |
511 | 444 | 510 | ||
512 | 445 | 511 | ||
513 | 446 | Phrase Searching | 512 | Phrase Searching |
514 | @@ -482,7 +548,8 @@ | |||
515 | 482 | 548 | ||
516 | 483 | >>> runsql(r""" | 549 | >>> runsql(r""" |
517 | 484 | ... SELECT title, max(ranking) FROM ( | 550 | ... SELECT title, max(ranking) FROM ( |
519 | 485 | ... SELECT Bug.title,rank(Bug.fti||Message.fti,ftq('firefox')) AS ranking | 551 | ... SELECT Bug.title,rank(Bug.fti||Message.fti,ftq('firefox')) |
520 | 552 | ... AS ranking | ||
521 | 486 | ... FROM Bug, BugMessage, Message | 553 | ... FROM Bug, BugMessage, Message |
522 | 487 | ... WHERE Bug.id = BugMessage.bug AND Message.id = BugMessage.message | 554 | ... WHERE Bug.id = BugMessage.bug AND Message.id = BugMessage.message |
523 | 488 | ... AND (Bug.fti @@ ftq('firefox') OR Message.fti @@ ftq('firefox')) | 555 | ... AND (Bug.fti @@ ftq('firefox') OR Message.fti @@ ftq('firefox')) |
524 | @@ -499,7 +566,8 @@ | |||
525 | 499 | ... AND BugTask.product = Product.id | 566 | ... AND BugTask.product = Product.id |
526 | 500 | ... AND Product.name LIKE lower('%firefox%') | 567 | ... AND Product.name LIKE lower('%firefox%') |
527 | 501 | ... UNION | 568 | ... UNION |
529 | 502 | ... SELECT Bug.title, rank(Product.fti, ftq('firefox')) - 0.3 AS ranking | 569 | ... SELECT Bug.title, rank(Product.fti, ftq('firefox')) - 0.3 |
530 | 570 | ... AS ranking | ||
531 | 503 | ... FROM Bug, BugTask, Product | 571 | ... FROM Bug, BugTask, Product |
532 | 504 | ... WHERE Bug.id = BugTask.bug | 572 | ... WHERE Bug.id = BugTask.bug |
533 | 505 | ... AND BugTask.product = Product.id | 573 | ... AND BugTask.product = Product.id |
534 | @@ -518,7 +586,8 @@ | |||
535 | 518 | Printing doesn't work 0.70 | 586 | Printing doesn't work 0.70 |
536 | 519 | 587 | ||
537 | 520 | 588 | ||
539 | 521 | == Natural Language Phrase Query == | 589 | Natural Language Phrase Query |
540 | 590 | ----------------------------- | ||
541 | 522 | 591 | ||
542 | 523 | The standard boolean searches of tsearch2 are fine, but sometime you | 592 | The standard boolean searches of tsearch2 are fine, but sometime you |
543 | 524 | want more fuzzy searches. | 593 | want more fuzzy searches. |
544 | @@ -557,7 +626,8 @@ | |||
545 | 557 | on Ubuntu) - so we are disabling this and reworking from the ground up. | 626 | on Ubuntu) - so we are disabling this and reworking from the ground up. |
546 | 558 | 627 | ||
547 | 559 | 628 | ||
549 | 560 | === nl_term_candidates() === | 629 | nl_term_candidates() |
550 | 630 | ~~~~~~~~~~~~~~~~~~~~ | ||
551 | 561 | 631 | ||
552 | 562 | To find the terms in a search phrase that are canditates for the search, | 632 | To find the terms in a search phrase that are canditates for the search, |
553 | 563 | we can use the nl_term_candidates() function. This function uses ftq() | 633 | we can use the nl_term_candidates() function. This function uses ftq() |
554 | @@ -574,19 +644,16 @@ | |||
555 | 574 | >>> nl_term_candidates('how do I do this?') | 644 | >>> nl_term_candidates('how do I do this?') |
556 | 575 | [] | 645 | [] |
557 | 576 | 646 | ||
558 | 577 | We also handle expansion of hypenated words (like ftq does): | ||
559 | 578 | |||
560 | 579 | >>> nl_term_candidates('firefox foo-bar give me trouble') | ||
561 | 580 | [u'firefox', u'foo', u'bar', u'foobar', u'give', u'troubl'] | ||
562 | 581 | |||
563 | 582 | Except for the hyphenation character, all non-word caracters are ignored: | 647 | Except for the hyphenation character, all non-word caracters are ignored: |
564 | 583 | 648 | ||
565 | 584 | >>> nl_term_candidates( | 649 | >>> nl_term_candidates( |
567 | 585 | ... "Will the \'\'|\'\' character (inside a ''quoted'' string) work???") | 650 | ... "Will the \'\'|\'\' character (inside a ''quoted'' string) " |
568 | 651 | ... "work???") | ||
569 | 586 | [u'charact', u'insid', u'quot', u'string', u'work'] | 652 | [u'charact', u'insid', u'quot', u'string', u'work'] |
570 | 587 | 653 | ||
571 | 588 | 654 | ||
573 | 589 | === nl_phrase_search() === | 655 | nl_phrase_search() |
574 | 656 | ~~~~~~~~~~~~~~~~~~ | ||
575 | 590 | 657 | ||
576 | 591 | To get the actual tsearch2 query that should be run, you will use the | 658 | To get the actual tsearch2 query that should be run, you will use the |
577 | 592 | nl_phrase_search() function. This one takes two mandatory parameters and | 659 | nl_phrase_search() function. This one takes two mandatory parameters and |
578 | @@ -637,7 +704,8 @@ | |||
579 | 637 | u'slow|system' | 704 | u'slow|system' |
580 | 638 | 705 | ||
581 | 639 | 706 | ||
583 | 640 | ==== Using other constraints ==== | 707 | Using other constraints |
584 | 708 | ....................... | ||
585 | 641 | 709 | ||
586 | 642 | You can pass a third parameter to the function that will be use as | 710 | You can pass a third parameter to the function that will be use as |
587 | 643 | an additional constraint to determine the total number of rows that | 711 | an additional constraint to determine the total number of rows that |
588 | @@ -659,7 +727,8 @@ | |||
589 | 659 | 727 | ||
590 | 660 | >>> nl_phrase_search( | 728 | >>> nl_phrase_search( |
591 | 661 | ... 'firefox gets very slow on flickr', Question, | 729 | ... 'firefox gets very slow on flickr', Question, |
593 | 662 | ... "Question.product = %s AND Product.active = 't'" % firefox_product.id, | 730 | ... "Question.product = %s AND Product.active = 't'" |
594 | 731 | ... % firefox_product.id, | ||
595 | 663 | ... ['Product'], fast_enabled=False) | 732 | ... ['Product'], fast_enabled=False) |
596 | 664 | u'slow|flickr' | 733 | u'slow|flickr' |
597 | 665 | 734 | ||
598 | @@ -679,7 +748,8 @@ | |||
599 | 679 | u'(firefox&flickr&slow)|(flickr&slow)|(firefox&slow)|(firefox&flickr)' | 748 | u'(firefox&flickr&slow)|(flickr&slow)|(firefox&slow)|(firefox&flickr)' |
600 | 680 | 749 | ||
601 | 681 | 750 | ||
603 | 682 | ==== No keywords filtering with few rows ==== | 751 | No keywords filtering with few rows |
604 | 752 | ................................... | ||
605 | 683 | 753 | ||
606 | 684 | The 50% rule is really useful only when there are many rows. When there | 754 | The 50% rule is really useful only when there are many rows. When there |
607 | 685 | only very few rows, that keyword elimination becomes a problem since | 755 | only very few rows, that keyword elimination becomes a problem since |
r=self: See the linked MP for the real review.