Merge lp:~cyphermox/ubuntu/oneiric/xapian-core/lp833172 into lp:ubuntu/oneiric/xapian-core

Proposed by Mathieu Trudel-Lapierre
Status: Merged
Merged at revision: 21
Proposed branch: lp:~cyphermox/ubuntu/oneiric/xapian-core/lp833172
Merge into: lp:ubuntu/oneiric/xapian-core
Diff against target: 726 lines (+689/-2)
4 files modified
debian/changelog (+7/-0)
debian/control (+2/-1)
debian/control.in (+2/-1)
debian/patches/cjk-ngram-applied-to-1.2-branch.patch (+678/-0)
To merge this branch: bzr merge lp:~cyphermox/ubuntu/oneiric/xapian-core/lp833172
Reviewer Review Type Date Requested Status
Ubuntu Sponsors Pending
Review via email: mp+72821@code.launchpad.net
To post a comment you must log in.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
=== modified file 'debian/changelog'
--- debian/changelog 2011-04-06 02:19:10 +0000
+++ debian/changelog 2011-08-25 03:14:24 +0000
@@ -1,3 +1,10 @@
1xapian-core (1.2.5-1ubuntu1) UNRELEASED; urgency=low
2
3 * debian/patches/cjk-ngram-applied-to-1.2-branch.patch: add support for CJK
4 input methods by adding a tokenizer for CJK. (LP: #833172)
5
6 -- Mathieu Trudel-Lapierre <mathieu-tl@ubuntu.com> Wed, 24 Aug 2011 19:29:01 -0400
7
1xapian-core (1.2.5-1) unstable; urgency=low8xapian-core (1.2.5-1) unstable; urgency=low
29
3 * New upstream release.10 * New upstream release.
411
=== modified file 'debian/control'
--- debian/control 2010-08-24 11:18:50 +0000
+++ debian/control 2011-08-25 03:14:24 +0000
@@ -1,7 +1,8 @@
1Source: xapian-core1Source: xapian-core
2Section: libs2Section: libs
3Priority: optional3Priority: optional
4Maintainer: Olly Betts <olly@survex.com>4Maintainer: Ubuntu Developers <ubuntu-devel-discuss@lists.ubuntu.com>
5XSBC-Original-Maintainer: Olly Betts <olly@survex.com>
5Standards-Version: 3.9.16Standards-Version: 3.9.1
6Build-Depends: debhelper (>= 7), autotools-dev, zlib1g-dev, uuid-dev7Build-Depends: debhelper (>= 7), autotools-dev, zlib1g-dev, uuid-dev
7Homepage: http://xapian.org/8Homepage: http://xapian.org/
89
=== modified file 'debian/control.in'
--- debian/control.in 2010-08-24 11:18:50 +0000
+++ debian/control.in 2011-08-25 03:14:24 +0000
@@ -1,7 +1,8 @@
1Source: xapian-core1Source: xapian-core
2Section: libs2Section: libs
3Priority: optional3Priority: optional
4Maintainer: Olly Betts <olly@survex.com>4Maintainer: Ubuntu Developers <ubuntu-devel-discuss@lists.ubuntu.com>
5XSBC-Original-Maintainer: Olly Betts <olly@survex.com>
5Standards-Version: 3.9.16Standards-Version: 3.9.1
6Build-Depends: @BUILD_DEPS@ autotools-dev, zlib1g-dev, uuid-dev7Build-Depends: @BUILD_DEPS@ autotools-dev, zlib1g-dev, uuid-dev
7Homepage: http://xapian.org/8Homepage: http://xapian.org/
89
=== added directory 'debian/patches'
=== added file 'debian/patches/cjk-ngram-applied-to-1.2-branch.patch'
--- debian/patches/cjk-ngram-applied-to-1.2-branch.patch 1970-01-01 00:00:00 +0000
+++ debian/patches/cjk-ngram-applied-to-1.2-branch.patch 2011-08-25 03:14:24 +0000
@@ -0,0 +1,678 @@
1Origin: http://trac.xapian.org/attachment/ticket/180/cjk-ngram-applied-to-1.2-branch.patch
2Subject: Add support for CJK text to queryparser and termgenerator
3Bug-Ubuntu: https://bugs.launchpad.net/ubuntu/+source/xapian-core/+bug/833172
4Bug: http://trac.xapian.org/ticket/180
5Last-Update: 2011-08-24
6
7Index: xapian-core/queryparser/Makefile.mk
8===================================================================
9--- xapian-core.orig/queryparser/Makefile.mk 2011-08-24 19:09:38.000000000 -0400
10+++ xapian-core/queryparser/Makefile.mk 2011-08-24 19:39:30.756055473 -0400
11@@ -5,6 +5,7 @@
12 endif
13
14 noinst_HEADERS +=\
15+ queryparser/cjk-tokenizer.h\
16 queryparser/queryparser_internal.h\
17 queryparser/queryparser_token.h\
18 queryparser/termgenerator_internal.h
19@@ -57,6 +58,7 @@
20 endif
21
22 lib_src +=\
23+ queryparser/cjk-tokenizer.cc\
24 queryparser/queryparser.cc\
25 queryparser/queryparser_internal.cc\
26 queryparser/termgenerator.cc\
27Index: xapian-core/queryparser/cjk-tokenizer.cc
28===================================================================
29--- /dev/null 1970-01-01 00:00:00.000000000 +0000
30+++ xapian-core/queryparser/cjk-tokenizer.cc 2011-08-24 19:39:30.756055473 -0400
31@@ -0,0 +1,124 @@
32+/** @file cjk-tokenizer.cc
33+ * @brief Tokenise CJK text as n-grams
34+ */
35+/* Copyright (c) 2007, 2008 Yung-chung Lin (henearkrxern@gmail.com)
36+ * Copyright (c) 2011 Richard Boulton (richard@tartarus.org)
37+ * Copyright (c) 2011 Brandon Schaefer (brandontschaefer@gmail.com)
38+ * Copyright (c) 2011 Olly Betts
39+ *
40+ * Permission is hereby granted, free of charge, to any person obtaining a copy
41+ * of this software and associated documentation files (the "Software"), to deal
42+ * deal in the Software without restriction, including without limitation the
43+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
44+ * sell copies of the Software, and to permit persons to whom the Software is
45+ * furnished to do so, subject to the following conditions:
46+ *
47+ * The above copyright notice and this permission notice shall be included in
48+ * all copies or substantial portions of the Software.
49+ *
50+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
51+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
52+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
53+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
54+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
55+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
56+ * IN THE SOFTWARE.
57+ */
58+
59+#include <config.h>
60+
61+#include "cjk-tokenizer.h"
62+
63+#include "omassert.h"
64+#include "xapian/unicode.h"
65+
66+#include <cstdlib>
67+#include <string>
68+
69+using namespace std;
70+
71+static unsigned NGRAM_SIZE = 2;
72+
73+bool
74+CJK::is_cjk_enabled()
75+{
76+ const char * p;
77+ static bool result = ((p = getenv("XAPIAN_CJK_NGRAM")) != NULL && *p);
78+ return result;
79+}
80+
81+// 2E80..2EFF; CJK Radicals Supplement
82+// 3000..303F; CJK Symbols and Punctuation
83+// 3040..309F; Hiragana
84+// 30A0..30FF; Katakana
85+// 3100..312F; Bopomofo
86+// 3130..318F; Hangul Compatibility Jamo
87+// 3190..319F; Kanbun
88+// 31A0..31BF; Bopomofo Extended
89+// 31C0..31EF; CJK Strokes
90+// 31F0..31FF; Katakana Phonetic Extensions
91+// 3200..32FF; Enclosed CJK Letters and Months
92+// 3300..33FF; CJK Compatibility
93+// 3400..4DBF; CJK Unified Ideographs Extension A
94+// 4DC0..4DFF; Yijing Hexagram Symbols
95+// 4E00..9FFF; CJK Unified Ideographs
96+// A700..A71F; Modifier Tone Letters
97+// AC00..D7AF; Hangul Syllables
98+// F900..FAFF; CJK Compatibility Ideographs
99+// FE30..FE4F; CJK Compatibility Forms
100+// FF00..FFEF; Halfwidth and Fullwidth Forms
101+// 20000..2A6DF; CJK Unified Ideographs Extension B
102+// 2F800..2FA1F; CJK Compatibility Ideographs Supplement
103+bool
104+CJK::codepoint_is_cjk(unsigned p)
105+{
106+ if (p < 0x2E80) return false;
107+ return ((p >= 0x2E80 && p <= 0x2EFF) ||
108+ (p >= 0x3000 && p <= 0x9FFF) ||
109+ (p >= 0xA700 && p <= 0xA71F) ||
110+ (p >= 0xAC00 && p <= 0xD7AF) ||
111+ (p >= 0xF900 && p <= 0xFAFF) ||
112+ (p >= 0xFE30 && p <= 0xFE4F) ||
113+ (p >= 0xFF00 && p <= 0xFFEF) ||
114+ (p >= 0x20000 && p <= 0x2A6DF) ||
115+ (p >= 0x2F800 && p <= 0x2FA1F));
116+}
117+
118+string
119+CJK::get_cjk(Xapian::Utf8Iterator &it)
120+{
121+ string str;
122+ while (it != Xapian::Utf8Iterator() && codepoint_is_cjk(*it)) {
123+ Xapian::Unicode::append_utf8(str, *it);
124+ ++it;
125+ }
126+ return str;
127+}
128+
129+const string &
130+CJKTokenIterator::operator*() const
131+{
132+ if (current_token.empty()) {
133+ Assert(it != Xapian::Utf8Iterator());
134+ p = it;
135+ Xapian::Unicode::append_utf8(current_token, *p);
136+ ++p;
137+ len = 1;
138+ }
139+ return current_token;
140+}
141+
142+CJKTokenIterator &
143+CJKTokenIterator::operator++()
144+{
145+ if (len < NGRAM_SIZE && p != Xapian::Utf8Iterator()) {
146+ Xapian::Unicode::append_utf8(current_token, *p);
147+ ++p;
148+ ++len;
149+ } else {
150+ Assert(it != Xapian::Utf8Iterator());
151+ ++it;
152+ current_token.resize(0);
153+ }
154+ return *this;
155+}
156Index: xapian-core/queryparser/queryparser.lemony
157===================================================================
158--- xapian-core.orig/queryparser/queryparser.lemony 2011-08-24 19:09:38.000000000 -0400
159+++ xapian-core/queryparser/queryparser.lemony 2011-08-24 19:39:30.756055473 -0400
160@@ -31,6 +31,8 @@
161 // Include the list of token values lemon generates.
162 #include "queryparser_token.h"
163
164+#include "cjk-tokenizer.h"
165+
166 #include <algorithm>
167 #include <list>
168 #include <string>
169@@ -133,6 +135,8 @@
170 }
171 };
172
173+class Terms;
174+
175 /** Class used to pass information about a token from lexer to parser.
176 *
177 * Generally an instance of this class carries term information, but it can be
178@@ -189,6 +193,12 @@
179 */
180 Query * as_partial_query(State * state_) const;
181
182+ /** Build a query for a string of CJK characters. */
183+ Query * as_cjk_query() const;
184+
185+ /** Handle a CJK character string in a positional context. */
186+ void as_positional_cjk_term(Terms * terms) const;
187+
188 /// Value range query.
189 Query as_value_range_query() const;
190
191@@ -413,6 +423,24 @@
192 return q;
193 }
194
195+Query *
196+Term::as_cjk_query() const
197+{
198+ vector<Query> prefix_cjk;
199+ const list<string> & prefixes = prefix_info->prefixes;
200+ list<string>::const_iterator piter;
201+ for (CJKTokenIterator tk(name); tk != CJKTokenIterator(); ++tk) {
202+ for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) {
203+ string cjk = *piter;
204+ cjk += *tk;
205+ prefix_cjk.push_back(Query(cjk, 1, pos));
206+ }
207+ }
208+ Query * q = new Query(Query::OP_AND, prefix_cjk.begin(), prefix_cjk.end());
209+ delete this;
210+ return q;
211+}
212+
213 Query
214 Term::as_value_range_query() const
215 {
216@@ -520,6 +548,7 @@
217
218 string
219 QueryParser::Internal::parse_term(Utf8Iterator &it, const Utf8Iterator &end,
220+ bool cjk_ngram, bool & is_cjk_term,
221 bool &was_acronym)
222 {
223 string term;
224@@ -545,10 +574,16 @@
225 }
226 was_acronym = !term.empty();
227
228+ if (cjk_ngram && term.empty() && CJK::codepoint_is_cjk(*it)) {
229+ term = CJK::get_cjk(it);
230+ is_cjk_term = true;
231+ }
232+
233 if (term.empty()) {
234 unsigned prevch = *it;
235 Unicode::append_utf8(term, prevch);
236 while (++it != end) {
237+ if (cjk_ngram && CJK::codepoint_is_cjk(*it)) break;
238 unsigned ch = *it;
239 if (!is_wordchar(ch)) {
240 // Treat a single embedded '&' or "'" or similar as a word
241@@ -617,6 +652,8 @@
242 QueryParser::Internal::parse_query(const string &qs, unsigned flags,
243 const string &default_prefix)
244 {
245+ bool cjk_ngram = CJK::is_cjk_enabled();
246+
247 // Set value_ranges if we may have to handle value ranges in the query.
248 bool value_ranges;
249 value_ranges = !valrangeprocs.empty() && (qs.find("..") != string::npos);
250@@ -958,7 +995,8 @@
251
252 phrased_term:
253 bool was_acronym;
254- string term = parse_term(it, end, was_acronym);
255+ bool is_cjk_term = false;
256+ string term = parse_term(it, end, cjk_ngram, is_cjk_term, was_acronym);
257
258 // Boolean operators.
259 if ((mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) &&
260@@ -1058,6 +1096,12 @@
261 Term * term_obj = new Term(&state, term, prefix_info,
262 unstemmed_term, stem_term, term_pos++);
263
264+ if (is_cjk_term) {
265+ Parse(pParser, CJKTERM, term_obj, &state);
266+ if (it == end) break;
267+ continue;
268+ }
269+
270 if (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) {
271 if (it != end) {
272 if ((flags & FLAG_WILDCARD) && *it == '*') {
273@@ -1526,6 +1570,23 @@
274 }
275 };
276
277+void
278+Term::as_positional_cjk_term(Terms * terms) const
279+{
280+ // Add each individual CJK character to the phrase.
281+ string t;
282+ for (Utf8Iterator it(name); it != Utf8Iterator(); ++it) {
283+ Unicode::append_utf8(t, *it);
284+ Term * c = new Term(state, t, prefix_info, unstemmed, stem, pos);
285+ terms->add_positional_term(c);
286+ t.resize(0);
287+ }
288+
289+ // FIXME: we want to add the n-grams as filters too for efficiency.
290+
291+ delete this;
292+}
293+
294 // Helper macro for converting a boolean operation into a Xapian::Query.
295 #define BOOL_OP_TO_QUERY(E, A, OP, B, OP_TXT) \
296 do {\
297@@ -1909,6 +1970,10 @@
298 delete U;
299 }
300
301+compound_term(T) ::= CJKTERM(U). {
302+ { T = U->as_cjk_query(); }
303+}
304+
305 // phrase - The "inside the quotes" part of a double-quoted phrase.
306
307 %type phrase {Terms *}
308@@ -1920,11 +1985,21 @@
309 P->add_positional_term(T);
310 }
311
312+phrase(P) ::= CJKTERM(T). {
313+ P = new Terms;
314+ T->as_positional_cjk_term(P);
315+}
316+
317 phrase(P) ::= phrase(Q) TERM(T). {
318 P = Q;
319 P->add_positional_term(T);
320 }
321
322+phrase(P) ::= phrase(Q) CJKTERM(T). {
323+ P = Q;
324+ T->as_positional_cjk_term(P);
325+}
326+
327 // phrased_term - A phrased term works like a single term, but is actually
328 // 2 or more terms linked together into a phrase by punctuation. There must be
329 // at least 2 terms in order to be able to have punctuation between the terms!
330Index: xapian-core/queryparser/queryparser_internal.h
331===================================================================
332--- xapian-core.orig/queryparser/queryparser_internal.h 2011-08-24 19:09:38.000000000 -0400
333+++ xapian-core/queryparser/queryparser_internal.h 2011-08-24 19:40:08.916055546 -0400
334@@ -1,7 +1,7 @@
335 /* queryparser_internal.h: The non-lemon-generated parts of the QueryParser
336 * class.
337 *
338- * Copyright (C) 2005,2006,2007,2010 Olly Betts
339+ * Copyright (C) 2005,2006,2007,2010,2011 Olly Betts
340 *
341 * This program is free software; you can redistribute it and/or
342 * modify it under the terms of the GNU General Public License as
343@@ -80,6 +80,7 @@
344 filter_type type);
345
346 std::string parse_term(Utf8Iterator &it, const Utf8Iterator &end,
347+ bool cjk_ngram, bool &is_cjk_term,
348 bool &was_acronym);
349
350 public:
351Index: xapian-core/queryparser/cjk-tokenizer.h
352===================================================================
353--- /dev/null 1970-01-01 00:00:00.000000000 +0000
354+++ xapian-core/queryparser/cjk-tokenizer.h 2011-08-24 19:39:30.756055473 -0400
355@@ -0,0 +1,94 @@
356+/** @file cjk-tokenizer.h
357+ * @brief Tokenise CJK text as n-grams
358+ */
359+/* Copyright (c) 2007, 2008 Yung-chung Lin (henearkrxern@gmail.com)
360+ * Copyright (c) 2011 Richard Boulton (richard@tartarus.org)
361+ * Copyright (c) 2011 Brandon Schaefer (brandontschaefer@gmail.com)
362+ * Copyright (c) 2011 Olly Betts
363+ *
364+ * Permission is hereby granted, free of charge, to any person obtaining a copy
365+ * of this software and associated documentation files (the "Software"), to deal
366+ * deal in the Software without restriction, including without limitation the
367+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
368+ * sell copies of the Software, and to permit persons to whom the Software is
369+ * furnished to do so, subject to the following conditions:
370+ *
371+ * The above copyright notice and this permission notice shall be included in
372+ * all copies or substantial portions of the Software.
373+ *
374+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
375+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
376+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
377+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
378+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
379+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
380+ * IN THE SOFTWARE.
381+ */
382+
383+#ifndef XAPIAN_INCLUDED_CJK_TOKENIZER_H
384+#define XAPIAN_INCLUDED_CJK_TOKENIZER_H
385+
386+#include "xapian/unicode.h"
387+
388+#include <string>
389+
390+namespace CJK {
391+
392+/** Should we use the CJK n-gram code?
393+ *
394+ * The first time this is called it reads the environmental variable
395+ * XAPIAN_CJK_NGRAM and returns true if it is set to a non-empty value.
396+ * Subsequent calls cache and return the same value.
397+ */
398+bool is_cjk_enabled();
399+
400+bool codepoint_is_cjk(unsigned codepoint);
401+
402+std::string get_cjk(Xapian::Utf8Iterator &it);
403+
404+}
405+
406+class CJKTokenIterator {
407+ Xapian::Utf8Iterator it;
408+
409+ mutable Xapian::Utf8Iterator p;
410+
411+ mutable unsigned len;
412+
413+ mutable std::string current_token;
414+
415+ public:
416+ CJKTokenIterator(const std::string & s)
417+ : it(s) { }
418+
419+ CJKTokenIterator(const Xapian::Utf8Iterator & it_)
420+ : it(it_) { }
421+
422+ CJKTokenIterator()
423+ : it() { }
424+
425+ const std::string & operator*() const;
426+
427+ CJKTokenIterator & operator++();
428+
429+ /// Get the length of the current token in Unicode characters.
430+ unsigned get_length() const { return len; }
431+
432+ friend bool operator==(const CJKTokenIterator &, const CJKTokenIterator &);
433+};
434+
435+inline bool
436+operator==(const CJKTokenIterator & a, const CJKTokenIterator & b)
437+{
438+ // We only really care about comparisons where one or other is an end
439+ // iterator.
440+ return a.it == b.it;
441+}
442+
443+inline bool
444+operator!=(const CJKTokenIterator & a, const CJKTokenIterator & b)
445+{
446+ return !(a == b);
447+}
448+
449+#endif // XAPIAN_INCLUDED_CJK_TOKENIZER_H
450Index: xapian-core/queryparser/termgenerator_internal.cc
451===================================================================
452--- xapian-core.orig/queryparser/termgenerator_internal.cc 2011-08-24 19:09:38.000000000 -0400
453+++ xapian-core/queryparser/termgenerator_internal.cc 2011-08-24 19:39:30.766055473 -0400
454@@ -1,7 +1,7 @@
455 /** @file termgenerator_internal.cc
456 * @brief TermGenerator class internals
457 */
458-/* Copyright (C) 2007,2010 Olly Betts
459+/* Copyright (C) 2007,2010,2011 Olly Betts
460 *
461 * This program is free software; you can redistribute it and/or modify
462 * it under the terms of the GNU General Public License as published by
463@@ -30,6 +30,8 @@
464
465 #include <string>
466
467+#include "cjk-tokenizer.h"
468+
469 using namespace std;
470
471 namespace Xapian {
472@@ -126,6 +128,8 @@
473 TermGenerator::Internal::index_text(Utf8Iterator itor, termcount weight,
474 const string & prefix, bool with_positions)
475 {
476+ bool cjk_ngram = CJK::is_cjk_enabled();
477+
478 int stop_mode = STOPWORDS_INDEX_UNSTEMMED_ONLY;
479
480 if (!stopper) stop_mode = STOPWORDS_NONE;
481@@ -163,11 +167,53 @@
482 }
483
484 while (true) {
485+ if (cjk_ngram && CJK::codepoint_is_cjk(*itor)) {
486+ const string & cjk = CJK::get_cjk(itor);
487+ for (CJKTokenIterator tk(cjk); tk != CJKTokenIterator(); ++tk) {
488+ const string & cjk_token = *tk;
489+ if (cjk_token.size() > MAX_PROB_TERM_LENGTH) continue;
490+
491+ if (stop_mode == STOPWORDS_IGNORE && (*stopper)(cjk_token))
492+ continue;
493+
494+ if (with_positions && tk.get_length() == 1) {
495+ doc.add_posting(prefix + cjk_token, ++termpos, wdf_inc);
496+ } else {
497+ doc.add_term(prefix + cjk_token, wdf_inc);
498+ }
499+ if ((flags & FLAG_SPELLING) && prefix.empty())
500+ db.add_spelling(cjk_token);
501+
502+ if (!stemmer.internal.get()) continue;
503+
504+ if (stop_mode == STOPWORDS_INDEX_UNSTEMMED_ONLY &&
505+ (*stopper)(cjk_token))
506+ continue;
507+
508+ // Note, this uses the lowercased term, but that's OK as we
509+ // only want to avoid stemming terms starting with a digit.
510+ if (!should_stem(cjk_token)) continue;
511+
512+ // Add stemmed form without positional information.
513+ string stem("Z");
514+ stem += prefix;
515+ stem += stemmer(cjk_token);
516+ doc.add_term(stem, wdf_inc);
517+ }
518+ while (true) {
519+ if (itor == Utf8Iterator()) return;
520+ ch = check_wordchar(*itor);
521+ if (ch) break;
522+ ++itor;
523+ }
524+ }
525 unsigned prevch;
526 do {
527 Unicode::append_utf8(term, ch);
528 prevch = ch;
529- if (++itor == Utf8Iterator()) goto endofterm;
530+ if (++itor == Utf8Iterator() ||
531+ (cjk_ngram && CJK::codepoint_is_cjk(*itor)))
532+ goto endofterm;
533 ch = check_wordchar(*itor);
534 } while (ch);
535
536Index: xapian-core/tests/termgentest.cc
537===================================================================
538--- xapian-core.orig/tests/termgentest.cc 2011-08-24 19:09:38.000000000 -0400
539+++ xapian-core/tests/termgentest.cc 2011-08-24 19:39:30.766055473 -0400
540@@ -31,6 +31,8 @@
541 #include "testutils.h"
542 #include "utils.h"
543
544+#include <stdlib.h> // For setenv() or putenv()
545+
546 using namespace std;
547
548 #define TESTCASE(S) {#S, test_##S}
549@@ -106,12 +108,26 @@
550 "Z\xe1\x80\x9d\xe1\x80\xae\xe1\x80\x80\xe1\x80\xae\xe1\x80\x95\xe1\x80\xad\xe1\x80\x9e\xe1\x80\xaf\xe1\x80\xb6\xe1\x80\xb8\xe1\x80\x85\xe1\x80\xbd\xe1\x80\xb2\xe1\x80\x9e\xe1\x80\xb0\xe1\x80\x99\xe1\x80\xbb\xe1\x80\xac\xe1\x80\xb8\xe1\x80\x80:1 \xe1\x80\x9d\xe1\x80\xae\xe1\x80\x80\xe1\x80\xae\xe1\x80\x95\xe1\x80\xad\xe1\x80\x9e\xe1\x80\xaf\xe1\x80\xb6\xe1\x80\xb8\xe1\x80\x85\xe1\x80\xbd\xe1\x80\xb2\xe1\x80\x9e\xe1\x80\xb0\xe1\x80\x99\xe1\x80\xbb\xe1\x80\xac\xe1\x80\xb8\xe1\x80\x80[1]" },
551
552 { "", "fish+chips", "Zchip:1 Zfish:1 chips[2] fish[1]" },
553+
554+ // Basic CJK tests:
555+ { "stem=", "久有归天", "久[1] 久有:1 天[4] 归[3] 归天:1 有[2] 有归:1" },
556+ { "", "극지라", "극[1] 극지:1 라[3] 지[2] 지라:1" },
557+ { "", "ウルス アップ", "ア[4] ウ[1] ウル:1 ス[3] ッ[5] ップ:1 プ[6] ル[2] ルス:1" },
558+
559+ // CJK with prefix:
560+ { "prefix=XA", "发送从", "XA从[3] XA发[1] XA发送:1 XA送[2] XA送从:1" },
561+ { "prefix=XA", "点卡思考", "XA卡[2] XA卡思:1 XA思[3] XA思考:1 XA点[1] XA点卡:1 XA考[4]" },
562+
563+ // CJK mixed with non-CJK:
564+ { "prefix=", "インtestタ", "test[3] イ[1] イン:1 タ[4] ン[2]" },
565+ { "", "配this is合a个 test!", "a[5] is[3] test[7] this[2] 个[6] 合[4] 配[1]" },
566+
567 // All following tests are for things which we probably don't really want to
568 // behave as they currently do, but we haven't found a sufficiently general
569 // way to implement them yet.
570
571 // Test number like things
572- { "", "11:59", "11[1] 59[2]" },
573+ { "stem=en", "11:59", "11[1] 59[2]" },
574 { "", "11:59am", "11[1] 59am[2]" },
575
576 { NULL, NULL, NULL }
577@@ -770,6 +786,14 @@
578
579 int main(int argc, char **argv)
580 try {
581+ // FIXME: It would be better to test with and without XAPIAN_CJK_NGRAM set.
582+#ifdef __WIN32__
583+ _putenv_s("XAPIAN_CJK_NGRAM", "1");
584+#elif defined HAVE_SETENV
585+ setenv("XAPIAN_CJK_NGRAM", "1", 1);
586+#else
587+ putenv(const_cast<char*>("XAPIAN_CJK_NGRAM=1"));
588+#endif
589 test_driver::parse_command_line(argc, argv);
590 return test_driver::run(tests);
591 } catch (const char * e) {
592Index: xapian-core/tests/queryparsertest.cc
593===================================================================
594--- xapian-core.orig/tests/queryparsertest.cc 2011-08-24 19:09:38.000000000 -0400
595+++ xapian-core/tests/queryparsertest.cc 2011-08-24 19:39:30.766055473 -0400
596@@ -33,6 +33,8 @@
597 #include <string>
598 #include <vector>
599
600+#include <stdlib.h> // For setenv() or putenv()
601+
602 using namespace std;
603
604 #define TESTCASE(S) {#S, test_##S}
605@@ -639,6 +641,17 @@
606 { "multisite:xapian.org site:www.xapian.org author:richard authortitle:richard", "((ZArichard:(pos=1) OR ZArichard:(pos=2) OR ZXTrichard:(pos=2)) FILTER (Hwww.xapian.org AND (Hxapian.org OR Jxapian.org)))"},
607 { "authortitle:richard-boulton", "((Arichard:(pos=1) PHRASE 2 Aboulton:(pos=2)) OR (XTrichard:(pos=1) PHRASE 2 XTboulton:(pos=2)))"},
608 { "authortitle:\"richard boulton\"", "((Arichard:(pos=1) PHRASE 2 Aboulton:(pos=2)) OR (XTrichard:(pos=1) PHRASE 2 XTboulton:(pos=2)))"},
609+ // Some CJK tests.
610+ { "久有归天愿", "(久:(pos=1) AND 久有:(pos=1) AND 有:(pos=1) AND 有归:(pos=1) AND 归:(pos=1) AND 归天:(pos=1) AND 天:(pos=1) AND 天愿:(pos=1) AND 愿:(pos=1))" },
611+ { "title:久有 归 天愿", "((XT久:(pos=1) AND XT久有:(pos=1) AND XT有:(pos=1)) OR 归:(pos=2) OR (天:(pos=3) AND 天愿:(pos=3) AND 愿:(pos=3)))" },
612+ { "h众ello万众", "(Zh:(pos=1) OR 众:(pos=2) OR Zello:(pos=3) OR (万:(pos=4) AND 万众:(pos=4) AND 众:(pos=4)))" },
613+ { "世(の中)TEST_tm", "(世:(pos=1) OR (の:(pos=2) AND の中:(pos=2) AND 中:(pos=2)) OR test_tm:(pos=3))" },
614+ { "다녀 AND 와야", "(다:(pos=1) AND 다녀:(pos=1) AND 녀:(pos=1) AND 와:(pos=2) AND 와야:(pos=2) AND 야:(pos=2))" },
615+ { "authortitle:학술 OR 연구를", "((A학:(pos=1) AND XT학:(pos=1) AND A학술:(pos=1) AND XT학술:(pos=1) AND A술:(pos=1) AND XT술:(pos=1)) OR (연:(pos=2) AND 연구:(pos=2) AND 구:(pos=2) AND 구를:(pos=2) AND 를:(pos=2)))" },
616+ // FIXME: These should really filter by bigrams to accelerate:
617+ { "\"久有归\"", "(久:(pos=1) PHRASE 3 有:(pos=1) PHRASE 3 归:(pos=1))" },
618+ { "\"久有test归\"", "(久:(pos=1) PHRASE 4 有:(pos=1) PHRASE 4 test:(pos=2) PHRASE 4 归:(pos=3))" },
619+ // FIXME: this should work: { "久 NEAR 有", "(久:(pos=1) NEAR 11 有:(pos=2))" },
620 { NULL, NULL }
621 };
622
623@@ -709,6 +722,9 @@
624 // Add coverage for other cases similar to the above.
625 { "a b site:xapian.org", "((Za:(pos=1) AND Zb:(pos=2)) FILTER Hxapian.org)" },
626 { "site:xapian.org a b", "((Za:(pos=1) AND Zb:(pos=2)) FILTER Hxapian.org)" },
627+ // Some CJK tests.
628+ { "author:험가 OR subject:万众 hello world!", "((A험:(pos=1) AND A험가:(pos=1) AND A가:(pos=1)) OR (XT万:(pos=2) AND XT万众:(pos=2) AND XT众:(pos=2) AND Zhello:(pos=3) AND Zworld:(pos=4)))" },
629+ { "洛伊one儿差点two脸three", "(洛:(pos=1) AND 洛伊:(pos=1) AND 伊:(pos=1) AND Zone:(pos=2) AND 儿:(pos=3) AND 儿差:(pos=3) AND 差:(pos=3) AND 差点:(pos=3) AND 点:(pos=3) AND Ztwo:(pos=4) AND 脸:(pos=5) AND Zthree:(pos=6))" },
630 { NULL, NULL }
631 };
632
633@@ -761,6 +777,8 @@
634 TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((ZAme:(pos=1) OR ZXTstuff:(pos=2)))");
635 qobj = qp.parse_query("title:(stuff) me", Xapian::QueryParser::FLAG_BOOLEAN, "A");
636 TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((ZXTstuff:(pos=1) OR ZAme:(pos=2)))");
637+ qobj = qp.parse_query("英国 title:文森hello", 0, "A");
638+ TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query(((A英:(pos=1) AND A英国:(pos=1) AND A国:(pos=1)) OR (XT文:(pos=2) AND XT文森:(pos=2) AND XT森:(pos=2)) OR ZAhello:(pos=3)))");
639 return true;
640 }
641
642@@ -2441,6 +2459,14 @@
643
644 int main(int argc, char **argv)
645 try {
646+ // FIXME: It would be better to test with and without XAPIAN_CJK_NGRAM set.
647+#ifdef __WIN32__
648+ _putenv_s("XAPIAN_CJK_NGRAM", "1");
649+#elif defined HAVE_SETENV
650+ setenv("XAPIAN_CJK_NGRAM", "1", 1);
651+#else
652+ putenv(const_cast<char*>("XAPIAN_CJK_NGRAM=1"));
653+#endif
654 test_driver::parse_command_line(argc, argv);
655 return test_driver::run(tests);
656 } catch (const char * e) {
657Index: xapian-core/ChangeLog
658===================================================================
659--- xapian-core.orig/ChangeLog 2011-08-24 19:09:38.000000000 -0400
660+++ xapian-core/ChangeLog 2011-08-24 19:42:18.056055791 -0400
661@@ -1,3 +1,17 @@
662+Wed Aug 24 14:25:21 GMT 2011 Olly Betts <olly@survex.com>
663+
664+ * Backport change from trunk:
665+ * queryparser/queryparser.lemony: Fix memory leak (caught by existing
666+ testcase queryparser1 when run under valgrind).
667+
668+Wed Aug 24 14:13:24 GMT 2011 Olly Betts <olly@survex.com>
669+
670+ * Backport change from trunk:
671+ * queryparser/,tests/queryparsertest.cc,tests/termgentest.cc: Add
672+ support for indexing and searching CJK text using n-grams. Currently
673+ this is only enabled if environmental variable XAPIAN_CJK_NGRAM is
674+ set to a non-empty value.
675+
676 Mon Apr 04 14:41:33 GMT 2011 Olly Betts <olly@survex.com>
677
678 * NEWS: Final update for 1.2.5.

Subscribers

People subscribed via source and target branches