Merge lp:~cyphermox/ubuntu/oneiric/xapian-core/lp833172 into lp:ubuntu/oneiric/xapian-core

Proposed by Mathieu Trudel-Lapierre
Status: Merged
Merged at revision: 21
Proposed branch: lp:~cyphermox/ubuntu/oneiric/xapian-core/lp833172
Merge into: lp:ubuntu/oneiric/xapian-core
Diff against target: 726 lines (+689/-2)
4 files modified
debian/changelog (+7/-0)
debian/control (+2/-1)
debian/control.in (+2/-1)
debian/patches/cjk-ngram-applied-to-1.2-branch.patch (+678/-0)
To merge this branch: bzr merge lp:~cyphermox/ubuntu/oneiric/xapian-core/lp833172
Reviewer Review Type Date Requested Status
Ubuntu Sponsors Pending
Review via email: mp+72821@code.launchpad.net
To post a comment you must log in.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'debian/changelog'
2--- debian/changelog 2011-04-06 02:19:10 +0000
3+++ debian/changelog 2011-08-25 03:14:24 +0000
4@@ -1,3 +1,10 @@
5+xapian-core (1.2.5-1ubuntu1) UNRELEASED; urgency=low
6+
7+ * debian/patches/cjk-ngram-applied-to-1.2-branch.patch: add support for CJK
8+ input methods by adding a tokenizer for CJK. (LP: #833172)
9+
10+ -- Mathieu Trudel-Lapierre <mathieu-tl@ubuntu.com> Wed, 24 Aug 2011 19:29:01 -0400
11+
12 xapian-core (1.2.5-1) unstable; urgency=low
13
14 * New upstream release.
15
16=== modified file 'debian/control'
17--- debian/control 2010-08-24 11:18:50 +0000
18+++ debian/control 2011-08-25 03:14:24 +0000
19@@ -1,7 +1,8 @@
20 Source: xapian-core
21 Section: libs
22 Priority: optional
23-Maintainer: Olly Betts <olly@survex.com>
24+Maintainer: Ubuntu Developers <ubuntu-devel-discuss@lists.ubuntu.com>
25+XSBC-Original-Maintainer: Olly Betts <olly@survex.com>
26 Standards-Version: 3.9.1
27 Build-Depends: debhelper (>= 7), autotools-dev, zlib1g-dev, uuid-dev
28 Homepage: http://xapian.org/
29
30=== modified file 'debian/control.in'
31--- debian/control.in 2010-08-24 11:18:50 +0000
32+++ debian/control.in 2011-08-25 03:14:24 +0000
33@@ -1,7 +1,8 @@
34 Source: xapian-core
35 Section: libs
36 Priority: optional
37-Maintainer: Olly Betts <olly@survex.com>
38+Maintainer: Ubuntu Developers <ubuntu-devel-discuss@lists.ubuntu.com>
39+XSBC-Original-Maintainer: Olly Betts <olly@survex.com>
40 Standards-Version: 3.9.1
41 Build-Depends: @BUILD_DEPS@ autotools-dev, zlib1g-dev, uuid-dev
42 Homepage: http://xapian.org/
43
44=== added directory 'debian/patches'
45=== added file 'debian/patches/cjk-ngram-applied-to-1.2-branch.patch'
46--- debian/patches/cjk-ngram-applied-to-1.2-branch.patch 1970-01-01 00:00:00 +0000
47+++ debian/patches/cjk-ngram-applied-to-1.2-branch.patch 2011-08-25 03:14:24 +0000
48@@ -0,0 +1,678 @@
49+Origin: http://trac.xapian.org/attachment/ticket/180/cjk-ngram-applied-to-1.2-branch.patch
50+Subject: Add support for CJK text to queryparser and termgenerator
51+Bug-Ubuntu: https://bugs.launchpad.net/ubuntu/+source/xapian-core/+bug/833172
52+Bug: http://trac.xapian.org/ticket/180
53+Last-Update: 2011-08-24
54+
55+Index: xapian-core/queryparser/Makefile.mk
56+===================================================================
57+--- xapian-core.orig/queryparser/Makefile.mk 2011-08-24 19:09:38.000000000 -0400
58++++ xapian-core/queryparser/Makefile.mk 2011-08-24 19:39:30.756055473 -0400
59+@@ -5,6 +5,7 @@
60+ endif
61+
62+ noinst_HEADERS +=\
63++ queryparser/cjk-tokenizer.h\
64+ queryparser/queryparser_internal.h\
65+ queryparser/queryparser_token.h\
66+ queryparser/termgenerator_internal.h
67+@@ -57,6 +58,7 @@
68+ endif
69+
70+ lib_src +=\
71++ queryparser/cjk-tokenizer.cc\
72+ queryparser/queryparser.cc\
73+ queryparser/queryparser_internal.cc\
74+ queryparser/termgenerator.cc\
75+Index: xapian-core/queryparser/cjk-tokenizer.cc
76+===================================================================
77+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
78++++ xapian-core/queryparser/cjk-tokenizer.cc 2011-08-24 19:39:30.756055473 -0400
79+@@ -0,0 +1,124 @@
80++/** @file cjk-tokenizer.cc
81++ * @brief Tokenise CJK text as n-grams
82++ */
83++/* Copyright (c) 2007, 2008 Yung-chung Lin (henearkrxern@gmail.com)
84++ * Copyright (c) 2011 Richard Boulton (richard@tartarus.org)
85++ * Copyright (c) 2011 Brandon Schaefer (brandontschaefer@gmail.com)
86++ * Copyright (c) 2011 Olly Betts
87++ *
88++ * Permission is hereby granted, free of charge, to any person obtaining a copy
89++ * of this software and associated documentation files (the "Software"), to deal
90++ * deal in the Software without restriction, including without limitation the
91++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
92++ * sell copies of the Software, and to permit persons to whom the Software is
93++ * furnished to do so, subject to the following conditions:
94++ *
95++ * The above copyright notice and this permission notice shall be included in
96++ * all copies or substantial portions of the Software.
97++ *
98++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
99++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
100++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
101++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
102++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
103++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
104++ * IN THE SOFTWARE.
105++ */
106++
107++#include <config.h>
108++
109++#include "cjk-tokenizer.h"
110++
111++#include "omassert.h"
112++#include "xapian/unicode.h"
113++
114++#include <cstdlib>
115++#include <string>
116++
117++using namespace std;
118++
119++static unsigned NGRAM_SIZE = 2;
120++
121++bool
122++CJK::is_cjk_enabled()
123++{
124++ const char * p;
125++ static bool result = ((p = getenv("XAPIAN_CJK_NGRAM")) != NULL && *p);
126++ return result;
127++}
128++
129++// 2E80..2EFF; CJK Radicals Supplement
130++// 3000..303F; CJK Symbols and Punctuation
131++// 3040..309F; Hiragana
132++// 30A0..30FF; Katakana
133++// 3100..312F; Bopomofo
134++// 3130..318F; Hangul Compatibility Jamo
135++// 3190..319F; Kanbun
136++// 31A0..31BF; Bopomofo Extended
137++// 31C0..31EF; CJK Strokes
138++// 31F0..31FF; Katakana Phonetic Extensions
139++// 3200..32FF; Enclosed CJK Letters and Months
140++// 3300..33FF; CJK Compatibility
141++// 3400..4DBF; CJK Unified Ideographs Extension A
142++// 4DC0..4DFF; Yijing Hexagram Symbols
143++// 4E00..9FFF; CJK Unified Ideographs
144++// A700..A71F; Modifier Tone Letters
145++// AC00..D7AF; Hangul Syllables
146++// F900..FAFF; CJK Compatibility Ideographs
147++// FE30..FE4F; CJK Compatibility Forms
148++// FF00..FFEF; Halfwidth and Fullwidth Forms
149++// 20000..2A6DF; CJK Unified Ideographs Extension B
150++// 2F800..2FA1F; CJK Compatibility Ideographs Supplement
151++bool
152++CJK::codepoint_is_cjk(unsigned p)
153++{
154++ if (p < 0x2E80) return false;
155++ return ((p >= 0x2E80 && p <= 0x2EFF) ||
156++ (p >= 0x3000 && p <= 0x9FFF) ||
157++ (p >= 0xA700 && p <= 0xA71F) ||
158++ (p >= 0xAC00 && p <= 0xD7AF) ||
159++ (p >= 0xF900 && p <= 0xFAFF) ||
160++ (p >= 0xFE30 && p <= 0xFE4F) ||
161++ (p >= 0xFF00 && p <= 0xFFEF) ||
162++ (p >= 0x20000 && p <= 0x2A6DF) ||
163++ (p >= 0x2F800 && p <= 0x2FA1F));
164++}
165++
166++string
167++CJK::get_cjk(Xapian::Utf8Iterator &it)
168++{
169++ string str;
170++ while (it != Xapian::Utf8Iterator() && codepoint_is_cjk(*it)) {
171++ Xapian::Unicode::append_utf8(str, *it);
172++ ++it;
173++ }
174++ return str;
175++}
176++
177++const string &
178++CJKTokenIterator::operator*() const
179++{
180++ if (current_token.empty()) {
181++ Assert(it != Xapian::Utf8Iterator());
182++ p = it;
183++ Xapian::Unicode::append_utf8(current_token, *p);
184++ ++p;
185++ len = 1;
186++ }
187++ return current_token;
188++}
189++
190++CJKTokenIterator &
191++CJKTokenIterator::operator++()
192++{
193++ if (len < NGRAM_SIZE && p != Xapian::Utf8Iterator()) {
194++ Xapian::Unicode::append_utf8(current_token, *p);
195++ ++p;
196++ ++len;
197++ } else {
198++ Assert(it != Xapian::Utf8Iterator());
199++ ++it;
200++ current_token.resize(0);
201++ }
202++ return *this;
203++}
204+Index: xapian-core/queryparser/queryparser.lemony
205+===================================================================
206+--- xapian-core.orig/queryparser/queryparser.lemony 2011-08-24 19:09:38.000000000 -0400
207++++ xapian-core/queryparser/queryparser.lemony 2011-08-24 19:39:30.756055473 -0400
208+@@ -31,6 +31,8 @@
209+ // Include the list of token values lemon generates.
210+ #include "queryparser_token.h"
211+
212++#include "cjk-tokenizer.h"
213++
214+ #include <algorithm>
215+ #include <list>
216+ #include <string>
217+@@ -133,6 +135,8 @@
218+ }
219+ };
220+
221++class Terms;
222++
223+ /** Class used to pass information about a token from lexer to parser.
224+ *
225+ * Generally an instance of this class carries term information, but it can be
226+@@ -189,6 +193,12 @@
227+ */
228+ Query * as_partial_query(State * state_) const;
229+
230++ /** Build a query for a string of CJK characters. */
231++ Query * as_cjk_query() const;
232++
233++ /** Handle a CJK character string in a positional context. */
234++ void as_positional_cjk_term(Terms * terms) const;
235++
236+ /// Value range query.
237+ Query as_value_range_query() const;
238+
239+@@ -413,6 +423,24 @@
240+ return q;
241+ }
242+
243++Query *
244++Term::as_cjk_query() const
245++{
246++ vector<Query> prefix_cjk;
247++ const list<string> & prefixes = prefix_info->prefixes;
248++ list<string>::const_iterator piter;
249++ for (CJKTokenIterator tk(name); tk != CJKTokenIterator(); ++tk) {
250++ for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) {
251++ string cjk = *piter;
252++ cjk += *tk;
253++ prefix_cjk.push_back(Query(cjk, 1, pos));
254++ }
255++ }
256++ Query * q = new Query(Query::OP_AND, prefix_cjk.begin(), prefix_cjk.end());
257++ delete this;
258++ return q;
259++}
260++
261+ Query
262+ Term::as_value_range_query() const
263+ {
264+@@ -520,6 +548,7 @@
265+
266+ string
267+ QueryParser::Internal::parse_term(Utf8Iterator &it, const Utf8Iterator &end,
268++ bool cjk_ngram, bool & is_cjk_term,
269+ bool &was_acronym)
270+ {
271+ string term;
272+@@ -545,10 +574,16 @@
273+ }
274+ was_acronym = !term.empty();
275+
276++ if (cjk_ngram && term.empty() && CJK::codepoint_is_cjk(*it)) {
277++ term = CJK::get_cjk(it);
278++ is_cjk_term = true;
279++ }
280++
281+ if (term.empty()) {
282+ unsigned prevch = *it;
283+ Unicode::append_utf8(term, prevch);
284+ while (++it != end) {
285++ if (cjk_ngram && CJK::codepoint_is_cjk(*it)) break;
286+ unsigned ch = *it;
287+ if (!is_wordchar(ch)) {
288+ // Treat a single embedded '&' or "'" or similar as a word
289+@@ -617,6 +652,8 @@
290+ QueryParser::Internal::parse_query(const string &qs, unsigned flags,
291+ const string &default_prefix)
292+ {
293++ bool cjk_ngram = CJK::is_cjk_enabled();
294++
295+ // Set value_ranges if we may have to handle value ranges in the query.
296+ bool value_ranges;
297+ value_ranges = !valrangeprocs.empty() && (qs.find("..") != string::npos);
298+@@ -958,7 +995,8 @@
299+
300+ phrased_term:
301+ bool was_acronym;
302+- string term = parse_term(it, end, was_acronym);
303++ bool is_cjk_term = false;
304++ string term = parse_term(it, end, cjk_ngram, is_cjk_term, was_acronym);
305+
306+ // Boolean operators.
307+ if ((mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) &&
308+@@ -1058,6 +1096,12 @@
309+ Term * term_obj = new Term(&state, term, prefix_info,
310+ unstemmed_term, stem_term, term_pos++);
311+
312++ if (is_cjk_term) {
313++ Parse(pParser, CJKTERM, term_obj, &state);
314++ if (it == end) break;
315++ continue;
316++ }
317++
318+ if (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) {
319+ if (it != end) {
320+ if ((flags & FLAG_WILDCARD) && *it == '*') {
321+@@ -1526,6 +1570,23 @@
322+ }
323+ };
324+
325++void
326++Term::as_positional_cjk_term(Terms * terms) const
327++{
328++ // Add each individual CJK character to the phrase.
329++ string t;
330++ for (Utf8Iterator it(name); it != Utf8Iterator(); ++it) {
331++ Unicode::append_utf8(t, *it);
332++ Term * c = new Term(state, t, prefix_info, unstemmed, stem, pos);
333++ terms->add_positional_term(c);
334++ t.resize(0);
335++ }
336++
337++ // FIXME: we want to add the n-grams as filters too for efficiency.
338++
339++ delete this;
340++}
341++
342+ // Helper macro for converting a boolean operation into a Xapian::Query.
343+ #define BOOL_OP_TO_QUERY(E, A, OP, B, OP_TXT) \
344+ do {\
345+@@ -1909,6 +1970,10 @@
346+ delete U;
347+ }
348+
349++compound_term(T) ::= CJKTERM(U). {
350++ { T = U->as_cjk_query(); }
351++}
352++
353+ // phrase - The "inside the quotes" part of a double-quoted phrase.
354+
355+ %type phrase {Terms *}
356+@@ -1920,11 +1985,21 @@
357+ P->add_positional_term(T);
358+ }
359+
360++phrase(P) ::= CJKTERM(T). {
361++ P = new Terms;
362++ T->as_positional_cjk_term(P);
363++}
364++
365+ phrase(P) ::= phrase(Q) TERM(T). {
366+ P = Q;
367+ P->add_positional_term(T);
368+ }
369+
370++phrase(P) ::= phrase(Q) CJKTERM(T). {
371++ P = Q;
372++ T->as_positional_cjk_term(P);
373++}
374++
375+ // phrased_term - A phrased term works like a single term, but is actually
376+ // 2 or more terms linked together into a phrase by punctuation. There must be
377+ // at least 2 terms in order to be able to have punctuation between the terms!
378+Index: xapian-core/queryparser/queryparser_internal.h
379+===================================================================
380+--- xapian-core.orig/queryparser/queryparser_internal.h 2011-08-24 19:09:38.000000000 -0400
381++++ xapian-core/queryparser/queryparser_internal.h 2011-08-24 19:40:08.916055546 -0400
382+@@ -1,7 +1,7 @@
383+ /* queryparser_internal.h: The non-lemon-generated parts of the QueryParser
384+ * class.
385+ *
386+- * Copyright (C) 2005,2006,2007,2010 Olly Betts
387++ * Copyright (C) 2005,2006,2007,2010,2011 Olly Betts
388+ *
389+ * This program is free software; you can redistribute it and/or
390+ * modify it under the terms of the GNU General Public License as
391+@@ -80,6 +80,7 @@
392+ filter_type type);
393+
394+ std::string parse_term(Utf8Iterator &it, const Utf8Iterator &end,
395++ bool cjk_ngram, bool &is_cjk_term,
396+ bool &was_acronym);
397+
398+ public:
399+Index: xapian-core/queryparser/cjk-tokenizer.h
400+===================================================================
401+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
402++++ xapian-core/queryparser/cjk-tokenizer.h 2011-08-24 19:39:30.756055473 -0400
403+@@ -0,0 +1,94 @@
404++/** @file cjk-tokenizer.h
405++ * @brief Tokenise CJK text as n-grams
406++ */
407++/* Copyright (c) 2007, 2008 Yung-chung Lin (henearkrxern@gmail.com)
408++ * Copyright (c) 2011 Richard Boulton (richard@tartarus.org)
409++ * Copyright (c) 2011 Brandon Schaefer (brandontschaefer@gmail.com)
410++ * Copyright (c) 2011 Olly Betts
411++ *
412++ * Permission is hereby granted, free of charge, to any person obtaining a copy
413++ * of this software and associated documentation files (the "Software"), to deal
414++ * deal in the Software without restriction, including without limitation the
415++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
416++ * sell copies of the Software, and to permit persons to whom the Software is
417++ * furnished to do so, subject to the following conditions:
418++ *
419++ * The above copyright notice and this permission notice shall be included in
420++ * all copies or substantial portions of the Software.
421++ *
422++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
423++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
424++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
425++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
426++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
427++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
428++ * IN THE SOFTWARE.
429++ */
430++
431++#ifndef XAPIAN_INCLUDED_CJK_TOKENIZER_H
432++#define XAPIAN_INCLUDED_CJK_TOKENIZER_H
433++
434++#include "xapian/unicode.h"
435++
436++#include <string>
437++
438++namespace CJK {
439++
440++/** Should we use the CJK n-gram code?
441++ *
442++ * The first time this is called it reads the environmental variable
443++ * XAPIAN_CJK_NGRAM and returns true if it is set to a non-empty value.
444++ * Subsequent calls cache and return the same value.
445++ */
446++bool is_cjk_enabled();
447++
448++bool codepoint_is_cjk(unsigned codepoint);
449++
450++std::string get_cjk(Xapian::Utf8Iterator &it);
451++
452++}
453++
454++class CJKTokenIterator {
455++ Xapian::Utf8Iterator it;
456++
457++ mutable Xapian::Utf8Iterator p;
458++
459++ mutable unsigned len;
460++
461++ mutable std::string current_token;
462++
463++ public:
464++ CJKTokenIterator(const std::string & s)
465++ : it(s) { }
466++
467++ CJKTokenIterator(const Xapian::Utf8Iterator & it_)
468++ : it(it_) { }
469++
470++ CJKTokenIterator()
471++ : it() { }
472++
473++ const std::string & operator*() const;
474++
475++ CJKTokenIterator & operator++();
476++
477++ /// Get the length of the current token in Unicode characters.
478++ unsigned get_length() const { return len; }
479++
480++ friend bool operator==(const CJKTokenIterator &, const CJKTokenIterator &);
481++};
482++
483++inline bool
484++operator==(const CJKTokenIterator & a, const CJKTokenIterator & b)
485++{
486++ // We only really care about comparisons where one or other is an end
487++ // iterator.
488++ return a.it == b.it;
489++}
490++
491++inline bool
492++operator!=(const CJKTokenIterator & a, const CJKTokenIterator & b)
493++{
494++ return !(a == b);
495++}
496++
497++#endif // XAPIAN_INCLUDED_CJK_TOKENIZER_H
498+Index: xapian-core/queryparser/termgenerator_internal.cc
499+===================================================================
500+--- xapian-core.orig/queryparser/termgenerator_internal.cc 2011-08-24 19:09:38.000000000 -0400
501++++ xapian-core/queryparser/termgenerator_internal.cc 2011-08-24 19:39:30.766055473 -0400
502+@@ -1,7 +1,7 @@
503+ /** @file termgenerator_internal.cc
504+ * @brief TermGenerator class internals
505+ */
506+-/* Copyright (C) 2007,2010 Olly Betts
507++/* Copyright (C) 2007,2010,2011 Olly Betts
508+ *
509+ * This program is free software; you can redistribute it and/or modify
510+ * it under the terms of the GNU General Public License as published by
511+@@ -30,6 +30,8 @@
512+
513+ #include <string>
514+
515++#include "cjk-tokenizer.h"
516++
517+ using namespace std;
518+
519+ namespace Xapian {
520+@@ -126,6 +128,8 @@
521+ TermGenerator::Internal::index_text(Utf8Iterator itor, termcount weight,
522+ const string & prefix, bool with_positions)
523+ {
524++ bool cjk_ngram = CJK::is_cjk_enabled();
525++
526+ int stop_mode = STOPWORDS_INDEX_UNSTEMMED_ONLY;
527+
528+ if (!stopper) stop_mode = STOPWORDS_NONE;
529+@@ -163,11 +167,53 @@
530+ }
531+
532+ while (true) {
533++ if (cjk_ngram && CJK::codepoint_is_cjk(*itor)) {
534++ const string & cjk = CJK::get_cjk(itor);
535++ for (CJKTokenIterator tk(cjk); tk != CJKTokenIterator(); ++tk) {
536++ const string & cjk_token = *tk;
537++ if (cjk_token.size() > MAX_PROB_TERM_LENGTH) continue;
538++
539++ if (stop_mode == STOPWORDS_IGNORE && (*stopper)(cjk_token))
540++ continue;
541++
542++ if (with_positions && tk.get_length() == 1) {
543++ doc.add_posting(prefix + cjk_token, ++termpos, wdf_inc);
544++ } else {
545++ doc.add_term(prefix + cjk_token, wdf_inc);
546++ }
547++ if ((flags & FLAG_SPELLING) && prefix.empty())
548++ db.add_spelling(cjk_token);
549++
550++ if (!stemmer.internal.get()) continue;
551++
552++ if (stop_mode == STOPWORDS_INDEX_UNSTEMMED_ONLY &&
553++ (*stopper)(cjk_token))
554++ continue;
555++
556++ // Note, this uses the lowercased term, but that's OK as we
557++ // only want to avoid stemming terms starting with a digit.
558++ if (!should_stem(cjk_token)) continue;
559++
560++ // Add stemmed form without positional information.
561++ string stem("Z");
562++ stem += prefix;
563++ stem += stemmer(cjk_token);
564++ doc.add_term(stem, wdf_inc);
565++ }
566++ while (true) {
567++ if (itor == Utf8Iterator()) return;
568++ ch = check_wordchar(*itor);
569++ if (ch) break;
570++ ++itor;
571++ }
572++ }
573+ unsigned prevch;
574+ do {
575+ Unicode::append_utf8(term, ch);
576+ prevch = ch;
577+- if (++itor == Utf8Iterator()) goto endofterm;
578++ if (++itor == Utf8Iterator() ||
579++ (cjk_ngram && CJK::codepoint_is_cjk(*itor)))
580++ goto endofterm;
581+ ch = check_wordchar(*itor);
582+ } while (ch);
583+
584+Index: xapian-core/tests/termgentest.cc
585+===================================================================
586+--- xapian-core.orig/tests/termgentest.cc 2011-08-24 19:09:38.000000000 -0400
587++++ xapian-core/tests/termgentest.cc 2011-08-24 19:39:30.766055473 -0400
588+@@ -31,6 +31,8 @@
589+ #include "testutils.h"
590+ #include "utils.h"
591+
592++#include <stdlib.h> // For setenv() or putenv()
593++
594+ using namespace std;
595+
596+ #define TESTCASE(S) {#S, test_##S}
597+@@ -106,12 +108,26 @@
598+ "Z\xe1\x80\x9d\xe1\x80\xae\xe1\x80\x80\xe1\x80\xae\xe1\x80\x95\xe1\x80\xad\xe1\x80\x9e\xe1\x80\xaf\xe1\x80\xb6\xe1\x80\xb8\xe1\x80\x85\xe1\x80\xbd\xe1\x80\xb2\xe1\x80\x9e\xe1\x80\xb0\xe1\x80\x99\xe1\x80\xbb\xe1\x80\xac\xe1\x80\xb8\xe1\x80\x80:1 \xe1\x80\x9d\xe1\x80\xae\xe1\x80\x80\xe1\x80\xae\xe1\x80\x95\xe1\x80\xad\xe1\x80\x9e\xe1\x80\xaf\xe1\x80\xb6\xe1\x80\xb8\xe1\x80\x85\xe1\x80\xbd\xe1\x80\xb2\xe1\x80\x9e\xe1\x80\xb0\xe1\x80\x99\xe1\x80\xbb\xe1\x80\xac\xe1\x80\xb8\xe1\x80\x80[1]" },
599+
600+ { "", "fish+chips", "Zchip:1 Zfish:1 chips[2] fish[1]" },
601++
602++ // Basic CJK tests:
603++ { "stem=", "久有归天", "久[1] 久有:1 天[4] 归[3] 归天:1 有[2] 有归:1" },
604++ { "", "극지라", "극[1] 극지:1 라[3] 지[2] 지라:1" },
605++ { "", "ウルス アップ", "ア[4] ウ[1] ウル:1 ス[3] ッ[5] ップ:1 プ[6] ル[2] ルス:1" },
606++
607++ // CJK with prefix:
608++ { "prefix=XA", "发送从", "XA从[3] XA发[1] XA发送:1 XA送[2] XA送从:1" },
609++ { "prefix=XA", "点卡思考", "XA卡[2] XA卡思:1 XA思[3] XA思考:1 XA点[1] XA点卡:1 XA考[4]" },
610++
611++ // CJK mixed with non-CJK:
612++ { "prefix=", "インtestタ", "test[3] イ[1] イン:1 タ[4] ン[2]" },
613++ { "", "配this is合a个 test!", "a[5] is[3] test[7] this[2] 个[6] 合[4] 配[1]" },
614++
615+ // All following tests are for things which we probably don't really want to
616+ // behave as they currently do, but we haven't found a sufficiently general
617+ // way to implement them yet.
618+
619+ // Test number like things
620+- { "", "11:59", "11[1] 59[2]" },
621++ { "stem=en", "11:59", "11[1] 59[2]" },
622+ { "", "11:59am", "11[1] 59am[2]" },
623+
624+ { NULL, NULL, NULL }
625+@@ -770,6 +786,14 @@
626+
627+ int main(int argc, char **argv)
628+ try {
629++ // FIXME: It would be better to test with and without XAPIAN_CJK_NGRAM set.
630++#ifdef __WIN32__
631++ _putenv_s("XAPIAN_CJK_NGRAM", "1");
632++#elif defined HAVE_SETENV
633++ setenv("XAPIAN_CJK_NGRAM", "1", 1);
634++#else
635++ putenv(const_cast<char*>("XAPIAN_CJK_NGRAM=1"));
636++#endif
637+ test_driver::parse_command_line(argc, argv);
638+ return test_driver::run(tests);
639+ } catch (const char * e) {
640+Index: xapian-core/tests/queryparsertest.cc
641+===================================================================
642+--- xapian-core.orig/tests/queryparsertest.cc 2011-08-24 19:09:38.000000000 -0400
643++++ xapian-core/tests/queryparsertest.cc 2011-08-24 19:39:30.766055473 -0400
644+@@ -33,6 +33,8 @@
645+ #include <string>
646+ #include <vector>
647+
648++#include <stdlib.h> // For setenv() or putenv()
649++
650+ using namespace std;
651+
652+ #define TESTCASE(S) {#S, test_##S}
653+@@ -639,6 +641,17 @@
654+ { "multisite:xapian.org site:www.xapian.org author:richard authortitle:richard", "((ZArichard:(pos=1) OR ZArichard:(pos=2) OR ZXTrichard:(pos=2)) FILTER (Hwww.xapian.org AND (Hxapian.org OR Jxapian.org)))"},
655+ { "authortitle:richard-boulton", "((Arichard:(pos=1) PHRASE 2 Aboulton:(pos=2)) OR (XTrichard:(pos=1) PHRASE 2 XTboulton:(pos=2)))"},
656+ { "authortitle:\"richard boulton\"", "((Arichard:(pos=1) PHRASE 2 Aboulton:(pos=2)) OR (XTrichard:(pos=1) PHRASE 2 XTboulton:(pos=2)))"},
657++ // Some CJK tests.
658++ { "久有归天愿", "(久:(pos=1) AND 久有:(pos=1) AND 有:(pos=1) AND 有归:(pos=1) AND 归:(pos=1) AND 归天:(pos=1) AND 天:(pos=1) AND 天愿:(pos=1) AND 愿:(pos=1))" },
659++ { "title:久有 归 天愿", "((XT久:(pos=1) AND XT久有:(pos=1) AND XT有:(pos=1)) OR 归:(pos=2) OR (天:(pos=3) AND 天愿:(pos=3) AND 愿:(pos=3)))" },
660++ { "h众ello万众", "(Zh:(pos=1) OR 众:(pos=2) OR Zello:(pos=3) OR (万:(pos=4) AND 万众:(pos=4) AND 众:(pos=4)))" },
661++ { "世(の中)TEST_tm", "(世:(pos=1) OR (の:(pos=2) AND の中:(pos=2) AND 中:(pos=2)) OR test_tm:(pos=3))" },
662++ { "다녀 AND 와야", "(다:(pos=1) AND 다녀:(pos=1) AND 녀:(pos=1) AND 와:(pos=2) AND 와야:(pos=2) AND 야:(pos=2))" },
663++ { "authortitle:학술 OR 연구를", "((A학:(pos=1) AND XT학:(pos=1) AND A학술:(pos=1) AND XT학술:(pos=1) AND A술:(pos=1) AND XT술:(pos=1)) OR (연:(pos=2) AND 연구:(pos=2) AND 구:(pos=2) AND 구를:(pos=2) AND 를:(pos=2)))" },
664++ // FIXME: These should really filter by bigrams to accelerate:
665++ { "\"久有归\"", "(久:(pos=1) PHRASE 3 有:(pos=1) PHRASE 3 归:(pos=1))" },
666++ { "\"久有test归\"", "(久:(pos=1) PHRASE 4 有:(pos=1) PHRASE 4 test:(pos=2) PHRASE 4 归:(pos=3))" },
667++ // FIXME: this should work: { "久 NEAR 有", "(久:(pos=1) NEAR 11 有:(pos=2))" },
668+ { NULL, NULL }
669+ };
670+
671+@@ -709,6 +722,9 @@
672+ // Add coverage for other cases similar to the above.
673+ { "a b site:xapian.org", "((Za:(pos=1) AND Zb:(pos=2)) FILTER Hxapian.org)" },
674+ { "site:xapian.org a b", "((Za:(pos=1) AND Zb:(pos=2)) FILTER Hxapian.org)" },
675++ // Some CJK tests.
676++ { "author:험가 OR subject:万众 hello world!", "((A험:(pos=1) AND A험가:(pos=1) AND A가:(pos=1)) OR (XT万:(pos=2) AND XT万众:(pos=2) AND XT众:(pos=2) AND Zhello:(pos=3) AND Zworld:(pos=4)))" },
677++ { "洛伊one儿差点two脸three", "(洛:(pos=1) AND 洛伊:(pos=1) AND 伊:(pos=1) AND Zone:(pos=2) AND 儿:(pos=3) AND 儿差:(pos=3) AND 差:(pos=3) AND 差点:(pos=3) AND 点:(pos=3) AND Ztwo:(pos=4) AND 脸:(pos=5) AND Zthree:(pos=6))" },
678+ { NULL, NULL }
679+ };
680+
681+@@ -761,6 +777,8 @@
682+ TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((ZAme:(pos=1) OR ZXTstuff:(pos=2)))");
683+ qobj = qp.parse_query("title:(stuff) me", Xapian::QueryParser::FLAG_BOOLEAN, "A");
684+ TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((ZXTstuff:(pos=1) OR ZAme:(pos=2)))");
685++ qobj = qp.parse_query("英国 title:文森hello", 0, "A");
686++ TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query(((A英:(pos=1) AND A英国:(pos=1) AND A国:(pos=1)) OR (XT文:(pos=2) AND XT文森:(pos=2) AND XT森:(pos=2)) OR ZAhello:(pos=3)))");
687+ return true;
688+ }
689+
690+@@ -2441,6 +2459,14 @@
691+
692+ int main(int argc, char **argv)
693+ try {
694++ // FIXME: It would be better to test with and without XAPIAN_CJK_NGRAM set.
695++#ifdef __WIN32__
696++ _putenv_s("XAPIAN_CJK_NGRAM", "1");
697++#elif defined HAVE_SETENV
698++ setenv("XAPIAN_CJK_NGRAM", "1", 1);
699++#else
700++ putenv(const_cast<char*>("XAPIAN_CJK_NGRAM=1"));
701++#endif
702+ test_driver::parse_command_line(argc, argv);
703+ return test_driver::run(tests);
704+ } catch (const char * e) {
705+Index: xapian-core/ChangeLog
706+===================================================================
707+--- xapian-core.orig/ChangeLog 2011-08-24 19:09:38.000000000 -0400
708++++ xapian-core/ChangeLog 2011-08-24 19:42:18.056055791 -0400
709+@@ -1,3 +1,17 @@
710++Wed Aug 24 14:25:21 GMT 2011 Olly Betts <olly@survex.com>
711++
712++ * Backport change from trunk:
713++ * queryparser/queryparser.lemony: Fix memory leak (caught by existing
714++ testcase queryparser1 when run under valgrind).
715++
716++Wed Aug 24 14:13:24 GMT 2011 Olly Betts <olly@survex.com>
717++
718++ * Backport change from trunk:
719++ * queryparser/,tests/queryparsertest.cc,tests/termgentest.cc: Add
720++ support for indexing and searching CJK text using n-grams. Currently
721++ this is only enabled if environmental variable XAPIAN_CJK_NGRAM is
722++ set to a non-empty value.
723++
724+ Mon Apr 04 14:41:33 GMT 2011 Olly Betts <olly@survex.com>
725+
726+ * NEWS: Final update for 1.2.5.

Subscribers

People subscribed via source and target branches