Merge lp:~jpakkane/libcolumbus/dev into lp:libcolumbus

Proposed by Jussi Pakkanen
Status: Merged
Merged at revision: 452
Proposed branch: lp:~jpakkane/libcolumbus/dev
Merge into: lp:libcolumbus
Diff against target: 1720 lines (+538/-229)
52 files modified
CMakeLists.txt (+2/-1)
cmake/isclang.cc (+0/-26)
cmake/pch.cmake (+4/-2)
cmake/python.cmake (+10/-7)
coding style.txt (+2/-1)
debian/control (+3/-0)
debian/rules (+1/-1)
include/ColumbusHelpers.hh (+2/-2)
include/Corpus.hh (+3/-3)
include/Document.hh (+3/-1)
include/ErrorMatrix.hh (+4/-1)
include/ErrorValues.hh (+9/-3)
include/IndexMatches.hh (+3/-5)
include/IndexWeights.hh (+2/-1)
include/LevenshteinIndex.hh (+3/-5)
include/MatchResults.hh (+6/-1)
include/Matcher.hh (+14/-6)
include/MatcherStatistics.hh (+1/-1)
include/ResultFilter.hh (+3/-1)
include/SearchParameters.hh (+53/-0)
include/Trie.hh (+4/-1)
include/Word.hh (+1/-1)
include/WordList.hh (+3/-1)
include/WordStore.hh (+4/-1)
include/columbus.h (+1/-1)
python/CMakeLists.txt (+9/-10)
python/columbus.cc (+8/-6)
python/columbus.py (+0/-28)
share/CMakeLists.txt (+1/-0)
share/greekAccentedLetterGroups.txt (+7/-0)
src/CMakeLists.txt (+1/-0)
src/ColumbusCAPI.cc (+5/-3)
src/ColumbusHelpers.cc (+6/-4)
src/Document.cc (+5/-3)
src/ErrorValues.cc (+10/-2)
src/MatchResults.cc (+26/-0)
src/Matcher.cc (+57/-56)
src/SearchParameters.cc (+86/-0)
src/WordList.cc (+13/-0)
test/CAPITest.c (+2/-2)
test/CMakeLists.txt (+1/-0)
test/ErrorValuesTest.cc (+20/-2)
test/HelpersTest.cc (+2/-4)
test/MatchResultsTest.cc (+21/-0)
test/MatcherTest.cc (+3/-3)
test/ResultFilterTest.cc (+15/-19)
test/SearchParametersTest.cc (+86/-0)
test/pythontest.py (+6/-7)
tools/hudtest.cc (+3/-3)
tools/numberpad.cc (+1/-1)
tools/queryapp.cc (+1/-1)
tools/sctest.cc (+2/-2)
To merge this branch: bzr merge lp:~jpakkane/libcolumbus/dev
Reviewer Review Type Date Requested Status
PS Jenkins bot (community) continuous-integration Approve
Unity Team Pending
Review via email: mp+179100@code.launchpad.net

Commit message

New API and ABI breaking version.

Description of the change

New API and ABI breaking version. Not mergeable as it stands, as package naming and the like needs to be finalised first.

To post a comment you must log in.
Revision history for this message
PS Jenkins bot (ps-jenkins) wrote :
review: Needs Fixing (continuous-integration)
lp:~jpakkane/libcolumbus/dev updated
475. By Jussi Pakkanen

Merged trunk.

Revision history for this message
PS Jenkins bot (ps-jenkins) wrote :
review: Needs Fixing (continuous-integration)
Revision history for this message
Jussi Pakkanen (jpakkane) wrote :

The failure is caused by CMake not finding boost::python:

-- Could NOT find Boost
-- checking for one of the modules 'python3'
-- Boost not found, not building Python bindings.

but the package is installed:

Setting up libboost-python1.53-dev (1.53.0-6+exp2ubuntu1) ...
Setting up libboost-python-dev (1.53.0.0ubuntu2) ...

This compilation works fine on my machine (raring + Boost 1.53).

lp:~jpakkane/libcolumbus/dev updated
476. By Jussi Pakkanen

Detect Boost.Python manually, because CMake's Boost module is horribly broken.

Revision history for this message
PS Jenkins bot (ps-jenkins) wrote :
review: Approve (continuous-integration)
lp:~jpakkane/libcolumbus/dev updated
477. By Jussi Pakkanen

Removed stray debug statement.

Revision history for this message
PS Jenkins bot (ps-jenkins) wrote :
review: Approve (continuous-integration)
lp:~jpakkane/libcolumbus/dev updated
478. By Jussi Pakkanen

Changed Python API so it takes plain strings, which is easier for everyone involved.

Revision history for this message
PS Jenkins bot (ps-jenkins) wrote :
review: Approve (continuous-integration)
lp:~jpakkane/libcolumbus/dev updated
479. By Jussi Pakkanen

Exposed end error functions to Python.

Revision history for this message
PS Jenkins bot (ps-jenkins) wrote :
review: Approve (continuous-integration)

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'CMakeLists.txt'
2--- CMakeLists.txt 2013-04-23 11:05:57 +0000
3+++ CMakeLists.txt 2013-08-09 12:23:15 +0000
4@@ -49,8 +49,9 @@
5 # http://tldp.org/HOWTO/Program-Library-HOWTO/shared-libraries.html#AEN135
6 set(ABI_VERSION 0)
7
8+include(GNUInstallDirs)
9+set(LIBDIR ${CMAKE_INSTALL_LIBDIR})
10 # Set as cache variable so packaging can override.
11-set(LIBDIR "lib" CACHE PATH "Destination install dir for the library")
12 set(PYTHONDIR "lib/python3/dist-packages" CACHE PATH "Destination install dir for Python module")
13
14 include(TestBigEndian)
15
16=== removed file 'cmake/isclang.cc'
17--- cmake/isclang.cc 2012-12-07 11:01:33 +0000
18+++ cmake/isclang.cc 1970-01-01 00:00:00 +0000
19@@ -1,26 +0,0 @@
20-/*
21- * Copyright (C) 2012 Canonical, Ltd.
22- *
23- * Authors:
24- * Jussi Pakkanen <jussi.pakkanen@canonical.com>
25- *
26- * This library is free software; you can redistribute it and/or modify it under
27- * the terms of version 3 of the GNU Lesser General Public License as published
28- * by the Free Software Foundation.
29- *
30- * This library is distributed in the hope that it will be useful, but WITHOUT
31- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
32- * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
33- * details.
34- *
35- * You should have received a copy of the GNU Lesser General Public License
36- * along with this program. If not, see <http://www.gnu.org/licenses/>.
37- */
38-
39-int main(int argc, char **argv) {
40-#ifdef __clang__
41- return 1; // This gets assigned to a CMake variable so "1" means "true".
42-#else
43- return 0;
44-#endif
45-}
46
47=== modified file 'cmake/pch.cmake'
48--- cmake/pch.cmake 2012-12-21 10:32:38 +0000
49+++ cmake/pch.cmake 2013-08-09 12:23:15 +0000
50@@ -65,7 +65,8 @@
51 separate_arguments(compile_args)
52 add_custom_command(OUTPUT ${gch_filename}
53 COMMAND ${CMAKE_CXX_COMPILER} ${compile_args}
54- DEPENDS ${header_filename})
55+ DEPENDS ${header_filename}
56+ VERBATIM)
57 add_custom_target(${gch_target_name} DEPENDS ${gch_filename})
58 add_dependencies(${target_name} ${gch_target_name})
59
60@@ -77,7 +78,8 @@
61
62 endfunction()
63
64-try_run(IS_CLANG did_build ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_SOURCE_DIR}/cmake/isclang.cc)
65+include(CheckCXXSourceCompiles)
66+CHECK_CXX_SOURCE_COMPILES("#ifdef __clang__\n#else\n#error \"Not clang.\"\n#endif\nint main(int argc, char **argv) { return 0; }" IS_CLANG)
67
68 if(UNIX)
69 if(NOT APPLE)
70
71=== modified file 'cmake/python.cmake'
72--- cmake/python.cmake 2013-01-24 09:25:50 +0000
73+++ cmake/python.cmake 2013-08-09 12:23:15 +0000
74@@ -1,14 +1,20 @@
75 set(build_python FALSE)
76
77-find_package(Boost 1.49.0 COMPONENTS python)
78+# CMake's Boost.Python detector is completely and utterly
79+# broken. We have to do this manually.
80+#
81+# Upstream bug:
82+# http://public.kitware.com/Bug/view.php?id=12955
83+find_file(BP_HEADER boost/python.hpp)
84+
85 if(use_python2)
86 pkg_search_module(PYTHONLIBS python)
87 else()
88 pkg_search_module(PYTHONLIBS python3)
89 endif()
90
91-if(NOT Boost_FOUND)
92- message(STATUS "Boost not found, not building Python bindings.")
93+if(NOT BP_HEADER)
94+ message(STATUS "Boost.Python not found, not building Python bindings.")
95 else()
96 if(NOT PYTHONLIBS_FOUND)
97 message(STATUS "Python dev libraries not found, not building Python bindings.")
98@@ -19,11 +25,8 @@
99 if(NOT use_python2)
100 execute_process(COMMAND ${CMAKE_SOURCE_DIR}/cmake/pysoabi.py OUTPUT_VARIABLE pysoabi OUTPUT_STRIP_TRAILING_WHITESPACE)
101 endif()
102-
103- # Linking against libboost_python does not work with Python 3.
104- # Working around this bug:
105- # http://public.kitware.com/Bug/view.php?id=12955
106 find_library(BOOST_PYTHON_HACK boost_python-py${PYTHON_MAJOR}${PYTHON_MINOR})
107+
108 if(NOT BOOST_PYTHON_HACK)
109 message(STATUS "Boost.Python hack library not found, not building Python bindings")
110 else()
111
112=== modified file 'coding style.txt'
113--- coding style.txt 2012-06-07 11:48:28 +0000
114+++ coding style.txt 2013-08-09 12:23:15 +0000
115@@ -4,5 +4,6 @@
116 - indentation is 4 spaces, tabs are forbidden
117 - opening brace always on the same line
118 - class header files must be minimal
119- - no STL #includes because they slow down compilation massively
120+ - no STL #includes because they slow down compilation massively,
121+ the only exception is string, which is necessary for interoperation
122 - forward declarations instead of #includes
123
124=== modified file 'debian/control'
125--- debian/control 2013-02-11 20:12:45 +0000
126+++ debian/control 2013-08-09 12:23:15 +0000
127@@ -16,6 +16,7 @@
128 Package: libcolumbus0-0
129 Section: libs
130 Architecture: any
131+Multi-Arch: same
132 Pre-Depends: ${misc:Pre-Depends}
133 Depends: libcolumbus0-0-common (= ${source:Version}),
134 ${shlibs:Depends},
135@@ -26,6 +27,7 @@
136 Package: libcolumbus0-0-common
137 Section: libs
138 Architecture: all
139+Multi-Arch: foreign
140 Depends: ${shlibs:Depends},
141 ${misc:Depends},
142 Description: error tolerant matching engine - common files
143@@ -36,6 +38,7 @@
144 Package: libcolumbus0-dev
145 Section: libdevel
146 Architecture: any
147+Multi-Arch: same
148 Pre-Depends: ${misc:Pre-Depends}
149 Depends: libcolumbus0-0 (= ${binary:Version}),
150 ${misc:Depends},
151
152=== modified file 'debian/rules'
153--- debian/rules 2013-06-05 14:29:17 +0000
154+++ debian/rules 2013-08-09 12:23:15 +0000
155@@ -16,7 +16,7 @@
156 dh $@ --parallel
157
158 override_dh_auto_configure:
159- dh_auto_configure -- -DLIBDIR=/usr/lib/$(DEB_HOST_MULTIARCH) -DCMAKE_BUILD_TYPE=''
160+ dh_auto_configure -- -DCMAKE_BUILD_TYPE=''
161
162 override_dh_install:
163 dh_install --fail-missing
164
165=== modified file 'include/ColumbusHelpers.hh'
166--- include/ColumbusHelpers.hh 2012-12-07 11:01:33 +0000
167+++ include/ColumbusHelpers.hh 2013-08-09 12:23:15 +0000
168@@ -30,8 +30,8 @@
169 Letter* utf8ToInternal(const char *utf8Text, unsigned int &resultStringSize);
170 void internalToUtf8(const Letter *source, unsigned int characters, char *buf, unsigned int bufsize);
171 COL_PUBLIC COL_PUBLIC double hiresTimestamp();
172-COL_PUBLIC void splitToWords(const char *utf8Text, WordList &list);
173-COL_PUBLIC void split(const char *utf8Text, WordList &list, const Letter *splitChars, int numChars);
174+COL_PUBLIC WordList splitToWords(const char *utf8Text);
175+COL_PUBLIC WordList split(const char *utf8Text, const Letter *splitChars, int numChars);
176 COL_PUBLIC bool isWhitespace(Letter l);
177
178 COL_NAMESPACE_END
179
180=== modified file 'include/Corpus.hh'
181--- include/Corpus.hh 2012-12-07 11:01:33 +0000
182+++ include/Corpus.hh 2013-08-09 12:23:15 +0000
183@@ -27,15 +27,15 @@
184 struct CorpusPrivate;
185 class Document;
186
187-class COL_PUBLIC Corpus {
188+class COL_PUBLIC Corpus final {
189 private:
190 CorpusPrivate *p;
191- Corpus(const Corpus &c);
192- const Corpus& operator=(const Corpus &c);
193
194 public:
195 Corpus();
196 ~Corpus();
197+ Corpus(const Corpus &c) = delete;
198+ const Corpus& operator=(const Corpus &c) = delete;
199
200 void addDocument(const Document &d);
201 size_t size() const;
202
203=== modified file 'include/Document.hh'
204--- include/Document.hh 2012-12-07 11:01:33 +0000
205+++ include/Document.hh 2013-08-09 12:23:15 +0000
206@@ -21,6 +21,7 @@
207 #define DOCUMENT_HH_
208
209 #include "ColumbusCore.hh"
210+#include<string>
211
212 COL_NAMESPACE_START
213
214@@ -29,7 +30,7 @@
215
216 struct DocumentPrivate;
217
218-class COL_PUBLIC Document {
219+class COL_PUBLIC Document final {
220 private:
221 DocumentPrivate *p;
222
223@@ -41,6 +42,7 @@
224 const Document& operator=(const Document &d);
225 void addText(const Word &field, const WordList &words);
226 void addText(const Word &field, const char *textAsUtf8);
227+ void addText(const Word &field, const std::string &textAsUtf8);
228 const WordList& getText(const Word &field) const;
229 size_t fieldCount() const;
230 DocumentID getID() const;
231
232=== modified file 'include/ErrorMatrix.hh'
233--- include/ErrorMatrix.hh 2012-12-07 11:01:33 +0000
234+++ include/ErrorMatrix.hh 2013-08-09 12:23:15 +0000
235@@ -34,13 +34,16 @@
236
237 COL_NAMESPACE_START
238
239-class ErrorMatrix {
240+class ErrorMatrix final {
241 size_t rows, columns;
242 int **m;
243
244 public:
245 ErrorMatrix(const size_t rows_, const size_t columns_, const int insertError, const int deletionError);
246 ~ErrorMatrix();
247+ ErrorMatrix(const ErrorMatrix &em) = delete;
248+ const ErrorMatrix & operator=(const ErrorMatrix &other) = delete;
249+
250
251 void set(const size_t rowNum, const size_t colNum, const int error);
252 // No bounds checking because this is in the hot path.
253
254=== modified file 'include/ErrorValues.hh'
255--- include/ErrorValues.hh 2012-12-07 11:01:33 +0000
256+++ include/ErrorValues.hh 2013-08-09 12:23:15 +0000
257@@ -24,10 +24,15 @@
258
259 COL_NAMESPACE_START
260
261+enum accentGroups {
262+ latinAccentGroup,
263+ greekAccentGroup,
264+};
265+
266 struct ErrorValuesPrivate;
267 class Word;
268
269-class COL_PUBLIC ErrorValues {
270+class COL_PUBLIC ErrorValues final {
271 private:
272 static const int DEFAULT_ERROR = 100;
273 static const int DEFAULT_GROUP_ERROR = 30;
274@@ -56,6 +61,7 @@
275
276 ErrorValues();
277 ~ErrorValues();
278+ const ErrorValues& operator=(const ErrorValues &other) = delete;
279
280 int getInsertionError() const { return insertionError; }
281 int getDeletionError() const { return deletionError; }
282@@ -81,10 +87,10 @@
283
284 void setError(Letter l1, Letter l2, const int error);
285 void setGroupError(const Word &groupLetters, const int error);
286- void addLatinAccents();
287+ void addAccents(accentGroups group);
288 void addKeyboardErrors();
289 void addNumberpadErrors();
290- void addStandardErrors() { addLatinAccents(); addKeyboardErrors(); }
291+ void addStandardErrors();
292 bool isInGroup(Letter l);
293 void clearErrors();
294 void setSubstringMode();
295
296=== modified file 'include/IndexMatches.hh'
297--- include/IndexMatches.hh 2012-12-07 11:01:33 +0000
298+++ include/IndexMatches.hh 2013-08-09 12:23:15 +0000
299@@ -34,7 +34,7 @@
300 * in growing error order.
301 *
302 */
303-class COL_PUBLIC IndexMatches {
304+class COL_PUBLIC IndexMatches final {
305 friend class LevenshteinIndex;
306
307 private:
308@@ -44,13 +44,11 @@
309 void addMatch(const Word &queryWord, const WordID matchedWord, int error);
310 void sort();
311
312- // Disable copy and assignment.
313- IndexMatches(const IndexMatches &other);
314- const IndexMatches & operator=(const IndexMatches &other);
315-
316 public:
317 IndexMatches();
318 ~IndexMatches();
319+ IndexMatches(const IndexMatches &other) = delete;
320+ const IndexMatches & operator=(const IndexMatches &other) = delete;
321
322 size_t size() const;
323 const WordID& getMatch(size_t num) const;
324
325=== modified file 'include/IndexWeights.hh'
326--- include/IndexWeights.hh 2012-12-07 11:01:33 +0000
327+++ include/IndexWeights.hh 2013-08-09 12:23:15 +0000
328@@ -27,11 +27,12 @@
329 struct IndexWeightsPrivate;
330 class Word;
331
332-class COL_PUBLIC IndexWeights {
333+class COL_PUBLIC IndexWeights final {
334 IndexWeightsPrivate *p;
335 public:
336 IndexWeights();
337 ~IndexWeights();
338+ const IndexWeights & operator=(const IndexWeights &other) = delete;
339
340 void setWeight(const Word &w, double weigth);
341 double getWeight(const Word &w) const;
342
343=== modified file 'include/LevenshteinIndex.hh'
344--- include/LevenshteinIndex.hh 2013-01-29 09:36:25 +0000
345+++ include/LevenshteinIndex.hh 2013-08-09 12:23:15 +0000
346@@ -20,7 +20,6 @@
347 #ifndef LEVENSHTEININDEX_HH
348 #define LEVENSHTEININDEX_HH
349
350-#include <vector>
351 #include "ColumbusCore.hh"
352 #include "IndexMatches.hh"
353
354@@ -32,7 +31,7 @@
355 class Word;
356 class ErrorValues;
357
358-class COL_PUBLIC LevenshteinIndex {
359+class COL_PUBLIC LevenshteinIndex final {
360 private:
361 LevenshteinIndexPrivate *p;
362
363@@ -40,15 +39,14 @@
364 const Letter letter, const Letter previousLetter, const size_t depth, ErrorMatrix &em,
365 IndexMatches &matches, const int max_error) const;
366
367- // Disable copy and move.
368- LevenshteinIndex(const LevenshteinIndex &other);
369- LevenshteinIndex& operator=(const LevenshteinIndex &other);
370 int findOptimalError(const Letter letter, const Letter previousLetter, const Word &query,
371 const size_t i, const size_t depth, const ErrorMatrix &em, const ErrorValues &e) const;
372
373 public:
374 LevenshteinIndex();
375 ~LevenshteinIndex();
376+ LevenshteinIndex(const LevenshteinIndex &other) = delete;
377+ const LevenshteinIndex & operator=(const LevenshteinIndex &other) = delete;
378
379 static int getDefaultError();
380
381
382=== modified file 'include/MatchResults.hh'
383--- include/MatchResults.hh 2012-12-07 11:01:33 +0000
384+++ include/MatchResults.hh 2013-08-09 12:23:15 +0000
385@@ -27,7 +27,7 @@
386 struct MatchResultsPrivate;
387 class Word;
388
389-class COL_PUBLIC MatchResults {
390+class COL_PUBLIC MatchResults final {
391 MatchResultsPrivate *p;
392
393 void sortIfRequired() const;
394@@ -35,6 +35,11 @@
395 public:
396 MatchResults();
397 ~MatchResults();
398+ MatchResults(const MatchResults &other);
399+ MatchResults(MatchResults &&other);
400+
401+ const MatchResults& operator=(MatchResults &&other);
402+ const MatchResults& operator=(const MatchResults &other);
403
404 void addResult(DocumentID docID, double relevancy);
405 void addResults(const MatchResults &r);
406
407=== modified file 'include/Matcher.hh'
408--- include/Matcher.hh 2013-01-30 14:17:07 +0000
409+++ include/Matcher.hh 2013-08-09 12:23:15 +0000
410@@ -21,6 +21,7 @@
411 #define MATCHER_HH_
412
413 #include "ColumbusCore.hh"
414+#include<string>
415
416 COL_NAMESPACE_START
417
418@@ -33,22 +34,29 @@
419 class ErrorValues;
420 class IndexWeights;
421 class ResultFilter;
422+class SearchParameters;
423
424-class COL_PUBLIC Matcher {
425+class COL_PUBLIC Matcher final {
426 private:
427 MatcherPrivate *p;
428
429 void buildIndexes(const Corpus &c);
430 void addToIndex(const Word &word, const WordID wordID, const WordID indexID);
431- void matchWithRelevancy(const WordList &query, const bool dynamicError, const int extraError, MatchResults &matchedDocuments);
432+ void matchWithRelevancy(const WordList &query, const SearchParameters &params, const int extraError, MatchResults &matchedDocuments);
433
434 public:
435 Matcher();
436 ~Matcher();
437-
438- void match(const WordList &query, MatchResults &matchedDocuments);
439- void match(const char *queryAsUtf8, MatchResults &matchedDocuments);
440- void match(const char *queryAsUtf8, MatchResults &matchedDocuments, const ResultFilter &filter);
441+ Matcher& operator=(const Matcher &m) = delete;
442+
443+ // The simple API
444+ MatchResults match(const char *queryAsUtf8);
445+ MatchResults match(const WordList &query);
446+ MatchResults match(const std::string &queryAsUtf8);
447+
448+ // When you want to specify search parameters exactly.
449+ MatchResults match(const char *queryAsUtf8, const SearchParameters &params);
450+ MatchResults match(const WordList &query, const SearchParameters &params);
451 void index(const Corpus &c);
452 ErrorValues& getErrorValues();
453 IndexWeights& getIndexWeights();
454
455=== modified file 'include/MatcherStatistics.hh'
456--- include/MatcherStatistics.hh 2012-12-07 11:01:33 +0000
457+++ include/MatcherStatistics.hh 2013-08-09 12:23:15 +0000
458@@ -27,7 +27,7 @@
459 struct MatcherStatisticsPrivate;
460 class Word;
461
462-class MatcherStatistics {
463+class MatcherStatistics final {
464 private:
465
466 MatcherStatisticsPrivate *p;
467
468=== modified file 'include/ResultFilter.hh'
469--- include/ResultFilter.hh 2012-12-07 11:01:33 +0000
470+++ include/ResultFilter.hh 2013-08-09 12:23:15 +0000
471@@ -27,7 +27,7 @@
472 struct ResultFilterPrivate;
473 class Word;
474
475-class COL_PUBLIC ResultFilter {
476+class COL_PUBLIC ResultFilter final {
477 private:
478
479 ResultFilterPrivate *p;
480@@ -35,6 +35,8 @@
481 public:
482 ResultFilter();
483 ~ResultFilter();
484+ ResultFilter(const ResultFilter &rf) = delete;
485+ const ResultFilter & operator=(const ResultFilter &other) = delete;
486
487 void addNewTerm();
488 void addNewSubTerm(const Word &field, const Word &word);
489
490=== added file 'include/SearchParameters.hh'
491--- include/SearchParameters.hh 1970-01-01 00:00:00 +0000
492+++ include/SearchParameters.hh 2013-08-09 12:23:15 +0000
493@@ -0,0 +1,53 @@
494+/*
495+ * Copyright (C) 2013 Canonical, Ltd.
496+ *
497+ * Authors:
498+ * Jussi Pakkanen <jussi.pakkanen@canonical.com>
499+ *
500+ * This library is free software; you can redistribute it and/or modify it under
501+ * the terms of version 3 of the GNU Lesser General Public License as published
502+ * by the Free Software Foundation.
503+ *
504+ * This library is distributed in the hope that it will be useful, but WITHOUT
505+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
506+ * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
507+ * details.
508+ *
509+ * You should have received a copy of the GNU Lesser General Public License
510+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
511+ */
512+
513+#ifndef SEARCHPARAMETERS_H_
514+#define SEARCHPARAMETERS_H_
515+
516+#include "ColumbusCore.hh"
517+
518+COL_NAMESPACE_START
519+
520+struct SearchParametersPrivate;
521+class Word;
522+class ResultFilter;
523+
524+class COL_PUBLIC SearchParameters final {
525+private:
526+ SearchParametersPrivate *p;
527+
528+public:
529+ SearchParameters();
530+ ~SearchParameters();
531+ SearchParameters & operator=(const SearchParameters &other) = delete;
532+
533+ bool isDynamic() const;
534+ void setDynamic(bool dyn);
535+ int getDynamicError(const Word &w) const;
536+ ResultFilter& getResultFilter();
537+ const ResultFilter& getResultFilter() const;
538+
539+ void addNonsearchingField(const Word &w);
540+ bool isNonsearchingField(const Word &w) const;
541+
542+ int looseningIterations() const;
543+};
544+
545+COL_NAMESPACE_END
546+#endif
547
548=== modified file 'include/Trie.hh'
549--- include/Trie.hh 2013-01-31 10:23:45 +0000
550+++ include/Trie.hh 2013-08-09 12:23:15 +0000
551@@ -27,7 +27,7 @@
552 struct TriePrivate;
553 class Word;
554
555-class COL_PUBLIC Trie {
556+class COL_PUBLIC Trie final {
557 private:
558 TriePrivate *p;
559 void expand();
560@@ -38,6 +38,9 @@
561 public:
562 Trie();
563 ~Trie();
564+ Trie(const Trie &other) = delete;
565+ const Trie & operator=(const Trie &other) = delete;
566+
567
568 bool hasWord(const Word &word) const;
569 TrieOffset findWord(const Word &word) const;
570
571=== modified file 'include/Word.hh'
572--- include/Word.hh 2013-01-31 09:26:44 +0000
573+++ include/Word.hh 2013-08-09 12:23:15 +0000
574@@ -31,7 +31,7 @@
575 *
576 * A word's contents are immutable.
577 */
578-class COL_PUBLIC Word {
579+class COL_PUBLIC Word final {
580 private:
581
582 Letter *text; // Change this to a shared pointer to save memory.
583
584=== modified file 'include/WordList.hh'
585--- include/WordList.hh 2012-12-07 11:01:33 +0000
586+++ include/WordList.hh 2013-08-09 12:23:15 +0000
587@@ -27,18 +27,20 @@
588 struct WordListPrivate;
589 class Word;
590
591-class COL_PUBLIC WordList {
592+class COL_PUBLIC WordList final {
593 private:
594 WordListPrivate *p;
595
596 public:
597 WordList();
598 WordList(const WordList &wl);
599+ WordList(WordList &&wl);
600 ~WordList();
601
602 size_t size() const;
603 const Word& operator[](const size_t i) const;
604 const WordList& operator=(const WordList &l);
605+ const WordList& operator=(WordList &&wl);
606 bool operator==(const WordList &l) const;
607 bool operator!=(const WordList &l) const;
608 void addWord(const Word &w); // This is more of an implementation detail and should not be exposed in a base class or interface.
609
610=== modified file 'include/WordStore.hh'
611--- include/WordStore.hh 2013-01-31 10:23:45 +0000
612+++ include/WordStore.hh 2013-08-09 12:23:15 +0000
613@@ -36,7 +36,7 @@
614 struct WordStorePrivate;
615 class Word;
616
617-class COL_PUBLIC WordStore {
618+class COL_PUBLIC WordStore final {
619 private:
620
621 WordStorePrivate *p;
622@@ -44,6 +44,9 @@
623 public:
624 WordStore();
625 ~WordStore();
626+ WordStore(const WordStore &other) = delete;
627+ const WordStore & operator=(const WordStore &other) = delete;
628+
629
630 WordID getID(const Word &w);
631 Word getWord(const WordID id) const;
632
633=== modified file 'include/columbus.h'
634--- include/columbus.h 2013-01-08 12:55:36 +0000
635+++ include/columbus.h 2013-08-09 12:23:15 +0000
636@@ -56,7 +56,7 @@
637 COL_PUBLIC ColMatcher col_matcher_new();
638 COL_PUBLIC void col_matcher_delete(ColMatcher m);
639 COL_PUBLIC void col_matcher_index(ColMatcher m, ColCorpus c);
640-COL_PUBLIC void col_matcher_match(ColMatcher m, const char *query_as_utf8, ColMatchResults mr);
641+COL_PUBLIC ColMatchResults col_matcher_match(ColMatcher m, const char *query_as_utf8);
642 COL_PUBLIC ColErrorValues col_matcher_get_error_values(ColMatcher m);
643 COL_PUBLIC ColIndexWeights col_matcher_get_index_weights(ColMatcher m);
644
645
646=== modified file 'python/CMakeLists.txt'
647--- python/CMakeLists.txt 2013-01-24 09:25:50 +0000
648+++ python/CMakeLists.txt 2013-08-09 12:23:15 +0000
649@@ -2,16 +2,15 @@
650 include_directories(${PYTHONLIBS_INCLUDE_DIRS})
651
652 if(use_python2)
653- set(python_lib_name "_columbus")
654+ set(python_lib_name "columbus")
655 else()
656- set(python_lib_name "_columbus.${pysoabi}")
657+ set(python_lib_name "columbus.${pysoabi}")
658 endif()
659
660-add_library(_columbus_ext SHARED _columbus.cc)
661-target_link_libraries(_columbus_ext ${COL_LIB_BASENAME} ${BOOST_PYTHON_HACK} ${PYTHONLIBS_LIBRARIES})
662-set_target_properties(_columbus_ext PROPERTIES OUTPUT_NAME ${python_lib_name} PREFIX "")
663-
664-add_pch(pch/colpython_pch.hh _columbus_ext)
665-
666-install(TARGETS _columbus_ext DESTINATION ${PYTHONDIR})
667-install(FILES columbus.py DESTINATION ${PYTHONDIR})
668+add_library(columbus_ext SHARED columbus.cc)
669+target_link_libraries(columbus_ext ${COL_LIB_BASENAME} ${BOOST_PYTHON_HACK} ${PYTHONLIBS_LIBRARIES})
670+set_target_properties(columbus_ext PROPERTIES OUTPUT_NAME ${python_lib_name} PREFIX "")
671+
672+add_pch(pch/colpython_pch.hh columbus_ext)
673+
674+install(TARGETS columbus_ext DESTINATION ${PYTHONDIR})
675
676=== renamed file 'python/_columbus.cc' => 'python/columbus.cc'
677--- python/_columbus.cc 2013-01-23 13:50:20 +0000
678+++ python/columbus.cc 2013-08-09 12:23:15 +0000
679@@ -24,11 +24,10 @@
680 using namespace Columbus;
681
682
683-void (Document::*addAdaptor) (const Word &, const WordList &) = &Document::addText;
684-void (Matcher::*queryAdaptor) (const WordList &, MatchResults &) = &Matcher::match;
685+void (Document::*addAdaptor) (const Word &, const std::string &) = &Document::addText;
686+MatchResults (Matcher::*queryAdaptor) (const std::string &) = &Matcher::match;
687
688-BOOST_PYTHON_MODULE(_columbus)
689-{
690+BOOST_PYTHON_MODULE(columbus) {
691 class_<Corpus, boost::noncopyable>("Corpus", init<>())
692 .def("size", &Corpus::size)
693 .def("add_document", &Corpus::addDocument)
694@@ -48,7 +47,7 @@
695 .def("add_word", &WordList::addWord)
696 ;
697
698- def("_split_to_words", splitToWords);
699+ def("split_to_words", splitToWords);
700
701 class_<Document>("Document", init<DocumentID>())
702 .def(init<const Document&>())
703@@ -75,13 +74,16 @@
704 return_internal_reference<>())
705 ;
706
707- class_<ErrorValues>("ErrorValues")
708+ class_<ErrorValues>("ErrorValues", init<>())
709 .def("add_standard_errors", &ErrorValues::addStandardErrors)
710 .def("set_substring_mode", &ErrorValues::setSubstringMode)
711+ .def("set_end_deletion_error", &ErrorValues::setEndDeletionError)
712 .def("set_error", &ErrorValues::setError)
713 .def("get_substitute_error", &ErrorValues::getSubstituteError)
714 .def("get_default_error", &ErrorValues::getDefaultError)
715 .staticmethod("get_default_error")
716+ .def("get_substring_default_end_deletion_error", &ErrorValues::getSubstringDefaultEndDeletionError)
717+ .staticmethod("get_substring_default_end_deletion_error")
718 .def("clear_errors", &ErrorValues::clearErrors)
719 ;
720
721
722=== removed file 'python/columbus.py'
723--- python/columbus.py 2012-12-11 14:45:07 +0000
724+++ python/columbus.py 1970-01-01 00:00:00 +0000
725@@ -1,28 +0,0 @@
726-#!/usr/bin/python3 -tt
727-# -*- coding: utf-8 -*-
728-
729-# Copyright (C) 2012 Canonical, Ltd.
730-
731-# Authors:
732-# Jussi Pakkanen <jussi.pakkanen@canonical.com>
733-
734-# This library is free software; you can redistribute it and/or modify it under
735-# the terms of version 3 of the GNU Lesser General Public License as published
736-# by the Free Software Foundation.
737-
738-# This library is distributed in the hope that it will be useful, but WITHOUT
739-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
740-# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
741-# details.
742-
743-# You should have received a copy of the GNU Lesser General Public License
744-# along with this program. If not, see <http://www.gnu.org/licenses/>.
745-
746-from _columbus import Corpus, Word, WordList, _split_to_words, Document, \
747-MatchResults, Matcher, ErrorValues, IndexWeights
748-
749-def split_to_words(text):
750- list = WordList()
751- _split_to_words(text, list)
752- return list
753-
754
755=== modified file 'share/CMakeLists.txt'
756--- share/CMakeLists.txt 2012-11-26 10:25:57 +0000
757+++ share/CMakeLists.txt 2013-08-09 12:23:15 +0000
758@@ -1,3 +1,4 @@
759 install(FILES
760 latinAccentedLetterGroups.txt
761+greekAccentedLetterGroups.txt
762 DESTINATION share/${COL_LIB_BASENAME})
763
764=== added file 'share/greekAccentedLetterGroups.txt'
765--- share/greekAccentedLetterGroups.txt 1970-01-01 00:00:00 +0000
766+++ share/greekAccentedLetterGroups.txt 2013-08-09 12:23:15 +0000
767@@ -0,0 +1,7 @@
768+αά
769+εέ
770+ηή
771+ιίϊΐ
772+οό
773+υύϋΰ
774+ωώ
775
776=== modified file 'src/CMakeLists.txt'
777--- src/CMakeLists.txt 2013-01-28 11:21:16 +0000
778+++ src/CMakeLists.txt 2013-08-09 12:23:15 +0000
779@@ -17,6 +17,7 @@
780 ErrorMatrix.cc
781 ResultFilter.cc
782 Trie.cc
783+SearchParameters.cc
784 )
785
786 if(ICONV_LIBRARIES)
787
788=== modified file 'src/ColumbusCAPI.cc'
789--- src/ColumbusCAPI.cc 2013-01-17 09:17:28 +0000
790+++ src/ColumbusCAPI.cc 2013-08-09 12:23:15 +0000
791@@ -133,13 +133,15 @@
792 }
793 }
794
795-void col_matcher_match(ColMatcher m, const char *query_as_utf8, ColMatchResults mr) {
796+ColMatchResults col_matcher_match(ColMatcher m, const char *query_as_utf8) {
797 try {
798 Matcher *matcher = reinterpret_cast<Matcher*>(m);
799- MatchResults *results = reinterpret_cast<MatchResults*>(mr);
800- matcher->match(query_as_utf8, *results);
801+ MatchResults *results =
802+ new MatchResults(matcher->match(query_as_utf8));
803+ return reinterpret_cast<ColMatchResults>(results);
804 } catch(exception &e) {
805 fprintf(stderr, "Exception when matching: %s\n", e.what());
806+ return nullptr;
807 }
808 }
809
810
811=== modified file 'src/ColumbusHelpers.cc'
812--- src/ColumbusHelpers.cc 2013-01-21 14:22:59 +0000
813+++ src/ColumbusHelpers.cc 2013-08-09 12:23:15 +0000
814@@ -125,8 +125,8 @@
815
816 }
817
818-void splitToWords(const char *utf8Text, WordList &list) {
819- split(utf8Text, list, whitespaceLetters, numWhitespaceLetters);
820+WordList splitToWords(const char *utf8Text) {
821+ return split(utf8Text, whitespaceLetters, numWhitespaceLetters);
822 }
823
824 static bool isInList(const Letter l, const Letter *chars, int numChars) {
825@@ -136,7 +136,8 @@
826 return false;
827 }
828
829-void split(const char *utf8Text, WordList &list, const Letter *splitChars, int numChars) {
830+WordList split(const char *utf8Text, const Letter *splitChars, int numChars) {
831+ WordList list;
832 unsigned int strSize = strlen(utf8Text);
833 size_t begin, end;
834 end = 0;
835@@ -150,7 +151,7 @@
836 }
837 if(begin >= strSize) {
838 delete []word;
839- return;
840+ return list;
841 }
842 end = begin+1;
843 while(!isInList(utf8Text[end], splitChars, numChars) && end < strSize) {
844@@ -174,6 +175,7 @@
845 }
846 } while(end < strSize);
847 delete []word;
848+ return list;
849 }
850
851 bool isWhitespace(Letter l) {
852
853=== modified file 'src/Document.cc'
854--- src/Document.cc 2012-12-07 11:01:33 +0000
855+++ src/Document.cc 2013-08-09 12:23:15 +0000
856@@ -55,9 +55,11 @@
857 }
858
859 void Document::addText(const Word &field, const char *textAsUtf8) {
860- WordList l;
861- splitToWords(textAsUtf8, l);
862- addText(field, l);
863+ addText(field, splitToWords(textAsUtf8));
864+}
865+
866+void Document::addText(const Word &field, const std::string &textAsUtf8) {
867+ addText(field, textAsUtf8.c_str());
868 }
869
870 const WordList& Document::getText(const Word &field) const {
871
872=== modified file 'src/ErrorValues.cc'
873--- src/ErrorValues.cc 2013-01-11 15:18:30 +0000
874+++ src/ErrorValues.cc 2013-08-09 12:23:15 +0000
875@@ -35,6 +35,9 @@
876 COL_NAMESPACE_START
877 using namespace std;
878
879+static const char *accentGroupDataFile[] = {"latinAccentedLetterGroups.txt",
880+ "greekAccentedLetterGroups.txt"};
881+
882 const int LUT_BITS = 9;
883 const int LUT_LETTERS = 1 << LUT_BITS;
884 const int LUT_SIZE = (LUT_LETTERS*LUT_LETTERS);
885@@ -161,8 +164,8 @@
886 return p->groupMap.find(l) != p->groupMap.end();
887 }
888
889-void ErrorValues::addLatinAccents() {
890- const char *baseName = "latinAccentedLetterGroups.txt";
891+void ErrorValues::addAccents(accentGroups group) {
892+ const char *baseName = accentGroupDataFile[group];
893 string dataFile = findDataFile(baseName);
894 string line;
895 if(dataFile.length() == 0) {
896@@ -257,6 +260,11 @@
897 }
898 }
899
900+void ErrorValues::addStandardErrors() {
901+ addAccents(latinAccentGroup);
902+ addAccents(greekAccentGroup);
903+ addKeyboardErrors();
904+}
905
906 void ErrorValues::addToLUT(Letter l1, Letter l2, int value) {
907 if(l1 < LUT_LETTERS && l2 < LUT_LETTERS) {
908
909=== modified file 'src/MatchResults.cc'
910--- src/MatchResults.cc 2012-12-07 11:01:33 +0000
911+++ src/MatchResults.cc 2013-08-09 12:23:15 +0000
912@@ -36,10 +36,36 @@
913 p->sorted = true;;
914 }
915
916+MatchResults::MatchResults(const MatchResults &other) {
917+ p = new MatchResultsPrivate();
918+ *p = *other.p;
919+}
920+
921+MatchResults::MatchResults(MatchResults &&other) {
922+ p = other.p;
923+ other.p = nullptr;
924+}
925+
926 MatchResults::~MatchResults() {
927 delete p;
928 }
929
930+const MatchResults& MatchResults::operator=(MatchResults &&other) {
931+ if(this != &other) {
932+ delete p;
933+ p = other.p;
934+ other.p = nullptr;
935+ }
936+ return *this;
937+}
938+
939+const MatchResults& MatchResults::operator=(const MatchResults &other) {
940+ if(this != &other) {
941+ *p = *other.p;
942+ }
943+ return *this;
944+}
945+
946 void MatchResults::addResult(DocumentID id, double relevancy) {
947 pair<double, DocumentID> n;
948 n.first = relevancy;
949
950=== modified file 'src/Matcher.cc'
951--- src/Matcher.cc 2013-04-16 09:02:17 +0000
952+++ src/Matcher.cc 2013-08-09 12:23:15 +0000
953@@ -31,6 +31,7 @@
954 #include "MatcherStatistics.hh"
955 #include "WordStore.hh"
956 #include "ResultFilter.hh"
957+#include "SearchParameters.hh"
958 #include <cassert>
959 #include <stdexcept>
960 #include <map>
961@@ -138,20 +139,6 @@
962 }
963
964 /*
965- * Long words should allow for more error than short ones.
966- * This is a simple function which is meant to be strict
967- * so there won't be too many matches.
968- */
969-
970-static int getDynamicError(const Word &w) {
971- size_t len = w.length();
972- if(len < 2)
973- return LevenshteinIndex::getDefaultError();
974- else
975- return 2*LevenshteinIndex::getDefaultError();
976-}
977-
978-/*
979 * These are helper functions for Matcher. They are not member functions to avoid polluting the header
980 * with STL includes.
981 */
982@@ -197,17 +184,20 @@
983 }
984
985
986-static void matchIndexes(MatcherPrivate *p, const WordList &query, const bool dynamicError, const int extraError, BestIndexMatches &bestIndexMatches) {
987+static void matchIndexes(MatcherPrivate *p, const WordList &query, const SearchParameters &params, const int extraError, BestIndexMatches &bestIndexMatches) {
988 for(size_t i=0; i<query.size(); i++) {
989 const Word &w = query[i];
990 int maxError;
991- if(dynamicError)
992- maxError = getDynamicError(w);
993+ if(params.isDynamic())
994+ maxError = params.getDynamicError(w);
995 else
996 maxError = 2*LevenshteinIndex::getDefaultError();
997 maxError += extraError;
998
999 for(IndIterator it = p->indexes.begin(); it != p->indexes.end(); it++) {
1000+ if(params.isNonsearchingField(p->store.getWord(it->first))) {
1001+ continue;
1002+ }
1003 IndexMatches m;
1004 it->second->findWords(w, p->e, maxError, m);
1005 addMatches(p, bestIndexMatches, w, it->first, m);
1006@@ -249,6 +239,19 @@
1007 }
1008 }
1009
1010+static bool subtermsMatch(MatcherPrivate *p, const ResultFilter &filter, size_t term, DocumentID id) {
1011+ for(size_t subTerm=0; subTerm < filter.numSubTerms(term); subTerm++) {
1012+ const Word &filterName = filter.getField(term, subTerm);
1013+ const Word &value = filter.getWord(term, subTerm);
1014+ bool termFound = p->reverseIndex.documentHasTerm(
1015+ p->store.getID(value), p->store.getID(filterName), id);
1016+ if(!termFound) {
1017+ return false;
1018+ }
1019+ }
1020+ return true;
1021+}
1022+
1023 Matcher::Matcher() {
1024 p = new MatcherPrivate();
1025 }
1026@@ -308,13 +311,13 @@
1027 }
1028
1029
1030-void Matcher::matchWithRelevancy(const WordList &query, const bool dynamicError, const int extraError, MatchResults &matchedDocuments) {
1031+void Matcher::matchWithRelevancy(const WordList &query, const SearchParameters &params, const int extraError, MatchResults &matchedDocuments) {
1032 map<DocumentID, double> docs;
1033 BestIndexMatches bestIndexMatches;
1034 double start, indexMatchEnd, gatherEnd, finish;
1035
1036 start = hiresTimestamp();
1037- matchIndexes(p, query, dynamicError, extraError, bestIndexMatches);
1038+ matchIndexes(p, query, params, extraError, bestIndexMatches);
1039 indexMatchEnd = hiresTimestamp();
1040 // Now we know all matched words in all indexes. Gather up the corresponding documents.
1041 gatherMatchedDocuments(p, bestIndexMatches, docs);
1042@@ -328,54 +331,29 @@
1043 indexMatchEnd - start, gatherEnd - indexMatchEnd, finish - gatherEnd);
1044 }
1045
1046-void Matcher::match(const WordList &query, MatchResults &matchedDocuments) {
1047+MatchResults Matcher::match(const WordList &query, const SearchParameters &params) {
1048+ MatchResults matchedDocuments;
1049 const int maxIterations = 1;
1050 const int increment = LevenshteinIndex::getDefaultError();
1051 const size_t minMatches = 10;
1052 WordList expandedQuery;
1053+ MatchResults allMatches;
1054
1055 if(query.size() == 0)
1056- return;
1057+ return matchedDocuments;
1058 expandQuery(query, expandedQuery);
1059 // Try to search with ever growing error until we find enough matches.
1060 for(int i=0; i<maxIterations; i++) {
1061 MatchResults matches;
1062- matchWithRelevancy(expandedQuery, true, i*increment, matches);
1063+ matchWithRelevancy(expandedQuery, params, i*increment, matches);
1064 if(matches.size() >= minMatches || i == maxIterations-1) {
1065- matchedDocuments.addResults(matches);
1066- return;
1067- }
1068- }
1069-
1070-}
1071-
1072-void Matcher::match(const char *queryAsUtf8, MatchResults &matchedDocuments) {
1073- WordList l;
1074- splitToWords(queryAsUtf8, l);
1075- match(l, matchedDocuments);
1076-}
1077-
1078-ErrorValues& Matcher::getErrorValues() {
1079- return p->e;
1080-}
1081-
1082-static bool subtermsMatch(MatcherPrivate *p, const ResultFilter &filter, size_t term, DocumentID id) {
1083- for(size_t subTerm=0; subTerm < filter.numSubTerms(term); subTerm++) {
1084- const Word &filterName = filter.getField(term, subTerm);
1085- const Word &value = filter.getWord(term, subTerm);
1086- bool termFound = p->reverseIndex.documentHasTerm(
1087- p->store.getID(value), p->store.getID(filterName), id);
1088- if(!termFound) {
1089- return false;
1090- }
1091- }
1092- return true;
1093-
1094-}
1095-
1096-void Matcher::match(const char *queryAsUtf8, MatchResults &matchedDocuments, const ResultFilter &filter) {
1097- MatchResults allMatches;
1098- match(queryAsUtf8, allMatches);
1099+ allMatches.addResults(matches);
1100+ break;
1101+ }
1102+ }
1103+
1104+ /* Filter results into final set. */
1105+ auto &filter = params.getResultFilter();
1106 for(size_t i=0; i<allMatches.size(); i++) {
1107 DocumentID id = allMatches.getDocumentID(i);
1108 for(size_t term=0; term < filter.numTerms(); term++) {
1109@@ -385,6 +363,29 @@
1110 }
1111 }
1112 }
1113+ return matchedDocuments;
1114+}
1115+
1116+MatchResults Matcher::match(const char *queryAsUtf8) {
1117+ return match(splitToWords(queryAsUtf8));
1118+}
1119+
1120+MatchResults Matcher::match(const std::string &queryAsUtf8) {
1121+ return match(queryAsUtf8.c_str());
1122+}
1123+
1124+
1125+MatchResults Matcher::match(const WordList &query) {
1126+ SearchParameters defaults;
1127+ return match(query, defaults);
1128+}
1129+
1130+ErrorValues& Matcher::getErrorValues() {
1131+ return p->e;
1132+}
1133+
1134+MatchResults Matcher::match(const char *queryAsUtf8, const SearchParameters &params) {
1135+ return match(splitToWords(queryAsUtf8), params);
1136 }
1137
1138 IndexWeights& Matcher::getIndexWeights() {
1139
1140=== added file 'src/SearchParameters.cc'
1141--- src/SearchParameters.cc 1970-01-01 00:00:00 +0000
1142+++ src/SearchParameters.cc 2013-08-09 12:23:15 +0000
1143@@ -0,0 +1,86 @@
1144+/*
1145+ * Copyright (C) 2013 Canonical, Ltd.
1146+ *
1147+ * Authors:
1148+ * Jussi Pakkanen <jussi.pakkanen@canonical.com>
1149+ *
1150+ * This library is free software; you can redistribute it and/or modify it under
1151+ * the terms of version 3 of the GNU Lesser General Public License as published
1152+ * by the Free Software Foundation.
1153+ *
1154+ * This library is distributed in the hope that it will be useful, but WITHOUT
1155+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
1156+ * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
1157+ * details.
1158+ *
1159+ * You should have received a copy of the GNU Lesser General Public License
1160+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
1161+ */
1162+
1163+#include"SearchParameters.hh"
1164+#include"Word.hh"
1165+#include"LevenshteinIndex.hh"
1166+#include"ResultFilter.hh"
1167+#include<set>
1168+
1169+COL_NAMESPACE_START
1170+
1171+using namespace std;
1172+struct SearchParametersPrivate {
1173+ bool dynamic;
1174+ ResultFilter filter;
1175+ set<Word> nosearchFields;
1176+};
1177+
1178+SearchParameters::SearchParameters() {
1179+ p = new SearchParametersPrivate();
1180+ p->dynamic = true;
1181+}
1182+
1183+SearchParameters::~SearchParameters() {
1184+ delete p;
1185+}
1186+
1187+bool SearchParameters::isDynamic() const {
1188+ return p->dynamic;
1189+}
1190+void SearchParameters::setDynamic(bool dyn) {
1191+ p->dynamic = dyn;
1192+}
1193+
1194+/*
1195+ * Long words should allow for more error than short ones.
1196+ * This is a simple function which is meant to be strict
1197+ * so there won't be too many matches.
1198+ */
1199+
1200+int SearchParameters::getDynamicError(const Word &w) const {
1201+ size_t len = w.length();
1202+ if(len < 2)
1203+ return LevenshteinIndex::getDefaultError();
1204+ else
1205+ return 2*LevenshteinIndex::getDefaultError();
1206+}
1207+
1208+ResultFilter& SearchParameters::getResultFilter() {
1209+ return p->filter;
1210+}
1211+
1212+const ResultFilter& SearchParameters::getResultFilter() const {
1213+ return p->filter;
1214+}
1215+
1216+void SearchParameters::addNonsearchingField(const Word &w) {
1217+ p->nosearchFields.insert(w);
1218+}
1219+
1220+bool SearchParameters::isNonsearchingField(const Word &w) const {
1221+ return p->nosearchFields.find(w) != p->nosearchFields.end();
1222+}
1223+
1224+int SearchParameters::looseningIterations() const {
1225+ return 1;
1226+}
1227+
1228+COL_NAMESPACE_END
1229+
1230
1231=== modified file 'src/WordList.cc'
1232--- src/WordList.cc 2012-12-07 11:01:33 +0000
1233+++ src/WordList.cc 2013-08-09 12:23:15 +0000
1234@@ -39,6 +39,10 @@
1235 p->words = wl.p->words;
1236 }
1237
1238+WordList::WordList(WordList &&wl) {
1239+ p = wl.p;
1240+ wl.p = nullptr;
1241+}
1242
1243 WordList::~WordList() {
1244 delete p;
1245@@ -64,6 +68,15 @@
1246 return *this;
1247 }
1248
1249+const WordList& WordList::operator=(WordList &&wl) {
1250+ if(this != &wl) {
1251+ delete p;
1252+ p = wl.p;
1253+ wl.p = nullptr;
1254+ }
1255+ return *this;
1256+}
1257+
1258 bool WordList::operator==(const WordList &l) const {
1259 return p->words == l.p->words;
1260 }
1261
1262=== modified file 'test/CAPITest.c'
1263--- test/CAPITest.c 2013-04-03 13:50:54 +0000
1264+++ test/CAPITest.c 2013-08-09 12:23:15 +0000
1265@@ -98,7 +98,7 @@
1266 void testMatching() {
1267 ColCorpus c = buildCorpus();
1268 ColMatcher m = col_matcher_new();
1269- ColMatchResults matches = col_match_results_new();
1270+ ColMatchResults matches;
1271 DocumentID dFarName = 1000;
1272 DocumentID name1 = 0;
1273 DocumentID name2 = 10;
1274@@ -106,7 +106,7 @@
1275 col_matcher_index(m, c);
1276 col_corpus_delete(c);
1277
1278- col_matcher_match(m, "abe", matches);
1279+ matches = col_matcher_match(m, "abe");
1280 assert(col_match_results_size(matches) == 2);
1281 assert(col_match_results_get_id(matches, 0) != dFarName);
1282 assert(col_match_results_get_id(matches, 1) != dFarName);
1283
1284=== modified file 'test/CMakeLists.txt'
1285--- test/CMakeLists.txt 2013-01-31 10:01:17 +0000
1286+++ test/CMakeLists.txt 2013-08-09 12:23:15 +0000
1287@@ -19,6 +19,7 @@
1288 coltest(indexweights IndexWeightsTest.cc)
1289 coltest(wordstore WordStoreTest.cc)
1290 coltest(filtering ResultFilterTest.cc)
1291+coltest(searchparameters SearchParametersTest.cc)
1292 coltest(capi CAPITest.c)
1293
1294 add_executable(lev_scalability LevScalabilityTest.cc)
1295
1296=== modified file 'test/ErrorValuesTest.cc'
1297--- test/ErrorValuesTest.cc 2013-04-03 13:50:54 +0000
1298+++ test/ErrorValuesTest.cc 2013-08-09 12:23:15 +0000
1299@@ -61,7 +61,7 @@
1300 assert(ev.getSubstituteError(a, aacute) == defaultError);
1301 assert(ev.getSubstituteError(e, aacute) == defaultError);
1302
1303- ev.addLatinAccents();
1304+ ev.addAccents(latinAccentGroup);
1305 assert(ev.isInGroup(e));
1306 assert(ev.isInGroup(eacute));
1307 assert(ev.isInGroup(ebreve));
1308@@ -69,7 +69,6 @@
1309 assert(ev.isInGroup(aacute));
1310 assert(ev.isInGroup(abreve));
1311
1312-
1313 assert(ev.getSubstituteError(e, eacute) == defaultGroupError);
1314 assert(ev.getSubstituteError(eacute, e) == defaultGroupError);
1315 assert(ev.getSubstituteError(eacute, ebreve) == defaultGroupError);
1316@@ -106,12 +105,31 @@
1317 assert(ev.getSubstituteError('j', '6') < ErrorValues::getDefaultError());
1318 }
1319
1320+void testBigError() {
1321+ ErrorValues ev;
1322+ Letter l1 = 1000; // Big values, so they are guaranteed to be outside of the LUT.
1323+ Letter l2 = 10000;
1324+ int smallError = 1;
1325+
1326+ assert(smallError < ErrorValues::getDefaultError());
1327+ assert(ev.getSubstituteError(l1, l2) == ErrorValues::getDefaultError());
1328+ assert(ev.getSubstituteError(l2, l1) == ErrorValues::getDefaultError());
1329+ assert(ev.getSubstituteError(l2, l2) == 0);
1330+
1331+ ev.setError(l1, l2, smallError);
1332+ assert(ev.getSubstituteError(l1, l2) == smallError);
1333+ assert(ev.getSubstituteError(l2, l1) == smallError);
1334+ assert(ev.getSubstituteError(l2, l2) == 0);
1335+
1336+}
1337+
1338 int main(int /*argc*/, char **/*argv*/) {
1339 try {
1340 testError();
1341 testGroupError();
1342 testKeyboardErrors();
1343 testNumberpadErrors();
1344+ testBigError();
1345 } catch(const std::exception &e) {
1346 fprintf(stderr, "Fail: %s\n", e.what());
1347 return 666;
1348
1349=== modified file 'test/HelpersTest.cc'
1350--- test/HelpersTest.cc 2013-04-03 13:50:54 +0000
1351+++ test/HelpersTest.cc 2013-08-09 12:23:15 +0000
1352@@ -25,8 +25,7 @@
1353 using namespace Columbus;
1354
1355 bool splitCorrectly(const char *txt, const WordList &l) {
1356- WordList result;
1357- splitToWords(txt, result);
1358+ WordList result = splitToWords(txt);
1359 return result == l;
1360 }
1361
1362@@ -57,8 +56,7 @@
1363 void testWeirdWord() {
1364 const unsigned char txt[] = {0x42, 0x6c, 0x75, 0x65, 0x73, 0x20, 0xe2, 0x80, 0x9a, 0xc3, 0x84, 0xc3, 0xb2, 0x6e, 0xe2, 0x80,
1365 0x9a, 0xc3, 0x84, 0xc3, 0xb4, 0x20, 0x54, 0x72, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x0d, 0x0a, 0};
1366- WordList l;
1367- splitToWords((const char*)txt, l);
1368+ WordList l = splitToWords((const char*)txt);
1369 assert(l.size() == 3);
1370 }
1371
1372
1373=== modified file 'test/MatchResultsTest.cc'
1374--- test/MatchResultsTest.cc 2013-04-03 13:50:54 +0000
1375+++ test/MatchResultsTest.cc 2013-08-09 12:23:15 +0000
1376@@ -46,9 +46,30 @@
1377 assert(r.getRelevancy(0) == r2);
1378 }
1379
1380+MatchResults gimme() {
1381+ MatchResults m;
1382+ m.addResult(1, 1);
1383+ m.addResult(2, 2);
1384+ return m;
1385+}
1386+
1387+/*
1388+ * For great Valgrind justice.
1389+ */
1390+void testAssignments() {
1391+ MatchResults m1, m2;
1392+ m1.addResult(3, 4);
1393+ m2 = m1;
1394+ MatchResults m3(m1);
1395+ MatchResults m4(m3);
1396+ MatchResults m5(gimme());
1397+ MatchResults m6 = gimme();
1398+}
1399+
1400 int main(int /*argc*/, char **/*argv*/) {
1401 try {
1402 testMatchResult();
1403+ testAssignments();
1404 } catch(const std::exception &e) {
1405 fprintf(stderr, "Fail: %s\n", e.what());
1406 return 666;
1407
1408=== modified file 'test/MatcherTest.cc'
1409--- test/MatcherTest.cc 2013-04-03 13:50:54 +0000
1410+++ test/MatcherTest.cc 2013-08-09 12:23:15 +0000
1411@@ -76,7 +76,7 @@
1412 delete(c);
1413
1414 queryList.addWord(w1);
1415- m.match(queryList, matches);
1416+ matches = m.match(queryList);
1417 assert(matches.size() == 2);
1418 assert(matches.getDocumentID(0) != dFarName);
1419 assert(matches.getDocumentID(1) != dFarName);
1420@@ -99,7 +99,7 @@
1421 delete c;
1422
1423 queryList.addWord(w1);
1424- m.match(queryList, matches);
1425+ matches = m.match(queryList);
1426 assert(matches.size() == 2);
1427 // Document doc1 has an exact match, so it should be the best match.
1428 assert(matches.getRelevancy(0) > matches.getRelevancy(1));
1429@@ -123,7 +123,7 @@
1430 c.addDocument(d2);
1431 m.index(c);
1432
1433- m.match("Sara Michell Geller", matches);
1434+ matches = m.match("Sara Michell Geller");
1435 assert(matches.getDocumentID(0) == correct);
1436 }
1437
1438
1439=== modified file 'test/ResultFilterTest.cc'
1440--- test/ResultFilterTest.cc 2013-04-03 13:50:54 +0000
1441+++ test/ResultFilterTest.cc 2013-08-09 12:23:15 +0000
1442@@ -17,6 +17,7 @@
1443 * along with this program. If not, see <http://www.gnu.org/licenses/>.
1444 */
1445
1446+#include "SearchParameters.hh"
1447 #include "ResultFilter.hh"
1448 #include "Word.hh"
1449 #include "Document.hh"
1450@@ -44,8 +45,8 @@
1451 Document d2(2);
1452 Corpus c;
1453 Matcher m;
1454- ResultFilter emptyFilter;
1455- ResultFilter onlyTakeFirst, onlyTakeSecond, orTest, andTest;
1456+ SearchParameters emptyFilter;
1457+ SearchParameters onlyTakeFirst, onlyTakeSecond, orTest, andTest;
1458
1459 d1.addText(textField, txt);
1460 d1.addText(filterField1, val1str);
1461@@ -57,33 +58,28 @@
1462 c.addDocument(d2);
1463
1464 m.index(c);
1465- MatchResults r1;
1466- m.match(txt, r1, emptyFilter);
1467+ MatchResults r1 = m.match(txt, emptyFilter);
1468 assert(r1.size() == 2);
1469
1470- onlyTakeFirst.addNewSubTerm(filterField1, val1);
1471- MatchResults r2;
1472- m.match(txt, r2, onlyTakeFirst);
1473+ onlyTakeFirst.getResultFilter().addNewSubTerm(filterField1, val1);
1474+ MatchResults r2 = m.match(txt, onlyTakeFirst);
1475 assert(r2.size() == 1);
1476 assert(r2.getDocumentID(0) == 1);
1477
1478- onlyTakeSecond.addNewSubTerm(filterField1, val2);
1479- MatchResults r3;
1480- m.match(txt, r3, onlyTakeSecond);
1481+ onlyTakeSecond.getResultFilter().addNewSubTerm(filterField1, val2);
1482+ MatchResults r3 = m.match(txt, onlyTakeSecond);
1483 assert(r3.size() == 1);
1484 assert(r3.getDocumentID(0) == 2);
1485
1486- orTest.addNewSubTerm(filterField1, val1);
1487- orTest.addNewTerm();
1488- orTest.addNewSubTerm(filterField1, val2);
1489- MatchResults orResults;
1490- m.match(txt, orResults, orTest);
1491+ orTest.getResultFilter().addNewSubTerm(filterField1, val1);
1492+ orTest.getResultFilter().addNewTerm();
1493+ orTest.getResultFilter().addNewSubTerm(filterField1, val2);
1494+ MatchResults orResults = m.match(txt, orTest);
1495 assert(orResults.size() == 2);
1496
1497- andTest.addNewSubTerm(filterField2, val2);
1498- andTest.addNewSubTerm(filterField1, val1);
1499- MatchResults andResults;
1500- m.match(txt, andResults, andTest);
1501+ andTest.getResultFilter().addNewSubTerm(filterField2, val2);
1502+ andTest.getResultFilter().addNewSubTerm(filterField1, val1);
1503+ MatchResults andResults = m.match(txt, andTest);
1504 assert(andResults.size() == 0);
1505 }
1506
1507
1508=== added file 'test/SearchParametersTest.cc'
1509--- test/SearchParametersTest.cc 1970-01-01 00:00:00 +0000
1510+++ test/SearchParametersTest.cc 2013-08-09 12:23:15 +0000
1511@@ -0,0 +1,86 @@
1512+/*
1513+ * Copyright (C) 2013 Canonical, Ltd.
1514+ *
1515+ * Authors:
1516+ * Jussi Pakkanen <jussi.pakkanen@canonical.com>
1517+ *
1518+ * This library is free software; you can redistribute it and/or modify it under
1519+ * the terms of version 3 of the GNU Lesser General Public License as published
1520+ * by the Free Software Foundation.
1521+ *
1522+ * This library is distributed in the hope that it will be useful, but WITHOUT
1523+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
1524+ * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
1525+ * details.
1526+ *
1527+ * You should have received a copy of the GNU Lesser General Public License
1528+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
1529+ */
1530+
1531+#include"SearchParameters.hh"
1532+#include"Word.hh"
1533+#include"Matcher.hh"
1534+#include"Document.hh"
1535+#include"Corpus.hh"
1536+#include"MatchResults.hh"
1537+#include<cassert>
1538+
1539+using namespace Columbus;
1540+
1541+void testDynamic() {
1542+ SearchParameters sp;
1543+ assert(sp.isDynamic());
1544+
1545+ sp.setDynamic(false);
1546+ assert(!sp.isDynamic());
1547+
1548+ sp.setDynamic(true);
1549+ assert(sp.isDynamic());
1550+}
1551+
1552+void testNosearch() {
1553+ SearchParameters sp;
1554+ Word w1("abc");
1555+ Word w2("def");
1556+
1557+ assert(!sp.isNonsearchingField(w1));
1558+ assert(!sp.isNonsearchingField(w2));
1559+
1560+ sp.addNonsearchingField(w1);
1561+ assert(sp.isNonsearchingField(w1));
1562+ assert(!sp.isNonsearchingField(w2));
1563+
1564+ sp.addNonsearchingField(w2);
1565+ assert(sp.isNonsearchingField(w1));
1566+ assert(sp.isNonsearchingField(w2));
1567+}
1568+
1569+void testNosearchMatching() {
1570+ Word textField("text");
1571+ Word search("field1");
1572+ Word nonSearch("field2");
1573+ const char *val1str = "one";
1574+ Corpus c;
1575+ Matcher m;
1576+ SearchParameters sp;
1577+ MatchResults r;
1578+ Document d1(1);
1579+ Document d2(2);
1580+
1581+ sp.addNonsearchingField(nonSearch);
1582+ d1.addText(search, val1str);
1583+ d2.addText(nonSearch, val1str);
1584+ c.addDocument(d1);
1585+ c.addDocument(d2);
1586+ m.index(c);
1587+
1588+ r = m.match(val1str, sp);
1589+ assert(r.size() == 1);
1590+ assert(r.getDocumentID(0) == 1);
1591+}
1592+
1593+int main(int /*argc*/, char **/*argv*/) {
1594+ testDynamic();
1595+ testNosearch();
1596+ testNosearchMatching();
1597+}
1598
1599=== modified file 'test/pythontest.py'
1600--- test/pythontest.py 2013-01-23 13:50:20 +0000
1601+++ test/pythontest.py 2013-08-09 12:23:15 +0000
1602@@ -89,7 +89,7 @@
1603 def test_doc(self):
1604 docid = 435
1605 field = columbus.Word('fieldname')
1606- text = columbus.split_to_words('ye olde butcherede englishe')
1607+ text = 'ye olde butcherede englishe'
1608 d = columbus.Document(docid)
1609
1610 self.assertEqual(d.get_id(), docid, 'Document ID got mangled.')
1611@@ -98,7 +98,7 @@
1612 d.add_text(field, text)
1613 self.assertEqual(d.field_count(), 1, 'field count did not increase')
1614 self.assertGreater(len(text), 0)
1615- self.assertEqual(len(d.get_text(field)), len(text), 'stored text got mangled')
1616+ self.assertEqual(len(d.get_text(field)), len(text.split()), 'stored text got mangled')
1617
1618 class TestCorpus(unittest.TestCase):
1619
1620@@ -138,24 +138,23 @@
1621 def test_simple_match(self):
1622 c = columbus.Corpus()
1623 m = columbus.Matcher()
1624- matches = columbus.MatchResults()
1625 name1 = 0;
1626 name2 = 10;
1627 name3 = 1000;
1628 textName = columbus.Word("title")
1629
1630 d1 = columbus.Document(name1)
1631- d1.add_text(textName, columbus.split_to_words("abc def"))
1632+ d1.add_text(textName, "abc def")
1633 d2 = columbus.Document(name2)
1634- d2.add_text(textName, columbus.split_to_words("abe test"))
1635+ d2.add_text(textName, "abe test")
1636 dFar = columbus.Document(name3)
1637- dFar.add_text(textName, columbus.split_to_words("faraway donotmatchme"))
1638+ dFar.add_text(textName, "faraway donotmatchme")
1639 c.add_document(d1)
1640 c.add_document(d2)
1641 c.add_document(dFar)
1642 m.index(c)
1643
1644- m.match(columbus.split_to_words("abe"), matches)
1645+ matches = m.match("abe")
1646 self.assertEqual(len(matches), 2)
1647 self.assertNotEqual(matches.get_document_id(0), name3);
1648 self.assertNotEqual(matches.get_document_id(1), name3);
1649
1650=== modified file 'tools/hudtest.cc'
1651--- tools/hudtest.cc 2013-04-03 13:50:54 +0000
1652+++ tools/hudtest.cc 2013-08-09 12:23:15 +0000
1653@@ -64,7 +64,7 @@
1654 double queryStart, queryEnd;
1655 try {
1656 queryStart = hiresTimestamp();
1657- app->m->match(gtk_entry_get_text(GTK_ENTRY(app->entry)), matches);
1658+ matches = app->m->match(gtk_entry_get_text(GTK_ENTRY(app->entry)));
1659 queryEnd = hiresTimestamp();
1660 } catch(exception &e) {
1661 printf("Matching failed: %s\n", e.what());
1662@@ -181,8 +181,8 @@
1663 if(line[line.size()-2] == '\r')
1664 line[line.size()-2] = '\0';
1665 splitShowableParts(line, pathText, commandText);
1666- splitToWords(pathText.c_str(), path);
1667- splitToWords(commandText.c_str(), command);
1668+ path = splitToWords(pathText.c_str());
1669+ command = splitToWords(commandText.c_str());
1670 if(command.size() == 0)
1671 continue;
1672 Document d(app.pathSource.size());
1673
1674=== modified file 'tools/numberpad.cc'
1675--- tools/numberpad.cc 2013-04-03 13:50:54 +0000
1676+++ tools/numberpad.cc 2013-08-09 12:23:15 +0000
1677@@ -89,7 +89,7 @@
1678 double queryStart, queryEnd;
1679 try {
1680 queryStart = hiresTimestamp();
1681- app->m->match(gtk_entry_get_text(GTK_ENTRY(app->entry)), matches);
1682+ matches = app->m->match(gtk_entry_get_text(GTK_ENTRY(app->entry)));
1683 queryEnd = hiresTimestamp();
1684 } catch(exception &e) {
1685 printf("Matching failed: %s\n", e.what());
1686
1687=== modified file 'tools/queryapp.cc'
1688--- tools/queryapp.cc 2013-04-03 13:50:54 +0000
1689+++ tools/queryapp.cc 2013-08-09 12:23:15 +0000
1690@@ -61,7 +61,7 @@
1691 double queryStart, queryEnd;
1692 try {
1693 queryStart = hiresTimestamp();
1694- app->m->match(gtk_entry_get_text(GTK_ENTRY(app->entry)), matches);
1695+ matches = app->m->match(gtk_entry_get_text(GTK_ENTRY(app->entry)));
1696 queryEnd = hiresTimestamp();
1697 } catch(exception &e) {
1698 printf("Matching failed: %s\n", e.what());
1699
1700=== modified file 'tools/sctest.cc'
1701--- tools/sctest.cc 2013-04-03 13:50:54 +0000
1702+++ tools/sctest.cc 2013-08-09 12:23:15 +0000
1703@@ -88,7 +88,7 @@
1704 double queryStart, queryEnd;
1705 try {
1706 queryStart = hiresTimestamp();
1707- app->m->match(gtk_entry_get_text(GTK_ENTRY(app->entry)), matches);
1708+ matches = app->m->match(gtk_entry_get_text(GTK_ENTRY(app->entry)));
1709 queryEnd = hiresTimestamp();
1710 } catch(exception &e) {
1711 printf("Matching failed: %s\n", e.what());
1712@@ -176,7 +176,7 @@
1713 Word n;
1714 size_t equalsLoc = line.find('=', 0);
1715 if(equalsLoc < line.length()) {
1716- splitToWords(line.c_str() + equalsLoc + 1, vals);
1717+ vals = splitToWords(line.c_str() + equalsLoc + 1);
1718 line[equalsLoc] = '\0';
1719 try {
1720 n = line.c_str();

Subscribers

People subscribed via source and target branches

to all changes: