Merge lp:~zorba-coders/zorba/no_unicode into lp:zorba

Proposed by Rodolfo Ochoa
Status: Merged
Approved by: Rodolfo Ochoa
Approved revision: 10540
Merged at revision: 10761
Proposed branch: lp:~zorba-coders/zorba/no_unicode
Merge into: lp:zorba
Diff against target: 9007 lines (+3904/-1422)
269 files modified
CMakeConfiguration.txt (+5/-5)
CMakeLists.txt (+6/-2)
ChangeLog (+7/-0)
KNOWN_ISSUES.txt (+1/-1)
doc/cxx/examples/context.cpp (+4/-0)
include/zorba/config.h.cmake (+3/-1)
include/zorba/static_context.h (+4/-0)
include/zorba/util/time.h (+1/-1)
src/CMakeLists.txt (+4/-0)
src/api/serialization/serializer.cpp (+36/-33)
src/api/serialization/serializer.h (+2/-4)
src/diagnostics/diagnostic_en.xml (+116/-27)
src/diagnostics/pregenerated/dict_en.cpp (+98/-20)
src/precompiled/stdafx.h (+74/-356)
src/runtime/full_text/CMakeLists.txt (+3/-3)
src/runtime/full_text/default_tokenizer.cpp (+4/-4)
src/runtime/full_text/latin_tokenizer.cpp (+3/-2)
src/runtime/full_text/latin_tokenizer.h (+9/-8)
src/runtime/numerics/format_integer_impl.cpp (+1/-1)
src/runtime/numerics/numerics_impl.cpp (+1/-1)
src/runtime/strings/strings_impl.cpp (+58/-20)
src/store/api/store.h (+1/-1)
src/store/naive/simple_store.h (+7/-3)
src/store/naive/store.h (+12/-11)
src/system/globalenv.cpp (+7/-7)
src/unit_tests/CMakeLists.txt (+2/-2)
src/unit_tests/string.cpp (+8/-0)
src/unit_tests/unit_test_list.h (+2/-2)
src/unit_tests/unit_tests.cpp (+2/-2)
src/util/CMakeLists.txt (+4/-4)
src/util/icu_streambuf.h (+1/-0)
src/util/passthru_streambuf.cpp (+2/-2)
src/util/passthru_streambuf.h (+10/-2)
src/util/regex.cpp (+96/-82)
src/util/regex.h (+22/-34)
src/util/regex_xquery.cpp (+1860/-489)
src/util/regex_xquery.h (+359/-123)
src/util/transcode_streambuf.h (+5/-5)
src/util/unicode_categories.cpp (+3/-3)
src/util/unicode_categories.h (+44/-37)
src/util/unicode_util.cpp (+20/-2)
src/util/unicode_util.h (+47/-15)
src/util/utf8_util.cpp (+6/-6)
src/util/utf8_util.h (+29/-13)
src/util/utf8_util.tcc (+10/-2)
src/zorbatypes/collation_manager.cpp (+17/-17)
src/zorbatypes/collation_manager.h (+3/-3)
src/zorbatypes/libicu.h (+0/-32)
src/zorbatypes/transcoder.cpp (+8/-4)
src/zorbatypes/transcoder.h (+9/-9)
src/zorbautils/hashmap_itemh.h (+1/-1)
src/zorbautils/string_util.cpp (+19/-18)
src/zorbautils/string_util.h (+15/-1)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a1.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a10.xml.res (+242/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a11.xml.res (+6/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a2.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a3.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a5.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a6.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a7.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a8.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a9.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m1.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m10.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m11.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m12.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m13.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m14.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m15.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m16.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m17.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m18.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m19.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m2.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m20.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m21.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m22.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m23.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m24.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m25.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m26.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m27.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m28.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m29.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m3.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m30.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m31.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m32.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m33.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m34.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m35.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m36.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m37.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m38.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m39.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m4.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m40.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m41.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m42.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m43.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m44.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m45.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m46.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m47.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m48.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m49.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m5.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m50.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m51.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m52.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m53.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m6.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m7.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m8.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m9.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_prime1.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r1.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r10.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r11.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r12.xml.res (+5/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r2.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r3.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r4.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r5.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r6.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r9.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_t1.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_t4.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_t5.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/testdriver/bom_bug.xml.res (+1/-0)
test/rbkt/Queries/CMakeLists.txt (+16/-1)
test/rbkt/Queries/zorba/string/Regex/regex_a1.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_a10.xq (+11/-0)
test/rbkt/Queries/zorba/string/Regex/regex_a11.xq (+9/-0)
test/rbkt/Queries/zorba/string/Regex/regex_a2.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_a3.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_a5.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_a6.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_a7.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_a8.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_a9.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err1.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err1.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err10.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err10.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err11.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err11.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err12.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err12.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err13.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err13.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err14.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err14.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err15.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err15.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err16.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err16.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err17.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err17.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err18.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err18.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err19.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err19.xq (+3/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err2.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err2.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err20.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err20.xq (+3/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err21.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err21.xq (+3/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err22.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err22.xq (+3/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err23.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err23.xq (+3/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err24.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err24.xq (+3/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err25.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err25.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err3.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err3.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err4.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err4.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err5.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err5.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err7.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err7.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err8.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err8.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err9.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err9.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m1.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m10.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m11.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m12.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m13.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m14.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m15.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m16.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m17.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m18.xq (+3/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m19.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m2.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m20.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m21.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m22.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m23.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m24.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m25.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m26.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m27.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m28.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m29.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m3.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m30.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m31.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m32.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m33.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m34.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m35.xq (+4/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m36.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m37.xq (+4/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m38.xq (+4/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m39.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m4.xq (+6/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m40.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m41.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m42.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m43.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m44.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m45.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m46.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m47.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m48.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m49.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m5.xq (+6/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m50.xq (+2/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m51.xq (+2/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m52.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m53.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m6.xq (+6/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m7.xq (+6/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m8.xq (+7/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m9.xq (+7/-0)
test/rbkt/Queries/zorba/string/Regex/regex_prime1.xq (+17/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r1.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r10.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r11.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r12.xq (+7/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r2.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r3.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r4.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r5.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r6.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r7_err.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r7_err.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r8_err.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r8_err.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r9.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_t1.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_t2.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_t3_err.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_t3_err.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_t4.xq (+2/-0)
test/rbkt/Queries/zorba/string/Regex/regex_t5.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/zorba.html (+242/-0)
test/rbkt/Queries/zorba/string/Regex/zorba2.html (+5/-0)
test/rbkt/Queries/zorba/testdriver/bom_bug.xq (+1/-0)
test/unit/static_context.cpp (+2/-0)
test/update/CMakeLists.txt (+9/-0)
To merge this branch: bzr merge lp:~zorba-coders/zorba/no_unicode
Reviewer Review Type Date Requested Status
Rodolfo Ochoa Approve
Markos Zaharioudakis Approve
Review via email: mp+101588@code.launchpad.net

This proposal supersedes a proposal from 2012-04-07.

Commit message

"No Unicode" is now "No ICU."
Added a a q-flag fix for an undiscovered bug.

Description of the change

"No Unicode" is now "No ICU."
Added a a q-flag fix for an undiscovered bug.

To post a comment you must log in.
Revision history for this message
Matthias Brantner (matthias-brantner) wrote : Posted in a previous version of this proposal

Compiling with ZORBA_NO_ICU=ON fails on Linux:

[ 1%] Building CXX object src/CMakeFiles/zorba_simplestore.dir/api/zorba_string.cpp.o
In file included from /home/mbrantner/zorba/sandbox/src/util/regex.h:501:0,
                 from /home/mbrantner/zorba/sandbox/src/api/zorba_string.cpp:23:
/home/mbrantner/zorba/sandbox/src/util/regex_xquery.h:209:3: error: a class-key must be used when declaring a friend
/home/mbrantner/zorba/sandbox/src/util/regex_xquery.h:209:3: error: friend declaration does not name a class or function
/home/mbrantner/zorba/sandbox/src/util/regex_xquery.h:253:3: error: a class-key must be used when declaring a friend
/home/mbrantner/zorba/sandbox/src/util/regex_xquery.h:253:3: error: friend declaration does not name a class or function
make[2]: *** [src/CMakeFiles/zorba_simplestore.dir/api/zorba_string.cpp.o] Erro

Revision history for this message
Matthias Brantner (matthias-brantner) : Posted in a previous version of this proposal
review: Needs Fixing
Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote : Posted in a previous version of this proposal

There are additional revisions which have not been approved in review. Please seek review and approval of these new revisions.

Revision history for this message
Matthias Brantner (matthias-brantner) wrote : Posted in a previous version of this proposal

The test suite doesn't run clean on my system (Linux) without ICU. This prevents us from adding the built to the remote queue. For example, the following tests fail without ICU (some of them also seem to fail with ICU):

 1294 - test/rbkt/zorba/string/Regex/regex_a10 (Failed)
 1548 - test/rbkt/zorba/fulltext/ft-wildcard-true-2 (Failed)
 1560 - test/rbkt/zorba/fulltext/ft-wildcard-true-4 (Failed)
 1574 - test/rbkt/zorba/fulltext/ft-same-sentence-true-4 (Failed)
 1581 - test/rbkt/zorba/fulltext/ft-wildcard-true-3 (Failed)
 1587 - test/rbkt/zorba/fulltext/ft-wildcard-true-9 (Failed)
 1600 - test/rbkt/zorba/fulltext/ft-diacritics-insensitive-true-1 (Failed)
 1605 - test/rbkt/zorba/fulltext/ft-wildcard-true-8 (Failed)
 1612 - test/rbkt/zorba/fulltext/ft-wildcard-true-10 (Failed)
 1635 - test/rbkt/zorba/fulltext/ft-wildcard-true-7 (Failed)
 1637 - test/rbkt/zorba/fulltext/ft-wildcard-true-11 (Failed)
 1643 - test/rbkt/zorba/fulltext/ft-wildcard-FTDY0020-3 (Failed)
 1789 - test/rbkt/zorba/index/numbers (Failed)
 2345 - test/unit/string_test (Failed)
 2534 - test/update/zorba/store/sc3 (Failed)
 2544 - doc/cxx/examples/context.cpp (Failed)

Please make sure the test suite runs clean.

review: Needs Fixing
Revision history for this message
Paul J. Lucas (paul-lucas) wrote : Posted in a previous version of this proposal

Try it now.

Revision history for this message
Daniel Turcanu (danielturcanu) wrote : Posted in a previous version of this proposal

Before commiting this branch, the branch lp:~danielturcanu/zorba/my_conv_module should be merged.

Revision history for this message
Chris Hillery (ceejatec) wrote : Posted in a previous version of this proposal

FWIW, I've skimmed the change for CMake-related changes, and they all look fine (mostly quite trivial).

Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote : Posted in a previous version of this proposal

Attempt to merge into lp:zorba failed due to conflicts:

text conflict in ChangeLog

Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote : Posted in a previous version of this proposal
Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote : Posted in a previous version of this proposal

The attempt to merge lp:~zorba-coders/zorba/no_unicode into lp:zorba failed. Below is the output from the failed tests.

CMake Error at /home/ceej/zo/testing/zorbatest/tester/TarmacLander.cmake:274 (message):
  Validation queue job no_unicode-2012-03-30T19-15-23.23Z is finished. The
  final status was:

  6 tests did not succeed - changes not commited.

Error in read script: /home/ceej/zo/testing/zorbatest/tester/TarmacLander.cmake

Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote : Posted in a previous version of this proposal
Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote : Posted in a previous version of this proposal

The attempt to merge lp:~zorba-coders/zorba/no_unicode into lp:zorba failed. Below is the output from the failed tests.

CMake Error at /home/ceej/zo/testing/zorbatest/tester/TarmacLander.cmake:274 (message):
  Validation queue job no_unicode-2012-04-03T15-17-37.639Z is finished. The
  final status was:

  6 tests did not succeed - changes not commited.

Error in read script: /home/ceej/zo/testing/zorbatest/tester/TarmacLander.cmake

Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote : Posted in a previous version of this proposal
Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote : Posted in a previous version of this proposal

The attempt to merge lp:~zorba-coders/zorba/no_unicode into lp:zorba failed. Below is the output from the failed tests.

CMake Error at /home/ceej/zo/testing/zorbatest/tester/TarmacLander.cmake:274 (message):
  Validation queue job no_unicode-2012-04-06T00-21-13.829Z is finished. The
  final status was:

  6 tests did not succeed - changes not commited.

Error in read script: /home/ceej/zo/testing/zorbatest/tester/TarmacLander.cmake

Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote : Posted in a previous version of this proposal

There are additional revisions which have not been approved in review. Please seek review and approval of these new revisions.

Revision history for this message
Markos Zaharioudakis (markos-za) :
review: Approve
Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote :
Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote :

The attempt to merge lp:~zorba-coders/zorba/no_unicode into lp:zorba failed. Below is the output from the failed tests.

CMake Error at /home/ceej/zo/testing/zorbatest/tester/TarmacLander.cmake:274 (message):
  Validation queue job no_unicode-2012-04-11T22-01-43.083Z is finished. The
  final status was:

  No tests were run - build or configure step must have failed.

  Not commiting changes.

Error in read script: /home/ceej/zo/testing/zorbatest/tester/TarmacLander.cmake

Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote :
Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote :

The attempt to merge lp:~zorba-coders/zorba/no_unicode into lp:zorba failed. Below is the output from the failed tests.

CMake Error at /home/ceej/zo/testing/zorbatest/tester/TarmacLander.cmake:274 (message):
  Validation queue job no_unicode-2012-04-11T23-27-15.591Z is finished. The
  final status was:

  No tests were run - build or configure step must have failed.

  Not commiting changes.

Error in read script: /home/ceej/zo/testing/zorbatest/tester/TarmacLander.cmake

Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote :

Attempt to merge into lp:zorba failed due to conflicts:

text conflict in src/zorbautils/hashmap_itemh.h

Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote :
Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote :

The attempt to merge lp:~zorba-coders/zorba/no_unicode into lp:zorba failed. Below is the output from the failed tests.

CMake Error at /home/ceej/zo/testing/zorbatest/tester/TarmacLander.cmake:274 (message):
  Validation queue job no_unicode-2012-04-12T02-15-23.414Z is finished. The
  final status was:

  6 tests did not succeed - changes not commited.

Error in read script: /home/ceej/zo/testing/zorbatest/tester/TarmacLander.cmake

Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote :
Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote :

Validation queue job no_unicode-2012-04-13T23-51-19.988Z is finished. The final status was:

All tests succeeded!

Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote :

Voting does not meet specified criteria. Required: Approve > 1, Disapprove < 1, Needs Fixing < 1, Pending < 1. Got: 2 Approve, 1 Pending.

Revision history for this message
Rodolfo Ochoa (rodolfo-ochoa) :
review: Approve
Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote :
Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote :

Validation queue job no_unicode-2012-04-14T01-06-17.625Z is finished. The final status was:

All tests succeeded!

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'CMakeConfiguration.txt'
2--- CMakeConfiguration.txt 2012-03-28 05:19:57 +0000
3+++ CMakeConfiguration.txt 2012-04-13 19:45:38 +0000
4@@ -135,14 +135,14 @@
5 SET (ZORBA_DEBUG_STRING ${ZORBA_DEBUG_STRING} CACHE BOOL "debug strings")
6 MESSAGE (STATUS "ZORBA_DEBUG_STRING: " ${ZORBA_DEBUG_STRING})
7
8-SET(ZORBA_NO_UNICODE OFF CACHE BOOL "disable ICU")
9-MESSAGE(STATUS "ZORBA_NO_UNICODE: " ${ZORBA_NO_UNICODE})
10+SET(ZORBA_NO_ICU OFF CACHE BOOL "disable ICU")
11+MESSAGE(STATUS "ZORBA_NO_ICU: " ${ZORBA_NO_ICU})
12
13-IF (ZORBA_NO_UNICODE)
14+IF (ZORBA_NO_ICU)
15 SET (no_full_text ON)
16-ELSE (ZORBA_NO_UNICODE)
17+ELSE (ZORBA_NO_ICU)
18 SET (no_full_text OFF)
19-ENDIF (ZORBA_NO_UNICODE)
20+ENDIF (ZORBA_NO_ICU)
21 SET (ZORBA_NO_FULL_TEXT ${no_full_text} CACHE BOOL "disable XQuery Full-Text support")
22 MESSAGE(STATUS "ZORBA_NO_FULL_TEXT: " ${ZORBA_NO_FULL_TEXT})
23
24
25=== modified file 'CMakeLists.txt'
26--- CMakeLists.txt 2012-03-28 05:19:57 +0000
27+++ CMakeLists.txt 2012-04-13 19:45:38 +0000
28@@ -123,10 +123,14 @@
29 CHECK_TYPE_SIZE("int64_t" ZORBA_HAVE_INT64_T)
30
31 CHECK_CXX_SOURCE_COMPILES ("#include <type_traits>\nint main() { std::enable_if<true,int> x; }" ZORBA_CXX_ENABLE_IF)
32-CHECK_CXX_SOURCE_COMPILES ("int main() { int *p = nullptr; }" ZORBA_CXX_NULLPTR)
33-CHECK_CXX_SOURCE_COMPILES ("int main() { static_assert(1,\"\"); }" ZORBA_CXX_STATIC_ASSERT)
34+SET(CMAKE_EXTRA_INCLUDE_FILES wchar.h)
35+CHECK_TYPE_SIZE("wchar_t" ZORBA_SIZEOF_WCHAR_T)
36+SET(CMAKE_EXTRA_INCLUDE_FILES)
37 CHECK_CXX_SOURCE_COMPILES ("#include <memory>\nint main() { std::unique_ptr<int> p; }" ZORBA_CXX_UNIQUE_PTR)
38
39+CHECK_CXX_SOURCE_COMPILES("int main() { int *p = nullptr; }" ZORBA_CXX_NULLPTR)
40+CHECK_CXX_SOURCE_COMPILES("int main() { static_assert(1,\"\"); }" ZORBA_CXX_STATIC_ASSERT)
41+
42 ################################################################################
43 # Various cmake macros
44
45
46=== modified file 'ChangeLog'
47--- ChangeLog 2012-04-13 11:34:54 +0000
48+++ ChangeLog 2012-04-13 19:45:38 +0000
49@@ -4,6 +4,7 @@
50
51 New Features:
52 * Extended API for Python, Java, PHP and Ruby.
53+ * Added support for NO_ICU (to not use ICU for unicode processing)
54
55 Optimization:
56
57@@ -154,7 +155,9 @@
58 * Fixed bug when parsing a document with a base-uri attribute.
59 * Fixed bug #863320 (Sentence is incorrectly incremented when token characters end without sentence terminator)
60 * Fixed bug #863730 (static delete-node* functions don't raise ZDDY0012)
61+ * Implemented the probe-index-range-value for general indexes
62 * Removed ZSTR0005 and ZSTR0006 error codes
63+ * Fixed bug #867662 ("nullptr" warning)
64 * Fixed bug #868258 (Assertion failure with two delete collection)
65 * Fixed bug #871623 and #871629 (assertion failures with insertions in dynamic collections)
66 * Fixed bug #867262 (allow reuse of iterator over ExtFuncArgItemSequence)
67@@ -163,6 +166,8 @@
68 * New node-reference module. References can be obtained for any node, and
69 different nodes cannot have the same identifier.
70 * Fixed bug #872697 (segmentation fault with validation of NMTOKENS)
71+ * General index cannot be declared as unique if the type of its key is
72+ xs:anyAtomicType or xs:untypedAtomic.
73 * Added undo for node revalidation
74 * Optimization for count(collection()) expressions
75 * Fixed bug #872796 (validate-in-place can interfere with other update primitives)
76@@ -181,6 +186,8 @@
77 * Fixed bug #855715 (Invalid escaped characters in regex not caught)
78 * Fixed bug #862089 (Split binary/xq install directories for modules) by
79 splitting "module path" into separate URI and Library paths
80+ * New node-position module. This module allows to obtain a representation of a node position, which
81+ can be used to assess structural relationships with other nodes.
82 * Fixed bug #872502 (validation of the JSON module xqdoc fails)
83 * Fixed bug #897619 (testdriver_mt can not run the XQueryX tests)
84 * Fixed bug #867107 (xqdoc dependency to zorba is wrong)
85
86=== modified file 'KNOWN_ISSUES.txt'
87--- KNOWN_ISSUES.txt 2012-03-28 05:19:57 +0000
88+++ KNOWN_ISSUES.txt 2012-04-13 19:45:38 +0000
89@@ -37,7 +37,7 @@
90 * The serializer currently doesn't implement character maps as specified
91 (http://www.w3.org/TR/xslt-xquery-serialization/#character-maps)
92
93-* In the 2.0 release, setting the CMake variables ZORBA_NO_UNICODE to
94+* In the 2.0 release, setting the CMake variables ZORBA_NO_ICU to
95 ON is not supported.
96
97 * The PHP language binding is not supported on Mac OS X. For details,
98
99=== modified file 'doc/cxx/examples/context.cpp'
100--- doc/cxx/examples/context.cpp 2012-03-28 05:19:57 +0000
101+++ doc/cxx/examples/context.cpp 2012-04-13 19:45:38 +0000
102@@ -149,7 +149,11 @@
103 outStream2 << lQuery << std::endl;
104 std::cout << outStream2.str() << std::endl;
105
106+#ifndef ZORBA_NO_ICU
107 if (outStream2.str() != "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\nBook 1.1\n")
108+#else
109+ if (outStream2.str() != "<?xml version=\"1.0\"?>\nBook 1.1\n")
110+#endif /* ZORBA_NO_ICU */
111 {
112 std::cerr << "Test 4 failed with a wrong result : " << std::endl
113 << outStream2.str() << std::endl;
114
115=== modified file 'include/zorba/config.h.cmake'
116--- include/zorba/config.h.cmake 2012-03-28 05:19:57 +0000
117+++ include/zorba/config.h.cmake 2012-04-13 19:45:38 +0000
118@@ -96,6 +96,8 @@
119 typedef __int64 int64_t;
120 #endif /* ZORBA_HAVE_INT64_T */
121
122+#cmakedefine ZORBA_SIZEOF_WCHAR_T @ZORBA_SIZEOF_WCHAR_T@
123+
124 // Compiler
125 #cmakedefine CLANG
126 #cmakedefine MSVC
127@@ -148,7 +150,7 @@
128
129 // Zorba features
130 #cmakedefine ZORBA_NO_FULL_TEXT
131-#cmakedefine ZORBA_NO_UNICODE
132+#cmakedefine ZORBA_NO_ICU
133 #cmakedefine ZORBA_NO_XMLSCHEMA
134 #cmakedefine ZORBA_NUMERIC_OPTIMIZATION
135 #cmakedefine ZORBA_VERIFY_PEER_SSL_CERTIFICATE
136
137=== modified file 'include/zorba/static_context.h'
138--- include/zorba/static_context.h 2012-04-13 09:11:32 +0000
139+++ include/zorba/static_context.h 2012-04-13 19:45:38 +0000
140@@ -26,9 +26,13 @@
141 #include <zorba/function.h>
142 #include <zorba/annotation.h>
143 #include <zorba/smart_ptr.h>
144+#include <zorba/smart_ptr.h>
145 #ifndef ZORBA_NO_FULL_TEXT
146 #include <zorba/thesaurus.h>
147 #endif /* ZORBA_NO_FULL_TEXT */
148+#include <zorba/zorba.h>
149+#include <zorba/store_manager.h>
150+#include <zorba/zorba_exception.h>
151
152 namespace zorba {
153
154
155=== modified file 'include/zorba/util/time.h'
156--- include/zorba/util/time.h 2012-03-28 05:19:57 +0000
157+++ include/zorba/util/time.h 2012-04-13 19:45:38 +0000
158@@ -178,7 +178,7 @@
159
160 inline long get_walltime_in_millis(const walltime& t)
161 {
162- return t.time * 1000 + t.millitm;
163+ return (long)(t.time * 1000 + t.millitm);
164 }
165
166 #else /* not Windows, and no clock_gettime() */
167
168=== modified file 'src/CMakeLists.txt'
169--- src/CMakeLists.txt 2012-03-28 05:19:57 +0000
170+++ src/CMakeLists.txt 2012-04-13 19:45:38 +0000
171@@ -59,7 +59,10 @@
172 #
173 # Next, add the files to be compiled into the library
174 #
175+
176+MESSAGE(STATUS "PRECOMPILED HEADERS: " ${ZORBA_PRECOMPILED_HEADERS})
177 SET(ZORBA_PRECOMPILED_HEADERS OFF CACHE BOOL "Activate Zorba precompiled headers.")
178+MESSAGE(STATUS "PRECOMPILED HEADERS: " ${ZORBA_PRECOMPILED_HEADERS})
179
180 SET(ZORBA_SRCS)
181 ADD_SRC_SUBFOLDER(ZORBA_SRCS api API_SRCS)
182@@ -97,6 +100,7 @@
183 ENDIF(ZORBA_WITH_DEBUGGER)
184 ADD_SRC_SUBFOLDER(ZORBA_SRCS unit_tests UNIT_TEST_SRCS)
185
186+MESSAGE(STATUS "PRECOMPILED HEADERS: " ${ZORBA_PRECOMPILED_HEADERS})
187 IF(ZORBA_PRECOMPILED_HEADERS)
188 ADD_SRC_SUBFOLDER(ZORBA_SRCS precompiled ZORBAMISC_SRCS)
189 INCLUDE_DIRECTORIES("${CMAKE_SOURCE_DIR}/src/precompiled")
190
191=== modified file 'src/api/serialization/serializer.cpp'
192--- src/api/serialization/serializer.cpp 2012-03-28 05:19:57 +0000
193+++ src/api/serialization/serializer.cpp 2012-04-13 19:45:38 +0000
194@@ -180,7 +180,6 @@
195 for (; chars < chars_end; chars++ )
196 {
197
198-#ifndef ZORBA_NO_UNICODE
199 // the input string is UTF-8
200 int char_length = utf8::char_length(*chars);
201 if (char_length == 0)
202@@ -217,7 +216,6 @@
203
204 continue;
205 }
206-#endif//ZORBA_NO_UNICODE
207
208 // raise an error iff (1) the serialization format is XML 1.0 and (2) the given character is an invalid XML 1.0 character
209 if (ser && ser->method == PARAMETER_VALUE_XML &&
210@@ -332,14 +330,12 @@
211 {
212 tr << (char)0xEF << (char)0xBB << (char)0xBF;
213 }
214-#ifndef ZORBA_NO_UNICODE
215 else if (ser->encoding == PARAMETER_VALUE_UTF_16)
216 {
217 // Little-endian
218 tr.verbatim((char)0xFF);
219 tr.verbatim((char)0xFE);
220 }
221-#endif
222 }
223 }
224
225@@ -862,13 +858,17 @@
226 emitter::emit_declaration();
227
228 if (ser->omit_xml_declaration == PARAMETER_VALUE_NO) {
229- tr << "<?xml version=\"" << ser->version << "\" encoding=\"";
230- if (ser->encoding == PARAMETER_VALUE_UTF_8) {
231- tr << "UTF-8";
232-#ifndef ZORBA_NO_UNICODE
233- } else if (ser->encoding == PARAMETER_VALUE_UTF_16) {
234- tr << "UTF-16";
235-#endif
236+ tr << "<?xml version=\"" << ser->version;
237+ switch (ser->encoding) {
238+ case PARAMETER_VALUE_UTF_8:
239+ case PARAMETER_VALUE_UTF_16:
240+ tr << "\" encoding=\"";
241+ switch (ser->encoding) {
242+ case PARAMETER_VALUE_UTF_8 : tr << "UTF-8" ; break;
243+ case PARAMETER_VALUE_UTF_16: tr << "UTF-16"; break;
244+ default : ZORBA_ASSERT(false);
245+ }
246+ break;
247 }
248 tr << "\"";
249
250@@ -1174,14 +1174,18 @@
251 }
252
253 tr << "<meta http-equiv=\"content-type\" content=\""
254- << ser->media_type << "; charset=";
255-
256- if (ser->encoding == PARAMETER_VALUE_UTF_8)
257- tr << "UTF-8";
258-#ifndef ZORBA_NO_UNICODE
259- else if (ser->encoding == PARAMETER_VALUE_UTF_16)
260- tr << "UTF-16";
261-#endif
262+ << ser->media_type;
263+ switch (ser->encoding) {
264+ case PARAMETER_VALUE_UTF_8:
265+ case PARAMETER_VALUE_UTF_16:
266+ tr << "\" charset=\"";
267+ switch (ser->encoding) {
268+ case PARAMETER_VALUE_UTF_8 : tr << "UTF-8" ; break;
269+ case PARAMETER_VALUE_UTF_16: tr << "UTF-16"; break;
270+ default : ZORBA_ASSERT(false);
271+ }
272+ break;
273+ }
274 tr << "\"";
275 // closed_parent_tag = 1;
276 }
277@@ -1371,14 +1375,18 @@
278 }
279
280 tr << "<meta http-equiv=\"content-type\" content=\""
281- << ser->media_type << "; charset=";
282-
283- if (ser->encoding == PARAMETER_VALUE_UTF_8)
284- tr << "UTF-8";
285-#ifndef ZORBA_NO_UNICODE
286- else if (ser->encoding == PARAMETER_VALUE_UTF_16)
287- tr << "UTF-16";
288-#endif
289+ << ser->media_type;
290+ switch (ser->encoding) {
291+ case PARAMETER_VALUE_UTF_8:
292+ case PARAMETER_VALUE_UTF_16:
293+ tr << "\" charset=\"";
294+ switch (ser->encoding) {
295+ case PARAMETER_VALUE_UTF_8 : tr << "UTF-8" ; break;
296+ case PARAMETER_VALUE_UTF_16: tr << "UTF-16"; break;
297+ default : ZORBA_ASSERT(false);
298+ }
299+ break;
300+ }
301 tr << "\"/";
302 //closed_parent_tag = 1;
303 }
304@@ -2098,10 +2106,8 @@
305 {
306 if (!strcmp(aValue, "UTF-8"))
307 encoding = PARAMETER_VALUE_UTF_8;
308-#ifndef ZORBA_NO_UNICODE
309 else if (!strcmp(aValue, "UTF-16"))
310 encoding = PARAMETER_VALUE_UTF_16;
311-#endif
312 else
313 throw XQUERY_EXCEPTION(
314 err::SEPM0016, ERROR_PARAMS( aValue, aName, ZED( GoodValuesAreUTF8 ) )
315@@ -2210,16 +2216,13 @@
316 {
317 tr = new transcoder(os, false);
318 }
319-#ifndef ZORBA_NO_UNICODE
320 else if (encoding == PARAMETER_VALUE_UTF_16)
321 {
322 tr = new transcoder(os, true);
323 }
324-#endif
325 else
326 {
327- ZORBA_ASSERT(0);
328- return false;
329+ ZORBA_ASSERT(false);
330 }
331
332 if (method == PARAMETER_VALUE_XML)
333
334=== modified file 'src/api/serialization/serializer.h'
335--- src/api/serialization/serializer.h 2012-03-28 05:19:57 +0000
336+++ src/api/serialization/serializer.h 2012-04-13 19:45:38 +0000
337@@ -70,10 +70,8 @@
338 PARAMETER_VALUE_TEXT,
339 PARAMETER_VALUE_BINARY,
340
341- PARAMETER_VALUE_UTF_8
342-#ifndef ZORBA_NO_UNICODE
343- ,PARAMETER_VALUE_UTF_16
344-#endif
345+ PARAMETER_VALUE_UTF_8,
346+ PARAMETER_VALUE_UTF_16
347 } PARAMETER_VALUE_TYPE;
348
349 protected:
350
351=== modified file 'src/diagnostics/diagnostic_en.xml'
352--- src/diagnostics/diagnostic_en.xml 2012-04-10 13:10:22 +0000
353+++ src/diagnostics/diagnostic_en.xml 2012-04-13 19:45:38 +0000
354@@ -2517,11 +2517,11 @@
355 <value>attribute node</value>
356 </entry>
357
358- <entry key="BackRef0Illegal">
359+ <entry key="BackRef0Illegal" if="!defined(ZORBA_NO_ICU)">
360 <value>"0": illegal backreference</value>
361 </entry>
362
363- <entry key="BackRefIllegalInCharClass">
364+ <entry key="BackRefIllegalInCharClass" if="!defined(ZORBA_NO_ICU)">
365 <value>backreference illegal in character class</value>
366 </entry>
367
368@@ -2569,7 +2569,7 @@
369 <value>invalid library module</value>
370 </entry>
371
372- <entry key="BadRegexEscape_3">
373+ <entry key="BadRegexEscape_3" if="!defined(ZORBA_NO_ICU)">
374 <value>"$3": illegal escape character</value>
375 </entry>
376
377@@ -3029,7 +3029,7 @@
378 <value>nodeid component too big for encoding</value>
379 </entry>
380
381- <entry key="NonClosedBackRef_3">
382+ <entry key="NonClosedBackRef_3" if="!defined(ZORBA_NO_ICU)">
383 <value>'$$3': non-closed backreference</value>
384 </entry>
385
386@@ -3041,7 +3041,7 @@
387 <value>non-localhost authority</value>
388 </entry>
389
390- <entry key="NonexistentBackRef_3">
391+ <entry key="NonexistentBackRef_3" if="!defined(ZORBA_NO_ICU)">
392 <value>'$$3': non-existent backreference</value>
393 </entry>
394
395@@ -3193,94 +3193,183 @@
396 <value>item type is not a subtype of "$3"</value>
397 </entry>
398
399- <entry key="U_REGEX_BAD_ESCAPE_SEQUENCE" if="!defined(ZORBA_NO_UNICODE)">
400+ <entry key="U_REGEX_BAD_ESCAPE_SEQUENCE" if="!defined(ZORBA_NO_ICU)">
401 <value>unrecognized backslash escape sequence</value>
402 </entry>
403
404- <entry key="U_REGEX_BAD_INTERVAL" if="!defined(ZORBA_NO_UNICODE)">
405+ <entry key="U_REGEX_BAD_INTERVAL" if="!defined(ZORBA_NO_ICU)">
406 <value>error in {min,max} interval</value>
407 </entry>
408
409- <entry key="U_REGEX_INTERNAL_ERROR" if="!defined(ZORBA_NO_UNICODE)">
410+ <entry key="U_REGEX_INTERNAL_ERROR" if="!defined(ZORBA_NO_ICU)">
411 <value>an internal ICU error (bug) was detected</value>
412 </entry>
413
414- <entry key="U_REGEX_INVALID_BACK_REF" if="!defined(ZORBA_NO_UNICODE)">
415+ <entry key="U_REGEX_INVALID_BACK_REF" if="!defined(ZORBA_NO_ICU)">
416 <value>backreference to a non-existent capture group</value>
417 </entry>
418
419- <entry key="U_REGEX_INVALID_FLAG" if="!defined(ZORBA_NO_UNICODE)">
420+ <entry key="U_REGEX_INVALID_FLAG" if="!defined(ZORBA_NO_ICU)">
421 <value>invalid value for match mode flags</value>
422 </entry>
423
424- <entry key="U_REGEX_INVALID_RANGE" if="!defined(ZORBA_NO_UNICODE)">
425+ <entry key="U_REGEX_INVALID_RANGE" if="!defined(ZORBA_NO_ICU)">
426 <value>in character range [x-y], x is greater than y</value>
427 </entry>
428
429- <entry key="U_REGEX_INVALID_STATE" if="!defined(ZORBA_NO_UNICODE)">
430+ <entry key="U_REGEX_INVALID_STATE" if="!defined(ZORBA_NO_ICU)">
431 <value>RegexMatcher in invalid state for requested operation</value>
432 </entry>
433
434- <entry key="U_REGEX_LOOK_BEHIND_LIMIT" if="!defined(ZORBA_NO_UNICODE)">
435+ <entry key="U_REGEX_LOOK_BEHIND_LIMIT" if="!defined(ZORBA_NO_ICU)">
436 <value>look-behind pattern matches must have a bounded maximum length</value>
437 </entry>
438
439- <entry key="U_REGEX_MAX_LT_MIN" if="!defined(ZORBA_NO_UNICODE)">
440+ <entry key="U_REGEX_MAX_LT_MIN" if="!defined(ZORBA_NO_ICU)">
441 <value>in {min,max}, max is less than min</value>
442 </entry>
443
444- <entry key="U_REGEX_MISMATCHED_PAREN" if="!defined(ZORBA_NO_UNICODE)">
445+ <entry key="U_REGEX_MISMATCHED_PAREN" if="!defined(ZORBA_NO_ICU)">
446 <value>incorrectly nested parentheses</value>
447 </entry>
448
449- <entry key="U_REGEX_MISSING_CLOSE_BRACKET" if="!defined(ZORBA_NO_UNICODE)">
450+ <entry key="U_REGEX_MISSING_CLOSE_BRACKET" if="!defined(ZORBA_NO_ICU)">
451 <value>missing ']'</value>
452 </entry>
453
454- <entry key="U_REGEX_NUMBER_TOO_BIG" if="!defined(ZORBA_NO_UNICODE)">
455+ <entry key="U_REGEX_NUMBER_TOO_BIG" if="!defined(ZORBA_NO_ICU)">
456 <value>decimal number is too large</value>
457 </entry>
458
459- <entry key="U_REGEX_OCTAL_TOO_BIG" if="!defined(ZORBA_NO_UNICODE)">
460+ <entry key="U_REGEX_OCTAL_TOO_BIG" if="!defined(ZORBA_NO_ICU)">
461 <value>octal character constants must be &lt;= 0377</value>
462 </entry>
463
464- <entry key="U_REGEX_PROPERTY_SYNTAX" if="!defined(ZORBA_NO_UNICODE)">
465+ <entry key="U_REGEX_PROPERTY_SYNTAX" if="!defined(ZORBA_NO_ICU)">
466 <value>incorrect Unicode property</value>
467 </entry>
468
469- <entry key="U_REGEX_RULE_SYNTAX" if="!defined(ZORBA_NO_UNICODE)">
470+ <entry key="U_REGEX_RULE_SYNTAX" if="!defined(ZORBA_NO_ICU)">
471 <value>syntax error</value>
472 </entry>
473
474- <entry key="U_REGEX_SET_CONTAINS_STRING" if="!defined(ZORBA_NO_UNICODE)">
475+ <entry key="U_REGEX_SET_CONTAINS_STRING" if="!defined(ZORBA_NO_ICU)">
476 <value>can not have UnicodeSets containing strings</value>
477 </entry>
478
479- <entry key="U_REGEX_STACK_OVERFLOW" if="!defined(ZORBA_NO_UNICODE)">
480+ <entry key="U_REGEX_STACK_OVERFLOW" if="!defined(ZORBA_NO_ICU)">
481 <value>backtrack stack overflow</value>
482 </entry>
483
484- <entry key="U_REGEX_STOPPED_BY_CALLER" if="!defined(ZORBA_NO_UNICODE)">
485+ <entry key="U_REGEX_STOPPED_BY_CALLER" if="!defined(ZORBA_NO_ICU)">
486 <value>matching operation aborted by user callback fn</value>
487 </entry>
488
489- <entry key="U_REGEX_TIME_OUT" if="!defined(ZORBA_NO_UNICODE)">
490+ <entry key="U_REGEX_TIME_OUT" if="!defined(ZORBA_NO_ICU)">
491 <value>maximum allowed match time exceeded</value>
492 </entry>
493
494- <entry key="U_REGEX_UNIMPLEMENTED" if="!defined(ZORBA_NO_UNICODE)">
495- <value>use of regular expression feature that is not yet implemented</value>
496+ <entry key="U_REGEX_UNIMPLEMENTED" if="!defined(ZORBA_NO_ICU)">
497+ <value>use of regular expression feature that is not yet implemented</value>
498+ </entry>
499+
500+ <!-- Regex Ascii error messages-->
501+ <entry key="REGEX_UNIMPLEMENTED" if="defined(ZORBA_NO_ICU)">
502+ <value>use of regular expression feature that is not yet implemented</value>
503+ </entry>
504+
505+ <entry key="REGEX_MISMATCHED_PAREN" if="defined(ZORBA_NO_ICU)">
506+ <value>incorrectly nested parentheses</value>
507+ </entry>
508+
509+ <entry key="REGEX_BROKEN_P_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
510+ <value>broken \\p construct</value>
511+ </entry>
512+
513+ <entry key="REGEX_UNKNOWN_PL_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
514+ <value>unknown \\p{L?} category; supported categories: L, Lu, Ll, Lt, Lm, Lo</value>
515+ </entry>
516+
517+ <entry key="REGEX_UNKNOWN_PM_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
518+ <value>unknown \\p{M?} category; supported categories: M, Mn, Mc, Me</value>
519+ </entry>
520+
521+ <entry key="REGEX_UNKNOWN_PN_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
522+ <value>unknown \\p{N?} category; supported categories: N, Nd, Nl, No</value>
523+ </entry>
524+
525+ <entry key="REGEX_UNKNOWN_PP_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
526+ <value>unknown \\p{P?} category; supported categories: P, Pc, Pd, Ps, Pe, Pi, Pf, Po</value>
527+ </entry>
528+
529+ <entry key="REGEX_UNKNOWN_PZ_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
530+ <value>unknown \\p{Z?} category; supported categories: Z, Zs, Zl, Zp</value>
531+ </entry>
532+
533+ <entry key="REGEX_UNKNOWN_PS_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
534+ <value>unknown \\p{S?} category; supported categories: S, Sm, Sc, Sk, So</value>
535+ </entry>
536+
537+ <entry key="REGEX_UNKNOWN_PC_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
538+ <value>unknown \\p{C?} category; supported categories: C, Cc, Cf, Co, Cn(for not assigned)</value>
539+ </entry>
540+
541+ <entry key="REGEX_BROKEN_PIs_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
542+ <value>broken \\p{Is} construct; valid characters are [a-zA-Z0-9-]</value>
543+ </entry>
544+
545+ <entry key="REGEX_UNKNOWN_PIs_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
546+ <value>unknown \\p{Is} category block; see supported block escapes here: http://www.w3.org/TR/xmlschema-2/#charcter-classes</value>
547+ </entry>
548+
549+ <entry key="REGEX_INVALID_UNICODE_CODEPOINT_u" if="defined(ZORBA_NO_ICU)">
550+ <value>invalid unicode hex, should be in form \\uXXXX or \\UXXXXXXXX</value>
551+ </entry>
552+
553+ <entry key="REGEX_UNKNOWN_ESC_CHAR" if="defined(ZORBA_NO_ICU)">
554+ <value>unknown \\? escape char; supported escapes are: \\[nrt\\|.?*+(){}[]-^$] for char escapes, \\[pP] for categories and \\[sSiIcCdDwW] for multichar groups</value>
555+ </entry>
556+
557+ <entry key="REGEX_INVALID_BACK_REF" if="defined(ZORBA_NO_ICU)">
558+ <value>\\$3 backreference to a non-existent capture group ($4 groups so far)</value>
559+ </entry>
560+
561+ <entry key="REGEX_INVALID_ATOM_CHAR" if="defined(ZORBA_NO_ICU)">
562+ <value>'$3': invalid character for an atom; forbidden characters are: [{}?*+|^]</value>
563+ </entry>
564+
565+ <entry key="REGEX_INVALID_SUBCLASS" if="defined(ZORBA_NO_ICU)">
566+ <value>malformed class subtraction</value>
567+ </entry>
568+
569+ <entry key="REGEX_INVALID_USE_OF_SUBCLASS" if="defined(ZORBA_NO_ICU)">
570+ <value>improper use of class subtraction: it must be the last construct in a class group [xxx-[yyy]]</value>
571+ </entry>
572+
573+ <entry key="REGEX_MULTICHAR_IN_CHAR_RANGE" if="defined(ZORBA_NO_ICU)">
574+ <value>multichars or char categories cannot be part of a char range</value>
575+ </entry>
576+
577+ <entry key="REGEX_MISSING_CLOSE_BRACKET" if="defined(ZORBA_NO_ICU)">
578+ <value>missing ']' in character group</value>
579+ </entry>
580+
581+ <entry key="REGEX_MAX_LT_MIN" if="defined(ZORBA_NO_ICU)">
582+ <value>in {min,max}, max is less than min</value>
583 </entry>
584
585 <entry key="UnaryArithOp">
586 <value>unary arithmetic operator</value>
587 </entry>
588
589- <entry key="UnbalancedChar_3">
590+ <entry key="UnbalancedChar_3" if="!defined(ZORBA_NO_ICU)">
591 <value>missing '$3'</value>
592 </entry>
593
594+ <entry key="UnescapedChar_3" if="!defined(ZORBA_NO_ICU)">
595+ <value>character '$3' must be escaped here</value>
596+ </entry>
597+
598 <entry key="UnexpectedElement">
599 <value>unexpected element</value>
600 </entry>
601
602=== modified file 'src/diagnostics/pregenerated/dict_en.cpp'
603--- src/diagnostics/pregenerated/dict_en.cpp 2012-04-10 13:10:22 +0000
604+++ src/diagnostics/pregenerated/dict_en.cpp 2012-04-13 19:45:38 +0000
605@@ -437,8 +437,12 @@
606 { "~AtomizationOfGroupByMakesMoreThanOneItem", "atomization of groupby variable produces more than one item" },
607 { "~AttributeName", "attribute name" },
608 { "~AttributeNode", "attribute node" },
609+#if !defined(ZORBA_NO_ICU)
610 { "~BackRef0Illegal", "\"0\": illegal backreference" },
611+#endif
612+#if !defined(ZORBA_NO_ICU)
613 { "~BackRefIllegalInCharClass", "backreference illegal in character class" },
614+#endif
615 { "~BadAnyURI", "invalid xs:anyURI" },
616 { "~BadArgTypeForFn_2o34o", "${\"2\": }invalid argument type for function $3()${: 4}" },
617 { "~BadCharAfter_34", "'$3': illegal character after '$4'" },
618@@ -451,7 +455,9 @@
619 { "~BadIterator", "invalid iterator" },
620 { "~BadLibraryModule", "invalid library module" },
621 { "~BadPath", "invalid path" },
622+#if !defined(ZORBA_NO_ICU)
623 { "~BadRegexEscape_3", "\"$3\": illegal escape character" },
624+#endif
625 { "~BadStreamState", "bad I/O stream state" },
626 { "~BadTokenInBraces_3", "\"$3\": illegal token within { }" },
627 { "~BadTraceStream", "trace stream not retrievable using SerializationCallback" },
628@@ -567,10 +573,14 @@
629 { "~NoUntypedKeyNodeValue_2", "node with untyped key value found during probe on index \"$2\"" },
630 { "~NodeIDNeedsBytes_2", "nodeid requires more than $2 bytes" },
631 { "~NodeIDTooBig", "nodeid component too big for encoding" },
632+#if !defined(ZORBA_NO_ICU)
633 { "~NonClosedBackRef_3", "'$$3': non-closed backreference" },
634+#endif
635 { "~NonFileThesaurusURI", "non-file thesaurus URI" },
636 { "~NonLocalhostAuthority", "non-localhost authority" },
637+#if !defined(ZORBA_NO_ICU)
638 { "~NonexistentBackRef_3", "'$$3': non-existent backreference" },
639+#endif
640 { "~NotAllowedForTypeName", "not allowed for typeName (use xsd:untyped instead)" },
641 { "~NotAmongInScopeSchemaTypes", "not among in-scope schema types" },
642 { "~NotDefInDynamicCtx", "not defined in dynamic context" },
643@@ -589,6 +599,69 @@
644 { "~ParserNoCreateTree", "XML tree creation failed" },
645 { "~PromotionImpossible", "promotion not possible" },
646 { "~QuotedColon_23", "\"$2\": $3" },
647+#if defined(ZORBA_NO_ICU)
648+ { "~REGEX_BROKEN_PIs_CONSTRUCT", "broken \\p{Is} construct; valid characters are [a-zA-Z0-9-]" },
649+#endif
650+#if defined(ZORBA_NO_ICU)
651+ { "~REGEX_BROKEN_P_CONSTRUCT", "broken \\p construct" },
652+#endif
653+#if defined(ZORBA_NO_ICU)
654+ { "~REGEX_INVALID_ATOM_CHAR", "'$3': invalid character for an atom; forbidden characters are: [{}?*+|^]" },
655+#endif
656+#if defined(ZORBA_NO_ICU)
657+ { "~REGEX_INVALID_BACK_REF", "\\$3 backreference to a non-existent capture group ($4 groups so far)" },
658+#endif
659+#if defined(ZORBA_NO_ICU)
660+ { "~REGEX_INVALID_SUBCLASS", "malformed class subtraction" },
661+#endif
662+#if defined(ZORBA_NO_ICU)
663+ { "~REGEX_INVALID_UNICODE_CODEPOINT_u", "invalid unicode hex, should be in form \\uXXXX or \\UXXXXXXXX" },
664+#endif
665+#if defined(ZORBA_NO_ICU)
666+ { "~REGEX_INVALID_USE_OF_SUBCLASS", "improper use of class subtraction: it must be the last construct in a class group [xxx-[yyy]]" },
667+#endif
668+#if defined(ZORBA_NO_ICU)
669+ { "~REGEX_MAX_LT_MIN", "in {min,max}, max is less than min" },
670+#endif
671+#if defined(ZORBA_NO_ICU)
672+ { "~REGEX_MISMATCHED_PAREN", "incorrectly nested parentheses" },
673+#endif
674+#if defined(ZORBA_NO_ICU)
675+ { "~REGEX_MISSING_CLOSE_BRACKET", "missing ']' in character group" },
676+#endif
677+#if defined(ZORBA_NO_ICU)
678+ { "~REGEX_MULTICHAR_IN_CHAR_RANGE", "multichars or char categories cannot be part of a char range" },
679+#endif
680+#if defined(ZORBA_NO_ICU)
681+ { "~REGEX_UNIMPLEMENTED", "use of regular expression feature that is not yet implemented" },
682+#endif
683+#if defined(ZORBA_NO_ICU)
684+ { "~REGEX_UNKNOWN_ESC_CHAR", "unknown \\? escape char; supported escapes are: \\[nrt\\|.?*+(){}[]-^$] for char escapes, \\[pP] for categories and \\[sSiIcCdDwW] for multichar groups" },
685+#endif
686+#if defined(ZORBA_NO_ICU)
687+ { "~REGEX_UNKNOWN_PC_CONSTRUCT", "unknown \\p{C?} category; supported categories: C, Cc, Cf, Co, Cn(for not assigned)" },
688+#endif
689+#if defined(ZORBA_NO_ICU)
690+ { "~REGEX_UNKNOWN_PIs_CONSTRUCT", "unknown \\p{Is} category block; see supported block escapes here: http://www.w3.org/TR/xmlschema-2/#charcter-classes" },
691+#endif
692+#if defined(ZORBA_NO_ICU)
693+ { "~REGEX_UNKNOWN_PL_CONSTRUCT", "unknown \\p{L?} category; supported categories: L, Lu, Ll, Lt, Lm, Lo" },
694+#endif
695+#if defined(ZORBA_NO_ICU)
696+ { "~REGEX_UNKNOWN_PM_CONSTRUCT", "unknown \\p{M?} category; supported categories: M, Mn, Mc, Me" },
697+#endif
698+#if defined(ZORBA_NO_ICU)
699+ { "~REGEX_UNKNOWN_PN_CONSTRUCT", "unknown \\p{N?} category; supported categories: N, Nd, Nl, No" },
700+#endif
701+#if defined(ZORBA_NO_ICU)
702+ { "~REGEX_UNKNOWN_PP_CONSTRUCT", "unknown \\p{P?} category; supported categories: P, Pc, Pd, Ps, Pe, Pi, Pf, Po" },
703+#endif
704+#if defined(ZORBA_NO_ICU)
705+ { "~REGEX_UNKNOWN_PS_CONSTRUCT", "unknown \\p{S?} category; supported categories: S, Sm, Sc, Sk, So" },
706+#endif
707+#if defined(ZORBA_NO_ICU)
708+ { "~REGEX_UNKNOWN_PZ_CONSTRUCT", "unknown \\p{Z?} category; supported categories: Z, Zs, Zl, Zp" },
709+#endif
710 { "~SEPM0009_Not10", "the version parameter has a value other than \"1.0\" and the doctype-system parameter is specified" },
711 { "~SEPM0009_NotOmit", "the standalone attribute has a value other than \"omit\"" },
712 { "~SchemaAttributeName", "schema-attribute name" },
713@@ -610,68 +683,73 @@
714 { "~TwoDecimalFormatsSameName_2", "\"$2\": two decimal formats with this name" },
715 { "~TwoDefaultDecimalFormats", "two default decimal formats" },
716 { "~TypeIsNotSubtype", "item type is not a subtype of \"$3\"" },
717-#if !defined(ZORBA_NO_UNICODE)
718+#if !defined(ZORBA_NO_ICU)
719 { "~U_REGEX_BAD_ESCAPE_SEQUENCE", "unrecognized backslash escape sequence" },
720 #endif
721-#if !defined(ZORBA_NO_UNICODE)
722+#if !defined(ZORBA_NO_ICU)
723 { "~U_REGEX_BAD_INTERVAL", "error in {min,max} interval" },
724 #endif
725-#if !defined(ZORBA_NO_UNICODE)
726+#if !defined(ZORBA_NO_ICU)
727 { "~U_REGEX_INTERNAL_ERROR", "an internal ICU error (bug) was detected" },
728 #endif
729-#if !defined(ZORBA_NO_UNICODE)
730+#if !defined(ZORBA_NO_ICU)
731 { "~U_REGEX_INVALID_BACK_REF", "backreference to a non-existent capture group" },
732 #endif
733-#if !defined(ZORBA_NO_UNICODE)
734+#if !defined(ZORBA_NO_ICU)
735 { "~U_REGEX_INVALID_FLAG", "invalid value for match mode flags" },
736 #endif
737-#if !defined(ZORBA_NO_UNICODE)
738+#if !defined(ZORBA_NO_ICU)
739 { "~U_REGEX_INVALID_RANGE", "in character range [x-y], x is greater than y" },
740 #endif
741-#if !defined(ZORBA_NO_UNICODE)
742+#if !defined(ZORBA_NO_ICU)
743 { "~U_REGEX_INVALID_STATE", "RegexMatcher in invalid state for requested operation" },
744 #endif
745-#if !defined(ZORBA_NO_UNICODE)
746+#if !defined(ZORBA_NO_ICU)
747 { "~U_REGEX_LOOK_BEHIND_LIMIT", "look-behind pattern matches must have a bounded maximum length" },
748 #endif
749-#if !defined(ZORBA_NO_UNICODE)
750+#if !defined(ZORBA_NO_ICU)
751 { "~U_REGEX_MAX_LT_MIN", "in {min,max}, max is less than min" },
752 #endif
753-#if !defined(ZORBA_NO_UNICODE)
754+#if !defined(ZORBA_NO_ICU)
755 { "~U_REGEX_MISMATCHED_PAREN", "incorrectly nested parentheses" },
756 #endif
757-#if !defined(ZORBA_NO_UNICODE)
758+#if !defined(ZORBA_NO_ICU)
759 { "~U_REGEX_MISSING_CLOSE_BRACKET", "missing ']'" },
760 #endif
761-#if !defined(ZORBA_NO_UNICODE)
762+#if !defined(ZORBA_NO_ICU)
763 { "~U_REGEX_NUMBER_TOO_BIG", "decimal number is too large" },
764 #endif
765-#if !defined(ZORBA_NO_UNICODE)
766+#if !defined(ZORBA_NO_ICU)
767 { "~U_REGEX_OCTAL_TOO_BIG", "octal character constants must be <= 0377" },
768 #endif
769-#if !defined(ZORBA_NO_UNICODE)
770+#if !defined(ZORBA_NO_ICU)
771 { "~U_REGEX_PROPERTY_SYNTAX", "incorrect Unicode property" },
772 #endif
773-#if !defined(ZORBA_NO_UNICODE)
774+#if !defined(ZORBA_NO_ICU)
775 { "~U_REGEX_RULE_SYNTAX", "syntax error" },
776 #endif
777-#if !defined(ZORBA_NO_UNICODE)
778+#if !defined(ZORBA_NO_ICU)
779 { "~U_REGEX_SET_CONTAINS_STRING", "can not have UnicodeSets containing strings" },
780 #endif
781-#if !defined(ZORBA_NO_UNICODE)
782+#if !defined(ZORBA_NO_ICU)
783 { "~U_REGEX_STACK_OVERFLOW", "backtrack stack overflow" },
784 #endif
785-#if !defined(ZORBA_NO_UNICODE)
786+#if !defined(ZORBA_NO_ICU)
787 { "~U_REGEX_STOPPED_BY_CALLER", "matching operation aborted by user callback fn" },
788 #endif
789-#if !defined(ZORBA_NO_UNICODE)
790+#if !defined(ZORBA_NO_ICU)
791 { "~U_REGEX_TIME_OUT", "maximum allowed match time exceeded" },
792 #endif
793-#if !defined(ZORBA_NO_UNICODE)
794+#if !defined(ZORBA_NO_ICU)
795 { "~U_REGEX_UNIMPLEMENTED", "use of regular expression feature that is not yet implemented" },
796 #endif
797 { "~UnaryArithOp", "unary arithmetic operator" },
798+#if !defined(ZORBA_NO_ICU)
799 { "~UnbalancedChar_3", "missing '$3'" },
800+#endif
801+#if !defined(ZORBA_NO_ICU)
802+ { "~UnescapedChar_3", "character '$3' must be escaped here" },
803+#endif
804 { "~UnexpectedElement", "unexpected element" },
805 { "~VarValMustBeSingleItem_2", "\"$2\": variable value must be single item" },
806 { "~Variable", "variable" },
807
808=== modified file 'src/precompiled/stdafx.h'
809--- src/precompiled/stdafx.h 2012-03-28 05:19:57 +0000
810+++ src/precompiled/stdafx.h 2012-04-13 19:45:38 +0000
811@@ -15,363 +15,81 @@
812
813 */
814
815-#if defined STDAFX
816-#include <iostream>
817-#include <stdexcept>
818-#include <cassert>
819-#include <cstring>
820-#include <memory>
821-
822-#include <sstream>
823-#include <xfwrap>
824-#include <xfwrap1>
825-#include <istream>
826-#include <cstdio>
827-#include <xxshared>
828-#include <crtdefs.h>
829-#include <map>
830-#include <set>
831-//#include <poppack.h>
832-//#include <xxtype_traits>
833-//#include <xxcallwrap>
834-
835-// #include <xxcallpmf>
836-// //#include <xxbind0>
837-// //#include <xxbind1>
838-// //#include <xxresult>
839-// #include <zorba/audit.h>
840-// #include "api/auditimpl.h"
841-// #include <zorba/audit.h>
842-
843- //#include "unicode/unistr.h"
844- #include "runtime/sequences/sequences.h"
845- #include "diagnostics/xquery_diagnostics.h"
846- #include "xercesc/util/xercesdefs.hpp"
847- #include "runtime/collections/collections.h"
848- #include "unicode/utypes.h"
849- #include "zorba/config.h"
850- #include "store/api/store.h"
851- #include "zorba/zorba.h"
852- #include "zorba/api_shared_types.h"
853- #include "compiler/parsetree/parsenodes.h"
854- #include "compiler/parser/parse_constants.h"
855- //#include "compiler/api/compilercb.h"
856- #include "zorbautils/checked_vector.h"
857- #include "compiler/parser/xquery_driver.h"
858- #include "util/sorter.h"
859- #include "compiler/xqueryx/xqueryx_to_xquery.h"
860-// #include "compiler/xqueryx/xqueryx_xslt.h"
861-//#include "compiler/parser/xquery_scanner.h"
862-//#include "compiler/parsetree/parsenode_base.h"
863-//#include "compiler/parsetree/parsenode_visitor.h"
864-// #include "runtime/core/flwor_iterator.h"
865-// #include "context/static_context.h"
866-// #include "zorbautils/fatal.h"
867-// #include "runtime/base/unarybase.h"
868-// #include "compiler/expression/expr_consts.h"
869-// #include "api/iterator_singleton.h"
870-// #include "runtime/visitors/printer_visitor_api.h"
871-// //#include "compiler/parsetree/parsenode_print_dot_visitor.h"
872-// //#include "compiler/parsetree/parsenode_print_dot_visitor.h"
873-// //#include "runtime/visitors/planiter_visitor_impl_code.h"
874-// //#include "runtime/visitors/planiter_visitor_impl_include.h"
875-// //#include "runtime/visitors/printer_visitor_impl.h"
876-// //#include "runtime/core/path.h"
877-// #include "compiler/expression/ft_expr.h"
878-// #include "compiler/expression/ftnode.h"
879-// #include "compiler/parser/query_loc.h"
880+#ifdef STDAFX
881+
882+ #include <fstream>
883+ #include <iostream>
884+ #include <stdexcept>
885+ #include <cassert>
886+ #include <cstring>
887+ #include <memory>
888+
889+ #include <sstream>
890+ #include <xfwrap>
891+ #include <xfwrap1>
892+ #include <istream>
893+ #include <cstdio>
894+ #include <xxshared>
895+ #include <crtdefs.h>
896+ #include <map>
897+ #include <set>
898+
899+ #include "runtime/sequences/sequences.h"
900+ #include "diagnostics/xquery_diagnostics.h"
901+ #include "xercesc/util/xercesdefs.hpp"
902+ #include "runtime/collections/collections.h"
903+ #include "unicode/utypes.h"
904+ #include "zorba/config.h"
905+ #include "store/api/store.h"
906+ #include "zorba/zorba.h"
907+ #include "zorba/api_shared_types.h"
908+ #include "compiler/parsetree/parsenodes.h"
909+ #include "compiler/parser/parse_constants.h"
910+ #include "zorbautils/checked_vector.h"
911+ #include "compiler/parser/xquery_driver.h"
912+ #include "util/sorter.h"
913+ #include "compiler/xqueryx/xqueryx_to_xquery.h"
914+ #include <zorba/store_manager.h>
915+ #include <zorba/xquery.h>
916+ #include <zorba/xquery_exception.h>
917 #include "util/cxx_util.h"
918-// #include "util/indent.h"
919-// #include "util/stl_util.h"
920-// #include "diagnostics/xquery_diagnostics.h"
921-// #include "zorbatypes/numconversions.h"
922+ #include "diagnostics/assert.h"
923+ #include "zorbatypes/mapm/m_apm_lc.h"
924+ #include "zorbatypes/datetime/parse.h"
925+ #include "zorbatypes/chartype.h"
926+ #include "zorbatypes/collation_manager.h"
927+ #include "zorbatypes/ft_token.h"
928+ #include "zorbatypes/m_apm.h"
929+ #include "zorbatypes/rclock.h"
930+ #include "zorbatypes/schema_types.h"
931+ #include "zorbatypes/timezone.h"
932+ #include "zorbatypes/transcoder.h"
933+ #include "zorbatypes/URI.h"
934+ #include "zorbatypes/xerces_xmlcharray.h"
935+ #include "zorbatypes/zorbatypes_decl.h"
936+ #include "zorbatypes/zstring.h"
937+ #include "zorbautils/condition.h"
938+ #include "zorbautils/hashfun.h"
939+ #include "zorbautils/hashmap.h"
940+ #include "zorbautils/hashmap_itemp.h"
941+ #include "zorbautils/hashmap_str_obj.h"
942+ #include "zorbautils/hashmap_zstring.h"
943+ #include "zorbautils/hashset.h"
944+ #include "zorbautils/hashset_itemh.h"
945+ #include "zorbautils/latch.h"
946+ #include "zorbautils/locale.h"
947+ #include "zorbautils/lock.h"
948+ #include "zorbautils/mutex.h"
949+ #include "zorbautils/runnable.h"
950+ #include "zorbautils/SAXParser.h"
951+ #include "zorbautils/stack.h"
952+ #include "zorbautils/string_util.h"
953+ #include "unit_tests/unit_test_list.h"
954+ #include "zorba/diagnostic_handler.h"
955+ #include "zorba/xquery_warning.h"
956+ #include "runtime/full_text/ftcontains_visitor.h"
957+ #include "store/api/ft_token_iterator.h"
958+ #include "store/naive/ft_token_store.h"
959
960-// #include "api/serialization/serializable.h"
961-// #include "api/serialization/serializer.h"
962-// #include "api/collectionimpl.h"
963-// #include "api/dynamiccontextimpl.h"
964-// #include "api/fileimpl.h"
965-// #include "api/functionimpl.h"
966-// #include "api/invoke_item_sequence.h"
967-// #include "api/itemfactoryimpl.h"
968-// #include "api/resultiteratorchainer.h"
969-// #include "api/resultiteratorimpl.h"
970-// #include "api/sax2impl.h"
971-// #include "api/serializerimpl.h"
972-// #include "api/staticcontextimpl.h"
973-// #include "api/storeiteratorimpl.h"
974-// #include "api/unmarshaller.h"
975-// #include "api/uri_resolver_wrappers.h"
976-// #include "api/vectoriterator.h"
977-// #include "api/xmldatamanagerimpl.h"
978-// //#include "api/xqueryimpl.h"
979-// #include "api/zorbaimpl.h"
980-// #include "capi/cdynamic_context.h"
981-// #include "capi/cexpression.h"
982-// #include "capi/cexternal_function.h"
983-// #include "capi/cimplementation.h"
984-// #include "capi/csequence.h"
985-// #include "capi/cstatic_context.h"
986-// #include "capi/error.h"
987-// #include "capi/external_module.h"
988-// #include "capi/single_item_sequence.h"
989-// #include "capi/user_item_sequence.h"
990-// #include "compiler/parser/flexlexer.h"
991-// #include "compiler/parser/ft_types.h"
992-// #include "compiler/parser/symbol_table.h"
993-// #include "compiler/parser/xqdoc_comment.h"
994-// #include "compiler/parsetree/parsenode_print_xml_visitor.h"
995-// #include "compiler/parsetree/parsenode_print_xqdoc_visitor.h"
996-// #include "compiler/parsetree/parsenode_print_xquery_visitor.h"
997-// #include "compiler/parsetree/parsenode_xqdoc_visitor.h"
998-// #include "compiler/translator/prolog_graph.h"
999-// #include "compiler/translator/translator.h"
1000-// #include "compiler/codegen/plan_visitor.h"
1001-// #include "compiler/expression/abstract_expr_visitor.h"
1002-// #include "compiler/expression/expr.h"
1003-// #include "compiler/expression/expr_annotations.h"
1004-// #include "compiler/expression/expr_base.h"
1005-// #include "compiler/expression/expr_classes.h"
1006-// #include "compiler/expression/expr_iter.h"
1007-// #include "compiler/expression/expr_utils.h"
1008-// #include "compiler/expression/expr_visitor.h"
1009-// #include "compiler/expression/flwor_expr.h"
1010-// //#include "compiler/expression/fo_expr.h"
1011-// #include "compiler/expression/ftnode_classes.h"
1012-// #include "compiler/expression/ftnode_visitor.h"
1013-// #include "compiler/expression/function_item_expr.h"
1014-// #include "compiler/expression/path_expr.h"
1015-// #include "compiler/expression/script_exprs.h"
1016-// #include "compiler/expression/update_exprs.h"
1017-// #include "compiler/expression/var_expr.h"
1018-// #include "compiler/rewriter/framework/rewriter.h"
1019-// #include "compiler/rewriter/framework/rewriter_context.h"
1020-// #include "compiler/rewriter/framework/rule_driver.h"
1021-// #include "compiler/rewriter/framework/sequential_rewriter.h"
1022-// #include "compiler/rewriter/rewriters/common_rewriter.h"
1023-// #include "compiler/rewriter/rewriters/default_optimizer.h"
1024-// #include "compiler/rewriter/rewriters/phase1_rewriter.h"
1025-// #include "compiler/rewriter/rules/ruleset.h"
1026-// #include "compiler/rewriter/rules/rule_base.h"
1027-// #include "compiler/rewriter/rules/type_rules.h"
1028-// #include "compiler/rewriter/tools/dataflow_annotations.h"
1029-// #include "compiler/rewriter/tools/expr_tools.h"
1030-// #include "compiler/rewriter/tools/udf_graph.h"
1031-// #include "compiler/xqddf/collection_decl.h"
1032-// #include "compiler/xqddf/value_ic.h"
1033-// #include "compiler/xqddf/value_index.h"
1034-// #include "compiler/semantic_annotations/annotations.h"
1035-// #include "compiler/semantic_annotations/annotation_holder.h"
1036-// #include "compiler/semantic_annotations/annotation_keys.h"
1037-// #include "compiler/api/compiler_api.h"
1038-// #include "compiler/api/compiler_api_impl.h"
1039-// #include "system/globalenv.h"
1040-// #include "system/properties.h"
1041-// #include "system/zorba_properties.h"
1042-// #include "context/decimal_format.h"
1043-// #include "context/default_uri_mappers.h"
1044-// #include "context/default_url_resolvers.h"
1045-// #include "context/dynamic_context.h"
1046-// #include "context/dynamic_loader.h"
1047-// #include "context/internal_uri_resolvers.h"
1048-// //#include "context/namespace_context.h"
1049-// #include "context/root_static_context.h"
1050-// #include "context/sctx_map_iterator.h"
1051-// #include "context/standard_uri_resolvers.h"
1052-// #include "context/static_context_consts.h"
1053-// #include "context/stemmer_wrappers.h"
1054-// #include "context/uri_resolver.h"
1055-// #include "context/uri_resolver_wrapper.h"
1056-#include "diagnostics/assert.h"
1057-// #include "diagnostics/diagnostic.h"
1058-// #include "diagnostics/dict.h"
1059-// #include "diagnostics/dict_impl.h"
1060-// #include "diagnostics/StackWalker.h"
1061-// #include "diagnostics/user_error.h"
1062-// #include "diagnostics/user_exception.h"
1063-// #include "diagnostics/xquery_exception.h"
1064-// #include "diagnostics/xquery_stack_trace.h"
1065-// #include "diagnostics/xquery_warning.h"
1066-// #include "diagnostics/zorba_exception.h"
1067-// //#include "functions/annotation.h"
1068-// #include "functions/external_function.h"
1069-// #include "functions/function.h"
1070-// #include "functions/function_consts.h"
1071-// #include "functions/function_impl.h"
1072-// #include "functions/func_accessors_impl.h"
1073-// #include "functions/func_apply.h"
1074-// #include "functions/func_arithmetic.h"
1075-// #include "functions/func_booleans_impl.h"
1076-// #include "functions/func_durations_dates_times_impl.h"
1077-// #include "functions/func_enclosed.h"
1078-// #include "functions/func_eval.h"
1079-// #include "functions/func_hoist.h"
1080-// #include "functions/func_index_ddl.h"
1081-// #include "functions/func_node_sort_distinct.h"
1082-// #include "functions/func_numerics_impl.h"
1083-// #include "functions/func_reflection.h"
1084-// #include "functions/func_sequences_impl.h"
1085-// #include "functions/func_var_decl.h"
1086-// #include "functions/library.h"
1087-// #include "functions/signature.h"
1088-// #include "functions/udf.h"
1089-// #include "runtime/full_text/thesauri/decode_base128.h"
1090-// #include "runtime/full_text/thesauri/encoded_list.h"
1091-// #include "runtime/full_text/thesauri/iso2788.h"
1092-// #include "runtime/full_text/thesauri/wn_db_segment.h"
1093-// #include "runtime/full_text/thesauri/wn_synset.h"
1094-// #include "runtime/full_text/thesauri/wn_thesaurus.h"
1095-// #include "runtime/full_text/thesauri/wn_types.h"
1096-// #include "runtime/full_text/thesauri/xqftts_relationship.h"
1097-// #include "runtime/full_text/thesauri/xqftts_thesaurus.h"
1098-// #include "runtime/full_text/ft_match.h"
1099-// #include "runtime/full_text/ft_query_item.h"
1100-// #include "runtime/full_text/ft_single_token_iterator.h"
1101-// #include "runtime/full_text/ft_stop_words_set.h"
1102-// #include "runtime/full_text/ft_thesaurus.h"
1103-// #include "runtime/full_text/ft_token_matcher.h"
1104-// #include "runtime/full_text/ft_token_seq_iterator.h"
1105-// #include "runtime/full_text/ft_token_span.h"
1106-// #include "runtime/full_text/ft_wildcard.h"
1107-// #include "runtime/full_text/full_text.h"
1108-// #include "runtime/full_text/apply.h"
1109-// #include "runtime/full_text/ft_util.h"
1110-// #include "runtime/collections/collections_base.h"
1111-// #include "runtime/core/apply_updates.h"
1112-// #include "runtime/core/arithmetic_impl.h"
1113-// #include "runtime/core/constructors.h"
1114-// #include "runtime/core/fncall_iterator.h"
1115-// #include "runtime/core/internal_operators.h"
1116-// #include "runtime/core/item_iterator.h"
1117-// #include "runtime/core/nodeid_iterators.h"
1118-// #include "runtime/core/path_iterators.h"
1119-// #include "runtime/core/sequencetypes.h"
1120-// #include "runtime/core/trycatch.h"
1121-// #include "runtime/core/var_iterators.h"
1122-// #include "runtime/numerics/NumericsImpl.h"
1123-// #include "runtime/booleans/BooleanImpl.h"
1124-// #include "runtime/base/binarybase.h"
1125-// #include "runtime/base/narybase.h"
1126-// #include "runtime/base/noarybase.h"
1127-// #include "runtime/base/plan_iterator.h"
1128-// #include "runtime/sequences/SequencesImpl.h"
1129-// #include "runtime/visitors/iterprinter.h"
1130-// #include "runtime/misc/materialize.h"
1131-// #include "runtime/scripting/scripting.h"
1132-// #include "types/schema/EventSchemaValidator.h"
1133-// #include "types/schema/LoadSchemaErrorHandler.h"
1134-// #include "types/schema/PrintSchema.h"
1135-// #include "types/schema/revalidateUtils.h"
1136-// #include "types/schema/schema.h"
1137-// #include "types/schema/SchemaValidatorFilter.h"
1138-// #include "types/schema/StrX.h"
1139-// #include "types/schema/validate.h"
1140-// #include "types/schema/ValidationEventHandler.h"
1141-// #include "types/schema/xercesIncludes.h"
1142-// #include "types/schema/XercesParseUtils.h"
1143-// #include "types/schema/XercSchemaValidator.h"
1144-// #include "types/casting.h"
1145-// #include "types/collation.h"
1146-// #include "types/node_test.h"
1147-// #include "types/root_typemanager.h"
1148-// #include "types/typeconstants.h"
1149-// #include "types/typeimpl.h"
1150-// #include "types/typemanager.h"
1151-// #include "types/typemanagerimpl.h"
1152-// #include "types/typeops.h"
1153-// #include "util/fx/fxarray.h"
1154-// #include "util/fx/fxcharheap.h"
1155-// #include "util/ascii_util.h"
1156-// #include "util/atomic_int.h"
1157-// #include "util/auto_vector.h"
1158-// #include "util/curl_util.h"
1159-// #include "util/dir.h"
1160-// #include "util/dynamic_bitset.h"
1161-// #include "util/empty.h"
1162-// #include "util/error_util.h"
1163-// #include "util/fs_util.h"
1164-// #include "util/hashmap.h"
1165-// //#include "util/hashmap32.h"
1166-// #include "util/less.h"
1167-// #include "util/mmap_file.h"
1168-// #include "util/nonatomic_int.h"
1169-// #include "util/omanip.h"
1170-// #include "util/oseparator.h"
1171-// #include "util/regex.h"
1172-// #include "util/singleton.h"
1173-// #include "util/string_util.h"
1174-// #include "util/threads.h"
1175-// #include "util/tokenbuf.h"
1176-// #include "util/tracer.h"
1177-// #include "util/triple.h"
1178-// #include "util/unicode_categories.h"
1179-// #include "util/unicode_util.h"
1180-// #include "util/uri_util.h"
1181-// #include "util/utf8_string.h"
1182-// #include "util/utf8_util.h"
1183-// #include "util/utf8_util_base.h"
1184-// #include "util/void_int.h"
1185-// #include "util/xml_util.h"
1186-// #include "zorbamisc/config/platform.h"
1187-// //#include "zorbaserialization/archiver.h"
1188-// #include "zorbaserialization/base64impl.h"
1189-// #include "zorbaserialization/bin_archiver.h"
1190-// //#include "zorbaserialization/class_serializer.h"
1191-// #include "zorbaserialization/mem_archiver.h"
1192-// #include "zorbaserialization/serialization_engine.h"
1193-// #include "zorbaserialization/template_serializer.h"
1194-// #include "zorbaserialization/xml_archiver.h"
1195-// #include "zorbaserialization/zorba_class_serializer.h"
1196- #include "zorbatypes/mapm/m_apm_lc.h"
1197- #include "zorbatypes/datetime/parse.h"
1198- //#include "zorbatypes/binary.h"
1199- #include "zorbatypes/chartype.h"
1200- #include "zorbatypes/collation_manager.h"
1201- //#include "zorbatypes/datetime.h"
1202- //#include "zorbatypes/decimal.h"
1203- //#include "zorbatypes/duration.h"
1204- //#include "zorbatypes/floatimpl.h"
1205- #include "zorbatypes/ft_token.h"
1206- //#include "zorbatypes/integer.h"
1207- #include "zorbatypes/libicu.h"
1208- #include "zorbatypes/m_apm.h"
1209- //#include "zorbatypes/rchandle.h"
1210- #include "zorbatypes/rclock.h"
1211- //#include "zorbatypes/regex_ascii.h"
1212- #include "zorbatypes/schema_types.h"
1213- #include "zorbatypes/timezone.h"
1214- #include "zorbatypes/transcoder.h"
1215- #include "zorbatypes/URI.h"
1216- #include "zorbatypes/xerces_xmlcharray.h"
1217- #include "zorbatypes/zorbatypes_decl.h"
1218- #include "zorbatypes/zstring.h"
1219- //#include "zorbautils/stemmer/sb_stemmer.h"
1220- #include "zorbautils/condition.h"
1221- #include "zorbautils/hashfun.h"
1222- #include "zorbautils/hashmap.h"
1223- #include "zorbautils/hashmap_itemp.h"
1224- #include "zorbautils/hashmap_str_obj.h"
1225- #include "zorbautils/hashmap_zstring.h"
1226- #include "zorbautils/hashset.h"
1227- #include "zorbautils/hashset_itemh.h"
1228- //#include "zorbautils/icu_tokenizer.h"
1229- #include "zorbautils/latch.h"
1230- #include "zorbautils/locale.h"
1231- #include "zorbautils/lock.h"
1232- #include "zorbautils/mutex.h"
1233- #include "zorbautils/runnable.h"
1234- #include "zorbautils/SAXParser.h"
1235- #include "zorbautils/stack.h"
1236-// #include "zorbautils/stemmer.h"
1237- #include "zorbautils/string_util.h"
1238- //#include "zorbautils/synchronous_logger.h"
1239- //#include "zorbautils/tokenizer.h"
1240- #include "unit_tests/unit_test_list.h"
1241- #include "zorba/diagnostic_handler.h"
1242- #include "zorba/xquery_warning.h"
1243- #include "runtime/full_text/ftcontains_visitor.h"
1244- #include "store/naive/naive_ft_token_iterator.h"
1245- #include "store/api/ft_token_iterator.h"
1246- #include "store/naive/ft_token_store.h"
1247 #endif
1248 /* vim:set et sw=2 ts=2: */
1249
1250=== modified file 'src/runtime/full_text/CMakeLists.txt'
1251--- src/runtime/full_text/CMakeLists.txt 2012-03-28 05:19:57 +0000
1252+++ src/runtime/full_text/CMakeLists.txt 2012-04-13 19:45:38 +0000
1253@@ -42,11 +42,11 @@
1254 default_tokenizer.cpp
1255 )
1256
1257-IF (ZORBA_NO_UNICODE)
1258+IF (ZORBA_NO_ICU)
1259 LIST(APPEND FULLTEXT_SRCS latin_tokenizer.cpp)
1260-ELSE (ZORBA_NO_UNICODE)
1261+ELSE (ZORBA_NO_ICU)
1262 LIST(APPEND FULLTEXT_SRCS icu_tokenizer.cpp)
1263-ENDIF (ZORBA_NO_UNICODE)
1264+ENDIF (ZORBA_NO_ICU)
1265
1266 ADD_SRC_SUBFOLDER(FULLTEXT_SRCS stemmer LIBSTEMMER_SRCS)
1267
1268
1269=== modified file 'src/runtime/full_text/default_tokenizer.cpp'
1270--- src/runtime/full_text/default_tokenizer.cpp 2012-03-28 05:19:57 +0000
1271+++ src/runtime/full_text/default_tokenizer.cpp 2012-04-13 19:45:38 +0000
1272@@ -19,22 +19,22 @@
1273 #include <zorba/config.h>
1274
1275 #include "default_tokenizer.h"
1276-#ifdef ZORBA_NO_UNICODE
1277+#ifdef ZORBA_NO_ICU
1278 # include "latin_tokenizer.h"
1279 #else
1280 # include "icu_tokenizer.h"
1281-#endif /* ZORBA_NO_UNICODE */
1282+#endif /* ZORBA_NO_ICU */
1283
1284 namespace zorba {
1285
1286 ///////////////////////////////////////////////////////////////////////////////
1287
1288 TokenizerProvider const& default_tokenizer_provider() {
1289-#ifdef ZORBA_NO_UNICODE
1290+#ifdef ZORBA_NO_ICU
1291 static LatinTokenizerProvider const instance;
1292 #else
1293 static ICU_TokenizerProvider const instance;
1294-#endif /* ZORBA_NO_UNICODE */
1295+#endif /* ZORBA_NO_ICU */
1296 return instance;
1297 };
1298
1299
1300=== modified file 'src/runtime/full_text/latin_tokenizer.cpp'
1301--- src/runtime/full_text/latin_tokenizer.cpp 2012-03-28 05:19:57 +0000
1302+++ src/runtime/full_text/latin_tokenizer.cpp 2012-04-13 19:45:38 +0000
1303@@ -18,8 +18,9 @@
1304 #include <functional>
1305
1306 #include <zorba/diagnostic_list.h>
1307-#include <zorba/xquery_exception.h>
1308-#include <zorba/zorba.h>
1309+
1310+#include "diagnostics/dict.h"
1311+#include "diagnostics/xquery_exception.h"
1312
1313 #include "latin_tokenizer.h"
1314
1315
1316=== modified file 'src/runtime/full_text/latin_tokenizer.h'
1317--- src/runtime/full_text/latin_tokenizer.h 2012-03-28 05:19:57 +0000
1318+++ src/runtime/full_text/latin_tokenizer.h 2012-04-13 19:45:38 +0000
1319@@ -14,12 +14,12 @@
1320 * limitations under the License.
1321 */
1322
1323-#ifndef ZORBA_WESTERN_TOKENIZER_H
1324-#define ZORBA_WESTERN_TOKENIZER_H
1325+#ifndef ZORBA_LATIN_TOKENIZER_H
1326+#define ZORBA_LATIN_TOKENIZER_H
1327
1328 #include <zorba/config.h>
1329
1330-#ifdef ZORBA_NO_FULL_TEXT
1331+#ifdef ZORBA_NO_ICU
1332
1333 #include <zorba/tokenizer.h>
1334 #include "zorbatypes/zstring.h"
1335@@ -38,8 +38,8 @@
1336
1337 // inherited
1338 void destroy() const;
1339- void tokenize( char const*, size_type, iso639_1::type, bool, Callback&,
1340- void* );
1341+ void tokenize( char const*, size_type, locale::iso639_1::type, bool,
1342+ Callback&, void* );
1343
1344 private:
1345 typedef zstring string_type;
1346@@ -64,13 +64,14 @@
1347 class LatinTokenizerProvider : public TokenizerProvider {
1348 public:
1349 // inherited
1350- Tokenizer::ptr getTokenizer( iso639_1::type, Tokenizer::Numbers& ) const;
1351+ Tokenizer::ptr getTokenizer( locale::iso639_1::type,
1352+ Tokenizer::Numbers& ) const;
1353 };
1354
1355 ///////////////////////////////////////////////////////////////////////////////
1356
1357 } // namespace zorba
1358
1359-#endif /* ZORBA_NO_FULL_TEXT */
1360-#endif /* ZORBA_WESTERN_TOKENIZER_H */
1361+#endif /* ZORBA_NO_ICU */
1362+#endif /* ZORBA_LATIN_TOKENIZER_H */
1363 /* vim:set et sw=2 ts=2: */
1364
1365=== modified file 'src/runtime/numerics/format_integer_impl.cpp'
1366--- src/runtime/numerics/format_integer_impl.cpp 2012-03-28 05:19:57 +0000
1367+++ src/runtime/numerics/format_integer_impl.cpp 2012-04-13 19:45:38 +0000
1368@@ -881,7 +881,7 @@
1369 utf8_result += (*valueit);
1370 }
1371 else
1372- utf8_result += (0x2080 + *valueit - '0');
1373+ utf8_result += (unicode::code_point)(0x2080 + *valueit - '0');
1374 }
1375 }
1376 else if((c0 == 0x2460) || //CIRCLED DIGIT ONE (1-20)
1377
1378=== modified file 'src/runtime/numerics/numerics_impl.cpp'
1379--- src/runtime/numerics/numerics_impl.cpp 2012-03-28 05:19:57 +0000
1380+++ src/runtime/numerics/numerics_impl.cpp 2012-04-13 19:45:38 +0000
1381@@ -462,7 +462,7 @@
1382 minus( "-" )
1383 {
1384 utf8_string<zstring> u_per_mille( per_mille );
1385- u_per_mille = 0x2030;
1386+ u_per_mille = (unicode::code_point)0x2030;
1387 }
1388
1389 void readFormat(const DecimalFormat_t& df_t)
1390
1391=== modified file 'src/runtime/strings/strings_impl.cpp'
1392--- src/runtime/strings/strings_impl.cpp 2012-03-28 05:19:57 +0000
1393+++ src/runtime/strings/strings_impl.cpp 2012-04-13 19:45:38 +0000
1394@@ -810,7 +810,9 @@
1395 zstring normForm;
1396 zstring resStr;
1397 unicode::normalization::type normType;
1398+#ifndef ZORBA_NO_ICU
1399 bool success;
1400+#endif /* ZORBA_NO_ICU */
1401
1402 PlanIteratorState* state;
1403 DEFAULT_STACK_INIT(PlanIteratorState, state, planState);
1404@@ -860,10 +862,10 @@
1405 }
1406
1407 item0->getStringValue2(resStr);
1408-#ifndef ZORBA_NO_UNICODE
1409+#ifndef ZORBA_NO_ICU
1410 success = utf8::normalize(resStr, normType, &resStr);
1411 ZORBA_ASSERT(success);
1412-#endif//#ifndef ZORBA_NO_UNICODE
1413+#endif//#ifndef ZORBA_NO_ICU
1414 STACK_PUSH(GENV_ITEMFACTORY->createString(result, resStr), state );
1415 }
1416 else
1417@@ -992,7 +994,7 @@
1418 trans_map[ *map_i ] = *trans_i;
1419
1420 for ( ; map_i != map_end; ++map_i )
1421- trans_map[ *map_i ] = ~0;
1422+ trans_map[ *map_i ] = static_cast<unicode::code_point>( ~0 );
1423 }
1424
1425 utf8_string<zstring> u_result_string( result_string );
1426@@ -1007,7 +1009,7 @@
1427 cp_map_type::const_iterator const found_i = trans_map.find( cp );
1428 if ( found_i != trans_map.end() ) {
1429 cp = found_i->second;
1430- if ( cp == ~0 )
1431+ if ( cp == static_cast<unicode::code_point>( ~0 ) )
1432 continue;
1433 }
1434 u_result_string += cp;
1435@@ -1795,16 +1797,33 @@
1436 int &utf8start,
1437 unsigned int &bytestart,
1438 int utf8end,
1439+ unsigned int byteend,
1440 zstring &out)
1441 {
1442+#ifndef ZORBA_NO_ICU
1443 utf8::size_type clen;
1444- while(utf8start < utf8end)
1445- {
1446- clen = utf8::char_length(*sin);
1447- out.append(sin, clen);
1448- utf8start++;
1449- bytestart += clen;
1450- sin += clen;
1451+ if(utf8end)
1452+ {
1453+ while(utf8start < utf8end)
1454+ {
1455+ clen = utf8::char_length(*sin);
1456+ if(clen == 0)
1457+ clen = 1;
1458+ out.append(sin, clen);
1459+ utf8start++;
1460+ bytestart += clen;
1461+ sin += clen;
1462+ }
1463+ }
1464+ else
1465+#endif
1466+ {
1467+ if(!utf8end)
1468+ utf8end = byteend;
1469+ out.append(sin, utf8end-bytestart);
1470+ sin += utf8end-bytestart;
1471+ utf8start = utf8end;
1472+ bytestart = utf8end;
1473 }
1474 }
1475
1476@@ -1812,6 +1831,7 @@
1477 int &match_end1,
1478 unsigned int &match_end1_bytes,
1479 int match_start2,
1480+ unsigned int match_start2_bytes,
1481 const char *&strin)
1482 {
1483 store::Item_t non_match_elem;
1484@@ -1833,7 +1853,7 @@
1485 // utf8_it++;
1486 // match_end1++;
1487 //}
1488- copyUtf8Chars(strin, match_end1, match_end1_bytes, match_start2, non_match_str);
1489+ copyUtf8Chars(strin, match_end1, match_end1_bytes, match_start2, match_start2_bytes, non_match_str);
1490 store::Item_t non_match_text_item;
1491 GENV_ITEMFACTORY->createTextNode(non_match_text_item, non_match_elem, non_match_str);
1492 }
1493@@ -1864,19 +1884,31 @@
1494 i--;
1495 break;
1496 }
1497+#ifndef ZORBA_NO_ICU
1498 match_startg = rx.get_match_start(i+1);
1499 if((match_startg < 0) && (gparent < 0))
1500 continue;
1501+#else
1502+ int temp_endg;
1503+ match_startg = -1;
1504+ temp_endg = -1;
1505+ if(!rx.get_match_start_end_bytes(i+1, &match_startg, &temp_endg) && (gparent < 0))
1506+ continue;
1507+#endif
1508 if(match_endgood < match_startg)
1509 {
1510 //add non-group match text
1511 zstring non_group_str;
1512
1513- copyUtf8Chars(sin, match_endgood, match_end1_bytes, match_startg, non_group_str);
1514+ copyUtf8Chars(sin, match_endgood, match_end1_bytes, match_startg, 0, non_group_str);
1515 store::Item_t non_group_text_item;
1516 GENV_ITEMFACTORY->createTextNode(non_group_text_item, parent.getp(), non_group_str);
1517 }
1518+#ifndef ZORBA_NO_ICU
1519 match_endg = rx.get_match_end(i+1);
1520+#else
1521+ match_endg = temp_endg;
1522+#endif
1523 //add group match text
1524 GENV_ITEMFACTORY->createQName(group_element_name,
1525 static_context::W3C_FN_NS, "fn", "group");
1526@@ -1907,7 +1939,7 @@
1527 }
1528 zstring group_str;
1529
1530- copyUtf8Chars(sin, match_startg, match_end1_bytes, match_endg, group_str);
1531+ copyUtf8Chars(sin, match_startg, match_end1_bytes, match_endg, 0, group_str);
1532 store::Item_t group_text_item;
1533 GENV_ITEMFACTORY->createTextNode(group_text_item, group_elem.getp(), group_str);
1534 }
1535@@ -1916,7 +1948,7 @@
1536 {
1537 zstring non_group_str;
1538
1539- copyUtf8Chars(sin, match_endgood, match_end1_bytes, match_end2, non_group_str);
1540+ copyUtf8Chars(sin, match_endgood, match_end1_bytes, match_end2, 0, non_group_str);
1541 store::Item_t non_group_text_item;
1542 GENV_ITEMFACTORY->createTextNode(non_group_text_item, parent, non_group_str);
1543 }
1544@@ -2144,8 +2176,14 @@
1545 reachedEnd = false;
1546 while(rx.find_next_match(&reachedEnd))
1547 {
1548- int match_start2 = rx.get_match_start();
1549- int match_end2 = rx.get_match_end();
1550+ int match_start2;
1551+ int match_end2;
1552+#ifndef ZORBA_NO_ICU
1553+ match_start2 = rx.get_match_start();
1554+ match_end2 = rx.get_match_end();
1555+#else
1556+ rx.get_match_start_end_bytes(0, &match_start2, &match_end2);
1557+#endif
1558 ZORBA_ASSERT(match_start2 >= 0);
1559
1560 if(is_input_stream && reachedEnd && !instream->eof())
1561@@ -2157,7 +2195,7 @@
1562 //construct the fn:non-match
1563 if(match_start2 > match_end1)
1564 {
1565- addNonMatchElement(result, match_end1, match_end1_bytes, match_start2, instr);
1566+ addNonMatchElement(result, match_end1, match_end1_bytes, match_start2, 0, instr);
1567 }
1568
1569 //construct the fn:match
1570@@ -2165,7 +2203,7 @@
1571 match_end1 = match_end2;
1572 }
1573
1574- if(is_input_stream && reachedEnd && !instream->eof())
1575+ if(is_input_stream && !instream->eof())
1576 {
1577 //load some more data, maybe the match will be different
1578 if(match_end1_bytes)
1579@@ -2213,7 +2251,7 @@
1580 else
1581 {
1582 if(match_end1_bytes < streambuf_read)
1583- addNonMatchElement(result, match_end1, match_end1_bytes, streambuf_read, instr);
1584+ addNonMatchElement(result, match_end1, match_end1_bytes, 0, streambuf_read, instr);
1585 if(is_input_stream && instream->eof())
1586 reachedEnd = true;
1587 }
1588
1589=== modified file 'src/store/api/store.h'
1590--- src/store/api/store.h 2012-04-10 20:59:34 +0000
1591+++ src/store/api/store.h 2012-04-13 19:45:38 +0000
1592@@ -16,7 +16,7 @@
1593 #ifndef ZORBA_STORE_STORE_H
1594 #define ZORBA_STORE_STORE_H
1595
1596-#include <zorba/config.h>
1597+#include "zorba/config.h"
1598 #include "zorbatypes/schema_types.h"
1599
1600 #include "store/api/shared_types.h"
1601
1602=== modified file 'src/store/naive/simple_store.h'
1603--- src/store/naive/simple_store.h 2012-03-28 23:58:23 +0000
1604+++ src/store/naive/simple_store.h 2012-04-13 19:45:38 +0000
1605@@ -16,7 +16,11 @@
1606 #ifndef ZORBA_SIMPLE_STORE
1607 #define ZORBA_SIMPLE_STORE
1608
1609-#include "store.h"
1610+#include "store/naive/store.h"
1611+
1612+#include "store/naive/node_factory.h"
1613+#include "store/naive/pul_primitive_factory.h"
1614+#include "store/naive/tree_id_generator.h"
1615
1616 namespace zorba {
1617 namespace simplestore {
1618@@ -72,7 +76,7 @@
1619
1620 NodeFactory* createNodeFactory() const;
1621
1622- void destroyNodeFactory(NodeFactory*) const;
1623+ void destroyNodeFactory(zorba::simplestore::NodeFactory*) const;
1624
1625 store::ItemFactory* createItemFactory() const;
1626
1627@@ -84,7 +88,7 @@
1628
1629 PULPrimitiveFactory* createPULFactory() const;
1630
1631- void destroyPULFactory(PULPrimitiveFactory*) const;
1632+ void destroyPULFactory(zorba::simplestore::PULPrimitiveFactory*) const;
1633
1634 CollectionSet* createCollectionSet() const;
1635
1636
1637=== modified file 'src/store/naive/store.h'
1638--- src/store/naive/store.h 2012-03-28 22:09:36 +0000
1639+++ src/store/naive/store.h 2012-04-13 19:45:38 +0000
1640@@ -16,10 +16,18 @@
1641 #ifndef ZORBA_SIMPLESTORE_STORE_H
1642 #define ZORBA_SIMPLESTORE_STORE_H
1643
1644+#include "store/api/store.h"
1645+
1646 #include "shared_types.h"
1647 #include "store_defs.h"
1648 #include "hashmap_nodep.h"
1649 #include "tree_id.h"
1650+#include "store/util/hashmap_stringbuf.h"
1651+#include "zorbautils/mutex.h"
1652+#include "zorbautils/lock.h"
1653+#include "zorbautils/hashmap.h"
1654+#include "zorbautils/hashmap_itemp.h"
1655+#include "zorbautils/hashmap_zstring_nonserializable.h"
1656
1657 #if (defined (WIN32) || defined (WINCE))
1658 #include "node_items.h"
1659@@ -28,14 +36,7 @@
1660 #include "store/api/ic.h"
1661 #endif
1662
1663-#include "store/api/store.h"
1664-
1665-#include "store/util/hashmap_stringbuf.h"
1666-
1667-#include "zorbautils/mutex.h"
1668-#include "zorbautils/lock.h"
1669-#include "zorbautils/hashmap_itemp.h"
1670-#include "zorbautils/hashmap_zstring_nonserializable.h"
1671+using namespace zorba;
1672
1673 namespace zorba
1674 {
1675@@ -63,9 +64,9 @@
1676 class TreeIdGeneratorFactory;
1677 class TreeIdGenerator;
1678
1679-typedef zorba::HashMapZString<XmlNode_t> DocumentSet;
1680-typedef ItemPointerHashMap<store::Index_t> IndexSet;
1681-typedef ItemPointerHashMap<store::IC_t> ICSet;
1682+typedef HashMapZString<XmlNode_t> DocumentSet;
1683+typedef zorba::ItemPointerHashMap<store::Index_t> IndexSet;
1684+typedef zorba::ItemPointerHashMap<store::IC_t> ICSet;
1685
1686
1687
1688
1689=== modified file 'src/system/globalenv.cpp'
1690--- src/system/globalenv.cpp 2012-04-12 09:21:02 +0000
1691+++ src/system/globalenv.cpp 2012-04-13 19:45:38 +0000
1692@@ -17,11 +17,11 @@
1693
1694 #include "common/common.h"
1695
1696-#ifndef ZORBA_NO_UNICODE
1697+#ifndef ZORBA_NO_ICU
1698 # include <unicode/uclean.h>
1699 # include <unicode/utypes.h>
1700 # include <unicode/udata.h>
1701-#endif /* ZORBA_NO_UNICODE */
1702+#endif /* ZORBA_NO_ICU */
1703
1704 #ifdef ZORBA_WITH_BIG_INTEGER
1705 # include "zorbatypes/m_apm.h"
1706@@ -206,7 +206,7 @@
1707 // from one thread only
1708 // see http://www.icu-project.org/userguide/design.html#Init_and_Termination
1709 // and http://www.icu-project.org/apiref/icu4c/uclean_8h.html
1710-#ifndef ZORBA_NO_UNICODE
1711+#ifndef ZORBA_NO_ICU
1712 # if defined U_STATIC_IMPLEMENTATION && (defined WIN32 || defined WINCE)
1713 {
1714 TCHAR self_path[1024];
1715@@ -236,13 +236,13 @@
1716 udata_setCommonData(icu_appdata, &data_err);
1717 ZORBA_ASSERT(data_err == U_ZERO_ERROR);
1718
1719- // u_setDataDirectory(self_path);
1720+ // u_setDataDirectory(self_path);
1721 }
1722 # endif
1723 UErrorCode lICUInitStatus = U_ZERO_ERROR;
1724 u_init(&lICUInitStatus);
1725 ZORBA_ASSERT(lICUInitStatus == U_ZERO_ERROR);
1726-#endif//ifndef ZORBA_NO_UNICODE
1727+#endif /* ZORBA_NO_ICU */
1728 }
1729
1730
1731@@ -254,12 +254,12 @@
1732 // releases statically initialized memory and prevents
1733 // valgrind from reporting those problems at the end
1734 // see http://www.icu-project.org/apiref/icu4c/uclean_8h.html#93f27d0ddc7c196a1da864763f2d8920
1735-#ifndef ZORBA_NO_UNICODE
1736+#ifndef ZORBA_NO_ICU
1737 u_cleanup();
1738 # if defined U_STATIC_IMPLEMENTATION && (defined WIN32 || defined WINCE)
1739 delete[] icu_appdata;
1740 # endif
1741-#endif//ifndef ZORBA_NO_UNICODE
1742+#endif /* ZORBA_NO_ICU */
1743 }
1744
1745
1746
1747=== modified file 'src/unit_tests/CMakeLists.txt'
1748--- src/unit_tests/CMakeLists.txt 2012-03-28 05:19:57 +0000
1749+++ src/unit_tests/CMakeLists.txt 2012-04-13 19:45:38 +0000
1750@@ -29,9 +29,9 @@
1751 tokenizer.cpp)
1752 ENDIF (NOT ZORBA_NO_FULL_TEXT)
1753
1754-IF (NOT ZORBA_NO_UNICODE)
1755+IF (NOT ZORBA_NO_ICU)
1756 LIST (APPEND UNIT_TEST_SRCS
1757 test_icu_streambuf.cpp)
1758-ENDIF (NOT ZORBA_NO_UNICODE)
1759+ENDIF (NOT ZORBA_NO_ICU)
1760
1761 # vim:set et sw=2 tw=2:
1762
1763=== modified file 'src/unit_tests/string.cpp'
1764--- src/unit_tests/string.cpp 2012-03-28 05:19:57 +0000
1765+++ src/unit_tests/string.cpp 2012-04-13 19:45:38 +0000
1766@@ -569,6 +569,7 @@
1767 ASSERT_TRUE( t == s );
1768 }
1769
1770+#ifndef ZORBA_NO_ICU
1771 template<class StringType>
1772 static void test_to_string_from_wchar_t() {
1773 wchar_t const w[] = L"hello";
1774@@ -578,6 +579,7 @@
1775 for ( string::size_type i = 0; i < s.length(); ++i )
1776 ASSERT_TRUE( s[i] == w[i] );
1777 }
1778+#endif /* ZORBA_NO_ICU */
1779
1780 template<class StringType>
1781 static void test_to_upper() {
1782@@ -605,6 +607,7 @@
1783 }
1784 }
1785
1786+#ifndef ZORBA_NO_ICU
1787 static void test_to_wchar_t() {
1788 string const s = "hello";
1789 wchar_t *w;
1790@@ -616,6 +619,7 @@
1791 ASSERT_TRUE( w[i] == s[i] );
1792 delete[] w;
1793 }
1794+#endif /* ZORBA_NO_ICU */
1795
1796 static void test_trim_start() {
1797 char const *s;
1798@@ -873,16 +877,20 @@
1799 test_to_string_from_utf8<zstring>();
1800 test_to_string_from_utf8<zstring_p>();
1801
1802+#ifndef ZORBA_NO_ICU
1803 test_to_string_from_wchar_t<string>();
1804 test_to_string_from_wchar_t<zstring>();
1805 test_to_string_from_wchar_t<zstring_p>();
1806+#endif /* ZORBA_NO_ICU */
1807
1808 test_to_upper<string>();
1809 test_to_upper<zstring>();
1810 test_to_upper<zstring_p>();
1811 test_to_upper<String>();
1812
1813+#ifndef ZORBA_NO_ICU
1814 test_to_wchar_t();
1815+#endif /* ZORBA_NO_ICU */
1816
1817 test_trim_start();
1818 test_trim_end();
1819
1820=== modified file 'src/unit_tests/unit_test_list.h'
1821--- src/unit_tests/unit_test_list.h 2012-03-28 05:19:57 +0000
1822+++ src/unit_tests/unit_test_list.h 2012-04-13 19:45:38 +0000
1823@@ -36,9 +36,9 @@
1824 /**
1825 * ADD NEW UNIT TESTS HERE
1826 */
1827-#ifndef ZORBA_NO_UNICODE
1828+#ifndef ZORBA_NO_ICU
1829 int test_icu_streambuf( int, char*[] );
1830-#endif /* ZORBA_NO_UNICODE */
1831+#endif /* ZORBA_NO_ICU */
1832 int json_parser( int, char*[] );
1833
1834 void initializeTestList();
1835
1836=== modified file 'src/unit_tests/unit_tests.cpp'
1837--- src/unit_tests/unit_tests.cpp 2012-03-28 05:19:57 +0000
1838+++ src/unit_tests/unit_tests.cpp 2012-04-13 19:45:38 +0000
1839@@ -39,9 +39,9 @@
1840 void initializeTestList() {
1841 libunittests["string"] = test_string;
1842 libunittests["uri"] = runUriTest;
1843-#ifndef ZORBA_NO_UNICODE
1844+#ifndef ZORBA_NO_ICU
1845 libunittests["icu_streambuf"] = test_icu_streambuf;
1846-#endif /* ZORBA_NO_UNICODE */
1847+#endif /* ZORBA_NO_ICU */
1848 libunittests["json_parser"] = json_parser;
1849 libunittests["unique_ptr"] = test_unique_ptr;
1850 #ifndef ZORBA_NO_FULL_TEXT
1851
1852=== modified file 'src/util/CMakeLists.txt'
1853--- src/util/CMakeLists.txt 2012-03-28 05:19:57 +0000
1854+++ src/util/CMakeLists.txt 2012-04-13 19:45:38 +0000
1855@@ -40,14 +40,14 @@
1856 LIST(APPEND UTIL_SRCS mmap_file.cpp)
1857 ENDIF(ZORBA_WITH_FILE_ACCESS)
1858
1859-IF(ZORBA_NO_UNICODE)
1860+IF(ZORBA_NO_ICU)
1861 LIST(APPEND UTIL_SRCS
1862- regex_ascii.cpp
1863+ regex_xquery.cpp
1864 passthru_streambuf.cpp)
1865-ELSE(ZORBA_NO_UNICODE)
1866+ELSE(ZORBA_NO_ICU)
1867 LIST(APPEND UTIL_SRCS
1868 icu_streambuf.cpp)
1869-ENDIF(ZORBA_NO_UNICODE)
1870+ENDIF(ZORBA_NO_ICU)
1871
1872 HEADER_GROUP_SUBFOLDER(UTIL_SRCS fx)
1873 HEADER_GROUP_SUBFOLDER(UTIL_SRCS win32)
1874
1875=== modified file 'src/util/icu_streambuf.h'
1876--- src/util/icu_streambuf.h 2012-02-04 01:26:18 +0000
1877+++ src/util/icu_streambuf.h 2012-04-13 19:45:38 +0000
1878@@ -17,6 +17,7 @@
1879 #ifndef ZORBA_ICU_STREAMBUF_H
1880 #define ZORBA_ICU_STREAMBUF_H
1881
1882+#include <unicode/ucnv.h>
1883 #include <zorba/transcode_stream.h>
1884
1885 #include "util/utf8_util.h"
1886
1887=== modified file 'src/util/passthru_streambuf.cpp'
1888--- src/util/passthru_streambuf.cpp 2012-02-04 01:26:18 +0000
1889+++ src/util/passthru_streambuf.cpp 2012-04-13 19:45:38 +0000
1890@@ -14,8 +14,8 @@
1891 * limitations under the License.
1892 */
1893
1894+#include "stdafx.h"
1895 #include "passthru_streambuf.h"
1896-
1897 using namespace std;
1898
1899 namespace zorba {
1900@@ -47,7 +47,7 @@
1901 }
1902
1903 bool passthru_streambuf::is_supported( char const *cc_charset ) {
1904- return !is_necessary( charset );
1905+ return !is_necessary( cc_charset );
1906 }
1907
1908 passthru_streambuf::pos_type
1909
1910=== modified file 'src/util/passthru_streambuf.h'
1911--- src/util/passthru_streambuf.h 2012-02-02 18:37:24 +0000
1912+++ src/util/passthru_streambuf.h 2012-04-13 19:45:38 +0000
1913@@ -17,8 +17,9 @@
1914 #ifndef ZORBA_PASSTHRU_STREAMBUF_H
1915 #define ZORBA_PASSTHRU_STREAMBUF_H
1916
1917-#include <zorba/transcode_streambuf.h>
1918-
1919+#include <zorba/transcode_stream.h>
1920+#include "zorbatypes/zstring.h"
1921+#include "util/ascii_util.h"
1922 namespace zorba {
1923
1924 ///////////////////////////////////////////////////////////////////////////////
1925@@ -48,6 +49,13 @@
1926 * @return \c true only if the character encoding is supported.
1927 */
1928 static bool is_supported( char const *charset );
1929+ static bool is_necessary( char const *cc_charset );
1930+
1931+ typedef std::streambuf::char_type char_type;
1932+ typedef std::streambuf::int_type int_type;
1933+ typedef std::streambuf::off_type off_type;
1934+ typedef std::streambuf::pos_type pos_type;
1935+ typedef std::streambuf::traits_type traits_type;
1936
1937 protected:
1938 void imbue( std::locale const& );
1939
1940=== modified file 'src/util/regex.cpp'
1941--- src/util/regex.cpp 2012-03-28 05:19:57 +0000
1942+++ src/util/regex.cpp 2012-04-13 19:45:38 +0000
1943@@ -15,8 +15,6 @@
1944 */
1945 #include "stdafx.h"
1946
1947-#include "regex.h"
1948-
1949 #include <cstring>
1950 #include <vector>
1951
1952@@ -28,13 +26,13 @@
1953
1954 #include "ascii_util.h"
1955 #include "cxx_util.h"
1956+#include "regex.h"
1957 #include "stl_util.h"
1958
1959 #define INVALID_RE_EXCEPTION(...) \
1960 XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS( __VA_ARGS__ ) )
1961
1962-
1963-#ifndef ZORBA_NO_UNICODE
1964+#ifndef ZORBA_NO_ICU
1965 # include <unicode/uversion.h>
1966 U_NAMESPACE_USE
1967
1968@@ -103,6 +101,7 @@
1969
1970 bool got_backslash = false;
1971 bool in_char_class = false; // within [...]
1972+ bool is_first_char = true; // to check ^ placement
1973
1974 bool in_backref = false; // '\'[1-9][0-9]*
1975 unsigned backref_no = 0; // 1-based
1976@@ -231,6 +230,8 @@
1977 ++open_cap_subs;
1978 cap_sub.push_back( true );
1979 cur_cap_sub = cap_sub.size();
1980+ is_first_char = true;
1981+ goto append;
1982 }
1983 break;
1984 case ')':
1985@@ -245,8 +246,10 @@
1986 case '[':
1987 if ( q_flag )
1988 *icu_re += '\\';
1989- else
1990+ else {
1991 in_char_class = true;
1992+ goto append;
1993+ }
1994 break;
1995 case ']':
1996 if ( q_flag )
1997@@ -254,6 +257,19 @@
1998 else
1999 in_char_class = false;
2000 break;
2001+ case '^':
2002+ if ( q_flag )
2003+ *icu_re += '\\';
2004+ else if ( !is_first_char && !in_char_class )
2005+ throw INVALID_RE_EXCEPTION( xq_re, ZED( UnescapedChar_3 ), *xq_c );
2006+ break;
2007+ case '|':
2008+ if ( q_flag )
2009+ *icu_re += '\\';
2010+ else {
2011+ is_first_char = true;
2012+ goto append;
2013+ }
2014 default:
2015 if ( x_flag && ascii::is_space( *xq_c ) ) {
2016 if ( !in_char_class )
2017@@ -265,37 +281,42 @@
2018 //
2019 *icu_re += '\\';
2020 }
2021- }
2022- }
2023+ } // switch
2024+ } // else
2025+ is_first_char = false;
2026+append:
2027 *icu_re += *xq_c;
2028 } // FOR_EACH
2029
2030- if ( i_flag ) {
2031- //
2032- // XQuery 3.0 F&O 5.6.1.1: All other constructs are unaffected by the "i"
2033- // flag. For example, "\p{Lu}" continues to match upper-case letters only.
2034- //
2035- // However, ICU lower-cases everything for the 'i' flag; hence we have to
2036- // turn off the 'i' flag for just the \p{Lu}.
2037- //
2038- // Note that the "6" and "12" below are correct since "\\" represents a
2039- // single '\'.
2040- //
2041- ascii::replace_all( *icu_re, "\\p{Lu}", 6, "(?-i:\\p{Lu})", 12 );
2042- }
2043+ if ( !q_flag ) {
2044+ if ( i_flag ) {
2045+ //
2046+ // XQuery 3.0 F&O 5.6.1.1: All other constructs are unaffected by the "i"
2047+ // flag. For example, "\p{Lu}" continues to match upper-case letters
2048+ // only.
2049+ //
2050+ // However, ICU lower-cases everything for the 'i' flag; hence we have to
2051+ // turn off the 'i' flag for just the \p{Lu}.
2052+ //
2053+ // Note that the "6" and "12" below are correct since "\\" represents a
2054+ // single '\'.
2055+ //
2056+ ascii::replace_all( *icu_re, "\\p{Lu}", 6, "(?-i:\\p{Lu})", 12 );
2057+ }
2058
2059- //
2060- // XML Schema Part 2 F.1.1: [Unicode Database] groups code points into a
2061- // number of blocks such as Basic Latin (i.e., ASCII), Latin-1 Supplement,
2062- // Hangul Jamo, CJK Compatibility, etc. The set containing all characters
2063- // that have block name X (with all white space stripped out), can be
2064- // identified with a block escape \p{IsX}.
2065- //
2066- // However, ICU uses \p{InX} rather than \p{IsX}.
2067- //
2068- // Note that the "5" below is correct since "\\" represents a single '\'.
2069- //
2070- ascii::replace_all( *icu_re, "\\p{Is", 5, "\\p{In", 5 );
2071+ //
2072+ // XML Schema Part 2 F.1.1: [Unicode Database] groups code points into a
2073+ // number of blocks such as Basic Latin (i.e., ASCII), Latin-1 Supplement,
2074+ // Hangul Jamo, CJK Compatibility, etc. The set containing all characters
2075+ // that have block name X (with all white space stripped out), can be
2076+ // identified with a block escape \p{IsX}.
2077+ //
2078+ // However, ICU uses \p{InX} rather than \p{IsX}.
2079+ //
2080+ // Note that the "5" below is correct since "\\" represents a single '\'.
2081+ //
2082+ ascii::replace_all( *icu_re, "\\p{Is", 5, "\\p{In", 5 );
2083+ } // q_flag
2084 }
2085
2086 ///////////////////////////////////////////////////////////////////////////////
2087@@ -442,11 +463,11 @@
2088 }
2089
2090 } // namespace unicode
2091-
2092-}//namespace zorba
2093-
2094-
2095-#else /* ZORBA_NO_UNICODE */
2096+} // namespace zorba
2097+
2098+///////////////////////////////////////////////////////////////////////////////
2099+
2100+#else /* ZORBA_NO_ICU */
2101
2102 #include "zorbatypes/zstring.h"
2103
2104@@ -470,7 +491,7 @@
2105 case 'i': flags |= REGEX_ASCII_CASE_INSENSITIVE; break;
2106 case 's': flags |= REGEX_ASCII_DOTALL; break;
2107 case 'm': flags |= REGEX_ASCII_MULTILINE; break;
2108- case 'x': flags |= REGEX_ASCII_COMMENTS; break;
2109+ case 'x': flags |= REGEX_ASCII_NO_WHITESPACE; break;
2110 case 'q': flags |= REGEX_ASCII_LITERAL; break;
2111 default:
2112 throw XQUERY_EXCEPTION( err::FORX0001, ERROR_PARAMS( *p ) );
2113@@ -483,6 +504,7 @@
2114 void regex::compile( char const *pattern, char const *flags)
2115 {
2116 parsed_flags = parse_regex_flags(flags);
2117+ regex_xquery::CRegexXQuery_parser regex_parser;
2118 regex_matcher = regex_parser.parse(pattern, parsed_flags);
2119 if(!regex_matcher)
2120 throw INVALID_RE_EXCEPTION(pattern);
2121@@ -517,6 +539,8 @@
2122 bool regex::next_token( char const *s, size_type *pos, zstring *token,
2123 bool *matched)
2124 {
2125+ if(!s[*pos])
2126+ return false;
2127 bool retval;
2128 int match_pos;
2129 int matched_len;
2130@@ -528,14 +552,8 @@
2131 token->assign(s+*pos, match_pos);
2132 *pos += match_pos + matched_len;
2133 if(matched)
2134- if(match_pos)
2135- *matched = true;
2136- else
2137- *matched = false;
2138- if(match_pos)
2139- return true;
2140- else
2141- return false;
2142+ *matched = true;
2143+ return true;
2144 }
2145 else
2146 {
2147@@ -544,7 +562,7 @@
2148 *pos += strlen(s+*pos);
2149 if(matched)
2150 *matched = false;
2151- return s[*pos] != 0;
2152+ return true;
2153 }
2154 }
2155
2156@@ -554,13 +572,9 @@
2157 int matched_pos;
2158 int matched_len;
2159
2160- bool prev_align = regex_matcher->set_align_begin(true);
2161- retval = regex_matcher->match_from(s, parsed_flags, &matched_pos, &matched_len);
2162- regex_matcher->set_align_begin(prev_align);
2163+ retval = regex_matcher->match_anywhere(s, parsed_flags|REGEX_ASCII_WHOLE_MATCH, &matched_pos, &matched_len);
2164 if(!retval)
2165 return false;
2166- if(matched_len != strlen(s))
2167- return false;
2168 return true;
2169 }
2170
2171@@ -587,14 +601,19 @@
2172 //look for dollars
2173 if(*temprepl == '\\')
2174 {
2175- temprepl++;
2176- if(!*temprepl || (*temprepl != '\\') || (*temprepl != '$'))//Invalid replacement string.
2177- throw XQUERY_EXCEPTION( err::FORX0004, ERROR_PARAMS( replacement ) );
2178+ if(!(parsed_flags & REGEX_ASCII_LITERAL))
2179+ {
2180+ temprepl++;
2181+ if(!*temprepl)
2182+ temprepl--;
2183+ else if((*temprepl != '\\') && (*temprepl != '$'))//Invalid replacement string.
2184+ throw XQUERY_EXCEPTION( err::FORX0004, ERROR_PARAMS( replacement ) );
2185+ }
2186 result->append(1, *temprepl);
2187 temprepl++;
2188 continue;
2189 }
2190- if(*temprepl == '$')
2191+ if((*temprepl == '$') && !(parsed_flags & REGEX_ASCII_LITERAL))
2192 {
2193 temprepl++;
2194 index = 0;
2195@@ -648,7 +667,7 @@
2196 if(retval)
2197 {
2198 m_match_pos += m_pos;
2199- m_pos = m_match_pos = m_matched_len;
2200+ m_pos = m_match_pos + m_matched_len;
2201 }
2202 else
2203 {
2204@@ -666,35 +685,30 @@
2205 return (int)regex_matcher->get_indexed_regex_count();
2206 }
2207
2208-int regex::get_match_start( int groupId )
2209-{
2210- if(groupId == 0)
2211- return m_match_pos;
2212- if(groupId > (int)regex_matcher->get_indexed_regex_count())
2213- return -1;
2214- const char *submatched_source;
2215- int submatched_len;
2216- if(!regex_matcher->get_indexed_match(groupId, &submatched_source, &submatched_len))
2217- return -1;
2218- return submatched_source - s_in_.c_str();
2219-}
2220-
2221-int regex::get_match_end( int groupId )
2222-{
2223- if(groupId == 0)
2224- return m_match_pos + m_matched_len;
2225- if(groupId > (int)regex_matcher->get_indexed_regex_count())
2226- return -1;
2227- const char *submatched_source;
2228- int submatched_len;
2229- if(!regex_matcher->get_indexed_match(groupId, &submatched_source, &submatched_len))
2230- return -1;
2231- return submatched_source - s_in_.c_str() + submatched_len;
2232+bool regex::get_match_start_end_bytes( int groupId, int *start, int *end )
2233+{
2234+ *start = -1;
2235+ *end = -1;
2236+ if(groupId == 0)
2237+ {
2238+ *start = m_match_pos;
2239+ *end = m_match_pos + m_matched_len;
2240+ return true;
2241+ }
2242+ if(groupId > (int)regex_matcher->get_indexed_regex_count())
2243+ return false;
2244+ const char *submatched_source;
2245+ int submatched_len;
2246+ if(!regex_matcher->get_indexed_match(groupId, &submatched_source, &submatched_len))
2247+ return false;
2248+ *start = submatched_source - s_in_.c_str();
2249+ *end = *start + submatched_len;
2250+ return true;
2251 }
2252
2253 } // namespace unicode
2254 } // namespace zorba
2255-#endif /* ZORBA_NO_UNICODE */
2256+#endif /* ZORBA_NO_ICU */
2257
2258 ///////////////////////////////////////////////////////////////////////////////
2259
2260
2261=== modified file 'src/util/regex.h'
2262--- src/util/regex.h 2012-03-28 05:19:57 +0000
2263+++ src/util/regex.h 2012-04-13 19:45:38 +0000
2264@@ -17,15 +17,13 @@
2265 #ifndef ZORBA_REGEX_H
2266 #define ZORBA_REGEX_H
2267
2268-#ifndef ZORBA_NO_UNICODE
2269-#include <unicode/regex.h>
2270-#endif
2271-
2272 #include "cxx_util.h"
2273 #include "unicode_util.h"
2274 #include "zorbatypes/zstring.h"
2275
2276-#ifndef ZORBA_NO_UNICODE
2277+#ifndef ZORBA_NO_ICU
2278+
2279+#include <unicode/regex.h>
2280
2281 namespace zorba {
2282
2283@@ -496,15 +494,17 @@
2284 } // namespace unicode
2285 } // namespace zorba
2286
2287-#else ///ZORBA_NO_UNICODE (ascii part:)
2288-
2289-#include "util/regex_ascii.h"
2290+///////////////////////////////////////////////////////////////////////////////
2291+
2292+#else /* ZORBA_NO_ICU */
2293+
2294+#include "util/regex_xquery.h"
2295 #include <string>
2296
2297 namespace zorba{
2298 /**
2299 * Converts an XQuery regular expression to the form used by the regular
2300- * expression library Zorba is using (here regex_ascii).
2301+ * expression library Zorba is using (here regex_xquery).
2302 *
2303 * @param xq_re The XQuery regular expression.
2304 * @param lib_re A pointer to the resuling library regular expression.
2305@@ -525,7 +525,7 @@
2306 /**
2307 * Constructs a %regex.
2308 */
2309- regex() : regex_matcher( NULL ) { }
2310+ regex() : regex_matcher( nullptr ) { }
2311
2312 /**
2313 * Destroys a %regex.
2314@@ -835,31 +835,21 @@
2315
2316 /**
2317 * Get the start position of the matched group.
2318- * If groupId is zero, then the start position of the whole match is returned.
2319- * If groupId is non-zero, then the start position of that group is returned.
2320- * If that group has not been matched, -1 is returned.
2321+ * If groupId is zero, then the start and end position of the whole match is returned.
2322+ * If groupId is non-zero, then the start and end position of that group is returned.
2323+ * If that group has not been matched, false is returned.
2324 *
2325 * @param groupId the id of the group, either zero for the entire regex,
2326 * or [1 .. group_count] for that specific group
2327- * @return the start position, zero based, or -1 if that group didn't match
2328+ * @param start to return start position in bytes
2329+ * @param end to return end position in bytes
2330+ * @return true if that group exists and has been matched
2331 */
2332- int get_match_start( int groupId = 0 );
2333+ bool get_match_start_end_bytes( int groupId, int *start, int *end );
2334
2335- /**
2336- * Get the end position of the matched group.
2337- * If groupId is zero, then the end position of the whole match is returned.
2338- * If groupId is non-zero, then the end position of that group is returned.
2339- * If that group has not been matched, -1 is returned.
2340- *
2341- * @param groupId the id of the group, either zero for the entire regex,
2342- * or [1 .. group_count] for that specific group
2343- * @return the end position, zero based, or -1 if that group didn't match
2344- */
2345- int get_match_end( int groupId = 0 );
2346
2347 private:
2348- regex_ascii::CRegexAscii_parser regex_parser;
2349- regex_ascii::CRegexAscii_regex *regex_matcher;
2350+ regex_xquery::CRegexXQuery_regex *regex_matcher;
2351 uint32_t parsed_flags;
2352
2353 zstring s_in_;
2354@@ -873,15 +863,13 @@
2355 regex( regex const& );
2356 regex& operator=( regex const& );
2357 };
2358+
2359+///////////////////////////////////////////////////////////////////////////////
2360+
2361 } // namespace unicode
2362 } // namespace zorba
2363
2364-#endif /* ZORBA_NO_UNICODE */
2365-
2366-
2367-///////////////////////////////////////////////////////////////////////////////
2368-
2369-
2370+#endif /* ZORBA_NO_ICU */
2371 #endif /* ZORBA_REGEX_H */
2372 /*
2373 * Local variables:
2374
2375=== renamed file 'src/util/regex_ascii.cpp' => 'src/util/regex_xquery.cpp'
2376--- src/util/regex_ascii.cpp 2012-03-28 05:19:57 +0000
2377+++ src/util/regex_xquery.cpp 2012-04-13 19:45:38 +0000
2378@@ -1,4 +1,4 @@
2379-a/*
2380+/*
2381 * Copyright 2006-2008 The FLWOR Foundation.
2382 *
2383 * Licensed under the Apache License, Version 2.0 (the "License");
2384@@ -18,12 +18,15 @@
2385
2386 #include "diagnostics/xquery_diagnostics.h"
2387
2388-#include "regex_ascii.h"
2389+#include "regex_xquery.h"
2390 #include <string.h>
2391 #include "zorbatypes/chartype.h"
2392+#include "util/unicode_categories.h"
2393+#include "util/ascii_util.h"
2394+#include "util/utf8_string.h"
2395
2396 namespace zorba {
2397- namespace regex_ascii{
2398+ namespace regex_xquery{
2399 //ascii regular expression matching
2400
2401 /*http://www.w3.org/TR/xmlschema-2/#regexs
2402@@ -62,96 +65,138 @@
2403 + http://www.w3.org/TR/xquery-operators/#regex-syntax (not implemented)
2404 */
2405
2406+
2407+static bool compare_ascii_i(const char *str1, const char *str2)
2408+{
2409+ while(*str1 && *str2)
2410+ {
2411+ if(ascii::to_lower(*str1) != ascii::to_lower(*str2))
2412+ return false;
2413+ str1++;
2414+ str2++;
2415+ }
2416+ if(*str1 || *str2)
2417+ return false;
2418+ return true;
2419+}
2420+
2421+static bool compare_unicode_ni(const char *str1, const char *str2, int len)
2422+{
2423+ while(len > 0)
2424+ {
2425+ const char *temp_str1 = str1;
2426+ const char *temp_str2 = str2;
2427+ unicode::code_point cp1 = unicode::to_upper(utf8::next_char(temp_str1));
2428+ unicode::code_point cp2 = unicode::to_upper(utf8::next_char(temp_str2));
2429+ if(cp1 != cp2)
2430+ return false;
2431+ len -= temp_str1-str1;
2432+ str1 = temp_str1;
2433+ str2 = temp_str2;
2434+ }
2435+ return true;
2436+}
2437+static utf8::size_type myutf8len(const char *source)
2438+{
2439+ utf8::size_type len = utf8::char_length(*source);
2440+ if(!len)
2441+ return 1;
2442+ else
2443+ return len;
2444+}
2445 ////////////////////////////////////
2446 ////Regular expression parsing and building of the tree
2447 ////////////////////////////////////
2448
2449-CRegexAscii_regex* CRegexAscii_parser::parse(const char *pattern, unsigned int flags)
2450+CRegexXQuery_regex* CRegexXQuery_parser::parse(const char *pattern, unsigned int flags)
2451 {
2452 this->flags = flags;
2453- bool align_begin = false;
2454
2455- if(!(flags & REGEX_ASCII_LITERAL) && (pattern[0] == '^'))
2456- align_begin = true;
2457-
2458 int regex_len;
2459- CRegexAscii_regex* regex = parse_regexp(pattern + (align_begin?1:0), &regex_len);
2460+ CRegexXQuery_regex* regex = parse_regexp(pattern, &regex_len);
2461
2462- if(regex)
2463- regex->set_align_begin(align_begin);
2464-
2465 return regex;
2466 }
2467
2468 //until '\0' or ')'
2469-CRegexAscii_regex* CRegexAscii_parser::parse_regexp(const char *pattern,
2470+CRegexXQuery_regex* CRegexXQuery_parser::parse_regexp(const char *pattern,
2471 int *regex_len)
2472 {
2473 *regex_len = 0;
2474 int branch_len;
2475 regex_depth++;
2476- CRegexAscii_regex *regex = new CRegexAscii_regex(current_regex);
2477+ std::auto_ptr<CRegexXQuery_regex> regex(new CRegexXQuery_regex(current_regex));
2478 if(!current_regex)
2479- current_regex = regex;
2480+ current_regex = regex.get();
2481 if(regex_depth >= 2)
2482 {
2483 //mark this as group if it does not start with ?:
2484 if(pattern[0] != '?' || pattern[1] != ':')
2485- current_regex->subregex.push_back(regex);
2486+ current_regex->subregex.push_back(regex.get());
2487 else
2488 *regex_len = 2;
2489 }
2490- CRegexAscii_branch *branch;
2491+ CRegexXQuery_branch *branch;
2492+ bool must_read_another_branch = true;
2493 while(pattern[*regex_len] && (pattern[*regex_len] != ')'))
2494 {
2495 branch = parse_branch(pattern+*regex_len, &branch_len);
2496 if(!branch)
2497 {
2498 regex_depth--;
2499- delete regex;
2500 return NULL;
2501 }
2502 regex->add_branch(branch);
2503 *regex_len += branch_len;
2504+ if(pattern[*regex_len] == '|')
2505+ (*regex_len)++;
2506+ else
2507+ must_read_another_branch = false;
2508 }
2509- if((current_regex == regex) && (pattern[*regex_len] == ')'))
2510+ if((current_regex == regex.get()) && (pattern[*regex_len] == ')'))
2511 {
2512- throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(U_REGEX_MISMATCHED_PAREN)) );
2513+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_MISMATCHED_PAREN)) );
2514 }
2515 if(pattern[*regex_len])
2516 (*regex_len)++;
2517+ if(must_read_another_branch)
2518+ regex->add_branch(new CRegexXQuery_branch(current_regex));//add empty branch
2519 regex->flags = 0;//finished initialization
2520 regex_depth--;
2521- return regex;
2522+ return regex.release();
2523 }
2524
2525-CRegexAscii_branch* CRegexAscii_parser::parse_branch(const char *pattern, int *branch_len)
2526+CRegexXQuery_branch* CRegexXQuery_parser::parse_branch(const char *pattern, int *branch_len)
2527 {
2528 int piece_len;
2529
2530- CRegexAscii_branch *branch = new CRegexAscii_branch(current_regex);
2531- CRegexAscii_piece *piece;
2532+ std::auto_ptr<CRegexXQuery_branch> branch(new CRegexXQuery_branch(current_regex));
2533+ CRegexXQuery_piece *piece;
2534 *branch_len = 0;
2535 while(pattern[*branch_len] && (pattern[*branch_len] != '|') && (pattern[*branch_len] != ')'))
2536 {
2537 piece = parse_piece(pattern+*branch_len, &piece_len);
2538 if(!piece)
2539 {
2540- delete branch;
2541 return NULL;
2542 }
2543+ if(branch->piece_list.size() && dynamic_cast<CRegexXQuery_pinstart*>(piece->atom))
2544+ {
2545+ //found ^ that is not at the beginning of branch
2546+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_ATOM_CHAR), '^') );
2547+ }
2548 branch->add_piece(piece);
2549 *branch_len += piece_len;
2550 }
2551- if(pattern[*branch_len] == '|')
2552- (*branch_len)++;
2553- return branch;
2554+ //if(pattern[*branch_len] == '|')
2555+ // (*branch_len)++;
2556+ return branch.release();
2557 }
2558
2559 //piece = atom + quantifier
2560-CRegexAscii_piece* CRegexAscii_parser::parse_piece(const char *pattern, int *piece_len)
2561+CRegexXQuery_piece* CRegexXQuery_parser::parse_piece(const char *pattern, int *piece_len)
2562 {
2563- CRegexAscii_piece *piece = new CRegexAscii_piece;
2564+ std::auto_ptr<CRegexXQuery_piece> piece(new CRegexXQuery_piece);
2565 IRegexAtom *atom;
2566 *piece_len = 0;
2567
2568@@ -160,19 +205,18 @@
2569 atom = read_atom(pattern, &atom_len);
2570 if(!atom)
2571 {
2572- delete piece;
2573 return NULL;
2574 }
2575 piece->set_atom(atom);
2576 if(!(flags & REGEX_ASCII_LITERAL))
2577- read_quantifier(piece, pattern+atom_len, &quantif_len);
2578+ read_quantifier(piece.get(), pattern+atom_len, &quantif_len);
2579
2580 *piece_len += atom_len + quantif_len;
2581
2582- return piece;
2583+ return piece.release();
2584 }
2585
2586-char CRegexAscii_parser::myishex(char c)
2587+char CRegexXQuery_parser::myishex(char c)
2588 {
2589 if((c >= '0') && (c <= '9'))
2590 return c-'0'+1;
2591@@ -183,26 +227,125 @@
2592 return 0;//not a hex
2593 }
2594
2595-bool CRegexAscii_parser::myisdigit(char c)
2596-{
2597- return (c >= '0') || (c <= '9');
2598-}
2599-
2600-char CRegexAscii_parser::readChar(const char *pattern, int *char_len, bool *is_multichar)
2601+bool CRegexXQuery_parser::myisdigit(char c)
2602+{
2603+ return (c >= '0') && (c <= '9');
2604+}
2605+
2606+bool CRegexXQuery_parser::myisletterAZ(char c)
2607+{
2608+ return ((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z'));
2609+}
2610+
2611+static const unicode::code_point specials_extcp[] = {0xFFF0, 0xFFFD, 0};
2612+
2613+static CRegexXQuery_parser::block_escape_t block_escape[] =
2614+{
2615+{{0x0000, 0x007F}, NULL, "BasicLatin"},
2616+{{0x0080, 0x00FF}, NULL, "Latin-1Supplement"},
2617+{{0x0100, 0x017F}, NULL, "LatinExtended-A"},
2618+{{0x0180, 0x024F}, NULL, "LatinExtended-B"},
2619+{{0x0250, 0x02AF}, NULL, "IPAExtensions"},
2620+{{0x02B0, 0x02FF}, NULL, "SpacingModifierLetters"},
2621+{{0x0300, 0x036F}, NULL, "CombiningDiacriticalMarks"},
2622+{{0x0370, 0x03FF}, NULL, "Greek"},
2623+{{0x0400, 0x04FF}, NULL, "Cyrillic"},
2624+{{0x0530, 0x058F}, NULL, "Armenian"},
2625+{{0x0590, 0x05FF}, NULL, "Hebrew"},
2626+{{0x0600, 0x06FF}, NULL, "Arabic"},
2627+{{0x0700, 0x074F}, NULL, "Syriac"},
2628+{{0x0780, 0x07BF}, NULL, "Thaana"},
2629+{{0x0900, 0x097F}, NULL, "Devanagari"},
2630+{{0x0980, 0x09FF}, NULL, "Bengali"},
2631+{{0x0A00, 0x0A7F}, NULL, "Gurmukhi"},
2632+{{0x0A80, 0x0AFF}, NULL, "Gujarati"},
2633+{{0x0B00, 0x0B7F}, NULL, "Oriya"},
2634+{{0x0B80, 0x0BFF}, NULL, "Tamil"},
2635+{{0x0C00, 0x0C7F}, NULL, "Telugu"},
2636+{{0x0C80, 0x0CFF}, NULL, "Kannada"},
2637+{{0x0D00, 0x0D7F}, NULL, "Malayalam"},
2638+{{0x0D80, 0x0DFF}, NULL, "Sinhala"},
2639+{{0x0E00, 0x0E7F}, NULL, "Thai"},
2640+{{0x0E80, 0x0EFF}, NULL, "Lao"},
2641+{{0x0F00, 0x0FFF}, NULL, "Tibetan"},
2642+{{0x1000, 0x109F}, NULL, "Myanmar"},
2643+{{0x10A0, 0x10FF}, NULL, "Georgian"},
2644+{{0x1100, 0x11FF}, NULL, "HangulJamo"},
2645+{{0x1200, 0x137F}, NULL, "Ethiopic"},
2646+{{0x13A0, 0x13FF}, NULL, "Cherokee"},
2647+{{0x1400, 0x167F}, NULL, "UnifiedCanadianAboriginalSyllabics"},
2648+{{0x1680, 0x169F}, NULL, "Ogham"},
2649+{{0x16A0, 0x16FF}, NULL, "Runic"},
2650+{{0x1780, 0x17FF}, NULL, "Khmer"},
2651+{{0x1800, 0x18AF}, NULL, "Mongolian"},
2652+{{0x1E00, 0x1EFF}, NULL, "LatinExtendedAdditional"},
2653+{{0x1F00, 0x1FFF}, NULL, "GreekExtended"},
2654+{{0x2000, 0x206F}, NULL, "GeneralPunctuation"},
2655+{{0x2070, 0x209F}, NULL, "SuperscriptsandSubscripts"},
2656+{{0x20A0, 0x20CF}, NULL, "CurrencySymbols"},
2657+{{0x20D0, 0x20FF}, NULL, "CombiningMarksforSymbols"},
2658+{{0x2100, 0x214F}, NULL, "LetterlikeSymbols"},
2659+{{0x2150, 0x218F}, NULL, "NumberForms"},
2660+{{0x2190, 0x21FF}, NULL, "Arrows"},
2661+{{0x2200, 0x22FF}, NULL, "MathematicalOperators"},
2662+{{0x2300, 0x23FF}, NULL, "MiscellaneousTechnical"},
2663+{{0x2400, 0x243F}, NULL, "ControlPictures"},
2664+{{0x2440, 0x245F}, NULL, "OpticalCharacterRecognition"},
2665+{{0x2460, 0x24FF}, NULL, "EnclosedAlphanumerics"},
2666+{{0x2500, 0x257F}, NULL, "BoxDrawing"},
2667+{{0x2580, 0x259F}, NULL, "BlockElements"},
2668+{{0x25A0, 0x25FF}, NULL, "GeometricShapes"},
2669+{{0x2600, 0x26FF}, NULL, "MiscellaneousSymbols"},
2670+{{0x2700, 0x27BF}, NULL, "Dingbats"},
2671+{{0x2800, 0x28FF}, NULL, "BraillePatterns"},
2672+{{0x2E80, 0x2EFF}, NULL, "CJKRadicalsSupplement"},
2673+{{0x2F00, 0x2FDF}, NULL, "KangxiRadicals"},
2674+{{0x2FF0, 0x2FFF}, NULL, "IdeographicDescriptionCharacters"},
2675+{{0x3000, 0x303F}, NULL, "CJKSymbolsandPunctuation"},
2676+{{0x3040, 0x309F}, NULL, "Hiragana"},
2677+{{0x30A0, 0x30FF}, NULL, "Katakana"},
2678+{{0x3100, 0x312F}, NULL, "Bopomofo"},
2679+{{0x3130, 0x318F}, NULL, "HangulCompatibilityJamo"},
2680+{{0x3190, 0x319F}, NULL, "Kanbun"},
2681+{{0x31A0, 0x31BF}, NULL, "BopomofoExtended"},
2682+{{0x3200, 0x32FF}, NULL, "EnclosedCJKLettersandMonths"},
2683+{{0x3300, 0x33FF}, NULL, "CJKCompatibility"},
2684+{{0x3400, 0x4DB5}, NULL, "CJKUnifiedIdeographsExtensionA"},
2685+{{0x4E00, 0x9FFF}, NULL, "CJKUnifiedIdeographs"},
2686+{{0xA000, 0xA48F}, NULL, "YiSyllables"},
2687+{{0xA490, 0xA4CF}, NULL, "YiRadicals"},
2688+{{0xAC00, 0xD7A3}, NULL, "HangulSyllables"},
2689+{{0xE000, 0xF8FF}, NULL, "PrivateUse"},
2690+{{0xF900, 0xFAFF}, NULL, "CJKCompatibilityIdeographs"},
2691+{{0xFB00, 0xFB4F}, NULL, "AlphabeticPresentationForms"},
2692+{{0xFB50, 0xFDFF}, NULL, "ArabicPresentationForms-A"},
2693+{{0xFE20, 0xFE2F}, NULL, "CombiningHalfMarks"},
2694+{{0xFE30, 0xFE4F}, NULL, "CJKCompatibilityForms"},
2695+{{0xFE50, 0xFE6F}, NULL, "SmallFormVariants"},
2696+{{0xFE70, 0xFEFE}, NULL, "ArabicPresentationForms-B"},
2697+{{0xFEFF, 0xFEFF}, specials_extcp, "Specials"},
2698+{{0xFF00, 0xFFEF}, NULL, "HalfwidthandFullwidthForms"}
2699+};
2700+
2701+CRegexXQuery_charmatch* CRegexXQuery_parser::readChar(const char *pattern,
2702+ int *char_len,
2703+ enum CHARGROUP_t *multichar_type)
2704 {
2705 char c = 0;
2706 *char_len = 0;
2707- *is_multichar = false;
2708+ *multichar_type = CHARGROUP_NO_MULTICHAR;
2709 switch(pattern[*char_len])
2710 {
2711 case '\\':
2712- { (*char_len)++;
2713+ {
2714+ (*char_len)++;
2715 switch(pattern[*char_len])
2716 {
2717- case 'n': c = '\n';break;
2718- case 'r': c = '\r';break;
2719- case 't': c = '\t';break;
2720+ case 'n': c = '\n';(*char_len)++;return new CRegexXQuery_char_ascii(current_regex, c);
2721+ case 'r': c = '\r';(*char_len)++;return new CRegexXQuery_char_ascii(current_regex, c);
2722+ case 't': c = '\t';(*char_len)++;return new CRegexXQuery_char_ascii(current_regex, c);
2723 case '\\':
2724+ case '/'://+
2725 case '|':
2726 case '.':
2727 case '?':
2728@@ -216,19 +359,205 @@
2729 case '['://#x5B
2730 case ']'://#x5D
2731 case '^'://#x5E
2732+ case '$'://+
2733 c = pattern[*char_len];
2734- break;
2735+ (*char_len)++;
2736+ *multichar_type = CHARGROUP_FLAGS_ONECHAR_ASCII;
2737+ return new CRegexXQuery_char_ascii(current_regex, c);
2738 case 'p'://catEsc
2739 case 'P'://complEsc
2740+ {
2741 //ignore the prop for now
2742- c = pattern[*char_len];
2743- *is_multichar = true;
2744- if(pattern[*char_len+1] == '{')
2745- {
2746- while(pattern[*char_len] != '}')
2747+ *multichar_type = CHARGROUP_FLAGS_MULTICHAR_p;//(CHARGROUP_t)((pattern[*char_len] == 'P') ? 128 : 0);
2748+ bool is_reverse = (pattern[*char_len] == 'P');
2749+ c = 0;
2750+ if(pattern[(*char_len)+1] != '{')
2751+ {
2752+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_BROKEN_P_CONSTRUCT)) );
2753+ }
2754+ (*char_len) += 2;
2755+ switch(pattern[*char_len])
2756+ {//IsCategory
2757+ case 'L':
2758+ {
2759+ switch(pattern[(*char_len)+1])
2760+ {
2761+ case '}':
2762+ c = unicode::UNICODE_Ll + 50;break;
2763+ case 'u':
2764+ c = unicode::UNICODE_Lu; (*char_len)++;break;
2765+ case 'l':
2766+ c = unicode::UNICODE_Ll; (*char_len)++;break;
2767+ case 't':
2768+ c = unicode::UNICODE_Lt; (*char_len)++;break;
2769+ case 'm':
2770+ c = unicode::UNICODE_Lm; (*char_len)++;break;
2771+ case 'o':
2772+ c = unicode::UNICODE_Lo; (*char_len)++;break;
2773+ default:
2774+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PL_CONSTRUCT)) );
2775+ }
2776+ }break;
2777+ case 'M':
2778+ {
2779+ switch(pattern[(*char_len)+1])
2780+ {
2781+ case '}':
2782+ c = unicode::UNICODE_Mc + 50;break;
2783+ case 'n':
2784+ c = unicode::UNICODE_Mn; (*char_len)++;break;
2785+ case 'c':
2786+ c = unicode::UNICODE_Mc; (*char_len)++;break;
2787+ case 'e':
2788+ c = unicode::UNICODE_Me; (*char_len)++;break;
2789+ default:
2790+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PM_CONSTRUCT)) );
2791+ }
2792+ }break;
2793+ case 'N':
2794+ {
2795+ switch(pattern[(*char_len)+1])
2796+ {
2797+ case '}':
2798+ c = unicode::UNICODE_Nd + 50;break;
2799+ case 'd':
2800+ c = unicode::UNICODE_Nd; (*char_len)++;break;
2801+ case 'l':
2802+ c = unicode::UNICODE_Nl; (*char_len)++;break;
2803+ case 'o':
2804+ c = unicode::UNICODE_No; (*char_len)++;break;
2805+ default:
2806+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PN_CONSTRUCT)) );
2807+ }
2808+ }break;
2809+ case 'P':
2810+ {
2811+ switch(pattern[(*char_len)+1])
2812+ {
2813+ case '}':
2814+ c = unicode::UNICODE_Pc + 50;break;
2815+ case 'c':
2816+ c = unicode::UNICODE_Pc; (*char_len)++;break;
2817+ case 'd':
2818+ c = unicode::UNICODE_Pd; (*char_len)++;break;
2819+ case 's':
2820+ c = unicode::UNICODE_Ps; (*char_len)++;break;
2821+ case 'e':
2822+ c = unicode::UNICODE_Pe; (*char_len)++;break;
2823+ case 'i':
2824+ c = unicode::UNICODE_Pi; (*char_len)++;break;
2825+ case 'f':
2826+ c = unicode::UNICODE_Pf; (*char_len)++;break;
2827+ case 'o':
2828+ c = unicode::UNICODE_Po; (*char_len)++;break;
2829+ default:
2830+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PP_CONSTRUCT)) );
2831+ }
2832+ }break;
2833+ case 'Z':
2834+ {
2835+ switch(pattern[(*char_len)+1])
2836+ {
2837+ case '}':
2838+ c = unicode::UNICODE_Zl + 50;break;
2839+ case 's':
2840+ c = unicode::UNICODE_Zs; (*char_len)++;break;
2841+ case 'l':
2842+ c = unicode::UNICODE_Zl; (*char_len)++;break;
2843+ case 'p':
2844+ c = unicode::UNICODE_Zp; (*char_len)++;break;
2845+ default:
2846+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PZ_CONSTRUCT)) );
2847+ }
2848+ }break;
2849+ case 'S':
2850+ {
2851+ switch(pattern[(*char_len)+1])
2852+ {
2853+ case '}':
2854+ c = unicode::UNICODE_Sc + 50;break;
2855+ case 'm':
2856+ c = unicode::UNICODE_Sm; (*char_len)++;break;
2857+ case 'c':
2858+ c = unicode::UNICODE_Sc; (*char_len)++;break;
2859+ case 'k':
2860+ c = unicode::UNICODE_Sk; (*char_len)++;break;
2861+ case 'o':
2862+ c = unicode::UNICODE_So; (*char_len)++;break;
2863+ default:
2864+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PS_CONSTRUCT)) );
2865+ }
2866+ }break;
2867+ case 'C':
2868+ {
2869+ switch(pattern[(*char_len)+1])
2870+ {
2871+ case '}':
2872+ c = unicode::UNICODE_Cc + 50;break;
2873+ case 'c':
2874+ c = unicode::UNICODE_Cc; (*char_len)++;break;
2875+ case 'f':
2876+ c = unicode::UNICODE_Cf; (*char_len)++;break;
2877+ case 'o':
2878+ c = unicode::UNICODE_Co; (*char_len)++;break;
2879+ case 'n':
2880+ c = unicode::UNICODE_Cn; (*char_len)++;break;
2881+ default:
2882+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PC_CONSTRUCT)) );
2883+ }
2884+ }break;
2885+ }//end switch
2886+ if(c)
2887+ {
2888+ if(pattern[(*char_len) + 1] != '}')
2889+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_BROKEN_P_CONSTRUCT)) );
2890+ (*char_len)++;
2891+ (*char_len)++;
2892+ return new CRegexXQuery_multicharP(current_regex, c, is_reverse);
2893+ }
2894+ if(pattern[*char_len] == 'I')
2895+ {
2896+ if(pattern[(*char_len)+1] == 's')//IsBlock
2897+ {
2898+ *multichar_type = CHARGROUP_FLAGS_MULTICHAR_Is;
2899+ (*char_len) += 2;
2900+ zstring block_name;
2901+ char tempc = pattern[(*char_len)];
2902+ while(tempc && (tempc != '}'))
2903+ {
2904+ if(!myisletterAZ(tempc) && !myisdigit(tempc) && (tempc != '-'))
2905+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_BROKEN_PIs_CONSTRUCT)) );
2906+ block_name.append(1, tempc);
2907+ (*char_len)++;
2908+ tempc = pattern[(*char_len)];
2909+ }
2910+ if(!tempc)
2911+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_BROKEN_PIs_CONSTRUCT)) );
2912+ //search for the block name
2913+ int i;
2914+ int nr_blocks = sizeof(block_escape)/sizeof(CRegexXQuery_parser::block_escape_t);
2915+ for(i=0;i<nr_blocks;i++)
2916+ {
2917+ if(compare_ascii_i(block_name.c_str(), block_escape[i].group_name))
2918+ {
2919+ c = i;
2920+ break;
2921+ }
2922+ }
2923+ if(i==nr_blocks)
2924+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PIs_CONSTRUCT)) );
2925 (*char_len)++;
2926- }
2927- break;
2928+ return new CRegexXQuery_multicharIs(current_regex, i, is_reverse);
2929+ }
2930+ else
2931+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_BROKEN_PIs_CONSTRUCT)) );
2932+ }
2933+ else
2934+ {
2935+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_BROKEN_P_CONSTRUCT)) );
2936+ }
2937+ break;//unreachable
2938+ }//end case 'p'
2939 //multiCharEsc
2940 case 's':
2941 case 'S':
2942@@ -240,40 +569,104 @@
2943 case 'D':
2944 case 'w':
2945 case 'W':
2946- *is_multichar = true;
2947+ *multichar_type = CHARGROUP_FLAGS_MULTICHAR_OTHER;
2948 c = pattern[*char_len];
2949- break;
2950- }
2951- break;
2952- }
2953- case '#':///might be #xXX
2954- {
2955- if((pattern[*char_len+1] == 'x') &&
2956- myishex(pattern[*char_len+2]) && myishex(pattern[*char_len+3]))
2957- {
2958- c = (myishex(pattern[*char_len+2])-1)<<4 | (myishex(pattern[*char_len+3])-1);
2959- *char_len += 3;
2960- break;
2961- }
2962- }
2963+ (*char_len)++;
2964+ return new CRegexXQuery_multicharOther(current_regex, c);
2965+ case 'u'://unicode codepoint \uXXXX
2966+ {
2967+ unicode::code_point utf8c = 0;
2968+ (*char_len)++;
2969+ for(int i=0;i<4;i++)
2970+ {
2971+ char hex = myishex(pattern[*char_len]);
2972+ if(!hex)
2973+ {
2974+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_UNICODE_CODEPOINT_u)) );
2975+ }
2976+ utf8c <<= 4;
2977+ utf8c |= (hex-1) & 0x0f;
2978+ (*char_len)++;
2979+ }
2980+ return create_charmatch(utf8c, NULL, 0, multichar_type);
2981+ }
2982+ case 'U'://unicode codepoint \UXXXXXXXX
2983+ {
2984+ unicode::code_point utf8c = 0;
2985+ (*char_len)++;
2986+ for(int i=0;i<8;i++)
2987+ {
2988+ char hex = myishex(pattern[*char_len]);
2989+ if(!hex)
2990+ {
2991+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_UNICODE_CODEPOINT_u)) );
2992+ }
2993+ utf8c <<= 4;
2994+ utf8c |= (hex-1) & 0x0f;
2995+ (*char_len)++;
2996+ }
2997+ return create_charmatch(utf8c, NULL, 0, multichar_type);
2998+ }
2999+ default:
3000+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_ESC_CHAR)) );
3001+ }
3002+ assert(false);
3003+ break;//unreachable
3004+ }//end case '\'
3005 default:
3006- c = pattern[*char_len];
3007- break;
3008- }
3009-
3010- (*char_len)++;
3011- return c;
3012-}
3013-
3014-
3015-
3016-IRegexAtom* CRegexAscii_parser::read_atom(const char *pattern, int *atom_len)
3017+ {
3018+ const char *temp_pattern = pattern;
3019+ unicode::code_point utf8c = utf8::next_char(temp_pattern);
3020+ (*char_len) = temp_pattern - pattern;
3021+ return create_charmatch(utf8c, pattern, *char_len, multichar_type);
3022+ }
3023+ }
3024+ return NULL;
3025+}
3026+
3027+CRegexXQuery_charmatch *CRegexXQuery_parser::create_charmatch(unicode::code_point utf8c,
3028+ const char *pattern, int utf8len,
3029+ enum CHARGROUP_t *multichar_type)
3030+{
3031+ if(utf8c <= 0x7F)
3032+ {
3033+ *multichar_type = CHARGROUP_FLAGS_ONECHAR_ASCII;
3034+ if(flags & REGEX_ASCII_CASE_INSENSITIVE)
3035+ return new CRegexXQuery_char_ascii_i(current_regex, (char)utf8c);
3036+ else
3037+ return new CRegexXQuery_char_ascii(current_regex, (char)utf8c);
3038+ }
3039+ else
3040+ {
3041+ *multichar_type = CHARGROUP_FLAGS_ONECHAR_UNICODE;
3042+ if(flags & REGEX_ASCII_CASE_INSENSITIVE)
3043+ return new CRegexXQuery_char_unicode_i(current_regex, utf8c);
3044+ else
3045+ {
3046+ if(pattern)
3047+ return new CRegexXQuery_char_unicode(current_regex, pattern, utf8len);
3048+ else
3049+ return new CRegexXQuery_char_unicode_cp(current_regex, utf8c);
3050+ }
3051+ }
3052+}
3053+
3054+IRegexAtom* CRegexXQuery_parser::read_atom(const char *pattern, int *atom_len)
3055 {
3056 *atom_len = 0;
3057- char c;
3058- bool is_end_line = false;
3059- c = pattern[*atom_len];
3060- if((!(flags & REGEX_ASCII_LITERAL)) && (c == '\\'))
3061+ if(flags & REGEX_ASCII_LITERAL)
3062+ {
3063+ unicode::code_point utf8c;
3064+ //bool is_end_line = false;
3065+ const char *temp_pattern = pattern;
3066+ utf8c = utf8::next_char(temp_pattern);
3067+ *atom_len = temp_pattern - pattern;
3068+ enum CHARGROUP_t multichar_type;
3069+ return create_charmatch(utf8c, pattern, *atom_len, &multichar_type);
3070+ }
3071+
3072+ char c = *pattern;
3073+ if(c == '\\')
3074 {
3075 //check for back reference
3076 if(myisdigit(pattern[(*atom_len)+1]))
3077@@ -281,13 +674,13 @@
3078 (*atom_len)++;
3079 if(pattern[*atom_len] == '0')
3080 {
3081- throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(U_REGEX_INVALID_BACK_REF)) );
3082+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_BACK_REF), 0, current_regex->subregex.size()) );
3083 }
3084 unsigned int backref = pattern[*atom_len] - '0';
3085 if((backref > current_regex->subregex.size()) ||
3086 (current_regex->subregex.at(backref-1)->flags != 0))
3087 {
3088- throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(U_REGEX_INVALID_BACK_REF)) );
3089+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_BACK_REF), backref, current_regex->subregex.size()) );
3090 }
3091 while(current_regex->subregex.size() >= backref*10)
3092 {
3093@@ -303,70 +696,86 @@
3094 break;
3095 }
3096 }
3097- return new CRegexAscii_backref(current_regex, backref);
3098+ (*atom_len)++;
3099+ return new CRegexXQuery_backref(current_regex, backref);
3100 }
3101 }
3102+ if(c == '^')
3103+ {
3104+ (*atom_len)++;
3105+ return new CRegexXQuery_pinstart(current_regex);
3106+ }
3107+ if((c == '}') || (c == '{') || (c == '?') || (c == '*') || (c == '+') || (c == '|'))
3108+ {
3109+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_ATOM_CHAR), c) );
3110+ }
3111 switch(c)
3112 {
3113 case '[':
3114 {
3115- if(!(flags & REGEX_ASCII_LITERAL))
3116- {
3117- (*atom_len)++;
3118- CRegexAscii_chargroup *chargroup = NULL;
3119- int chargroup_len;
3120- chargroup = readchargroup(pattern+*atom_len, &chargroup_len);
3121- *atom_len += chargroup_len;
3122- return chargroup;
3123- }
3124+ (*atom_len)++;
3125+ CRegexXQuery_chargroup *chargroup = NULL;
3126+ int chargroup_len;
3127+ chargroup = readchargroup(pattern+*atom_len, &chargroup_len);
3128+ *atom_len += chargroup_len;
3129+ return chargroup;
3130 }
3131 case '.'://WildCharEsc
3132 {
3133- if(!(flags & REGEX_ASCII_LITERAL))
3134- {
3135- CRegexAscii_wildchar *wildchar = new CRegexAscii_wildchar(current_regex);
3136- (*atom_len)++;
3137- return wildchar;
3138- }
3139+ (*atom_len)++;
3140+ return new CRegexXQuery_wildchar(current_regex);
3141 }
3142 case '('://begin an embedded reg exp
3143 {
3144- if(!(flags & REGEX_ASCII_LITERAL))
3145- {
3146- (*atom_len)++;
3147- CRegexAscii_regex *emb_regex = NULL;
3148- int regex_len;
3149- emb_regex = parse_regexp(pattern + *atom_len, &regex_len);
3150- *atom_len += regex_len;
3151- return emb_regex;
3152- }
3153+ (*atom_len)++;
3154+ CRegexXQuery_regex *emb_regex = NULL;
3155+ int regex_len;
3156+ emb_regex = parse_regexp(pattern + *atom_len, &regex_len);
3157+ *atom_len += regex_len;
3158+ return emb_regex;
3159 }
3160 case '$'://end line
3161- if(!(flags & REGEX_ASCII_LITERAL))
3162- {
3163- is_end_line = true;
3164- }
3165+ //is_end_line = true;
3166+ (*atom_len)++;
3167+ return new CRegexXQuery_endline(current_regex);
3168 default:
3169 {
3170- char c;
3171+ //char c;
3172+ CRegexXQuery_charmatch *charmatch = NULL;
3173 int c_len;
3174- bool is_multichar = false;
3175- if(!(flags & REGEX_ASCII_LITERAL))
3176- c = readChar(pattern+*atom_len, &c_len, &is_multichar);
3177- else
3178+ CHARGROUP_t multichar_type = CHARGROUP_NO_MULTICHAR;
3179+ *atom_len = 0;
3180+ while(pattern[*atom_len])
3181 {
3182- c = pattern[*atom_len];
3183- c_len = 1;
3184+ charmatch = readChar(pattern+*atom_len, &c_len, &multichar_type);
3185+ *atom_len += c_len;
3186+ if((flags & REGEX_ASCII_NO_WHITESPACE) && (multichar_type == CHARGROUP_FLAGS_ONECHAR_ASCII))
3187+ {
3188+ char c = (char)charmatch->get_c();
3189+ if((c == ' ') || (c == '\t') || (c == '\r') || (c == '\n'))
3190+ {
3191+ //ignore this whitespace
3192+ delete charmatch;
3193+ continue;
3194+ }
3195+ else
3196+ break;
3197+ }
3198+ else
3199+ break;
3200 }
3201- CRegexAscii_chargroup *chargroup = new CRegexAscii_chargroup(current_regex);
3202- if(is_multichar)
3203- chargroup->addMultiChar(c);
3204+ /*
3205+ std::auto_ptr<CRegexXQuery_chargroup> chargroup(new CRegexXQuery_chargroup(current_regex));
3206+ if(multichar_type)
3207+ chargroup->addMultiChar(c, multichar_type);
3208 else if(is_end_line)
3209 chargroup->addEndLine();
3210 else
3211- chargroup->addCharRange(c, c);
3212+ chargroup->addOneChar(c);
3213 *atom_len += c_len;
3214- return chargroup;
3215+ return chargroup.release();
3216+ */
3217+ return charmatch;
3218 }
3219 }
3220 }
3221@@ -374,81 +783,119 @@
3222 //read until ']'
3223 //posCharGroup ::= ( charRange | charClassEsc )+
3224 //charRange ::= seRange | XmlCharIncDash
3225-CRegexAscii_chargroup* CRegexAscii_parser::readchargroup(const char *pattern, int *chargroup_len)
3226+CRegexXQuery_chargroup* CRegexXQuery_parser::readchargroup(const char *pattern, int *chargroup_len)
3227 {
3228- CRegexAscii_chargroup *chargroup = NULL;
3229+ std::auto_ptr<CRegexXQuery_chargroup> chargroup;
3230 *chargroup_len = 0;
3231 if(pattern[*chargroup_len] == '^')//negative group
3232 {
3233 (*chargroup_len)++;
3234- chargroup = new CRegexAscii_negchargroup(current_regex);
3235+ chargroup.reset(new CRegexXQuery_negchargroup(current_regex));
3236 }
3237 else
3238- chargroup = new CRegexAscii_chargroup(current_regex);
3239+ chargroup.reset(new CRegexXQuery_chargroup(current_regex));
3240 while(pattern[*chargroup_len] && (pattern[*chargroup_len]!=']'))
3241 {
3242- char c1, c2;
3243- bool is_multichar;
3244+ //char c1, c2;
3245+ CHARGROUP_t multichar_type = CHARGROUP_NO_MULTICHAR;
3246 int c1_len;
3247- c1 = pattern[*chargroup_len];
3248- c2 = pattern[*chargroup_len+1];
3249- if((c1 == '-') && (c2 == '['))//charClassSub
3250+ if((pattern[*chargroup_len] == '-') && (pattern[(*chargroup_len)+1] == '['))//charClassSub
3251 {
3252 int classsub_len;
3253- CRegexAscii_chargroup *classsub = readchargroup(pattern + *chargroup_len+1 + 1, &classsub_len);
3254+ CRegexXQuery_chargroup *classsub = readchargroup(pattern + (*chargroup_len)+1 + 1, &classsub_len);
3255 if(!classsub)
3256 {
3257- delete chargroup;
3258- return NULL;
3259+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_SUBCLASS)) );
3260 }
3261 chargroup->addClassSub(classsub);
3262 *chargroup_len += 2 + classsub_len + 1;
3263 if(pattern[*chargroup_len-1] != ']')
3264 {
3265- delete chargroup;
3266- return NULL;
3267+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_USE_OF_SUBCLASS)) );
3268 }
3269- return chargroup;
3270+ return chargroup.release();
3271 }
3272
3273- c1 = readChar(pattern+*chargroup_len, &c1_len, &is_multichar);
3274- if(is_multichar)//first char is multichar
3275+ std::unique_ptr<CRegexXQuery_charmatch> charmatch(readChar(pattern+*chargroup_len, &c1_len, &multichar_type));
3276+ if((multichar_type == CHARGROUP_FLAGS_MULTICHAR_p) ||
3277+ (multichar_type == CHARGROUP_FLAGS_MULTICHAR_Is) ||
3278+ (multichar_type == CHARGROUP_FLAGS_MULTICHAR_OTHER))//first char is multichar
3279 {
3280- chargroup->addMultiChar(c1);
3281+ if((pattern[*chargroup_len+c1_len] == '-') &&///should not be a range
3282+ (pattern[*chargroup_len+c1_len+1] != ']'))
3283+ {
3284+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_MULTICHAR_IN_CHAR_RANGE)) );
3285+ }
3286+ //chargroup->addMultiChar(c1, multichar_type);
3287+ chargroup->addCharMatch(charmatch.release());
3288 *chargroup_len += c1_len;
3289 continue;
3290 }
3291- if(pattern[*chargroup_len+c1_len] == '-')///might be a range
3292+ (*chargroup_len) += c1_len;
3293+ if(pattern[*chargroup_len] == '-')///might be a range
3294 {
3295- if(pattern[*chargroup_len+c1_len+1] == ']')//no range, just the last char is '-'
3296+ if(pattern[(*chargroup_len)+1] == ']')//no range, just the last char is '-'
3297 {
3298- chargroup->addCharRange(c1, c1);
3299- chargroup->addCharRange('-', '-');
3300- *chargroup_len += c1_len + 1;
3301+ //chargroup->addOneChar(c1);
3302+ //chargroup->addOneChar('-');
3303+ chargroup->addCharMatch(charmatch.release());
3304+ chargroup->addCharMatch(new CRegexXQuery_char_ascii(current_regex, '-'));
3305+ (*chargroup_len)++;
3306 continue;
3307 }
3308- else
3309+ else if(pattern[(*chargroup_len)+1] != '[')
3310 {
3311 //it is a range
3312- char c3;
3313- int c3_len;
3314- c3 = readChar(pattern+*chargroup_len+c1_len+1, &c3_len, &is_multichar);
3315- if(is_multichar)
3316- return NULL;//error
3317- chargroup->addCharRange(c1, c3);
3318- *chargroup_len += c1_len + 1 + c3_len;
3319+ (*chargroup_len)++;
3320+ std::unique_ptr<CRegexXQuery_charmatch> charmatch2;
3321+ CHARGROUP_t multichar_type2 = CHARGROUP_NO_MULTICHAR;
3322+ int c2_len;
3323+ charmatch2.reset(readChar(pattern+(*chargroup_len), &c2_len, &multichar_type2));
3324+ if((multichar_type2 != CHARGROUP_FLAGS_ONECHAR_ASCII) &&
3325+ (multichar_type2 != CHARGROUP_FLAGS_ONECHAR_ASCII))//second char in range is multichar
3326+ {
3327+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_MULTICHAR_IN_CHAR_RANGE)) );
3328+ }
3329+ //chargroup->addCharRange(c1, c3);
3330+ if((multichar_type == CHARGROUP_FLAGS_ONECHAR_ASCII) && (multichar_type2 == CHARGROUP_FLAGS_ONECHAR_ASCII))
3331+ {
3332+ if(flags & REGEX_ASCII_CASE_INSENSITIVE)
3333+ chargroup->addCharMatch(new CRegexXQuery_char_range_ascii_i(current_regex,
3334+ (char)charmatch->get_c(),
3335+ (char)charmatch2->get_c()));
3336+ else
3337+ chargroup->addCharMatch(new CRegexXQuery_char_range_ascii(current_regex,
3338+ (char)charmatch->get_c(),
3339+ (char)charmatch2->get_c()));
3340+ }
3341+ else
3342+ {
3343+ if(flags & REGEX_ASCII_CASE_INSENSITIVE)
3344+ chargroup->addCharMatch(new CRegexXQuery_char_range_unicode_i(current_regex,
3345+ charmatch->get_c(),
3346+ charmatch2->get_c()));
3347+ else
3348+ chargroup->addCharMatch(new CRegexXQuery_char_range_unicode(current_regex,
3349+ charmatch->get_c(),
3350+ charmatch2->get_c()));
3351+ }
3352+ *chargroup_len += c2_len;
3353 continue;
3354 }
3355 }
3356- chargroup->addCharRange(c1, c1);
3357- *chargroup_len += c1_len;
3358+ //chargroup->addOneChar(c1);
3359+ chargroup->addCharMatch(charmatch.release());
3360 }
3361 if(pattern[*chargroup_len])
3362 (*chargroup_len)++;
3363- return chargroup;
3364+ else
3365+ {
3366+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_MISSING_CLOSE_BRACKET)) );
3367+ }
3368+ return chargroup.release();
3369 }
3370
3371-void CRegexAscii_parser::read_quantifier(CRegexAscii_piece *piece,
3372+void CRegexXQuery_parser::read_quantifier(CRegexXQuery_piece *piece,
3373 const char *pattern, int *quantif_len)
3374 {
3375 *quantif_len = 0;
3376@@ -496,6 +943,10 @@
3377 max = max*10 + pattern[*quantif_len] - '0';
3378 (*quantif_len)++;
3379 }
3380+ if(max < min)
3381+ {
3382+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_MAX_LT_MIN)) );
3383+ }
3384 piece->set_quantifier_min_max(min, max, true);
3385 }
3386 while(pattern[*quantif_len] && (pattern[*quantif_len] != '}'))
3387@@ -524,23 +975,25 @@
3388 ///Constructors and destructors and internal functions
3389 ////////////////////////////
3390
3391-CRegexAscii_regex::CRegexAscii_regex(CRegexAscii_regex *topregex) : IRegexAtom(topregex?topregex:this)
3392+CRegexXQuery_regex::CRegexXQuery_regex(CRegexXQuery_regex *topregex) : IRegexAtom(topregex?topregex:this)
3393 {
3394 matched_source = NULL;
3395 matched_len = 0;
3396+// backup_matched_source = NULL;
3397+// backup_matched_len = 0;
3398 flags = 128;//set to 0 after initialization
3399 }
3400
3401-CRegexAscii_regex::~CRegexAscii_regex()
3402+CRegexXQuery_regex::~CRegexXQuery_regex()
3403 {
3404- std::list<CRegexAscii_branch*>::iterator branch_it;
3405+ std::list<CRegexXQuery_branch*>::iterator branch_it;
3406
3407 for(branch_it = branch_list.begin(); branch_it != branch_list.end(); branch_it++)
3408 {
3409 delete (*branch_it);
3410 }
3411 /*
3412- std::vector<CRegexAscii_regex*>::iterator subregex_it;
3413+ std::vector<CRegexXQuery_regex*>::iterator subregex_it;
3414 for(subregex_it = subregex.begin(); subregex_it != subregex.end(); subregex_it++)
3415 {
3416 delete (*subregex_it);
3417@@ -548,25 +1001,18 @@
3418 */
3419 }
3420
3421-bool CRegexAscii_regex::set_align_begin(bool align_begin)
3422-{
3423- bool prev_align = this->align_begin;
3424- this->align_begin = align_begin;
3425- return prev_align;
3426-}
3427-
3428-void CRegexAscii_regex::add_branch(CRegexAscii_branch *branch)
3429+void CRegexXQuery_regex::add_branch(CRegexXQuery_branch *branch)
3430 {
3431 branch_list.push_back(branch);
3432 }
3433
3434-bool CRegexAscii_regex::get_indexed_match(int index,
3435+bool CRegexXQuery_regex::get_indexed_match(int index,
3436 const char **matched_source,
3437 int *matched_len)
3438 {
3439 if(!index || index > (int)subregex.size())
3440 return false;
3441- CRegexAscii_regex *subr = subregex[index-1];
3442+ CRegexXQuery_regex *subr = subregex[index-1];
3443 *matched_source = subr->matched_source;
3444 if(!*matched_source)
3445 return false;
3446@@ -574,145 +1020,209 @@
3447 return true;
3448 }
3449
3450-unsigned int CRegexAscii_regex::get_indexed_regex_count()
3451+unsigned int CRegexXQuery_regex::get_indexed_regex_count()
3452 {
3453 return subregex.size();
3454 }
3455
3456-CRegexAscii_branch::CRegexAscii_branch(CRegexAscii_regex* regex) :
3457- IRegexMatcher(regex)
3458+CRegexXQuery_branch::CRegexXQuery_branch(CRegexXQuery_regex* regex)
3459+ //:
3460+ //IRegexMatcher(regex)
3461 {
3462 }
3463
3464-CRegexAscii_branch::~CRegexAscii_branch()
3465+CRegexXQuery_branch::~CRegexXQuery_branch()
3466 {
3467- std::list<CRegexAscii_piece*>::iterator piece_it;
3468+ std::list<RegexAscii_pieceinfo>::iterator piece_it;
3469
3470 for(piece_it = piece_list.begin(); piece_it != piece_list.end(); piece_it++)
3471 {
3472- delete (*piece_it);
3473+ delete (*piece_it).piece;
3474 }
3475 }
3476
3477-void CRegexAscii_branch::add_piece(CRegexAscii_piece *piece)
3478+void CRegexXQuery_branch::add_piece(CRegexXQuery_piece *piece)
3479 {
3480 piece_list.push_back(piece);
3481 }
3482
3483-CRegexAscii_piece::CRegexAscii_piece()
3484+CRegexXQuery_piece::CRegexXQuery_piece()
3485 {
3486+ atom = NULL;
3487+ regex_atom = NULL;
3488 }
3489
3490-CRegexAscii_piece::~CRegexAscii_piece()
3491+CRegexXQuery_piece::~CRegexXQuery_piece()
3492 {
3493 delete atom;
3494 }
3495
3496-void CRegexAscii_piece::set_atom(IRegexAtom *atom)
3497+void CRegexXQuery_piece::set_atom(IRegexAtom *atom)
3498 {
3499 this->atom = atom;
3500+ this->regex_atom = dynamic_cast<CRegexXQuery_regex*>(atom);
3501 }
3502
3503-void CRegexAscii_piece::set_quantifier_min_max(int min, int max, bool strict_max)
3504+void CRegexXQuery_piece::set_quantifier_min_max(int min, int max, bool strict_max)
3505 {
3506 this->min = min;
3507 this->max = max;
3508 this->strict_max = strict_max;
3509 }
3510-void CRegexAscii_piece::set_is_reluctant(bool is_reluctant)
3511+void CRegexXQuery_piece::set_is_reluctant(bool is_reluctant)
3512 {
3513 this->is_reluctant = is_reluctant;
3514 }
3515-void CRegexAscii_piece::get_quantifier(int *min, int *max, bool *strict_max)
3516+void CRegexXQuery_piece::get_quantifier(int *min, int *max, bool *strict_max)
3517 {
3518 *min = this->min;
3519 *max = this->max;
3520 *strict_max = this->strict_max;
3521 }
3522-bool CRegexAscii_piece::get_is_reluctant()
3523+bool CRegexXQuery_piece::get_is_reluctant()
3524 {
3525+ if(atom->regex_intern->flags & REGEX_ASCII_MINIMAL_MATCH)
3526+ return true;
3527 return is_reluctant;
3528 }
3529
3530
3531-CRegexAscii_chargroup::CRegexAscii_chargroup(CRegexAscii_regex* regex) :
3532+CRegexXQuery_charmatch::CRegexXQuery_charmatch(CRegexXQuery_regex* regex) :
3533+ IRegexAtom(regex)
3534+{
3535+}
3536+CRegexXQuery_multicharP::CRegexXQuery_multicharP(CRegexXQuery_regex* regex, char type, bool is_reverse) :
3537+ CRegexXQuery_charmatch(regex)
3538+{
3539+ this->multichar_type = type; this->is_reverse = is_reverse;
3540+}
3541+CRegexXQuery_multicharIs::CRegexXQuery_multicharIs(CRegexXQuery_regex* regex, int block_index, bool is_reverse) :
3542+ CRegexXQuery_charmatch(regex)
3543+{
3544+ this->block_index = block_index; this->is_reverse = is_reverse;
3545+}
3546+CRegexXQuery_multicharOther::CRegexXQuery_multicharOther(CRegexXQuery_regex* regex, char type) :
3547+ CRegexXQuery_charmatch(regex)
3548+{
3549+ this->multichar_type = type;
3550+}
3551+CRegexXQuery_char_ascii::CRegexXQuery_char_ascii(CRegexXQuery_regex* regex, char c) :
3552+ CRegexXQuery_charmatch(regex)
3553+{
3554+ this->c = c;
3555+}
3556+CRegexXQuery_char_ascii_i::CRegexXQuery_char_ascii_i(CRegexXQuery_regex* regex, char c) :
3557+ CRegexXQuery_char_ascii(regex, toupper(c))
3558+{
3559+}
3560+CRegexXQuery_char_range_ascii::CRegexXQuery_char_range_ascii(CRegexXQuery_regex* regex, char c1, char c2) :
3561+ CRegexXQuery_charmatch(regex)
3562+{
3563+ this->c1 = c1; this->c2 = c2;
3564+}
3565+CRegexXQuery_char_range_ascii_i::CRegexXQuery_char_range_ascii_i(CRegexXQuery_regex* regex, char c1, char c2) :
3566+ CRegexXQuery_char_range_ascii(regex, toupper(c1), toupper(c2))
3567+{
3568+}
3569+CRegexXQuery_char_unicode::CRegexXQuery_char_unicode(CRegexXQuery_regex* regex, const char *source, int len) :
3570+ CRegexXQuery_charmatch(regex)
3571+{
3572+ this->len = len;
3573+ memcpy(c, source, len);
3574+}
3575+CRegexXQuery_char_unicode_cp::CRegexXQuery_char_unicode_cp(CRegexXQuery_regex* regex, unicode::code_point c) :
3576+ CRegexXQuery_charmatch(regex)
3577+{
3578+ this->c = c;
3579+}
3580+CRegexXQuery_char_unicode_i::CRegexXQuery_char_unicode_i(CRegexXQuery_regex* regex, unicode::code_point c) :
3581+ CRegexXQuery_char_unicode_cp(regex, unicode::to_upper(c))
3582+{
3583+}
3584+CRegexXQuery_char_range_unicode::CRegexXQuery_char_range_unicode(CRegexXQuery_regex* regex, unicode::code_point c1, unicode::code_point c2) :
3585+ CRegexXQuery_charmatch(regex)
3586+{
3587+ this->c1 = c1; this->c2 = c2;
3588+}
3589+CRegexXQuery_char_range_unicode_i::CRegexXQuery_char_range_unicode_i(CRegexXQuery_regex* regex, unicode::code_point c1, unicode::code_point c2) :
3590+ CRegexXQuery_char_range_unicode(regex, unicode::to_upper(c1), unicode::to_upper(c2))
3591+{
3592+}
3593+CRegexXQuery_endline::CRegexXQuery_endline(CRegexXQuery_regex* regex) :
3594+ CRegexXQuery_charmatch(regex)
3595+{
3596+}
3597+
3598+unicode::code_point CRegexXQuery_char_unicode::get_c()
3599+{
3600+ const char *temp_c = (const char*)c;
3601+ return utf8::next_char(temp_c);
3602+}
3603+
3604+
3605+CRegexXQuery_chargroup::CRegexXQuery_chargroup(CRegexXQuery_regex* regex) :
3606 IRegexAtom(regex)
3607 {
3608 classsub = NULL;
3609 }
3610
3611-CRegexAscii_chargroup::~CRegexAscii_chargroup()
3612+CRegexXQuery_chargroup::~CRegexXQuery_chargroup()
3613 {
3614 delete classsub;
3615-}
3616-
3617-void CRegexAscii_chargroup::addMultiChar(char c)
3618-{
3619- chargroup_t cgt;
3620- cgt.flags = CHARGROUP_FLAGS_MULTICHAR;
3621- cgt.c1 = c;
3622- cgt.c2 = 0;
3623- chargroup_list.push_back(cgt);
3624-}
3625-
3626-void CRegexAscii_chargroup::addEndLine()
3627-{
3628- chargroup_t cgt;
3629- cgt.flags = CHARGROUP_FLAGS_ENDLINE;
3630- cgt.c1 = '$';
3631- cgt.c2 = 0;
3632- chargroup_list.push_back(cgt);
3633-}
3634-
3635-void CRegexAscii_chargroup::addCharRange(char c1, char c2)
3636-{
3637- chargroup_t cgt;
3638- cgt.flags = 0;
3639- cgt.c1 = c1;
3640- cgt.c2 = c2;
3641- chargroup_list.push_back(cgt);
3642-}
3643-
3644-void CRegexAscii_chargroup::addClassSub(CRegexAscii_chargroup* classsub)
3645+ std::list<CRegexXQuery_charmatch* >::iterator charmatch_it;
3646+ for(charmatch_it=chargroup_list.begin(); charmatch_it != chargroup_list.end(); charmatch_it++)
3647+ delete (*charmatch_it);
3648+}
3649+
3650+void CRegexXQuery_chargroup::addCharMatch(CRegexXQuery_charmatch *charmatch)
3651+{
3652+ chargroup_list.push_back(charmatch);
3653+}
3654+void CRegexXQuery_chargroup::addClassSub(CRegexXQuery_chargroup* classsub)
3655 {
3656 this->classsub = classsub;
3657 }
3658
3659-CRegexAscii_negchargroup::CRegexAscii_negchargroup(CRegexAscii_regex* regex) :
3660- CRegexAscii_chargroup(regex)
3661-{
3662-}
3663-
3664-CRegexAscii_negchargroup::~CRegexAscii_negchargroup()
3665-{
3666-}
3667-
3668-CRegexAscii_wildchar::CRegexAscii_wildchar(CRegexAscii_regex* regex) :
3669+CRegexXQuery_negchargroup::CRegexXQuery_negchargroup(CRegexXQuery_regex* regex) :
3670+ CRegexXQuery_chargroup(regex)
3671+{
3672+}
3673+
3674+CRegexXQuery_negchargroup::~CRegexXQuery_negchargroup()
3675+{
3676+}
3677+
3678+CRegexXQuery_wildchar::CRegexXQuery_wildchar(CRegexXQuery_regex* regex) :
3679 IRegexAtom(regex)
3680 {
3681 }
3682
3683-CRegexAscii_wildchar::~CRegexAscii_wildchar()
3684+CRegexXQuery_wildchar::~CRegexXQuery_wildchar()
3685 {
3686 }
3687
3688-CRegexAscii_backref::CRegexAscii_backref(CRegexAscii_regex* regex, unsigned int backref_) :
3689+CRegexXQuery_backref::CRegexXQuery_backref(CRegexXQuery_regex* regex, unsigned int backref_) :
3690 IRegexAtom(regex),
3691 backref(backref_)
3692 {
3693 }
3694
3695-CRegexAscii_backref::~CRegexAscii_backref()
3696-{
3697-}
3698-
3699-CRegexAscii_parser::CRegexAscii_parser()
3700+CRegexXQuery_backref::~CRegexXQuery_backref()
3701+{
3702+}
3703+
3704+CRegexXQuery_pinstart::CRegexXQuery_pinstart(CRegexXQuery_regex* regex):
3705+ IRegexAtom(regex)
3706+{
3707+}
3708+
3709+CRegexXQuery_parser::CRegexXQuery_parser()
3710 {
3711 current_regex = NULL;
3712 regex_depth = 0;
3713 }
3714
3715-CRegexAscii_parser::~CRegexAscii_parser()
3716+CRegexXQuery_parser::~CRegexXQuery_parser()
3717 {
3718 }
3719
3720@@ -720,9 +1230,68 @@
3721 //////////////////////////////////////////
3722 ////Matching the pattern on a string
3723 /////////////////////////////////////////
3724+static std::list<RegexAscii_pieceinfo> empty_pieces;//empty list of pieces
3725+/*
3726+std::list<RegexAscii_pieceinfo>::iterator
3727+IRegexAtom::choose_next_piece(const char *source, int *matched_len,
3728+ std::list<RegexAscii_pieceinfo>::iterator this_piece,
3729+ std::list<RegexAscii_pieceinfo>::iterator end_piece)
3730+{
3731+ //if this_piece is repetition, repeat until max, then go to next piece
3732+ int min, max;
3733+ bool strict_max;
3734+ while(this_piece != end_piece)
3735+ {
3736+ (*this_piece).piece->get_quantifier(&min, &max, &strict_max);
3737+ if(max <= ((*this_piece).nr_matches))//finished this piece
3738+ {
3739+ this_piece++;
3740+ }
3741+ else
3742+ break;
3743+ }
3744+ return this_piece;
3745+}
3746+*/
3747+
3748+bool IRegexAtom::match(const char *source, int *start_from_branch, int *matched_len,
3749+ std::list<RegexAscii_pieceinfo>::iterator this_piece,
3750+ std::list<RegexAscii_pieceinfo>::iterator end_piece)
3751+{
3752+ *start_from_branch = 0;
3753+ bool retmatch;
3754+ retmatch = match_internal(source, start_from_branch, matched_len);
3755+ if(!retmatch)
3756+ return false;
3757+
3758+ if(this_piece == end_piece)
3759+ return true;
3760+
3761+ (*this_piece).nr_matches++;
3762+ int min,max;
3763+ bool strict_max;
3764+ (*this_piece).piece->get_quantifier(&min, &max, &strict_max);
3765+ std::list<RegexAscii_pieceinfo>::iterator init_piece = this_piece;
3766+ if(((min == 1) && (max == 1)) || //the simple common case
3767+ ((*matched_len == 0) && ((*this_piece).nr_matches>=min)))//to avoid infinite loop
3768+ {
3769+ this_piece++;
3770+ if(this_piece == end_piece)
3771+ return true;
3772+ }
3773+ int matched_len2;
3774+ retmatch = (*this_piece).piece->match_piece(this_piece, end_piece, source + *matched_len, &matched_len2);
3775+ if(!retmatch)
3776+ {
3777+ (*init_piece).nr_matches--;
3778+ return false;
3779+ }
3780+ *matched_len += matched_len2;
3781+ return true;
3782+}
3783
3784 //try every position in source to match the pattern
3785-bool CRegexAscii_regex::match_anywhere(const char *source, unsigned int flags,
3786+bool CRegexXQuery_regex::match_anywhere(const char *source, unsigned int flags,
3787 int *match_pos, int *matched_len)
3788 {
3789 *match_pos = 0;
3790@@ -730,43 +1299,66 @@
3791 return match_from(source, flags, match_pos, matched_len);
3792 }
3793
3794-bool CRegexAscii_regex::match_from(const char *source, unsigned int flags,
3795+bool CRegexXQuery_regex::match_from(const char *source, unsigned int flags,
3796 int *match_pos, int *matched_len)
3797 {
3798 this->flags = flags;
3799+ this->source_start = source;
3800 reachedEnd = false;
3801
3802- std::vector<CRegexAscii_regex*>::iterator regex_it;
3803+ std::vector<CRegexXQuery_regex*>::iterator regex_it;
3804 for(regex_it = subregex.begin(); regex_it != subregex.end(); regex_it++)
3805 {
3806 (*regex_it)->matched_source = NULL;
3807 }
3808-// if(!source[0])
3809-// {
3810-// if(branch_list.empty())
3811-// return true;
3812-// else
3813-// return false;
3814-// }
3815-
3816- bool skip_first_match = false;
3817- if(*match_pos && align_begin)
3818- skip_first_match = true;
3819+
3820+ std::vector<std::pair<const char*, int> > saved_subregex;
3821+
3822+ if(*match_pos && (flags & REGEX_ASCII_WHOLE_MATCH))
3823+ return false;
3824+
3825 do
3826 {
3827- if(!skip_first_match)
3828- {
3829- if(match(source + *match_pos, matched_len))
3830- return true;
3831- }
3832- skip_first_match = false;
3833- if(align_begin)
3834+ int start_from_branch = 0;
3835+ int longest_match = -1;
3836+ while(1)
3837+ {
3838+ if(!match(source + *match_pos, &start_from_branch, matched_len, empty_pieces.begin(), empty_pieces.end()))
3839+ break;
3840+ if(longest_match < *matched_len)
3841+ {
3842+ longest_match = *matched_len;
3843+ if(start_from_branch && (flags & REGEX_ASCII_GET_LONGEST_BRANCH))
3844+ save_subregex_list(saved_subregex);
3845+ }
3846+ if(!start_from_branch || !(flags & REGEX_ASCII_GET_LONGEST_BRANCH))
3847+ break;
3848+ //else try the other branches to see which is longer
3849+ }
3850+ if(longest_match != -1)
3851+ {
3852+ *matched_len = longest_match;
3853+ if(saved_subregex.size())
3854+ load_subregex_list(saved_subregex);
3855+ if(flags & REGEX_ASCII_WHOLE_MATCH)
3856+ {
3857+ if(!source[*match_pos+*matched_len])
3858+ return true;
3859+ if((flags & REGEX_ASCII_MULTILINE) &&
3860+ ((source[*match_pos+*matched_len] == '\n') || (source[*match_pos+*matched_len] == '\r')))
3861+ return true;
3862+ return false;
3863+ }
3864+ return true;
3865+ }
3866+
3867+ if(flags & REGEX_ASCII_WHOLE_MATCH)
3868 {
3869 if(flags & REGEX_ASCII_MULTILINE)
3870 {
3871- //goto the next line
3872+ //go to next line
3873 while(source[*match_pos] && (source[*match_pos] != '\n') && (source[*match_pos] != '\r'))
3874- (*match_pos)++;
3875+ (*match_pos) += myutf8len(source);
3876 if(source[*match_pos] == '\n')
3877 {
3878 (*match_pos)++;
3879@@ -780,190 +1372,1039 @@
3880 (*match_pos)++;
3881 }
3882 if(!source[*match_pos])
3883- return false;
3884+ break;
3885 continue;
3886 }
3887- return false;
3888+ break;
3889 }
3890 if(!source[*match_pos])
3891 break;
3892- (*match_pos)++;
3893+ (*match_pos) += myutf8len(source);
3894 }
3895 while(source[*match_pos]);
3896+// if(!source[*match_pos])
3897+// {
3898+// reachedEnd = true;
3899+// }
3900 return false;
3901 }
3902
3903+void CRegexXQuery_regex::reset_match()
3904+{
3905+// this->backup_matched_source = this->matched_source;
3906+// this->backup_matched_len = this->matched_len;
3907+ this->matched_source = NULL;
3908+ this->matched_len = 0;
3909+ std::list<CRegexXQuery_branch*>::iterator branch_it;
3910+ for(branch_it = branch_list.begin(); branch_it != branch_list.end(); branch_it++)
3911+ {
3912+ (*branch_it)->reset();
3913+ }
3914+}
3915+/*
3916+void CRegexXQuery_regex::restore_match()
3917+{
3918+ this->matched_source = this->backup_matched_source;
3919+ this->matched_len = this->backup_matched_len;
3920+ std::list<CRegexXQuery_branch*>::iterator branch_it;
3921+ for(branch_it = branch_list.begin(); branch_it != branch_list.end(); branch_it++)
3922+ {
3923+ (*branch_it)->restore();
3924+ }
3925+}
3926+*/
3927 //match any of the branches
3928-bool CRegexAscii_regex::match(const char *source, int *matched_len)
3929+bool CRegexXQuery_regex::match(const char *source, int *start_from_branch, int *matched_len,
3930+ std::list<RegexAscii_pieceinfo>::iterator next_piece,
3931+ std::list<RegexAscii_pieceinfo>::iterator end_piece)
3932 {
3933 reachedEnd = false;
3934- std::list<CRegexAscii_branch*>::iterator branch_it;
3935-
3936- for(branch_it = branch_list.begin(); branch_it != branch_list.end(); branch_it++)
3937- {
3938- if((*branch_it)->match(source, matched_len))
3939- {
3940- matched_source = source;
3941- this->matched_len = *matched_len;
3942+ if(!(flags & REGEX_ASCII_GROUPING_LEN_WHOLE_PIECE) ||
3943+ (this->matched_source == NULL) || ((this->matched_source + this->matched_len) != source))
3944+ this->matched_source = source;
3945+ *matched_len = 0;
3946+ std::list<CRegexXQuery_branch*>::iterator branch_it;
3947+
3948+ if(*start_from_branch == 0)
3949+ {
3950+ for(branch_it = branch_list.begin(); branch_it != branch_list.end(); branch_it++)
3951+ {
3952+ (*branch_it)->reset();
3953+ }
3954+ }
3955+
3956+ branch_it = branch_list.begin();
3957+ if(*start_from_branch)
3958+ {
3959+ for(int i=0;i<*start_from_branch;i++)
3960+ branch_it++;
3961+ }
3962+ (*start_from_branch)++;
3963+ for(; branch_it != branch_list.end(); branch_it++,(*start_from_branch)++)
3964+ {
3965+ if((*branch_it)->match(source, matched_len, this, next_piece, end_piece))
3966+ {
3967+ //matched_source = source;
3968+ //this->matched_len = *matched_len;
3969 return true;
3970 }
3971 }
3972- matched_source = NULL;
3973- matched_len = 0;
3974+ *start_from_branch = 0;
3975+ if(this->matched_source == source)
3976+ this->matched_source = NULL;
3977+ *matched_len = 0;
3978 return false;
3979 }
3980
3981+void CRegexXQuery_regex::save_subregex_list(std::vector<std::pair<const char*, int> > &saved_subregex)
3982+{
3983+ saved_subregex.resize(0);
3984+ saved_subregex.reserve(subregex.size());
3985+ std::vector<CRegexXQuery_regex*>::iterator it;
3986+ for(it=subregex.begin(); it != subregex.end(); it++)
3987+ {
3988+ saved_subregex.push_back(std::pair<const char*, int>((*it)->matched_source, (*it)->matched_len));
3989+ }
3990+}
3991+
3992+void CRegexXQuery_regex::load_subregex_list(std::vector<std::pair<const char*, int> > &saved_subregex)
3993+{
3994+ std::vector<std::pair<const char*, int> >::iterator it;
3995+ std::vector<CRegexXQuery_regex*>::iterator subit;
3996+ for(it=saved_subregex.begin(), subit = subregex.begin(); it != saved_subregex.end(); it++, subit++)
3997+ {
3998+ (*subit)->matched_source = (*it).first;
3999+ (*subit)->matched_len = (*it).second;
4000+ }
4001+}
4002+
4003+void CRegexXQuery_branch::reset()
4004+{
4005+ std::list<RegexAscii_pieceinfo>::iterator piece_it;
4006+ for(piece_it = piece_list.begin(); piece_it != piece_list.end(); piece_it++)
4007+ {
4008+ (*piece_it).piece->atom->reset_match();
4009+ }
4010+}
4011+/*
4012+void CRegexXQuery_branch::restore()
4013+{
4014+ std::list<RegexAscii_pieceinfo>::iterator piece_it;
4015+ for(piece_it = piece_list.begin(); piece_it != piece_list.end(); piece_it++)
4016+ {
4017+ (*piece_it).piece->atom->restore_match();
4018+ }
4019+}
4020+*/
4021 //match all the pieces
4022-bool CRegexAscii_branch::match(const char *source, int *matched_len)
4023+bool CRegexXQuery_branch::match(const char *source, int *matched_len,
4024+ CRegexXQuery_regex* group_regex,
4025+ std::list<RegexAscii_pieceinfo>::iterator next_piece,
4026+ std::list<RegexAscii_pieceinfo>::iterator end_piece)
4027 {
4028- std::list<CRegexAscii_piece*>::iterator piece_it;
4029+ std::list<RegexAscii_pieceinfo>::iterator piece_it;
4030
4031 piece_it = piece_list.begin();
4032+ //if(piece_it == piece_list.end())
4033+ //if(!source[0])
4034+ // return true;
4035+ //else
4036+ // return false;
4037 if(piece_it == piece_list.end())
4038- if(source[0])
4039- return false;
4040+ {
4041+ piece_it = next_piece;
4042+ if(next_piece == end_piece)
4043+ {
4044+ group_regex->matched_len = 0;
4045+ return true;
4046+ }
4047+ }
4048+
4049+ std::list<RegexAscii_pieceinfo> temp_pieces(piece_list);
4050+ temp_pieces.push_back(group_regex);//this will be used to store the group match
4051+ temp_pieces.insert(temp_pieces.end(), next_piece, end_piece);
4052+
4053+ return (*piece_it).piece->match_piece(temp_pieces.begin(), temp_pieces.end(), source, matched_len);
4054+}
4055+
4056+bool CRegexXQuery_piece::match_piece(std::list<RegexAscii_pieceinfo>::iterator piece_it,
4057+ std::list<RegexAscii_pieceinfo>::iterator end_it,
4058+ const char *source, int *matched_len)
4059+{
4060+ if((*piece_it).nr_matches < 0)
4061+ {
4062+ //special case, store the group match
4063+ (*piece_it).group_regex->matched_len = source - (*piece_it).group_regex->matched_source;
4064+ piece_it++;
4065+ if(piece_it == end_it)
4066+ return true;
4067 else
4068- return true;
4069- if(!(*piece_it)->get_is_reluctant())
4070- return match_piece_iter_normal(piece_it, source, matched_len);
4071+ return (*piece_it).piece->match_piece(piece_it, end_it, source, matched_len);
4072+ }
4073+
4074+ if(!get_is_reluctant())
4075+ return match_piece_iter_normal(piece_it, end_it, source, matched_len);
4076 else
4077- return match_piece_iter_reluctant(piece_it, source, matched_len);
4078-}
4079-
4080-//match as less as possible
4081-bool CRegexAscii_branch::match_piece_iter_reluctant(
4082- std::list<CRegexAscii_piece*>::iterator piece_it,
4083+ return match_piece_iter_reluctant(piece_it, end_it, source, matched_len);
4084+}
4085+
4086+int CRegexXQuery_piece::choose_another_branch(std::vector<std::pair<int,int> > &match_lens)
4087+{
4088+ int i = match_lens.size()-1;
4089+ i--;
4090+ while((i >= 0) && (match_lens.at(i).second == 0))
4091+ i--;
4092+ if(i < 0)
4093+ return -1;//no more branches
4094+ match_lens.resize(i+1);
4095+ i++;
4096+ return i;
4097+}
4098+
4099+bool CRegexXQuery_piece::is_regex_atom()
4100+{
4101+ return regex_atom != NULL;
4102+}
4103+
4104+//match as less as possible (shortest string)
4105+bool CRegexXQuery_piece::match_piece_iter_reluctant(
4106+ std::list<RegexAscii_pieceinfo>::iterator piece_it,
4107+ std::list<RegexAscii_pieceinfo>::iterator end_it,
4108 const char *source, int *matched_len)
4109 {
4110 *matched_len = 0;
4111- if(piece_it == piece_list.end())
4112+ if(piece_it == end_it)
4113 return true;
4114
4115 int min, max;
4116 bool strict_max;
4117 //std::vector<int> match_lens;
4118- (*piece_it)->get_quantifier(&min, &max, &strict_max);
4119- if(strict_max && (max >= 0))
4120+ (*piece_it).piece->get_quantifier(&min, &max, &strict_max);
4121+
4122+ std::vector<std::pair<const char*, int> > saved_subregex;
4123+
4124+ if(is_regex_atom())
4125 {
4126- int timeslen;
4127- //check if the piece doesn't exceed the max match
4128- if((*piece_it)->match_piece_times(source, &timeslen, max+1, NULL))
4129- return false;///too many matches
4130+ //recursive
4131+ bool retmatch;
4132+ atom->regex_intern->save_subregex_list(saved_subregex);
4133+ if((*piece_it).nr_matches >= min)
4134+ {
4135+ //go to next piece
4136+ std::list<RegexAscii_pieceinfo>::iterator next_it = piece_it;
4137+ next_it++;
4138+ if(next_it == end_it)
4139+ return true;
4140+ retmatch = (*next_it).piece->match_piece(next_it, end_it, source, matched_len);
4141+ if(retmatch)
4142+ return true;
4143+ }
4144+ if(((max == -1) || ((*piece_it).nr_matches < max)) &&//try further with this piece
4145+ (((*piece_it).nr_matches < min) || ((*piece_it).nr_matches == 0) || ((*piece_it).piece->regex_atom->matched_len)))//if matched_len is zero, avoid infinite loop
4146+ {
4147+ int start_from_branch = 0;
4148+ int shortest_len = -1;
4149+ bool branch_saved = false;
4150+ //try all branches to get the shortest len
4151+ (*piece_it).nr_matches++;
4152+ while(atom->match(source, &start_from_branch, matched_len, piece_it, end_it))
4153+ {
4154+ if((shortest_len == -1) || (shortest_len > *matched_len))
4155+ {
4156+ shortest_len = *matched_len;
4157+ if(start_from_branch && (atom->regex_intern->flags & REGEX_ASCII_GET_LONGEST_BRANCH))
4158+ {
4159+ atom->regex_intern->save_subregex_list(saved_subregex);
4160+ branch_saved = true;
4161+ }
4162+ }
4163+ if(!start_from_branch || !(atom->regex_intern->flags & REGEX_ASCII_GET_LONGEST_BRANCH))
4164+ break;
4165+ }
4166+ if(shortest_len != -1)
4167+ {
4168+ *matched_len = shortest_len;
4169+ if(branch_saved)
4170+ atom->regex_intern->load_subregex_list(saved_subregex);
4171+ return true;
4172+ }
4173+ else
4174+ {
4175+ (*piece_it).nr_matches--;
4176+ atom->regex_intern->load_subregex_list(saved_subregex);
4177+ return false;
4178+ }
4179+ }
4180+ else
4181+ {
4182+ atom->regex_intern->load_subregex_list(saved_subregex);
4183+ return false;
4184+ }
4185 }
4186
4187- int i=min;
4188- std::list<CRegexAscii_piece*>::iterator next_it = piece_it;
4189+ int i=0;
4190+ int shortest_len = -1;
4191+ int otherpieces_shortest = -1;
4192+ int i_shortest = -1;
4193+ std::list<RegexAscii_pieceinfo>::iterator next_it = piece_it;
4194+ std::vector<std::pair<int,int> > match_lens;
4195 next_it++;
4196 int pieceslen = 0;
4197 while(1)
4198 {
4199- if((max > 0) && (i>max))
4200- break;
4201- int piecelen = 0;
4202- if((*piece_it)->match_piece_times(source+pieceslen, &piecelen, !pieceslen ? i : 1, NULL))
4203- {
4204- pieceslen += piecelen;
4205+ int piecelen = 0;
4206+ bool retmatch;
4207+ retmatch = match_piece_times(source, &piecelen, i < min ? min : i, &match_lens);
4208+ i = match_lens.size()-1;//number of matches
4209+ if(i<0)
4210+ i = 0;
4211+ if((i>=min))
4212+ {
4213+ pieceslen = piecelen;
4214+ if((shortest_len >= 0) && (shortest_len <= pieceslen))//this branch is longer
4215+ {//try another branch
4216+ i = choose_another_branch(match_lens);
4217+ if(i >= 0)
4218+ continue;//try another branch
4219+ else
4220+ break;
4221+ }
4222 int otherpieces = 0;
4223- if((next_it == piece_list.end()) ||
4224- ((*next_it)->get_is_reluctant() && match_piece_iter_reluctant(next_it, source+pieceslen, &otherpieces)) ||
4225- (!(*next_it)->get_is_reluctant() && match_piece_iter_normal(next_it, source+pieceslen, &otherpieces)))
4226- {
4227- *matched_len = pieceslen + otherpieces;
4228- return true;
4229- }
4230+ if((next_it == end_it) ||
4231+ (*next_it).piece->match_piece(next_it, end_it, source+pieceslen, &otherpieces)
4232+ )
4233+ {
4234+ if((i == pieceslen) || (match_lens.at(0).second == 0) ||//minimum achieved already, cannot go lower than that
4235+ !(atom->regex_intern->flags & REGEX_ASCII_GET_LONGEST_BRANCH))
4236+ {
4237+ *matched_len = pieceslen + otherpieces;
4238+ return true;
4239+ }
4240+ if((shortest_len < 0) || (shortest_len > pieceslen))
4241+ {
4242+ shortest_len = pieceslen;
4243+ otherpieces_shortest = otherpieces;
4244+ i_shortest = i;
4245+ if(match_lens.at(0).second != 0)
4246+ atom->regex_intern->save_subregex_list(saved_subregex);
4247+ }
4248+ i = choose_another_branch(match_lens);
4249+ if(i >= 0)
4250+ continue;//try another branch
4251+ else
4252+ break;
4253+ }
4254+ else
4255+ {
4256+ //try further
4257+ if(retmatch)
4258+ {
4259+ i++;
4260+ if((max < 0) || (i<=max))
4261+ continue;
4262+ i--;
4263+ }
4264+ }
4265+ }
4266+
4267+ if(i==0)
4268+ {
4269+ break;
4270 }
4271 else
4272- break;
4273- i++;
4274+ {
4275+ i = choose_another_branch(match_lens);
4276+ if(i >= 0)
4277+ continue;//try another branch
4278+ else
4279+ break;
4280+ }
4281 }
4282
4283+ if(shortest_len >= 0)
4284+ {
4285+ if(strict_max && (max>=0) && (i_shortest > max))
4286+ return false;
4287+ *matched_len = shortest_len + otherpieces_shortest;
4288+ if(saved_subregex.size())
4289+ atom->regex_intern->load_subregex_list(saved_subregex);
4290+ return true;
4291+ }
4292 return false;
4293 }
4294
4295 //match as much as possible
4296-bool CRegexAscii_branch::match_piece_iter_normal(
4297- std::list<CRegexAscii_piece*>::iterator piece_it,
4298+bool CRegexXQuery_piece::match_piece_iter_normal(
4299+ std::list<RegexAscii_pieceinfo>::iterator piece_it,
4300+ std::list<RegexAscii_pieceinfo>::iterator end_it,
4301 const char *source, int *matched_len)
4302 {
4303 *matched_len = 0;
4304
4305 int min, max;
4306 bool strict_max;
4307- std::vector<int> match_lens;
4308- (*piece_it)->get_quantifier(&min, &max, &strict_max);
4309- int timeslen;
4310- if(strict_max && (max >= 0))
4311+ std::vector<std::pair<int,int> > match_lens;
4312+ (*piece_it).piece->get_quantifier(&min, &max, &strict_max);
4313+ int timeslen = 0;
4314+ std::vector<std::pair<const char*, int> > saved_subregex;
4315+
4316+ if(is_regex_atom())
4317 {
4318- //check if the piece doesn't exceed the max match
4319- //if((*piece_it)->match_piece_times(source, &timeslen, max+1, &match_lens))
4320- // return false;///too many matches
4321- (*piece_it)->match_piece_times(source, &timeslen, max, &match_lens);
4322+ //recursive
4323+ bool retmatch;
4324+ atom->regex_intern->save_subregex_list(saved_subregex);
4325+ if(((max == -1) || ((*piece_it).nr_matches < max)) && //try further with this piece
4326+ (((*piece_it).nr_matches < min) || ((*piece_it).nr_matches == 0) || ((*piece_it).piece->regex_atom->matched_len)))//if matched_len is zero, avoid infinite loop
4327+ {
4328+ int start_from_branch = 0;
4329+ int longest_len = -1;
4330+ bool branch_saved = false;
4331+ //try all branches to get the longest len
4332+ (*piece_it).nr_matches++;
4333+ while(atom->match(source, &start_from_branch, matched_len, piece_it, end_it))
4334+ {
4335+ if((longest_len < *matched_len))
4336+ {
4337+ longest_len = *matched_len;
4338+ if(start_from_branch && (atom->regex_intern->flags & REGEX_ASCII_GET_LONGEST_BRANCH))
4339+ {
4340+ atom->regex_intern->save_subregex_list(saved_subregex);
4341+ branch_saved = true;
4342+ }
4343+ }
4344+ if(!start_from_branch || !(atom->regex_intern->flags & REGEX_ASCII_GET_LONGEST_BRANCH))
4345+ break;
4346+ }
4347+ if(longest_len != -1)
4348+ {
4349+ *matched_len = longest_len;
4350+ if(branch_saved)
4351+ atom->regex_intern->load_subregex_list(saved_subregex);
4352+ return true;
4353+ }
4354+ else
4355+ {
4356+ atom->regex_intern->load_subregex_list(saved_subregex);
4357+ (*piece_it).nr_matches--;
4358+ }
4359+ }
4360+ if((*piece_it).nr_matches >= min)
4361+ {
4362+ //go to next piece
4363+ std::list<RegexAscii_pieceinfo>::iterator next_it = piece_it;
4364+ next_it++;
4365+ if(next_it == end_it)
4366+ return true;
4367+ retmatch = (*next_it).piece->match_piece(next_it, end_it, source, matched_len);
4368+ if(!retmatch)
4369+ atom->regex_intern->load_subregex_list(saved_subregex);
4370+ return retmatch;
4371+ }
4372+ else
4373+ {
4374+ // regex_atom->restore_match();
4375+ atom->regex_intern->load_subregex_list(saved_subregex);
4376+ return false;
4377+ }
4378 }
4379- else if(!strict_max && (max >= 0))
4380- (*piece_it)->match_piece_times(source, &timeslen, max, &match_lens);
4381- else
4382- (*piece_it)->match_piece_times(source, &timeslen, -1, &match_lens);
4383
4384- int i;
4385- std::list<CRegexAscii_piece*>::iterator next_it = piece_it;
4386+ int longest_len = -1;
4387+ int otherpieces_longest = -1;
4388+ int i_longest = -1;
4389+ int i = max;
4390+ std::list<RegexAscii_pieceinfo>::iterator next_it = piece_it;
4391 next_it++;
4392- if(next_it == piece_list.end())
4393+
4394+ bool retmatch;
4395+ while(1)
4396 {
4397- if((int)match_lens.size() > min)
4398- {
4399- *matched_len = timeslen;
4400- return true;
4401+ retmatch = match_piece_times(source, &timeslen, i, &match_lens);
4402+ i=match_lens.size()-1;//number of matches
4403+ if((i>=min))
4404+ {
4405+ if(timeslen < longest_len)
4406+ {//this branch is no use
4407+ i = choose_another_branch(match_lens);
4408+ if(i >= 0)
4409+ {
4410+ i = max;
4411+ continue;//try another branch
4412+ }
4413+ else
4414+ break;
4415+ }
4416+ //int piecelen = 0;
4417+ int otherpieces = 0;
4418+ if((next_it == end_it) ||
4419+ (*next_it).piece->match_piece(next_it, end_it, source+timeslen, &otherpieces)
4420+ )
4421+ {
4422+ if(timeslen > longest_len)
4423+ {
4424+ longest_len = timeslen;
4425+ otherpieces_longest = otherpieces;
4426+ i_longest = i;
4427+ if(!(atom->regex_intern->flags & REGEX_ASCII_GET_LONGEST_BRANCH))
4428+ {
4429+ *matched_len = longest_len + otherpieces_longest;
4430+ return true;
4431+ }
4432+ else
4433+ {
4434+ if(match_lens.at(0).second)
4435+ atom->regex_intern->save_subregex_list(saved_subregex);
4436+ }
4437+ }
4438+ }
4439+ else
4440+ {
4441+ if(!match_lens.at(0).second)
4442+ {
4443+ match_lens.resize(match_lens.size()-1);
4444+ i--;
4445+ if(i >= 0)
4446+ continue;//try smaller
4447+ else
4448+ break;
4449+ }
4450+ else
4451+ {
4452+ i = choose_another_branch(match_lens);
4453+ if(i >= 0)
4454+ continue;//try another branch
4455+ else
4456+ break;
4457+ }
4458+ }
4459+ }
4460+ //now try another branch
4461+ i = choose_another_branch(match_lens);
4462+ if(i >= 0)
4463+ {
4464+ i = max;
4465+ continue;//try another branch
4466 }
4467 else
4468- return false;
4469- }
4470- for(i=match_lens.size()-1; i>=min; i--)
4471+ break;
4472+ }//end while
4473+
4474+ if(longest_len >= 0)
4475 {
4476- int piecelen = 0;
4477- int otherpieces = 0;
4478- if(((*next_it)->get_is_reluctant() && match_piece_iter_reluctant(next_it, source+match_lens[i]+piecelen, &otherpieces)) ||
4479- (!(*next_it)->get_is_reluctant() && match_piece_iter_normal(next_it, source+match_lens[i]+piecelen, &otherpieces)))
4480- {
4481- *matched_len = match_lens[i] + piecelen + otherpieces;
4482- return true;
4483- }
4484+ *matched_len = longest_len + otherpieces_longest;
4485+ if(saved_subregex.size())
4486+ atom->regex_intern->load_subregex_list(saved_subregex);
4487+ return true;
4488 }
4489
4490 return false;
4491 }
4492
4493-bool CRegexAscii_piece::match_piece_times(const char *source,
4494+bool CRegexXQuery_piece::match_piece_times(const char *source,
4495 int *piecelen,
4496 int times,
4497- std::vector<int> *match_lens)
4498+ std::vector<std::pair<int,int> > *match_lens)
4499 {
4500- *piecelen = 0;
4501- for(int i=0;(times < 0) || (i<times);i++)
4502- {
4503+ int i=0;
4504+ if(match_lens && match_lens->size())
4505+ {
4506+ i = match_lens->size()-1;
4507+ }
4508+ if(match_lens && match_lens->size())
4509+ *piecelen = match_lens->at(match_lens->size()-1).first;
4510+ else
4511+ *piecelen = 0;
4512+ if((times >= 0) && (i>=times))
4513+ return true;
4514+ for(;(times < 0) || (i<times);i++)
4515+ {
4516+ int atomlen;
4517+ int start_from_branch = 0;
4518+ if(match_lens && (i<(int)match_lens->size()))
4519+ start_from_branch = match_lens->at(i).second;
4520+ bool first_branch = (start_from_branch == 0);
4521+ if(!atom->match(source+*piecelen, &start_from_branch, &atomlen, empty_pieces.begin(), empty_pieces.end()))
4522+ {
4523+ if(match_lens)
4524+ {
4525+ if(i >= (int)match_lens->size())
4526+ match_lens->push_back(std::pair<int,int>(*piecelen, 0));
4527+ else
4528+ (*match_lens)[i] = std::pair<int,int>(*piecelen, 0);
4529+ }
4530+ return false;
4531+ }
4532 if(match_lens)
4533- match_lens->push_back(*piecelen);
4534- int atomlen;
4535- if(!atom->match(source+*piecelen, &atomlen))
4536- return false;
4537+ {
4538+ if(i >= (int)match_lens->size())
4539+ match_lens->push_back(std::pair<int,int>(*piecelen, start_from_branch));
4540+ else
4541+ (*match_lens)[i] = std::pair<int,int>(*piecelen, start_from_branch);
4542+ }
4543 *piecelen += atomlen;
4544 if(!atomlen && !source[*piecelen])
4545 {
4546- atom->regex_intern->reachedEnd = true;
4547+ // atom->regex_intern->set_reachedEnd(source);
4548+ break;
4549+ }
4550+ if(first_branch && (atomlen == 0))//avoid infinite loop
4551+ {
4552 break;
4553 }
4554 }
4555 if(match_lens)
4556- match_lens->push_back(*piecelen);
4557+ {
4558+ // if(i >= match_lens->size())
4559+ match_lens->push_back(std::pair<int,int>(*piecelen, 0));
4560+ // else
4561+ // (*match_lens)[i] = std::pair<int,int>(*piecelen, 0);
4562+ }
4563
4564 return true;
4565 }
4566
4567+bool CRegexXQuery_multicharP::match_internal(const char *source, int *start_from_branch, int *matched_len)
4568+{
4569+ if(!source[0])
4570+ {
4571+ regex_intern->set_reachedEnd(source);
4572+ return false;
4573+ }
4574+ bool found = false;
4575+ const char *temp_source = source;
4576+ unicode::code_point utf8c = utf8::next_char(temp_source);
4577+ switch(multichar_type)
4578+ {
4579+ case unicode::UNICODE_Ll + 50:
4580+ if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Ll) ||
4581+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Lm) ||
4582+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Lo) ||
4583+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Lt) ||
4584+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Lu))
4585+ {
4586+ if(!is_reverse)
4587+ found = true;
4588+ }
4589+ else
4590+ {
4591+ if(is_reverse)
4592+ found = true;
4593+ }
4594+ break;
4595+ case unicode::UNICODE_Mc + 50:
4596+ if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Mn) ||
4597+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Mc) ||
4598+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Me))
4599+ {
4600+ if(!is_reverse)
4601+ found = true;
4602+ }
4603+ else
4604+ {
4605+ if(is_reverse)
4606+ found = true;
4607+ }
4608+ break;
4609+ case unicode::UNICODE_Nd + 50:
4610+ if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Nd) ||
4611+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Nl) ||
4612+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_No))
4613+ {
4614+ if(!is_reverse)
4615+ found = true;
4616+ }
4617+ else
4618+ {
4619+ if(is_reverse)
4620+ found = true;
4621+ }
4622+ break;
4623+ case unicode::UNICODE_Pc + 50:
4624+ if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pc) ||
4625+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pd) ||
4626+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Ps) ||
4627+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pe) ||
4628+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pi) ||
4629+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pf) ||
4630+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Po))
4631+ {
4632+ if(!is_reverse)
4633+ found = true;
4634+ }
4635+ else
4636+ {
4637+ if(is_reverse)
4638+ found = true;
4639+ }
4640+ break;
4641+ case unicode::UNICODE_Zl + 50:
4642+ if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Zs) ||
4643+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Zl) ||
4644+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Zp))
4645+ {
4646+ if(!is_reverse)
4647+ found = true;
4648+ }
4649+ else
4650+ {
4651+ if(is_reverse)
4652+ found = true;
4653+ }
4654+ break;
4655+ case unicode::UNICODE_Sc + 50:
4656+ if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Sm) ||
4657+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Sc) ||
4658+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Sk) ||
4659+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_So))
4660+ {
4661+ if(!is_reverse)
4662+ found = true;
4663+ }
4664+ else
4665+ {
4666+ if(is_reverse)
4667+ found = true;
4668+ }
4669+ break;
4670+ case unicode::UNICODE_Cc + 50:
4671+ if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Cc) ||
4672+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Cf) ||
4673+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Co))//ignore unicode::UNICODE_Cn
4674+ {
4675+ if(!is_reverse)
4676+ found = true;
4677+ }
4678+ else
4679+ {
4680+ if(is_reverse)
4681+ found = true;
4682+ }
4683+ break;
4684+ default:
4685+ if(unicode::check_codepoint_category(utf8c, (unicode::category)multichar_type))
4686+ {
4687+ if(!is_reverse)
4688+ found = true;
4689+ }
4690+ else
4691+ {
4692+ if(is_reverse)
4693+ found = true;
4694+ }
4695+ break;
4696+ }
4697+
4698+ if(found)
4699+ {
4700+ *matched_len = temp_source - source;
4701+ }
4702+ return found;
4703+}
4704+
4705+bool CRegexXQuery_multicharIs::match_internal(const char *source, int *start_from_branch, int *matched_len)
4706+{
4707+ if(!source[0])
4708+ {
4709+ regex_intern->set_reachedEnd(source);
4710+ return false;
4711+ }
4712+ bool found = false;
4713+ const char *temp_source = source;
4714+ unicode::code_point utf8c = utf8::next_char(temp_source);
4715+ const unicode::code_point *cp = block_escape[block_index].cp;
4716+ if((utf8c >= cp[0]) && (utf8c <= cp[1]))
4717+ {
4718+ if(!is_reverse)
4719+ found = true;
4720+ }
4721+ else if(block_escape[block_index].ext_cp)
4722+ {
4723+ cp = block_escape[block_index].ext_cp;
4724+ while(*cp)
4725+ {
4726+ if((utf8c >= cp[0]) && (utf8c <= cp[1]))
4727+ break;
4728+ cp += 2;
4729+ }
4730+ if(*cp)
4731+ {
4732+ if(!is_reverse)
4733+ found = true;
4734+ }
4735+ else
4736+ {
4737+ if(is_reverse)
4738+ found = true;
4739+ }
4740+ }
4741+ else
4742+ {
4743+ if(is_reverse)
4744+ found = true;
4745+ }
4746+ if(found)
4747+ {
4748+ *matched_len = temp_source - source;
4749+ }
4750+ return found;
4751+}
4752+
4753+bool CRegexXQuery_multicharOther::match_internal(const char *source, int *start_from_branch, int *matched_len)
4754+{
4755+ if(!source[0])
4756+ {
4757+ regex_intern->set_reachedEnd(source);
4758+ return false;
4759+ }
4760+ bool found = false;
4761+ bool value_true = true;
4762+ const char *temp_source = source;
4763+ unicode::code_point utf8c = utf8::next_char(temp_source);
4764+ switch(multichar_type)
4765+ {
4766+ case 'S':value_true = false;//[^\s]
4767+ case 's'://[#x20\t\n\r]
4768+ switch(utf8c)
4769+ {
4770+ case '\t':
4771+ case '\r':
4772+ case '\n':
4773+ case ' ':
4774+ found = true;
4775+ default:
4776+ break;
4777+ }
4778+ break;
4779+ case 'I':value_true = false;//[^\i]
4780+ case 'i'://the set of initial name characters, those matched by Letter | '_' | ':'
4781+ if((utf8c == '_') ||
4782+ (utf8c == ':') ||
4783+ XQCharType::isLetter(utf8c))
4784+ {
4785+ found = true;
4786+ }
4787+ break;
4788+ case 'C':value_true = false;//[^\c]
4789+ case 'c'://the set of name characters, those matched by NameChar
4790+ if(XQCharType::isNameChar(utf8c))
4791+ {
4792+ found = true;
4793+ }
4794+ break;
4795+ case 'D':value_true = false;//[^\d]
4796+ case 'd':
4797+ if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Nd))
4798+ found = true;
4799+ break;
4800+ case 'W':value_true = false;//[^\w]
4801+ case 'w':
4802+ found = !(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pc) ||
4803+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pd) ||
4804+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Ps) ||
4805+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pe) ||
4806+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pi) ||
4807+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pf) ||
4808+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Po) ||
4809+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Zs) ||
4810+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Zl) ||
4811+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Zp) ||
4812+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Cc) ||
4813+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Cf) ||
4814+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Co));//ignore unicode::UNICODE_Cn
4815+ break;
4816+ default:
4817+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(source, ZED(REGEX_UNIMPLEMENTED)) );
4818+ }
4819+ if((found && value_true) || (!found && !value_true))
4820+ {
4821+ *matched_len = temp_source - source;
4822+ return true;
4823+ }
4824+ else
4825+ {
4826+ return false;
4827+ }
4828+}
4829+
4830+bool CRegexXQuery_char_ascii::match_internal(const char *source, int *start_from_branch, int *matched_len)
4831+{
4832+ if(!source[0])
4833+ {
4834+ regex_intern->set_reachedEnd(source);
4835+ return false;
4836+ }
4837+ if(source[0] == c)
4838+ {
4839+ *matched_len = 1;
4840+ return true;
4841+ }
4842+ else
4843+ return false;
4844+}
4845+
4846+bool CRegexXQuery_char_ascii_i::match_internal(const char *source, int *start_from_branch, int *matched_len)
4847+{
4848+ if(!source[0])
4849+ {
4850+ regex_intern->set_reachedEnd(source);
4851+ return false;
4852+ }
4853+ char sup = toupper(source[0]);
4854+ if(sup == c)
4855+ {
4856+ *matched_len = 1;
4857+ return true;
4858+ }
4859+ else
4860+ return false;
4861+}
4862+
4863+bool CRegexXQuery_char_range_ascii::match_internal(const char *source, int *start_from_branch, int *matched_len)
4864+{
4865+ if(!source[0])
4866+ {
4867+ regex_intern->set_reachedEnd(source);
4868+ return false;
4869+ }
4870+ if((source[0] >= c1) && (source[0] <= c2))
4871+ {
4872+ *matched_len = 1;
4873+ return true;
4874+ }
4875+ else
4876+ return false;
4877+}
4878+
4879+bool CRegexXQuery_char_range_ascii_i::match_internal(const char *source, int *start_from_branch, int *matched_len)
4880+{
4881+ if(!source[0])
4882+ {
4883+ regex_intern->set_reachedEnd(source);
4884+ return false;
4885+ }
4886+ char sup = toupper(source[0]);
4887+ if((sup >= c1) && (sup <= c2))
4888+ {
4889+ *matched_len = 1;
4890+ return true;
4891+ }
4892+ else
4893+ return false;
4894+}
4895+
4896+bool CRegexXQuery_char_unicode::match_internal(const char *source, int *start_from_branch, int *matched_len)
4897+{
4898+ if(!source[0])
4899+ {
4900+ regex_intern->set_reachedEnd(source);
4901+ return false;
4902+ }
4903+ if(!memcmp(source, c, len))
4904+ {
4905+ *matched_len = len;
4906+ return true;
4907+ }
4908+ else
4909+ return false;
4910+}
4911+
4912+bool CRegexXQuery_char_unicode_cp::match_internal(const char *source, int *start_from_branch, int *matched_len)
4913+{
4914+ if(!source[0])
4915+ {
4916+ regex_intern->set_reachedEnd(source);
4917+ return false;
4918+ }
4919+ const char *temp_source = source;
4920+ unicode::code_point utf8c = utf8::next_char(temp_source);
4921+ if(utf8c == c)
4922+ {
4923+ *matched_len = temp_source - source;
4924+ return true;
4925+ }
4926+ else
4927+ return false;
4928+}
4929+
4930+bool CRegexXQuery_char_unicode_i::match_internal(const char *source, int *start_from_branch, int *matched_len)
4931+{
4932+ if(!source[0])
4933+ {
4934+ regex_intern->set_reachedEnd(source);
4935+ return false;
4936+ }
4937+ const char *temp_source = source;
4938+ unicode::code_point sup = unicode::to_upper(utf8::next_char(temp_source));
4939+ if(sup == c)
4940+ {
4941+ *matched_len = temp_source - source;
4942+ return true;
4943+ }
4944+ else
4945+ return false;
4946+}
4947+
4948+bool CRegexXQuery_char_range_unicode::match_internal(const char *source, int *start_from_branch, int *matched_len)
4949+{
4950+ if(!source[0])
4951+ {
4952+ regex_intern->set_reachedEnd(source);
4953+ return false;
4954+ }
4955+ const char *temp_source = source;
4956+ unicode::code_point utf8c = utf8::next_char(temp_source);
4957+ if((utf8c >= c1) && (utf8c <= c2))
4958+ {
4959+ *matched_len = temp_source - source;
4960+ return true;
4961+ }
4962+ else
4963+ return false;
4964+}
4965+
4966+bool CRegexXQuery_char_range_unicode_i::match_internal(const char *source, int *start_from_branch, int *matched_len)
4967+{
4968+ if(!source[0])
4969+ {
4970+ regex_intern->set_reachedEnd(source);
4971+ return false;
4972+ }
4973+ const char *temp_source = source;
4974+ unicode::code_point sup = unicode::to_upper(utf8::next_char(temp_source));
4975+ if((sup >= c1) && (sup <= c2))
4976+ {
4977+ *matched_len = temp_source - source;
4978+ return true;
4979+ }
4980+ else
4981+ return false;
4982+}
4983+
4984+bool CRegexXQuery_endline::match_internal(const char *source, int *start_from_branch, int *matched_len)
4985+{
4986+ *matched_len = 0;
4987+ if(!source[0])
4988+ {
4989+ // regex_intern->reachedEnd = true;
4990+ return true;
4991+ }
4992+ if((source[0] == 0x0A) || ((source[0] == 0x0D) && (source[1] == 0x0A)))
4993+ {
4994+ if(regex_intern->get_flags() & REGEX_ASCII_MULTILINE)
4995+ {
4996+ // regex_intern->reachedEnd = true;
4997+ return true;
4998+ }
4999+ }
5000+ return false;
The diff has been truncated for viewing.

Subscribers

People subscribed via source and target branches