Merge lp:~zorba-coders/zorba/no_unicode into lp:zorba

Proposed by Paul J. Lucas
Status: Superseded
Proposed branch: lp:~zorba-coders/zorba/no_unicode
Merge into: lp:zorba
Diff against target: 8957 lines (+3879/-1395)
270 files modified
CMakeConfiguration.txt (+5/-5)
CMakeLists.txt (+6/-2)
ChangeLog (+7/-0)
KNOWN_ISSUES.txt (+1/-1)
doc/cxx/examples/context.cpp (+4/-0)
include/zorba/config.h.cmake (+3/-1)
include/zorba/static_context.h (+4/-0)
include/zorba/util/time.h (+1/-1)
src/CMakeLists.txt (+4/-0)
src/api/serialization/serializer.cpp (+36/-33)
src/api/serialization/serializer.h (+2/-4)
src/diagnostics/diagnostic_en.xml (+116/-27)
src/diagnostics/pregenerated/dict_en.cpp (+98/-20)
src/precompiled/stdafx.h (+74/-356)
src/runtime/full_text/CMakeLists.txt (+3/-3)
src/runtime/full_text/default_tokenizer.cpp (+4/-4)
src/runtime/full_text/latin_tokenizer.cpp (+3/-2)
src/runtime/full_text/latin_tokenizer.h (+9/-8)
src/runtime/numerics/format_integer_impl.cpp (+1/-1)
src/runtime/numerics/numerics_impl.cpp (+1/-1)
src/runtime/strings/strings_impl.cpp (+58/-20)
src/store/api/store.h (+1/-1)
src/store/naive/simple_store.h (+7/-3)
src/store/naive/store.cpp (+1/-1)
src/store/naive/store.h (+12/-11)
src/system/globalenv.cpp (+7/-7)
src/unit_tests/CMakeLists.txt (+2/-2)
src/unit_tests/string.cpp (+8/-0)
src/unit_tests/unit_test_list.h (+2/-2)
src/unit_tests/unit_tests.cpp (+2/-2)
src/util/CMakeLists.txt (+4/-4)
src/util/icu_streambuf.h (+1/-0)
src/util/passthru_streambuf.cpp (+2/-2)
src/util/passthru_streambuf.h (+10/-2)
src/util/regex.cpp (+67/-55)
src/util/regex.h (+22/-34)
src/util/regex_xquery.cpp (+1860/-489)
src/util/regex_xquery.h (+359/-123)
src/util/transcode_streambuf.h (+5/-5)
src/util/unicode_categories.cpp (+3/-3)
src/util/unicode_categories.h (+44/-37)
src/util/unicode_util.cpp (+20/-2)
src/util/unicode_util.h (+47/-15)
src/util/utf8_util.cpp (+6/-6)
src/util/utf8_util.h (+29/-13)
src/util/utf8_util.tcc (+10/-2)
src/zorbatypes/collation_manager.cpp (+17/-17)
src/zorbatypes/collation_manager.h (+3/-3)
src/zorbatypes/libicu.h (+0/-32)
src/zorbatypes/transcoder.cpp (+8/-4)
src/zorbatypes/transcoder.h (+9/-9)
src/zorbautils/hashmap_itemh.h (+4/-0)
src/zorbautils/string_util.cpp (+19/-18)
src/zorbautils/string_util.h (+15/-1)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a1.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a10.xml.res (+242/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a11.xml.res (+6/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a2.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a3.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a5.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a6.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a7.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a8.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a9.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m1.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m10.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m11.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m12.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m13.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m14.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m15.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m16.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m17.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m18.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m19.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m2.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m20.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m21.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m22.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m23.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m24.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m25.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m26.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m27.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m28.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m29.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m3.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m30.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m31.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m32.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m33.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m34.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m35.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m36.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m37.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m38.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m39.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m4.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m40.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m41.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m42.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m43.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m44.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m45.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m46.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m47.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m48.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m49.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m5.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m50.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m51.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m52.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m53.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m6.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m7.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m8.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m9.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_prime1.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r1.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r10.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r11.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r12.xml.res (+5/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r2.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r3.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r4.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r5.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r6.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r9.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_t1.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_t4.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_t5.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/testdriver/bom_bug.xml.res (+1/-0)
test/rbkt/Queries/CMakeLists.txt (+16/-1)
test/rbkt/Queries/zorba/string/Regex/regex_a1.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_a10.xq (+11/-0)
test/rbkt/Queries/zorba/string/Regex/regex_a11.xq (+9/-0)
test/rbkt/Queries/zorba/string/Regex/regex_a2.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_a3.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_a5.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_a6.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_a7.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_a8.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_a9.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err1.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err1.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err10.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err10.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err11.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err11.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err12.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err12.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err13.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err13.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err14.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err14.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err15.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err15.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err16.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err16.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err17.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err17.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err18.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err18.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err19.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err19.xq (+3/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err2.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err2.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err20.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err20.xq (+3/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err21.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err21.xq (+3/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err22.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err22.xq (+3/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err23.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err23.xq (+3/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err24.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err24.xq (+3/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err25.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err25.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err3.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err3.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err4.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err4.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err5.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err5.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err7.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err7.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err8.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err8.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err9.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err9.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m1.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m10.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m11.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m12.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m13.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m14.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m15.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m16.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m17.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m18.xq (+3/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m19.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m2.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m20.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m21.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m22.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m23.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m24.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m25.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m26.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m27.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m28.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m29.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m3.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m30.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m31.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m32.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m33.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m34.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m35.xq (+4/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m36.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m37.xq (+4/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m38.xq (+4/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m39.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m4.xq (+6/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m40.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m41.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m42.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m43.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m44.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m45.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m46.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m47.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m48.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m49.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m5.xq (+6/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m50.xq (+2/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m51.xq (+2/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m52.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m53.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m6.xq (+6/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m7.xq (+6/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m8.xq (+7/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m9.xq (+7/-0)
test/rbkt/Queries/zorba/string/Regex/regex_prime1.xq (+17/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r1.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r10.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r11.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r12.xq (+7/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r2.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r3.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r4.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r5.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r6.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r7_err.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r7_err.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r8_err.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r8_err.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r9.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_t1.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_t2.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_t3_err.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_t3_err.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_t4.xq (+2/-0)
test/rbkt/Queries/zorba/string/Regex/regex_t5.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/zorba.html (+242/-0)
test/rbkt/Queries/zorba/string/Regex/zorba2.html (+5/-0)
test/rbkt/Queries/zorba/testdriver/bom_bug.xq (+1/-0)
test/unit/static_context.cpp (+2/-0)
test/update/CMakeLists.txt (+9/-0)
To merge this branch: bzr merge lp:~zorba-coders/zorba/no_unicode
Reviewer Review Type Date Requested Status
Matthias Brantner Pending
Markos Zaharioudakis Pending
Review via email: mp+89104@code.launchpad.net

This proposal supersedes a proposal from 2011-12-09.

This proposal has been superseded by a proposal from 2012-04-06.

Commit message

"No Unicode" is now "No ICU."

Description of the change

"No Unicode" is now "No ICU."

To post a comment you must log in.
Revision history for this message
Matthias Brantner (matthias-brantner) wrote : Posted in a previous version of this proposal

Compiling with ZORBA_NO_ICU=ON fails on Linux:

[ 1%] Building CXX object src/CMakeFiles/zorba_simplestore.dir/api/zorba_string.cpp.o
In file included from /home/mbrantner/zorba/sandbox/src/util/regex.h:501:0,
                 from /home/mbrantner/zorba/sandbox/src/api/zorba_string.cpp:23:
/home/mbrantner/zorba/sandbox/src/util/regex_xquery.h:209:3: error: a class-key must be used when declaring a friend
/home/mbrantner/zorba/sandbox/src/util/regex_xquery.h:209:3: error: friend declaration does not name a class or function
/home/mbrantner/zorba/sandbox/src/util/regex_xquery.h:253:3: error: a class-key must be used when declaring a friend
/home/mbrantner/zorba/sandbox/src/util/regex_xquery.h:253:3: error: friend declaration does not name a class or function
make[2]: *** [src/CMakeFiles/zorba_simplestore.dir/api/zorba_string.cpp.o] Erro

Revision history for this message
Matthias Brantner (matthias-brantner) : Posted in a previous version of this proposal
review: Needs Fixing
Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote : Posted in a previous version of this proposal

There are additional revisions which have not been approved in review. Please seek review and approval of these new revisions.

Revision history for this message
Matthias Brantner (matthias-brantner) wrote : Posted in a previous version of this proposal

The test suite doesn't run clean on my system (Linux) without ICU. This prevents us from adding the built to the remote queue. For example, the following tests fail without ICU (some of them also seem to fail with ICU):

 1294 - test/rbkt/zorba/string/Regex/regex_a10 (Failed)
 1548 - test/rbkt/zorba/fulltext/ft-wildcard-true-2 (Failed)
 1560 - test/rbkt/zorba/fulltext/ft-wildcard-true-4 (Failed)
 1574 - test/rbkt/zorba/fulltext/ft-same-sentence-true-4 (Failed)
 1581 - test/rbkt/zorba/fulltext/ft-wildcard-true-3 (Failed)
 1587 - test/rbkt/zorba/fulltext/ft-wildcard-true-9 (Failed)
 1600 - test/rbkt/zorba/fulltext/ft-diacritics-insensitive-true-1 (Failed)
 1605 - test/rbkt/zorba/fulltext/ft-wildcard-true-8 (Failed)
 1612 - test/rbkt/zorba/fulltext/ft-wildcard-true-10 (Failed)
 1635 - test/rbkt/zorba/fulltext/ft-wildcard-true-7 (Failed)
 1637 - test/rbkt/zorba/fulltext/ft-wildcard-true-11 (Failed)
 1643 - test/rbkt/zorba/fulltext/ft-wildcard-FTDY0020-3 (Failed)
 1789 - test/rbkt/zorba/index/numbers (Failed)
 2345 - test/unit/string_test (Failed)
 2534 - test/update/zorba/store/sc3 (Failed)
 2544 - doc/cxx/examples/context.cpp (Failed)

Please make sure the test suite runs clean.

review: Needs Fixing
Revision history for this message
Paul J. Lucas (paul-lucas) wrote : Posted in a previous version of this proposal

Try it now.

Revision history for this message
Daniel Turcanu (danielturcanu) wrote :

Before commiting this branch, the branch lp:~danielturcanu/zorba/my_conv_module should be merged.

Revision history for this message
Chris Hillery (ceejatec) wrote :

FWIW, I've skimmed the change for CMake-related changes, and they all look fine (mostly quite trivial).

Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote :

Attempt to merge into lp:zorba failed due to conflicts:

text conflict in ChangeLog

Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote :
Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote :

The attempt to merge lp:~zorba-coders/zorba/no_unicode into lp:zorba failed. Below is the output from the failed tests.

CMake Error at /home/ceej/zo/testing/zorbatest/tester/TarmacLander.cmake:274 (message):
  Validation queue job no_unicode-2012-03-30T19-15-23.23Z is finished. The
  final status was:

  6 tests did not succeed - changes not commited.

Error in read script: /home/ceej/zo/testing/zorbatest/tester/TarmacLander.cmake

Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote :
Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote :

The attempt to merge lp:~zorba-coders/zorba/no_unicode into lp:zorba failed. Below is the output from the failed tests.

CMake Error at /home/ceej/zo/testing/zorbatest/tester/TarmacLander.cmake:274 (message):
  Validation queue job no_unicode-2012-04-03T15-17-37.639Z is finished. The
  final status was:

  6 tests did not succeed - changes not commited.

Error in read script: /home/ceej/zo/testing/zorbatest/tester/TarmacLander.cmake

lp:~zorba-coders/zorba/no_unicode updated
10530. By Paul J. Lucas

1. Added fix for not catching bad regexs like "^^".
2. Added if="!defined(ZORBA_NO_ICU)" for some entries in the diagnostics
   dictionary.

10531. By Paul J. Lucas

Merge from trunk.

10532. By Paul J. Lucas

Fix for '^' bug.

10533. By Paul J. Lucas

1. Fixed yet another '^' bug.
2. Marked some regex tests as expected failure with correct bug numbers.

10534. By Paul J. Lucas

No longer doing some stuff when q_flag is set.

10535. By Paul J. Lucas

Tweaked one error message.

10536. By Paul J. Lucas

Merge from trunk.

10537. By Rodolfo Ochoa

Merge from trunk

10538. By Rodolfo Ochoa

Strange error on include guards

10539. By Rodolfo Ochoa

merge from trunk

10540. By Rodolfo Ochoa

fix for regex errors in RQ

Unmerged revisions

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'CMakeConfiguration.txt'
2--- CMakeConfiguration.txt 2012-03-28 05:19:57 +0000
3+++ CMakeConfiguration.txt 2012-04-06 00:18:21 +0000
4@@ -135,14 +135,14 @@
5 SET (ZORBA_DEBUG_STRING ${ZORBA_DEBUG_STRING} CACHE BOOL "debug strings")
6 MESSAGE (STATUS "ZORBA_DEBUG_STRING: " ${ZORBA_DEBUG_STRING})
7
8-SET(ZORBA_NO_UNICODE OFF CACHE BOOL "disable ICU")
9-MESSAGE(STATUS "ZORBA_NO_UNICODE: " ${ZORBA_NO_UNICODE})
10+SET(ZORBA_NO_ICU OFF CACHE BOOL "disable ICU")
11+MESSAGE(STATUS "ZORBA_NO_ICU: " ${ZORBA_NO_ICU})
12
13-IF (ZORBA_NO_UNICODE)
14+IF (ZORBA_NO_ICU)
15 SET (no_full_text ON)
16-ELSE (ZORBA_NO_UNICODE)
17+ELSE (ZORBA_NO_ICU)
18 SET (no_full_text OFF)
19-ENDIF (ZORBA_NO_UNICODE)
20+ENDIF (ZORBA_NO_ICU)
21 SET (ZORBA_NO_FULL_TEXT ${no_full_text} CACHE BOOL "disable XQuery Full-Text support")
22 MESSAGE(STATUS "ZORBA_NO_FULL_TEXT: " ${ZORBA_NO_FULL_TEXT})
23
24
25=== modified file 'CMakeLists.txt'
26--- CMakeLists.txt 2012-03-28 05:19:57 +0000
27+++ CMakeLists.txt 2012-04-06 00:18:21 +0000
28@@ -123,10 +123,14 @@
29 CHECK_TYPE_SIZE("int64_t" ZORBA_HAVE_INT64_T)
30
31 CHECK_CXX_SOURCE_COMPILES ("#include <type_traits>\nint main() { std::enable_if<true,int> x; }" ZORBA_CXX_ENABLE_IF)
32-CHECK_CXX_SOURCE_COMPILES ("int main() { int *p = nullptr; }" ZORBA_CXX_NULLPTR)
33-CHECK_CXX_SOURCE_COMPILES ("int main() { static_assert(1,\"\"); }" ZORBA_CXX_STATIC_ASSERT)
34+SET(CMAKE_EXTRA_INCLUDE_FILES wchar.h)
35+CHECK_TYPE_SIZE("wchar_t" ZORBA_SIZEOF_WCHAR_T)
36+SET(CMAKE_EXTRA_INCLUDE_FILES)
37 CHECK_CXX_SOURCE_COMPILES ("#include <memory>\nint main() { std::unique_ptr<int> p; }" ZORBA_CXX_UNIQUE_PTR)
38
39+CHECK_CXX_SOURCE_COMPILES("int main() { int *p = nullptr; }" ZORBA_CXX_NULLPTR)
40+CHECK_CXX_SOURCE_COMPILES("int main() { static_assert(1,\"\"); }" ZORBA_CXX_STATIC_ASSERT)
41+
42 ################################################################################
43 # Various cmake macros
44
45
46=== modified file 'ChangeLog'
47--- ChangeLog 2012-04-04 15:59:01 +0000
48+++ ChangeLog 2012-04-06 00:18:21 +0000
49@@ -4,6 +4,7 @@
50
51 New Features:
52 * Extended API for Python, Java, PHP and Ruby.
53+ * Added support for NO_ICU (to not use ICU for unicode processing)
54
55 Bug Fixes/Other Changes:
56 * Fixed bug #967864 (var substitution did not update theFreeVars property)
57@@ -148,7 +149,9 @@
58 * Fixed bug when parsing a document with a base-uri attribute.
59 * Fixed bug #863320 (Sentence is incorrectly incremented when token characters end without sentence terminator)
60 * Fixed bug #863730 (static delete-node* functions don't raise ZDDY0012)
61+ * Implemented the probe-index-range-value for general indexes
62 * Removed ZSTR0005 and ZSTR0006 error codes
63+ * Fixed bug #867662 ("nullptr" warning)
64 * Fixed bug #868258 (Assertion failure with two delete collection)
65 * Fixed bug #871623 and #871629 (assertion failures with insertions in dynamic collections)
66 * Fixed bug #867262 (allow reuse of iterator over ExtFuncArgItemSequence)
67@@ -157,6 +160,8 @@
68 * New node-reference module. References can be obtained for any node, and
69 different nodes cannot have the same identifier.
70 * Fixed bug #872697 (segmentation fault with validation of NMTOKENS)
71+ * General index cannot be declared as unique if the type of its key is
72+ xs:anyAtomicType or xs:untypedAtomic.
73 * Added undo for node revalidation
74 * Optimization for count(collection()) expressions
75 * Fixed bug #872796 (validate-in-place can interfere with other update primitives)
76@@ -175,6 +180,8 @@
77 * Fixed bug #855715 (Invalid escaped characters in regex not caught)
78 * Fixed bug #862089 (Split binary/xq install directories for modules) by
79 splitting "module path" into separate URI and Library paths
80+ * New node-position module. This module allows to obtain a representation of a node position, which
81+ can be used to assess structural relationships with other nodes.
82 * Fixed bug #872502 (validation of the JSON module xqdoc fails)
83 * Fixed bug #897619 (testdriver_mt can not run the XQueryX tests)
84 * Fixed bug #867107 (xqdoc dependency to zorba is wrong)
85
86=== modified file 'KNOWN_ISSUES.txt'
87--- KNOWN_ISSUES.txt 2012-03-28 05:19:57 +0000
88+++ KNOWN_ISSUES.txt 2012-04-06 00:18:21 +0000
89@@ -37,7 +37,7 @@
90 * The serializer currently doesn't implement character maps as specified
91 (http://www.w3.org/TR/xslt-xquery-serialization/#character-maps)
92
93-* In the 2.0 release, setting the CMake variables ZORBA_NO_UNICODE to
94+* In the 2.0 release, setting the CMake variables ZORBA_NO_ICU to
95 ON is not supported.
96
97 * The PHP language binding is not supported on Mac OS X. For details,
98
99=== modified file 'doc/cxx/examples/context.cpp'
100--- doc/cxx/examples/context.cpp 2012-03-28 05:19:57 +0000
101+++ doc/cxx/examples/context.cpp 2012-04-06 00:18:21 +0000
102@@ -149,7 +149,11 @@
103 outStream2 << lQuery << std::endl;
104 std::cout << outStream2.str() << std::endl;
105
106+#ifndef ZORBA_NO_ICU
107 if (outStream2.str() != "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\nBook 1.1\n")
108+#else
109+ if (outStream2.str() != "<?xml version=\"1.0\"?>\nBook 1.1\n")
110+#endif /* ZORBA_NO_ICU */
111 {
112 std::cerr << "Test 4 failed with a wrong result : " << std::endl
113 << outStream2.str() << std::endl;
114
115=== modified file 'include/zorba/config.h.cmake'
116--- include/zorba/config.h.cmake 2012-03-28 05:19:57 +0000
117+++ include/zorba/config.h.cmake 2012-04-06 00:18:21 +0000
118@@ -96,6 +96,8 @@
119 typedef __int64 int64_t;
120 #endif /* ZORBA_HAVE_INT64_T */
121
122+#cmakedefine ZORBA_SIZEOF_WCHAR_T @ZORBA_SIZEOF_WCHAR_T@
123+
124 // Compiler
125 #cmakedefine CLANG
126 #cmakedefine MSVC
127@@ -148,7 +150,7 @@
128
129 // Zorba features
130 #cmakedefine ZORBA_NO_FULL_TEXT
131-#cmakedefine ZORBA_NO_UNICODE
132+#cmakedefine ZORBA_NO_ICU
133 #cmakedefine ZORBA_NO_XMLSCHEMA
134 #cmakedefine ZORBA_NUMERIC_OPTIMIZATION
135 #cmakedefine ZORBA_VERIFY_PEER_SSL_CERTIFICATE
136
137=== modified file 'include/zorba/static_context.h'
138--- include/zorba/static_context.h 2012-03-28 05:19:57 +0000
139+++ include/zorba/static_context.h 2012-04-06 00:18:21 +0000
140@@ -26,9 +26,13 @@
141 #include <zorba/function.h>
142 #include <zorba/annotation.h>
143 #include <zorba/smart_ptr.h>
144+#include <zorba/smart_ptr.h>
145 #ifndef ZORBA_NO_FULL_TEXT
146 #include <zorba/thesaurus.h>
147 #endif /* ZORBA_NO_FULL_TEXT */
148+#include <zorba/zorba.h>
149+#include <zorba/store_manager.h>
150+#include <zorba/zorba_exception.h>
151
152 namespace zorba {
153
154
155=== modified file 'include/zorba/util/time.h'
156--- include/zorba/util/time.h 2012-03-28 05:19:57 +0000
157+++ include/zorba/util/time.h 2012-04-06 00:18:21 +0000
158@@ -178,7 +178,7 @@
159
160 inline long get_walltime_in_millis(const walltime& t)
161 {
162- return t.time * 1000 + t.millitm;
163+ return (long)(t.time * 1000 + t.millitm);
164 }
165
166 #else /* not Windows, and no clock_gettime() */
167
168=== modified file 'src/CMakeLists.txt'
169--- src/CMakeLists.txt 2012-03-28 05:19:57 +0000
170+++ src/CMakeLists.txt 2012-04-06 00:18:21 +0000
171@@ -59,7 +59,10 @@
172 #
173 # Next, add the files to be compiled into the library
174 #
175+
176+MESSAGE(STATUS "PRECOMPILED HEADERS: " ${ZORBA_PRECOMPILED_HEADERS})
177 SET(ZORBA_PRECOMPILED_HEADERS OFF CACHE BOOL "Activate Zorba precompiled headers.")
178+MESSAGE(STATUS "PRECOMPILED HEADERS: " ${ZORBA_PRECOMPILED_HEADERS})
179
180 SET(ZORBA_SRCS)
181 ADD_SRC_SUBFOLDER(ZORBA_SRCS api API_SRCS)
182@@ -97,6 +100,7 @@
183 ENDIF(ZORBA_WITH_DEBUGGER)
184 ADD_SRC_SUBFOLDER(ZORBA_SRCS unit_tests UNIT_TEST_SRCS)
185
186+MESSAGE(STATUS "PRECOMPILED HEADERS: " ${ZORBA_PRECOMPILED_HEADERS})
187 IF(ZORBA_PRECOMPILED_HEADERS)
188 ADD_SRC_SUBFOLDER(ZORBA_SRCS precompiled ZORBAMISC_SRCS)
189 INCLUDE_DIRECTORIES("${CMAKE_SOURCE_DIR}/src/precompiled")
190
191=== modified file 'src/api/serialization/serializer.cpp'
192--- src/api/serialization/serializer.cpp 2012-03-28 05:19:57 +0000
193+++ src/api/serialization/serializer.cpp 2012-04-06 00:18:21 +0000
194@@ -180,7 +180,6 @@
195 for (; chars < chars_end; chars++ )
196 {
197
198-#ifndef ZORBA_NO_UNICODE
199 // the input string is UTF-8
200 int char_length = utf8::char_length(*chars);
201 if (char_length == 0)
202@@ -217,7 +216,6 @@
203
204 continue;
205 }
206-#endif//ZORBA_NO_UNICODE
207
208 // raise an error iff (1) the serialization format is XML 1.0 and (2) the given character is an invalid XML 1.0 character
209 if (ser && ser->method == PARAMETER_VALUE_XML &&
210@@ -332,14 +330,12 @@
211 {
212 tr << (char)0xEF << (char)0xBB << (char)0xBF;
213 }
214-#ifndef ZORBA_NO_UNICODE
215 else if (ser->encoding == PARAMETER_VALUE_UTF_16)
216 {
217 // Little-endian
218 tr.verbatim((char)0xFF);
219 tr.verbatim((char)0xFE);
220 }
221-#endif
222 }
223 }
224
225@@ -862,13 +858,17 @@
226 emitter::emit_declaration();
227
228 if (ser->omit_xml_declaration == PARAMETER_VALUE_NO) {
229- tr << "<?xml version=\"" << ser->version << "\" encoding=\"";
230- if (ser->encoding == PARAMETER_VALUE_UTF_8) {
231- tr << "UTF-8";
232-#ifndef ZORBA_NO_UNICODE
233- } else if (ser->encoding == PARAMETER_VALUE_UTF_16) {
234- tr << "UTF-16";
235-#endif
236+ tr << "<?xml version=\"" << ser->version;
237+ switch (ser->encoding) {
238+ case PARAMETER_VALUE_UTF_8:
239+ case PARAMETER_VALUE_UTF_16:
240+ tr << "\" encoding=\"";
241+ switch (ser->encoding) {
242+ case PARAMETER_VALUE_UTF_8 : tr << "UTF-8" ; break;
243+ case PARAMETER_VALUE_UTF_16: tr << "UTF-16"; break;
244+ default : ZORBA_ASSERT(false);
245+ }
246+ break;
247 }
248 tr << "\"";
249
250@@ -1174,14 +1174,18 @@
251 }
252
253 tr << "<meta http-equiv=\"content-type\" content=\""
254- << ser->media_type << "; charset=";
255-
256- if (ser->encoding == PARAMETER_VALUE_UTF_8)
257- tr << "UTF-8";
258-#ifndef ZORBA_NO_UNICODE
259- else if (ser->encoding == PARAMETER_VALUE_UTF_16)
260- tr << "UTF-16";
261-#endif
262+ << ser->media_type;
263+ switch (ser->encoding) {
264+ case PARAMETER_VALUE_UTF_8:
265+ case PARAMETER_VALUE_UTF_16:
266+ tr << "\" charset=\"";
267+ switch (ser->encoding) {
268+ case PARAMETER_VALUE_UTF_8 : tr << "UTF-8" ; break;
269+ case PARAMETER_VALUE_UTF_16: tr << "UTF-16"; break;
270+ default : ZORBA_ASSERT(false);
271+ }
272+ break;
273+ }
274 tr << "\"";
275 // closed_parent_tag = 1;
276 }
277@@ -1371,14 +1375,18 @@
278 }
279
280 tr << "<meta http-equiv=\"content-type\" content=\""
281- << ser->media_type << "; charset=";
282-
283- if (ser->encoding == PARAMETER_VALUE_UTF_8)
284- tr << "UTF-8";
285-#ifndef ZORBA_NO_UNICODE
286- else if (ser->encoding == PARAMETER_VALUE_UTF_16)
287- tr << "UTF-16";
288-#endif
289+ << ser->media_type;
290+ switch (ser->encoding) {
291+ case PARAMETER_VALUE_UTF_8:
292+ case PARAMETER_VALUE_UTF_16:
293+ tr << "\" charset=\"";
294+ switch (ser->encoding) {
295+ case PARAMETER_VALUE_UTF_8 : tr << "UTF-8" ; break;
296+ case PARAMETER_VALUE_UTF_16: tr << "UTF-16"; break;
297+ default : ZORBA_ASSERT(false);
298+ }
299+ break;
300+ }
301 tr << "\"/";
302 //closed_parent_tag = 1;
303 }
304@@ -2098,10 +2106,8 @@
305 {
306 if (!strcmp(aValue, "UTF-8"))
307 encoding = PARAMETER_VALUE_UTF_8;
308-#ifndef ZORBA_NO_UNICODE
309 else if (!strcmp(aValue, "UTF-16"))
310 encoding = PARAMETER_VALUE_UTF_16;
311-#endif
312 else
313 throw XQUERY_EXCEPTION(
314 err::SEPM0016, ERROR_PARAMS( aValue, aName, ZED( GoodValuesAreUTF8 ) )
315@@ -2210,16 +2216,13 @@
316 {
317 tr = new transcoder(os, false);
318 }
319-#ifndef ZORBA_NO_UNICODE
320 else if (encoding == PARAMETER_VALUE_UTF_16)
321 {
322 tr = new transcoder(os, true);
323 }
324-#endif
325 else
326 {
327- ZORBA_ASSERT(0);
328- return false;
329+ ZORBA_ASSERT(false);
330 }
331
332 if (method == PARAMETER_VALUE_XML)
333
334=== modified file 'src/api/serialization/serializer.h'
335--- src/api/serialization/serializer.h 2012-03-28 05:19:57 +0000
336+++ src/api/serialization/serializer.h 2012-04-06 00:18:21 +0000
337@@ -70,10 +70,8 @@
338 PARAMETER_VALUE_TEXT,
339 PARAMETER_VALUE_BINARY,
340
341- PARAMETER_VALUE_UTF_8
342-#ifndef ZORBA_NO_UNICODE
343- ,PARAMETER_VALUE_UTF_16
344-#endif
345+ PARAMETER_VALUE_UTF_8,
346+ PARAMETER_VALUE_UTF_16
347 } PARAMETER_VALUE_TYPE;
348
349 protected:
350
351=== modified file 'src/diagnostics/diagnostic_en.xml'
352--- src/diagnostics/diagnostic_en.xml 2012-03-28 05:19:57 +0000
353+++ src/diagnostics/diagnostic_en.xml 2012-04-06 00:18:21 +0000
354@@ -2517,11 +2517,11 @@
355 <value>attribute node</value>
356 </entry>
357
358- <entry key="BackRef0Illegal">
359+ <entry key="BackRef0Illegal" if="!defined(ZORBA_NO_ICU)">
360 <value>"0": illegal backreference</value>
361 </entry>
362
363- <entry key="BackRefIllegalInCharClass">
364+ <entry key="BackRefIllegalInCharClass" if="!defined(ZORBA_NO_ICU)">
365 <value>backreference illegal in character class</value>
366 </entry>
367
368@@ -2569,7 +2569,7 @@
369 <value>invalid library module</value>
370 </entry>
371
372- <entry key="BadRegexEscape_3">
373+ <entry key="BadRegexEscape_3" if="!defined(ZORBA_NO_ICU)">
374 <value>"$3": illegal escape character</value>
375 </entry>
376
377@@ -3029,7 +3029,7 @@
378 <value>nodeid component too big for encoding</value>
379 </entry>
380
381- <entry key="NonClosedBackRef_3">
382+ <entry key="NonClosedBackRef_3" if="!defined(ZORBA_NO_ICU)">
383 <value>'$$3': non-closed backreference</value>
384 </entry>
385
386@@ -3041,7 +3041,7 @@
387 <value>non-localhost authority</value>
388 </entry>
389
390- <entry key="NonexistentBackRef_3">
391+ <entry key="NonexistentBackRef_3" if="!defined(ZORBA_NO_ICU)">
392 <value>'$$3': non-existent backreference</value>
393 </entry>
394
395@@ -3193,94 +3193,183 @@
396 <value>item type is not a subtype of "$3"</value>
397 </entry>
398
399- <entry key="U_REGEX_BAD_ESCAPE_SEQUENCE" if="!defined(ZORBA_NO_UNICODE)">
400+ <entry key="U_REGEX_BAD_ESCAPE_SEQUENCE" if="!defined(ZORBA_NO_ICU)">
401 <value>unrecognized backslash escape sequence</value>
402 </entry>
403
404- <entry key="U_REGEX_BAD_INTERVAL" if="!defined(ZORBA_NO_UNICODE)">
405+ <entry key="U_REGEX_BAD_INTERVAL" if="!defined(ZORBA_NO_ICU)">
406 <value>error in {min,max} interval</value>
407 </entry>
408
409- <entry key="U_REGEX_INTERNAL_ERROR" if="!defined(ZORBA_NO_UNICODE)">
410+ <entry key="U_REGEX_INTERNAL_ERROR" if="!defined(ZORBA_NO_ICU)">
411 <value>an internal ICU error (bug) was detected</value>
412 </entry>
413
414- <entry key="U_REGEX_INVALID_BACK_REF" if="!defined(ZORBA_NO_UNICODE)">
415+ <entry key="U_REGEX_INVALID_BACK_REF" if="!defined(ZORBA_NO_ICU)">
416 <value>backreference to a non-existent capture group</value>
417 </entry>
418
419- <entry key="U_REGEX_INVALID_FLAG" if="!defined(ZORBA_NO_UNICODE)">
420+ <entry key="U_REGEX_INVALID_FLAG" if="!defined(ZORBA_NO_ICU)">
421 <value>invalid value for match mode flags</value>
422 </entry>
423
424- <entry key="U_REGEX_INVALID_RANGE" if="!defined(ZORBA_NO_UNICODE)">
425+ <entry key="U_REGEX_INVALID_RANGE" if="!defined(ZORBA_NO_ICU)">
426 <value>in character range [x-y], x is greater than y</value>
427 </entry>
428
429- <entry key="U_REGEX_INVALID_STATE" if="!defined(ZORBA_NO_UNICODE)">
430+ <entry key="U_REGEX_INVALID_STATE" if="!defined(ZORBA_NO_ICU)">
431 <value>RegexMatcher in invalid state for requested operation</value>
432 </entry>
433
434- <entry key="U_REGEX_LOOK_BEHIND_LIMIT" if="!defined(ZORBA_NO_UNICODE)">
435+ <entry key="U_REGEX_LOOK_BEHIND_LIMIT" if="!defined(ZORBA_NO_ICU)">
436 <value>look-behind pattern matches must have a bounded maximum length</value>
437 </entry>
438
439- <entry key="U_REGEX_MAX_LT_MIN" if="!defined(ZORBA_NO_UNICODE)">
440+ <entry key="U_REGEX_MAX_LT_MIN" if="!defined(ZORBA_NO_ICU)">
441 <value>in {min,max}, max is less than min</value>
442 </entry>
443
444- <entry key="U_REGEX_MISMATCHED_PAREN" if="!defined(ZORBA_NO_UNICODE)">
445+ <entry key="U_REGEX_MISMATCHED_PAREN" if="!defined(ZORBA_NO_ICU)">
446 <value>incorrectly nested parentheses</value>
447 </entry>
448
449- <entry key="U_REGEX_MISSING_CLOSE_BRACKET" if="!defined(ZORBA_NO_UNICODE)">
450+ <entry key="U_REGEX_MISSING_CLOSE_BRACKET" if="!defined(ZORBA_NO_ICU)">
451 <value>missing ']'</value>
452 </entry>
453
454- <entry key="U_REGEX_NUMBER_TOO_BIG" if="!defined(ZORBA_NO_UNICODE)">
455+ <entry key="U_REGEX_NUMBER_TOO_BIG" if="!defined(ZORBA_NO_ICU)">
456 <value>decimal number is too large</value>
457 </entry>
458
459- <entry key="U_REGEX_OCTAL_TOO_BIG" if="!defined(ZORBA_NO_UNICODE)">
460+ <entry key="U_REGEX_OCTAL_TOO_BIG" if="!defined(ZORBA_NO_ICU)">
461 <value>octal character constants must be &lt;= 0377</value>
462 </entry>
463
464- <entry key="U_REGEX_PROPERTY_SYNTAX" if="!defined(ZORBA_NO_UNICODE)">
465+ <entry key="U_REGEX_PROPERTY_SYNTAX" if="!defined(ZORBA_NO_ICU)">
466 <value>incorrect Unicode property</value>
467 </entry>
468
469- <entry key="U_REGEX_RULE_SYNTAX" if="!defined(ZORBA_NO_UNICODE)">
470+ <entry key="U_REGEX_RULE_SYNTAX" if="!defined(ZORBA_NO_ICU)">
471 <value>syntax error</value>
472 </entry>
473
474- <entry key="U_REGEX_SET_CONTAINS_STRING" if="!defined(ZORBA_NO_UNICODE)">
475+ <entry key="U_REGEX_SET_CONTAINS_STRING" if="!defined(ZORBA_NO_ICU)">
476 <value>can not have UnicodeSets containing strings</value>
477 </entry>
478
479- <entry key="U_REGEX_STACK_OVERFLOW" if="!defined(ZORBA_NO_UNICODE)">
480+ <entry key="U_REGEX_STACK_OVERFLOW" if="!defined(ZORBA_NO_ICU)">
481 <value>backtrack stack overflow</value>
482 </entry>
483
484- <entry key="U_REGEX_STOPPED_BY_CALLER" if="!defined(ZORBA_NO_UNICODE)">
485+ <entry key="U_REGEX_STOPPED_BY_CALLER" if="!defined(ZORBA_NO_ICU)">
486 <value>matching operation aborted by user callback fn</value>
487 </entry>
488
489- <entry key="U_REGEX_TIME_OUT" if="!defined(ZORBA_NO_UNICODE)">
490+ <entry key="U_REGEX_TIME_OUT" if="!defined(ZORBA_NO_ICU)">
491 <value>maximum allowed match time exceeded</value>
492 </entry>
493
494- <entry key="U_REGEX_UNIMPLEMENTED" if="!defined(ZORBA_NO_UNICODE)">
495- <value>use of regular expression feature that is not yet implemented</value>
496+ <entry key="U_REGEX_UNIMPLEMENTED" if="!defined(ZORBA_NO_ICU)">
497+ <value>use of regular expression feature that is not yet implemented</value>
498+ </entry>
499+
500+ <!-- Regex Ascii error messages-->
501+ <entry key="REGEX_UNIMPLEMENTED" if="defined(ZORBA_NO_ICU)">
502+ <value>use of regular expression feature that is not yet implemented</value>
503+ </entry>
504+
505+ <entry key="REGEX_MISMATCHED_PAREN" if="defined(ZORBA_NO_ICU)">
506+ <value>incorrectly nested parentheses</value>
507+ </entry>
508+
509+ <entry key="REGEX_BROKEN_P_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
510+ <value>broken \\p construct</value>
511+ </entry>
512+
513+ <entry key="REGEX_UNKNOWN_PL_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
514+ <value>unknown \\p{L?} category; supported categories: L, Lu, Ll, Lt, Lm, Lo</value>
515+ </entry>
516+
517+ <entry key="REGEX_UNKNOWN_PM_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
518+ <value>unknown \\p{M?} category; supported categories: M, Mn, Mc, Me</value>
519+ </entry>
520+
521+ <entry key="REGEX_UNKNOWN_PN_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
522+ <value>unknown \\p{N?} category; supported categories: N, Nd, Nl, No</value>
523+ </entry>
524+
525+ <entry key="REGEX_UNKNOWN_PP_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
526+ <value>unknown \\p{P?} category; supported categories: P, Pc, Pd, Ps, Pe, Pi, Pf, Po</value>
527+ </entry>
528+
529+ <entry key="REGEX_UNKNOWN_PZ_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
530+ <value>unknown \\p{Z?} category; supported categories: Z, Zs, Zl, Zp</value>
531+ </entry>
532+
533+ <entry key="REGEX_UNKNOWN_PS_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
534+ <value>unknown \\p{S?} category; supported categories: S, Sm, Sc, Sk, So</value>
535+ </entry>
536+
537+ <entry key="REGEX_UNKNOWN_PC_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
538+ <value>unknown \\p{C?} category; supported categories: C, Cc, Cf, Co, Cn(for not assigned)</value>
539+ </entry>
540+
541+ <entry key="REGEX_BROKEN_PIs_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
542+ <value>broken \\p{Is} construct; valid characters are [a-zA-Z0-9-]</value>
543+ </entry>
544+
545+ <entry key="REGEX_UNKNOWN_PIs_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
546+ <value>unknown \\p{Is} category block; see supported block escapes here: http://www.w3.org/TR/xmlschema-2/#charcter-classes</value>
547+ </entry>
548+
549+ <entry key="REGEX_INVALID_UNICODE_CODEPOINT_u" if="defined(ZORBA_NO_ICU)">
550+ <value>invalid unicode hex, should be in form \\uXXXX or \\UXXXXXXXX</value>
551+ </entry>
552+
553+ <entry key="REGEX_UNKNOWN_ESC_CHAR" if="defined(ZORBA_NO_ICU)">
554+ <value>unknown \\? escape char; supported escapes are: \\[nrt\\|.?*+(){}[]-^$] for char escapes, \\[pP] for categories and \\[sSiIcCdDwW] for multichar groups</value>
555+ </entry>
556+
557+ <entry key="REGEX_INVALID_BACK_REF" if="defined(ZORBA_NO_ICU)">
558+ <value>\\$3 backreference to a non-existent capture group ($4 groups so far)</value>
559+ </entry>
560+
561+ <entry key="REGEX_INVALID_ATOM_CHAR" if="defined(ZORBA_NO_ICU)">
562+ <value>$3 - invalid character for an atom; forbidden characters are: [{}?*+|^]</value>
563+ </entry>
564+
565+ <entry key="REGEX_INVALID_SUBCLASS" if="defined(ZORBA_NO_ICU)">
566+ <value>malformed class subtraction</value>
567+ </entry>
568+
569+ <entry key="REGEX_INVALID_USE_OF_SUBCLASS" if="defined(ZORBA_NO_ICU)">
570+ <value>improper use of class subtraction: it must be the last construct in a class group [xxx-[yyy]]</value>
571+ </entry>
572+
573+ <entry key="REGEX_MULTICHAR_IN_CHAR_RANGE" if="defined(ZORBA_NO_ICU)">
574+ <value>multichars or char categories cannot be part of a char range</value>
575+ </entry>
576+
577+ <entry key="REGEX_MISSING_CLOSE_BRACKET" if="defined(ZORBA_NO_ICU)">
578+ <value>missing ']' in character group</value>
579+ </entry>
580+
581+ <entry key="REGEX_MAX_LT_MIN" if="defined(ZORBA_NO_ICU)">
582+ <value>in {min,max}, max is less than min</value>
583 </entry>
584
585 <entry key="UnaryArithOp">
586 <value>unary arithmetic operator</value>
587 </entry>
588
589- <entry key="UnbalancedChar_3">
590+ <entry key="UnbalancedChar_3" if="!defined(ZORBA_NO_ICU)">
591 <value>missing '$3'</value>
592 </entry>
593
594+ <entry key="UnescapedChar_3" if="!defined(ZORBA_NO_ICU)">
595+ <value>character '$3' must be escaped here</value>
596+ </entry>
597+
598 <entry key="UnexpectedElement">
599 <value>unexpected element</value>
600 </entry>
601
602=== modified file 'src/diagnostics/pregenerated/dict_en.cpp'
603--- src/diagnostics/pregenerated/dict_en.cpp 2012-03-28 05:19:57 +0000
604+++ src/diagnostics/pregenerated/dict_en.cpp 2012-04-06 00:18:21 +0000
605@@ -437,8 +437,12 @@
606 { "~AtomizationOfGroupByMakesMoreThanOneItem", "atomization of groupby variable produces more than one item" },
607 { "~AttributeName", "attribute name" },
608 { "~AttributeNode", "attribute node" },
609+#if !defined(ZORBA_NO_ICU)
610 { "~BackRef0Illegal", "\"0\": illegal backreference" },
611+#endif
612+#if !defined(ZORBA_NO_ICU)
613 { "~BackRefIllegalInCharClass", "backreference illegal in character class" },
614+#endif
615 { "~BadAnyURI", "invalid xs:anyURI" },
616 { "~BadArgTypeForFn_2o34o", "${\"2\": }invalid argument type for function $3()${: 4}" },
617 { "~BadCharAfter_34", "'$3': illegal character after '$4'" },
618@@ -451,7 +455,9 @@
619 { "~BadIterator", "invalid iterator" },
620 { "~BadLibraryModule", "invalid library module" },
621 { "~BadPath", "invalid path" },
622+#if !defined(ZORBA_NO_ICU)
623 { "~BadRegexEscape_3", "\"$3\": illegal escape character" },
624+#endif
625 { "~BadStreamState", "bad I/O stream state" },
626 { "~BadTokenInBraces_3", "\"$3\": illegal token within { }" },
627 { "~BadTraceStream", "trace stream not retrievable using SerializationCallback" },
628@@ -567,10 +573,14 @@
629 { "~NoUntypedKeyNodeValue_2", "node with untyped key value found during probe on index \"$2\"" },
630 { "~NodeIDNeedsBytes_2", "nodeid requires more than $2 bytes" },
631 { "~NodeIDTooBig", "nodeid component too big for encoding" },
632+#if !defined(ZORBA_NO_ICU)
633 { "~NonClosedBackRef_3", "'$$3': non-closed backreference" },
634+#endif
635 { "~NonFileThesaurusURI", "non-file thesaurus URI" },
636 { "~NonLocalhostAuthority", "non-localhost authority" },
637+#if !defined(ZORBA_NO_ICU)
638 { "~NonexistentBackRef_3", "'$$3': non-existent backreference" },
639+#endif
640 { "~NotAllowedForTypeName", "not allowed for typeName (use xsd:untyped instead)" },
641 { "~NotAmongInScopeSchemaTypes", "not among in-scope schema types" },
642 { "~NotDefInDynamicCtx", "not defined in dynamic context" },
643@@ -589,6 +599,69 @@
644 { "~ParserNoCreateTree", "XML tree creation failed" },
645 { "~PromotionImpossible", "promotion not possible" },
646 { "~QuotedColon_23", "\"$2\": $3" },
647+#if defined(ZORBA_NO_ICU)
648+ { "~REGEX_BROKEN_PIs_CONSTRUCT", "broken \\p{Is} construct; valid characters are [a-zA-Z0-9-]" },
649+#endif
650+#if defined(ZORBA_NO_ICU)
651+ { "~REGEX_BROKEN_P_CONSTRUCT", "broken \\p construct" },
652+#endif
653+#if defined(ZORBA_NO_ICU)
654+ { "~REGEX_INVALID_ATOM_CHAR", "$3 - invalid character for an atom; forbidden characters are: [{}?*+|^]" },
655+#endif
656+#if defined(ZORBA_NO_ICU)
657+ { "~REGEX_INVALID_BACK_REF", "\\$3 backreference to a non-existent capture group ($4 groups so far)" },
658+#endif
659+#if defined(ZORBA_NO_ICU)
660+ { "~REGEX_INVALID_SUBCLASS", "malformed class subtraction" },
661+#endif
662+#if defined(ZORBA_NO_ICU)
663+ { "~REGEX_INVALID_UNICODE_CODEPOINT_u", "invalid unicode hex, should be in form \\uXXXX or \\UXXXXXXXX" },
664+#endif
665+#if defined(ZORBA_NO_ICU)
666+ { "~REGEX_INVALID_USE_OF_SUBCLASS", "improper use of class subtraction: it must be the last construct in a class group [xxx-[yyy]]" },
667+#endif
668+#if defined(ZORBA_NO_ICU)
669+ { "~REGEX_MAX_LT_MIN", "in {min,max}, max is less than min" },
670+#endif
671+#if defined(ZORBA_NO_ICU)
672+ { "~REGEX_MISMATCHED_PAREN", "incorrectly nested parentheses" },
673+#endif
674+#if defined(ZORBA_NO_ICU)
675+ { "~REGEX_MISSING_CLOSE_BRACKET", "missing ']' in character group" },
676+#endif
677+#if defined(ZORBA_NO_ICU)
678+ { "~REGEX_MULTICHAR_IN_CHAR_RANGE", "multichars or char categories cannot be part of a char range" },
679+#endif
680+#if defined(ZORBA_NO_ICU)
681+ { "~REGEX_UNIMPLEMENTED", "use of regular expression feature that is not yet implemented" },
682+#endif
683+#if defined(ZORBA_NO_ICU)
684+ { "~REGEX_UNKNOWN_ESC_CHAR", "unknown \\? escape char; supported escapes are: \\[nrt\\|.?*+(){}[]-^$] for char escapes, \\[pP] for categories and \\[sSiIcCdDwW] for multichar groups" },
685+#endif
686+#if defined(ZORBA_NO_ICU)
687+ { "~REGEX_UNKNOWN_PC_CONSTRUCT", "unknown \\p{C?} category; supported categories: C, Cc, Cf, Co, Cn(for not assigned)" },
688+#endif
689+#if defined(ZORBA_NO_ICU)
690+ { "~REGEX_UNKNOWN_PIs_CONSTRUCT", "unknown \\p{Is} category block; see supported block escapes here: http://www.w3.org/TR/xmlschema-2/#charcter-classes" },
691+#endif
692+#if defined(ZORBA_NO_ICU)
693+ { "~REGEX_UNKNOWN_PL_CONSTRUCT", "unknown \\p{L?} category; supported categories: L, Lu, Ll, Lt, Lm, Lo" },
694+#endif
695+#if defined(ZORBA_NO_ICU)
696+ { "~REGEX_UNKNOWN_PM_CONSTRUCT", "unknown \\p{M?} category; supported categories: M, Mn, Mc, Me" },
697+#endif
698+#if defined(ZORBA_NO_ICU)
699+ { "~REGEX_UNKNOWN_PN_CONSTRUCT", "unknown \\p{N?} category; supported categories: N, Nd, Nl, No" },
700+#endif
701+#if defined(ZORBA_NO_ICU)
702+ { "~REGEX_UNKNOWN_PP_CONSTRUCT", "unknown \\p{P?} category; supported categories: P, Pc, Pd, Ps, Pe, Pi, Pf, Po" },
703+#endif
704+#if defined(ZORBA_NO_ICU)
705+ { "~REGEX_UNKNOWN_PS_CONSTRUCT", "unknown \\p{S?} category; supported categories: S, Sm, Sc, Sk, So" },
706+#endif
707+#if defined(ZORBA_NO_ICU)
708+ { "~REGEX_UNKNOWN_PZ_CONSTRUCT", "unknown \\p{Z?} category; supported categories: Z, Zs, Zl, Zp" },
709+#endif
710 { "~SEPM0009_Not10", "the version parameter has a value other than \"1.0\" and the doctype-system parameter is specified" },
711 { "~SEPM0009_NotOmit", "the standalone attribute has a value other than \"omit\"" },
712 { "~SchemaAttributeName", "schema-attribute name" },
713@@ -610,68 +683,73 @@
714 { "~TwoDecimalFormatsSameName_2", "\"$2\": two decimal formats with this name" },
715 { "~TwoDefaultDecimalFormats", "two default decimal formats" },
716 { "~TypeIsNotSubtype", "item type is not a subtype of \"$3\"" },
717-#if !defined(ZORBA_NO_UNICODE)
718+#if !defined(ZORBA_NO_ICU)
719 { "~U_REGEX_BAD_ESCAPE_SEQUENCE", "unrecognized backslash escape sequence" },
720 #endif
721-#if !defined(ZORBA_NO_UNICODE)
722+#if !defined(ZORBA_NO_ICU)
723 { "~U_REGEX_BAD_INTERVAL", "error in {min,max} interval" },
724 #endif
725-#if !defined(ZORBA_NO_UNICODE)
726+#if !defined(ZORBA_NO_ICU)
727 { "~U_REGEX_INTERNAL_ERROR", "an internal ICU error (bug) was detected" },
728 #endif
729-#if !defined(ZORBA_NO_UNICODE)
730+#if !defined(ZORBA_NO_ICU)
731 { "~U_REGEX_INVALID_BACK_REF", "backreference to a non-existent capture group" },
732 #endif
733-#if !defined(ZORBA_NO_UNICODE)
734+#if !defined(ZORBA_NO_ICU)
735 { "~U_REGEX_INVALID_FLAG", "invalid value for match mode flags" },
736 #endif
737-#if !defined(ZORBA_NO_UNICODE)
738+#if !defined(ZORBA_NO_ICU)
739 { "~U_REGEX_INVALID_RANGE", "in character range [x-y], x is greater than y" },
740 #endif
741-#if !defined(ZORBA_NO_UNICODE)
742+#if !defined(ZORBA_NO_ICU)
743 { "~U_REGEX_INVALID_STATE", "RegexMatcher in invalid state for requested operation" },
744 #endif
745-#if !defined(ZORBA_NO_UNICODE)
746+#if !defined(ZORBA_NO_ICU)
747 { "~U_REGEX_LOOK_BEHIND_LIMIT", "look-behind pattern matches must have a bounded maximum length" },
748 #endif
749-#if !defined(ZORBA_NO_UNICODE)
750+#if !defined(ZORBA_NO_ICU)
751 { "~U_REGEX_MAX_LT_MIN", "in {min,max}, max is less than min" },
752 #endif
753-#if !defined(ZORBA_NO_UNICODE)
754+#if !defined(ZORBA_NO_ICU)
755 { "~U_REGEX_MISMATCHED_PAREN", "incorrectly nested parentheses" },
756 #endif
757-#if !defined(ZORBA_NO_UNICODE)
758+#if !defined(ZORBA_NO_ICU)
759 { "~U_REGEX_MISSING_CLOSE_BRACKET", "missing ']'" },
760 #endif
761-#if !defined(ZORBA_NO_UNICODE)
762+#if !defined(ZORBA_NO_ICU)
763 { "~U_REGEX_NUMBER_TOO_BIG", "decimal number is too large" },
764 #endif
765-#if !defined(ZORBA_NO_UNICODE)
766+#if !defined(ZORBA_NO_ICU)
767 { "~U_REGEX_OCTAL_TOO_BIG", "octal character constants must be <= 0377" },
768 #endif
769-#if !defined(ZORBA_NO_UNICODE)
770+#if !defined(ZORBA_NO_ICU)
771 { "~U_REGEX_PROPERTY_SYNTAX", "incorrect Unicode property" },
772 #endif
773-#if !defined(ZORBA_NO_UNICODE)
774+#if !defined(ZORBA_NO_ICU)
775 { "~U_REGEX_RULE_SYNTAX", "syntax error" },
776 #endif
777-#if !defined(ZORBA_NO_UNICODE)
778+#if !defined(ZORBA_NO_ICU)
779 { "~U_REGEX_SET_CONTAINS_STRING", "can not have UnicodeSets containing strings" },
780 #endif
781-#if !defined(ZORBA_NO_UNICODE)
782+#if !defined(ZORBA_NO_ICU)
783 { "~U_REGEX_STACK_OVERFLOW", "backtrack stack overflow" },
784 #endif
785-#if !defined(ZORBA_NO_UNICODE)
786+#if !defined(ZORBA_NO_ICU)
787 { "~U_REGEX_STOPPED_BY_CALLER", "matching operation aborted by user callback fn" },
788 #endif
789-#if !defined(ZORBA_NO_UNICODE)
790+#if !defined(ZORBA_NO_ICU)
791 { "~U_REGEX_TIME_OUT", "maximum allowed match time exceeded" },
792 #endif
793-#if !defined(ZORBA_NO_UNICODE)
794+#if !defined(ZORBA_NO_ICU)
795 { "~U_REGEX_UNIMPLEMENTED", "use of regular expression feature that is not yet implemented" },
796 #endif
797 { "~UnaryArithOp", "unary arithmetic operator" },
798+#if !defined(ZORBA_NO_ICU)
799 { "~UnbalancedChar_3", "missing '$3'" },
800+#endif
801+#if !defined(ZORBA_NO_ICU)
802+ { "~UnescapedChar_3", "character '$3' must be escaped here" },
803+#endif
804 { "~UnexpectedElement", "unexpected element" },
805 { "~VarValMustBeSingleItem_2", "\"$2\": variable value must be single item" },
806 { "~Variable", "variable" },
807
808=== modified file 'src/precompiled/stdafx.h'
809--- src/precompiled/stdafx.h 2012-03-28 05:19:57 +0000
810+++ src/precompiled/stdafx.h 2012-04-06 00:18:21 +0000
811@@ -15,363 +15,81 @@
812
813 */
814
815-#if defined STDAFX
816-#include <iostream>
817-#include <stdexcept>
818-#include <cassert>
819-#include <cstring>
820-#include <memory>
821-
822-#include <sstream>
823-#include <xfwrap>
824-#include <xfwrap1>
825-#include <istream>
826-#include <cstdio>
827-#include <xxshared>
828-#include <crtdefs.h>
829-#include <map>
830-#include <set>
831-//#include <poppack.h>
832-//#include <xxtype_traits>
833-//#include <xxcallwrap>
834-
835-// #include <xxcallpmf>
836-// //#include <xxbind0>
837-// //#include <xxbind1>
838-// //#include <xxresult>
839-// #include <zorba/audit.h>
840-// #include "api/auditimpl.h"
841-// #include <zorba/audit.h>
842-
843- //#include "unicode/unistr.h"
844- #include "runtime/sequences/sequences.h"
845- #include "diagnostics/xquery_diagnostics.h"
846- #include "xercesc/util/xercesdefs.hpp"
847- #include "runtime/collections/collections.h"
848- #include "unicode/utypes.h"
849- #include "zorba/config.h"
850- #include "store/api/store.h"
851- #include "zorba/zorba.h"
852- #include "zorba/api_shared_types.h"
853- #include "compiler/parsetree/parsenodes.h"
854- #include "compiler/parser/parse_constants.h"
855- //#include "compiler/api/compilercb.h"
856- #include "zorbautils/checked_vector.h"
857- #include "compiler/parser/xquery_driver.h"
858- #include "util/sorter.h"
859- #include "compiler/xqueryx/xqueryx_to_xquery.h"
860-// #include "compiler/xqueryx/xqueryx_xslt.h"
861-//#include "compiler/parser/xquery_scanner.h"
862-//#include "compiler/parsetree/parsenode_base.h"
863-//#include "compiler/parsetree/parsenode_visitor.h"
864-// #include "runtime/core/flwor_iterator.h"
865-// #include "context/static_context.h"
866-// #include "zorbautils/fatal.h"
867-// #include "runtime/base/unarybase.h"
868-// #include "compiler/expression/expr_consts.h"
869-// #include "api/iterator_singleton.h"
870-// #include "runtime/visitors/printer_visitor_api.h"
871-// //#include "compiler/parsetree/parsenode_print_dot_visitor.h"
872-// //#include "compiler/parsetree/parsenode_print_dot_visitor.h"
873-// //#include "runtime/visitors/planiter_visitor_impl_code.h"
874-// //#include "runtime/visitors/planiter_visitor_impl_include.h"
875-// //#include "runtime/visitors/printer_visitor_impl.h"
876-// //#include "runtime/core/path.h"
877-// #include "compiler/expression/ft_expr.h"
878-// #include "compiler/expression/ftnode.h"
879-// #include "compiler/parser/query_loc.h"
880+#ifdef STDAFX
881+
882+ #include <fstream>
883+ #include <iostream>
884+ #include <stdexcept>
885+ #include <cassert>
886+ #include <cstring>
887+ #include <memory>
888+
889+ #include <sstream>
890+ #include <xfwrap>
891+ #include <xfwrap1>
892+ #include <istream>
893+ #include <cstdio>
894+ #include <xxshared>
895+ #include <crtdefs.h>
896+ #include <map>
897+ #include <set>
898+
899+ #include "runtime/sequences/sequences.h"
900+ #include "diagnostics/xquery_diagnostics.h"
901+ #include "xercesc/util/xercesdefs.hpp"
902+ #include "runtime/collections/collections.h"
903+ #include "unicode/utypes.h"
904+ #include "zorba/config.h"
905+ #include "store/api/store.h"
906+ #include "zorba/zorba.h"
907+ #include "zorba/api_shared_types.h"
908+ #include "compiler/parsetree/parsenodes.h"
909+ #include "compiler/parser/parse_constants.h"
910+ #include "zorbautils/checked_vector.h"
911+ #include "compiler/parser/xquery_driver.h"
912+ #include "util/sorter.h"
913+ #include "compiler/xqueryx/xqueryx_to_xquery.h"
914+ #include <zorba/store_manager.h>
915+ #include <zorba/xquery.h>
916+ #include <zorba/xquery_exception.h>
917 #include "util/cxx_util.h"
918-// #include "util/indent.h"
919-// #include "util/stl_util.h"
920-// #include "diagnostics/xquery_diagnostics.h"
921-// #include "zorbatypes/numconversions.h"
922+ #include "diagnostics/assert.h"
923+ #include "zorbatypes/mapm/m_apm_lc.h"
924+ #include "zorbatypes/datetime/parse.h"
925+ #include "zorbatypes/chartype.h"
926+ #include "zorbatypes/collation_manager.h"
927+ #include "zorbatypes/ft_token.h"
928+ #include "zorbatypes/m_apm.h"
929+ #include "zorbatypes/rclock.h"
930+ #include "zorbatypes/schema_types.h"
931+ #include "zorbatypes/timezone.h"
932+ #include "zorbatypes/transcoder.h"
933+ #include "zorbatypes/URI.h"
934+ #include "zorbatypes/xerces_xmlcharray.h"
935+ #include "zorbatypes/zorbatypes_decl.h"
936+ #include "zorbatypes/zstring.h"
937+ #include "zorbautils/condition.h"
938+ #include "zorbautils/hashfun.h"
939+ #include "zorbautils/hashmap.h"
940+ #include "zorbautils/hashmap_itemp.h"
941+ #include "zorbautils/hashmap_str_obj.h"
942+ #include "zorbautils/hashmap_zstring.h"
943+ #include "zorbautils/hashset.h"
944+ #include "zorbautils/hashset_itemh.h"
945+ #include "zorbautils/latch.h"
946+ #include "zorbautils/locale.h"
947+ #include "zorbautils/lock.h"
948+ #include "zorbautils/mutex.h"
949+ #include "zorbautils/runnable.h"
950+ #include "zorbautils/SAXParser.h"
951+ #include "zorbautils/stack.h"
952+ #include "zorbautils/string_util.h"
953+ #include "unit_tests/unit_test_list.h"
954+ #include "zorba/diagnostic_handler.h"
955+ #include "zorba/xquery_warning.h"
956+ #include "runtime/full_text/ftcontains_visitor.h"
957+ #include "store/api/ft_token_iterator.h"
958+ #include "store/naive/ft_token_store.h"
959
960-// #include "api/serialization/serializable.h"
961-// #include "api/serialization/serializer.h"
962-// #include "api/collectionimpl.h"
963-// #include "api/dynamiccontextimpl.h"
964-// #include "api/fileimpl.h"
965-// #include "api/functionimpl.h"
966-// #include "api/invoke_item_sequence.h"
967-// #include "api/itemfactoryimpl.h"
968-// #include "api/resultiteratorchainer.h"
969-// #include "api/resultiteratorimpl.h"
970-// #include "api/sax2impl.h"
971-// #include "api/serializerimpl.h"
972-// #include "api/staticcontextimpl.h"
973-// #include "api/storeiteratorimpl.h"
974-// #include "api/unmarshaller.h"
975-// #include "api/uri_resolver_wrappers.h"
976-// #include "api/vectoriterator.h"
977-// #include "api/xmldatamanagerimpl.h"
978-// //#include "api/xqueryimpl.h"
979-// #include "api/zorbaimpl.h"
980-// #include "capi/cdynamic_context.h"
981-// #include "capi/cexpression.h"
982-// #include "capi/cexternal_function.h"
983-// #include "capi/cimplementation.h"
984-// #include "capi/csequence.h"
985-// #include "capi/cstatic_context.h"
986-// #include "capi/error.h"
987-// #include "capi/external_module.h"
988-// #include "capi/single_item_sequence.h"
989-// #include "capi/user_item_sequence.h"
990-// #include "compiler/parser/flexlexer.h"
991-// #include "compiler/parser/ft_types.h"
992-// #include "compiler/parser/symbol_table.h"
993-// #include "compiler/parser/xqdoc_comment.h"
994-// #include "compiler/parsetree/parsenode_print_xml_visitor.h"
995-// #include "compiler/parsetree/parsenode_print_xqdoc_visitor.h"
996-// #include "compiler/parsetree/parsenode_print_xquery_visitor.h"
997-// #include "compiler/parsetree/parsenode_xqdoc_visitor.h"
998-// #include "compiler/translator/prolog_graph.h"
999-// #include "compiler/translator/translator.h"
1000-// #include "compiler/codegen/plan_visitor.h"
1001-// #include "compiler/expression/abstract_expr_visitor.h"
1002-// #include "compiler/expression/expr.h"
1003-// #include "compiler/expression/expr_annotations.h"
1004-// #include "compiler/expression/expr_base.h"
1005-// #include "compiler/expression/expr_classes.h"
1006-// #include "compiler/expression/expr_iter.h"
1007-// #include "compiler/expression/expr_utils.h"
1008-// #include "compiler/expression/expr_visitor.h"
1009-// #include "compiler/expression/flwor_expr.h"
1010-// //#include "compiler/expression/fo_expr.h"
1011-// #include "compiler/expression/ftnode_classes.h"
1012-// #include "compiler/expression/ftnode_visitor.h"
1013-// #include "compiler/expression/function_item_expr.h"
1014-// #include "compiler/expression/path_expr.h"
1015-// #include "compiler/expression/script_exprs.h"
1016-// #include "compiler/expression/update_exprs.h"
1017-// #include "compiler/expression/var_expr.h"
1018-// #include "compiler/rewriter/framework/rewriter.h"
1019-// #include "compiler/rewriter/framework/rewriter_context.h"
1020-// #include "compiler/rewriter/framework/rule_driver.h"
1021-// #include "compiler/rewriter/framework/sequential_rewriter.h"
1022-// #include "compiler/rewriter/rewriters/common_rewriter.h"
1023-// #include "compiler/rewriter/rewriters/default_optimizer.h"
1024-// #include "compiler/rewriter/rewriters/phase1_rewriter.h"
1025-// #include "compiler/rewriter/rules/ruleset.h"
1026-// #include "compiler/rewriter/rules/rule_base.h"
1027-// #include "compiler/rewriter/rules/type_rules.h"
1028-// #include "compiler/rewriter/tools/dataflow_annotations.h"
1029-// #include "compiler/rewriter/tools/expr_tools.h"
1030-// #include "compiler/rewriter/tools/udf_graph.h"
1031-// #include "compiler/xqddf/collection_decl.h"
1032-// #include "compiler/xqddf/value_ic.h"
1033-// #include "compiler/xqddf/value_index.h"
1034-// #include "compiler/semantic_annotations/annotations.h"
1035-// #include "compiler/semantic_annotations/annotation_holder.h"
1036-// #include "compiler/semantic_annotations/annotation_keys.h"
1037-// #include "compiler/api/compiler_api.h"
1038-// #include "compiler/api/compiler_api_impl.h"
1039-// #include "system/globalenv.h"
1040-// #include "system/properties.h"
1041-// #include "system/zorba_properties.h"
1042-// #include "context/decimal_format.h"
1043-// #include "context/default_uri_mappers.h"
1044-// #include "context/default_url_resolvers.h"
1045-// #include "context/dynamic_context.h"
1046-// #include "context/dynamic_loader.h"
1047-// #include "context/internal_uri_resolvers.h"
1048-// //#include "context/namespace_context.h"
1049-// #include "context/root_static_context.h"
1050-// #include "context/sctx_map_iterator.h"
1051-// #include "context/standard_uri_resolvers.h"
1052-// #include "context/static_context_consts.h"
1053-// #include "context/stemmer_wrappers.h"
1054-// #include "context/uri_resolver.h"
1055-// #include "context/uri_resolver_wrapper.h"
1056-#include "diagnostics/assert.h"
1057-// #include "diagnostics/diagnostic.h"
1058-// #include "diagnostics/dict.h"
1059-// #include "diagnostics/dict_impl.h"
1060-// #include "diagnostics/StackWalker.h"
1061-// #include "diagnostics/user_error.h"
1062-// #include "diagnostics/user_exception.h"
1063-// #include "diagnostics/xquery_exception.h"
1064-// #include "diagnostics/xquery_stack_trace.h"
1065-// #include "diagnostics/xquery_warning.h"
1066-// #include "diagnostics/zorba_exception.h"
1067-// //#include "functions/annotation.h"
1068-// #include "functions/external_function.h"
1069-// #include "functions/function.h"
1070-// #include "functions/function_consts.h"
1071-// #include "functions/function_impl.h"
1072-// #include "functions/func_accessors_impl.h"
1073-// #include "functions/func_apply.h"
1074-// #include "functions/func_arithmetic.h"
1075-// #include "functions/func_booleans_impl.h"
1076-// #include "functions/func_durations_dates_times_impl.h"
1077-// #include "functions/func_enclosed.h"
1078-// #include "functions/func_eval.h"
1079-// #include "functions/func_hoist.h"
1080-// #include "functions/func_index_ddl.h"
1081-// #include "functions/func_node_sort_distinct.h"
1082-// #include "functions/func_numerics_impl.h"
1083-// #include "functions/func_reflection.h"
1084-// #include "functions/func_sequences_impl.h"
1085-// #include "functions/func_var_decl.h"
1086-// #include "functions/library.h"
1087-// #include "functions/signature.h"
1088-// #include "functions/udf.h"
1089-// #include "runtime/full_text/thesauri/decode_base128.h"
1090-// #include "runtime/full_text/thesauri/encoded_list.h"
1091-// #include "runtime/full_text/thesauri/iso2788.h"
1092-// #include "runtime/full_text/thesauri/wn_db_segment.h"
1093-// #include "runtime/full_text/thesauri/wn_synset.h"
1094-// #include "runtime/full_text/thesauri/wn_thesaurus.h"
1095-// #include "runtime/full_text/thesauri/wn_types.h"
1096-// #include "runtime/full_text/thesauri/xqftts_relationship.h"
1097-// #include "runtime/full_text/thesauri/xqftts_thesaurus.h"
1098-// #include "runtime/full_text/ft_match.h"
1099-// #include "runtime/full_text/ft_query_item.h"
1100-// #include "runtime/full_text/ft_single_token_iterator.h"
1101-// #include "runtime/full_text/ft_stop_words_set.h"
1102-// #include "runtime/full_text/ft_thesaurus.h"
1103-// #include "runtime/full_text/ft_token_matcher.h"
1104-// #include "runtime/full_text/ft_token_seq_iterator.h"
1105-// #include "runtime/full_text/ft_token_span.h"
1106-// #include "runtime/full_text/ft_wildcard.h"
1107-// #include "runtime/full_text/full_text.h"
1108-// #include "runtime/full_text/apply.h"
1109-// #include "runtime/full_text/ft_util.h"
1110-// #include "runtime/collections/collections_base.h"
1111-// #include "runtime/core/apply_updates.h"
1112-// #include "runtime/core/arithmetic_impl.h"
1113-// #include "runtime/core/constructors.h"
1114-// #include "runtime/core/fncall_iterator.h"
1115-// #include "runtime/core/internal_operators.h"
1116-// #include "runtime/core/item_iterator.h"
1117-// #include "runtime/core/nodeid_iterators.h"
1118-// #include "runtime/core/path_iterators.h"
1119-// #include "runtime/core/sequencetypes.h"
1120-// #include "runtime/core/trycatch.h"
1121-// #include "runtime/core/var_iterators.h"
1122-// #include "runtime/numerics/NumericsImpl.h"
1123-// #include "runtime/booleans/BooleanImpl.h"
1124-// #include "runtime/base/binarybase.h"
1125-// #include "runtime/base/narybase.h"
1126-// #include "runtime/base/noarybase.h"
1127-// #include "runtime/base/plan_iterator.h"
1128-// #include "runtime/sequences/SequencesImpl.h"
1129-// #include "runtime/visitors/iterprinter.h"
1130-// #include "runtime/misc/materialize.h"
1131-// #include "runtime/scripting/scripting.h"
1132-// #include "types/schema/EventSchemaValidator.h"
1133-// #include "types/schema/LoadSchemaErrorHandler.h"
1134-// #include "types/schema/PrintSchema.h"
1135-// #include "types/schema/revalidateUtils.h"
1136-// #include "types/schema/schema.h"
1137-// #include "types/schema/SchemaValidatorFilter.h"
1138-// #include "types/schema/StrX.h"
1139-// #include "types/schema/validate.h"
1140-// #include "types/schema/ValidationEventHandler.h"
1141-// #include "types/schema/xercesIncludes.h"
1142-// #include "types/schema/XercesParseUtils.h"
1143-// #include "types/schema/XercSchemaValidator.h"
1144-// #include "types/casting.h"
1145-// #include "types/collation.h"
1146-// #include "types/node_test.h"
1147-// #include "types/root_typemanager.h"
1148-// #include "types/typeconstants.h"
1149-// #include "types/typeimpl.h"
1150-// #include "types/typemanager.h"
1151-// #include "types/typemanagerimpl.h"
1152-// #include "types/typeops.h"
1153-// #include "util/fx/fxarray.h"
1154-// #include "util/fx/fxcharheap.h"
1155-// #include "util/ascii_util.h"
1156-// #include "util/atomic_int.h"
1157-// #include "util/auto_vector.h"
1158-// #include "util/curl_util.h"
1159-// #include "util/dir.h"
1160-// #include "util/dynamic_bitset.h"
1161-// #include "util/empty.h"
1162-// #include "util/error_util.h"
1163-// #include "util/fs_util.h"
1164-// #include "util/hashmap.h"
1165-// //#include "util/hashmap32.h"
1166-// #include "util/less.h"
1167-// #include "util/mmap_file.h"
1168-// #include "util/nonatomic_int.h"
1169-// #include "util/omanip.h"
1170-// #include "util/oseparator.h"
1171-// #include "util/regex.h"
1172-// #include "util/singleton.h"
1173-// #include "util/string_util.h"
1174-// #include "util/threads.h"
1175-// #include "util/tokenbuf.h"
1176-// #include "util/tracer.h"
1177-// #include "util/triple.h"
1178-// #include "util/unicode_categories.h"
1179-// #include "util/unicode_util.h"
1180-// #include "util/uri_util.h"
1181-// #include "util/utf8_string.h"
1182-// #include "util/utf8_util.h"
1183-// #include "util/utf8_util_base.h"
1184-// #include "util/void_int.h"
1185-// #include "util/xml_util.h"
1186-// #include "zorbamisc/config/platform.h"
1187-// //#include "zorbaserialization/archiver.h"
1188-// #include "zorbaserialization/base64impl.h"
1189-// #include "zorbaserialization/bin_archiver.h"
1190-// //#include "zorbaserialization/class_serializer.h"
1191-// #include "zorbaserialization/mem_archiver.h"
1192-// #include "zorbaserialization/serialization_engine.h"
1193-// #include "zorbaserialization/template_serializer.h"
1194-// #include "zorbaserialization/xml_archiver.h"
1195-// #include "zorbaserialization/zorba_class_serializer.h"
1196- #include "zorbatypes/mapm/m_apm_lc.h"
1197- #include "zorbatypes/datetime/parse.h"
1198- //#include "zorbatypes/binary.h"
1199- #include "zorbatypes/chartype.h"
1200- #include "zorbatypes/collation_manager.h"
1201- //#include "zorbatypes/datetime.h"
1202- //#include "zorbatypes/decimal.h"
1203- //#include "zorbatypes/duration.h"
1204- //#include "zorbatypes/floatimpl.h"
1205- #include "zorbatypes/ft_token.h"
1206- //#include "zorbatypes/integer.h"
1207- #include "zorbatypes/libicu.h"
1208- #include "zorbatypes/m_apm.h"
1209- //#include "zorbatypes/rchandle.h"
1210- #include "zorbatypes/rclock.h"
1211- //#include "zorbatypes/regex_ascii.h"
1212- #include "zorbatypes/schema_types.h"
1213- #include "zorbatypes/timezone.h"
1214- #include "zorbatypes/transcoder.h"
1215- #include "zorbatypes/URI.h"
1216- #include "zorbatypes/xerces_xmlcharray.h"
1217- #include "zorbatypes/zorbatypes_decl.h"
1218- #include "zorbatypes/zstring.h"
1219- //#include "zorbautils/stemmer/sb_stemmer.h"
1220- #include "zorbautils/condition.h"
1221- #include "zorbautils/hashfun.h"
1222- #include "zorbautils/hashmap.h"
1223- #include "zorbautils/hashmap_itemp.h"
1224- #include "zorbautils/hashmap_str_obj.h"
1225- #include "zorbautils/hashmap_zstring.h"
1226- #include "zorbautils/hashset.h"
1227- #include "zorbautils/hashset_itemh.h"
1228- //#include "zorbautils/icu_tokenizer.h"
1229- #include "zorbautils/latch.h"
1230- #include "zorbautils/locale.h"
1231- #include "zorbautils/lock.h"
1232- #include "zorbautils/mutex.h"
1233- #include "zorbautils/runnable.h"
1234- #include "zorbautils/SAXParser.h"
1235- #include "zorbautils/stack.h"
1236-// #include "zorbautils/stemmer.h"
1237- #include "zorbautils/string_util.h"
1238- //#include "zorbautils/synchronous_logger.h"
1239- //#include "zorbautils/tokenizer.h"
1240- #include "unit_tests/unit_test_list.h"
1241- #include "zorba/diagnostic_handler.h"
1242- #include "zorba/xquery_warning.h"
1243- #include "runtime/full_text/ftcontains_visitor.h"
1244- #include "store/naive/naive_ft_token_iterator.h"
1245- #include "store/api/ft_token_iterator.h"
1246- #include "store/naive/ft_token_store.h"
1247 #endif
1248 /* vim:set et sw=2 ts=2: */
1249
1250=== modified file 'src/runtime/full_text/CMakeLists.txt'
1251--- src/runtime/full_text/CMakeLists.txt 2012-03-28 05:19:57 +0000
1252+++ src/runtime/full_text/CMakeLists.txt 2012-04-06 00:18:21 +0000
1253@@ -42,11 +42,11 @@
1254 default_tokenizer.cpp
1255 )
1256
1257-IF (ZORBA_NO_UNICODE)
1258+IF (ZORBA_NO_ICU)
1259 LIST(APPEND FULLTEXT_SRCS latin_tokenizer.cpp)
1260-ELSE (ZORBA_NO_UNICODE)
1261+ELSE (ZORBA_NO_ICU)
1262 LIST(APPEND FULLTEXT_SRCS icu_tokenizer.cpp)
1263-ENDIF (ZORBA_NO_UNICODE)
1264+ENDIF (ZORBA_NO_ICU)
1265
1266 ADD_SRC_SUBFOLDER(FULLTEXT_SRCS stemmer LIBSTEMMER_SRCS)
1267
1268
1269=== modified file 'src/runtime/full_text/default_tokenizer.cpp'
1270--- src/runtime/full_text/default_tokenizer.cpp 2012-03-28 05:19:57 +0000
1271+++ src/runtime/full_text/default_tokenizer.cpp 2012-04-06 00:18:21 +0000
1272@@ -19,22 +19,22 @@
1273 #include <zorba/config.h>
1274
1275 #include "default_tokenizer.h"
1276-#ifdef ZORBA_NO_UNICODE
1277+#ifdef ZORBA_NO_ICU
1278 # include "latin_tokenizer.h"
1279 #else
1280 # include "icu_tokenizer.h"
1281-#endif /* ZORBA_NO_UNICODE */
1282+#endif /* ZORBA_NO_ICU */
1283
1284 namespace zorba {
1285
1286 ///////////////////////////////////////////////////////////////////////////////
1287
1288 TokenizerProvider const& default_tokenizer_provider() {
1289-#ifdef ZORBA_NO_UNICODE
1290+#ifdef ZORBA_NO_ICU
1291 static LatinTokenizerProvider const instance;
1292 #else
1293 static ICU_TokenizerProvider const instance;
1294-#endif /* ZORBA_NO_UNICODE */
1295+#endif /* ZORBA_NO_ICU */
1296 return instance;
1297 };
1298
1299
1300=== modified file 'src/runtime/full_text/latin_tokenizer.cpp'
1301--- src/runtime/full_text/latin_tokenizer.cpp 2012-03-28 05:19:57 +0000
1302+++ src/runtime/full_text/latin_tokenizer.cpp 2012-04-06 00:18:21 +0000
1303@@ -18,8 +18,9 @@
1304 #include <functional>
1305
1306 #include <zorba/diagnostic_list.h>
1307-#include <zorba/xquery_exception.h>
1308-#include <zorba/zorba.h>
1309+
1310+#include "diagnostics/dict.h"
1311+#include "diagnostics/xquery_exception.h"
1312
1313 #include "latin_tokenizer.h"
1314
1315
1316=== modified file 'src/runtime/full_text/latin_tokenizer.h'
1317--- src/runtime/full_text/latin_tokenizer.h 2012-03-28 05:19:57 +0000
1318+++ src/runtime/full_text/latin_tokenizer.h 2012-04-06 00:18:21 +0000
1319@@ -14,12 +14,12 @@
1320 * limitations under the License.
1321 */
1322
1323-#ifndef ZORBA_WESTERN_TOKENIZER_H
1324-#define ZORBA_WESTERN_TOKENIZER_H
1325+#ifndef ZORBA_LATIN_TOKENIZER_H
1326+#define ZORBA_LATIN_TOKENIZER_H
1327
1328 #include <zorba/config.h>
1329
1330-#ifdef ZORBA_NO_FULL_TEXT
1331+#ifdef ZORBA_NO_ICU
1332
1333 #include <zorba/tokenizer.h>
1334 #include "zorbatypes/zstring.h"
1335@@ -38,8 +38,8 @@
1336
1337 // inherited
1338 void destroy() const;
1339- void tokenize( char const*, size_type, iso639_1::type, bool, Callback&,
1340- void* );
1341+ void tokenize( char const*, size_type, locale::iso639_1::type, bool,
1342+ Callback&, void* );
1343
1344 private:
1345 typedef zstring string_type;
1346@@ -64,13 +64,14 @@
1347 class LatinTokenizerProvider : public TokenizerProvider {
1348 public:
1349 // inherited
1350- Tokenizer::ptr getTokenizer( iso639_1::type, Tokenizer::Numbers& ) const;
1351+ Tokenizer::ptr getTokenizer( locale::iso639_1::type,
1352+ Tokenizer::Numbers& ) const;
1353 };
1354
1355 ///////////////////////////////////////////////////////////////////////////////
1356
1357 } // namespace zorba
1358
1359-#endif /* ZORBA_NO_FULL_TEXT */
1360-#endif /* ZORBA_WESTERN_TOKENIZER_H */
1361+#endif /* ZORBA_NO_ICU */
1362+#endif /* ZORBA_LATIN_TOKENIZER_H */
1363 /* vim:set et sw=2 ts=2: */
1364
1365=== modified file 'src/runtime/numerics/format_integer_impl.cpp'
1366--- src/runtime/numerics/format_integer_impl.cpp 2012-03-28 05:19:57 +0000
1367+++ src/runtime/numerics/format_integer_impl.cpp 2012-04-06 00:18:21 +0000
1368@@ -881,7 +881,7 @@
1369 utf8_result += (*valueit);
1370 }
1371 else
1372- utf8_result += (0x2080 + *valueit - '0');
1373+ utf8_result += (unicode::code_point)(0x2080 + *valueit - '0');
1374 }
1375 }
1376 else if((c0 == 0x2460) || //CIRCLED DIGIT ONE (1-20)
1377
1378=== modified file 'src/runtime/numerics/numerics_impl.cpp'
1379--- src/runtime/numerics/numerics_impl.cpp 2012-03-28 05:19:57 +0000
1380+++ src/runtime/numerics/numerics_impl.cpp 2012-04-06 00:18:21 +0000
1381@@ -462,7 +462,7 @@
1382 minus( "-" )
1383 {
1384 utf8_string<zstring> u_per_mille( per_mille );
1385- u_per_mille = 0x2030;
1386+ u_per_mille = (unicode::code_point)0x2030;
1387 }
1388
1389 void readFormat(const DecimalFormat_t& df_t)
1390
1391=== modified file 'src/runtime/strings/strings_impl.cpp'
1392--- src/runtime/strings/strings_impl.cpp 2012-03-28 05:19:57 +0000
1393+++ src/runtime/strings/strings_impl.cpp 2012-04-06 00:18:21 +0000
1394@@ -810,7 +810,9 @@
1395 zstring normForm;
1396 zstring resStr;
1397 unicode::normalization::type normType;
1398+#ifndef ZORBA_NO_ICU
1399 bool success;
1400+#endif /* ZORBA_NO_ICU */
1401
1402 PlanIteratorState* state;
1403 DEFAULT_STACK_INIT(PlanIteratorState, state, planState);
1404@@ -860,10 +862,10 @@
1405 }
1406
1407 item0->getStringValue2(resStr);
1408-#ifndef ZORBA_NO_UNICODE
1409+#ifndef ZORBA_NO_ICU
1410 success = utf8::normalize(resStr, normType, &resStr);
1411 ZORBA_ASSERT(success);
1412-#endif//#ifndef ZORBA_NO_UNICODE
1413+#endif//#ifndef ZORBA_NO_ICU
1414 STACK_PUSH(GENV_ITEMFACTORY->createString(result, resStr), state );
1415 }
1416 else
1417@@ -992,7 +994,7 @@
1418 trans_map[ *map_i ] = *trans_i;
1419
1420 for ( ; map_i != map_end; ++map_i )
1421- trans_map[ *map_i ] = ~0;
1422+ trans_map[ *map_i ] = static_cast<unicode::code_point>( ~0 );
1423 }
1424
1425 utf8_string<zstring> u_result_string( result_string );
1426@@ -1007,7 +1009,7 @@
1427 cp_map_type::const_iterator const found_i = trans_map.find( cp );
1428 if ( found_i != trans_map.end() ) {
1429 cp = found_i->second;
1430- if ( cp == ~0 )
1431+ if ( cp == static_cast<unicode::code_point>( ~0 ) )
1432 continue;
1433 }
1434 u_result_string += cp;
1435@@ -1795,16 +1797,33 @@
1436 int &utf8start,
1437 unsigned int &bytestart,
1438 int utf8end,
1439+ unsigned int byteend,
1440 zstring &out)
1441 {
1442+#ifndef ZORBA_NO_ICU
1443 utf8::size_type clen;
1444- while(utf8start < utf8end)
1445- {
1446- clen = utf8::char_length(*sin);
1447- out.append(sin, clen);
1448- utf8start++;
1449- bytestart += clen;
1450- sin += clen;
1451+ if(utf8end)
1452+ {
1453+ while(utf8start < utf8end)
1454+ {
1455+ clen = utf8::char_length(*sin);
1456+ if(clen == 0)
1457+ clen = 1;
1458+ out.append(sin, clen);
1459+ utf8start++;
1460+ bytestart += clen;
1461+ sin += clen;
1462+ }
1463+ }
1464+ else
1465+#endif
1466+ {
1467+ if(!utf8end)
1468+ utf8end = byteend;
1469+ out.append(sin, utf8end-bytestart);
1470+ sin += utf8end-bytestart;
1471+ utf8start = utf8end;
1472+ bytestart = utf8end;
1473 }
1474 }
1475
1476@@ -1812,6 +1831,7 @@
1477 int &match_end1,
1478 unsigned int &match_end1_bytes,
1479 int match_start2,
1480+ unsigned int match_start2_bytes,
1481 const char *&strin)
1482 {
1483 store::Item_t non_match_elem;
1484@@ -1833,7 +1853,7 @@
1485 // utf8_it++;
1486 // match_end1++;
1487 //}
1488- copyUtf8Chars(strin, match_end1, match_end1_bytes, match_start2, non_match_str);
1489+ copyUtf8Chars(strin, match_end1, match_end1_bytes, match_start2, match_start2_bytes, non_match_str);
1490 store::Item_t non_match_text_item;
1491 GENV_ITEMFACTORY->createTextNode(non_match_text_item, non_match_elem, non_match_str);
1492 }
1493@@ -1864,19 +1884,31 @@
1494 i--;
1495 break;
1496 }
1497+#ifndef ZORBA_NO_ICU
1498 match_startg = rx.get_match_start(i+1);
1499 if((match_startg < 0) && (gparent < 0))
1500 continue;
1501+#else
1502+ int temp_endg;
1503+ match_startg = -1;
1504+ temp_endg = -1;
1505+ if(!rx.get_match_start_end_bytes(i+1, &match_startg, &temp_endg) && (gparent < 0))
1506+ continue;
1507+#endif
1508 if(match_endgood < match_startg)
1509 {
1510 //add non-group match text
1511 zstring non_group_str;
1512
1513- copyUtf8Chars(sin, match_endgood, match_end1_bytes, match_startg, non_group_str);
1514+ copyUtf8Chars(sin, match_endgood, match_end1_bytes, match_startg, 0, non_group_str);
1515 store::Item_t non_group_text_item;
1516 GENV_ITEMFACTORY->createTextNode(non_group_text_item, parent.getp(), non_group_str);
1517 }
1518+#ifndef ZORBA_NO_ICU
1519 match_endg = rx.get_match_end(i+1);
1520+#else
1521+ match_endg = temp_endg;
1522+#endif
1523 //add group match text
1524 GENV_ITEMFACTORY->createQName(group_element_name,
1525 static_context::W3C_FN_NS, "fn", "group");
1526@@ -1907,7 +1939,7 @@
1527 }
1528 zstring group_str;
1529
1530- copyUtf8Chars(sin, match_startg, match_end1_bytes, match_endg, group_str);
1531+ copyUtf8Chars(sin, match_startg, match_end1_bytes, match_endg, 0, group_str);
1532 store::Item_t group_text_item;
1533 GENV_ITEMFACTORY->createTextNode(group_text_item, group_elem.getp(), group_str);
1534 }
1535@@ -1916,7 +1948,7 @@
1536 {
1537 zstring non_group_str;
1538
1539- copyUtf8Chars(sin, match_endgood, match_end1_bytes, match_end2, non_group_str);
1540+ copyUtf8Chars(sin, match_endgood, match_end1_bytes, match_end2, 0, non_group_str);
1541 store::Item_t non_group_text_item;
1542 GENV_ITEMFACTORY->createTextNode(non_group_text_item, parent, non_group_str);
1543 }
1544@@ -2144,8 +2176,14 @@
1545 reachedEnd = false;
1546 while(rx.find_next_match(&reachedEnd))
1547 {
1548- int match_start2 = rx.get_match_start();
1549- int match_end2 = rx.get_match_end();
1550+ int match_start2;
1551+ int match_end2;
1552+#ifndef ZORBA_NO_ICU
1553+ match_start2 = rx.get_match_start();
1554+ match_end2 = rx.get_match_end();
1555+#else
1556+ rx.get_match_start_end_bytes(0, &match_start2, &match_end2);
1557+#endif
1558 ZORBA_ASSERT(match_start2 >= 0);
1559
1560 if(is_input_stream && reachedEnd && !instream->eof())
1561@@ -2157,7 +2195,7 @@
1562 //construct the fn:non-match
1563 if(match_start2 > match_end1)
1564 {
1565- addNonMatchElement(result, match_end1, match_end1_bytes, match_start2, instr);
1566+ addNonMatchElement(result, match_end1, match_end1_bytes, match_start2, 0, instr);
1567 }
1568
1569 //construct the fn:match
1570@@ -2165,7 +2203,7 @@
1571 match_end1 = match_end2;
1572 }
1573
1574- if(is_input_stream && reachedEnd && !instream->eof())
1575+ if(is_input_stream && !instream->eof())
1576 {
1577 //load some more data, maybe the match will be different
1578 if(match_end1_bytes)
1579@@ -2213,7 +2251,7 @@
1580 else
1581 {
1582 if(match_end1_bytes < streambuf_read)
1583- addNonMatchElement(result, match_end1, match_end1_bytes, streambuf_read, instr);
1584+ addNonMatchElement(result, match_end1, match_end1_bytes, 0, streambuf_read, instr);
1585 if(is_input_stream && instream->eof())
1586 reachedEnd = true;
1587 }
1588
1589=== modified file 'src/store/api/store.h'
1590--- src/store/api/store.h 2012-03-28 05:19:57 +0000
1591+++ src/store/api/store.h 2012-04-06 00:18:21 +0000
1592@@ -16,7 +16,7 @@
1593 #ifndef ZORBA_STORE_STORE_H
1594 #define ZORBA_STORE_STORE_H
1595
1596-#include <zorba/config.h>
1597+#include "zorba/config.h"
1598 #include "zorbatypes/schema_types.h"
1599
1600 #include "store/api/shared_types.h"
1601
1602=== modified file 'src/store/naive/simple_store.h'
1603--- src/store/naive/simple_store.h 2012-03-28 23:58:23 +0000
1604+++ src/store/naive/simple_store.h 2012-04-06 00:18:21 +0000
1605@@ -16,7 +16,11 @@
1606 #ifndef ZORBA_SIMPLE_STORE
1607 #define ZORBA_SIMPLE_STORE
1608
1609-#include "store.h"
1610+#include "store/naive/store.h"
1611+
1612+#include "store/naive/node_factory.h"
1613+#include "store/naive/pul_primitive_factory.h"
1614+#include "store/naive/tree_id_generator.h"
1615
1616 namespace zorba {
1617 namespace simplestore {
1618@@ -72,7 +76,7 @@
1619
1620 NodeFactory* createNodeFactory() const;
1621
1622- void destroyNodeFactory(NodeFactory*) const;
1623+ void destroyNodeFactory(zorba::simplestore::NodeFactory*) const;
1624
1625 store::ItemFactory* createItemFactory() const;
1626
1627@@ -84,7 +88,7 @@
1628
1629 PULPrimitiveFactory* createPULFactory() const;
1630
1631- void destroyPULFactory(PULPrimitiveFactory*) const;
1632+ void destroyPULFactory(zorba::simplestore::PULPrimitiveFactory*) const;
1633
1634 CollectionSet* createCollectionSet() const;
1635
1636
1637=== modified file 'src/store/naive/store.cpp'
1638--- src/store/naive/store.cpp 2012-03-28 22:09:36 +0000
1639+++ src/store/naive/store.cpp 2012-04-06 00:18:21 +0000
1640@@ -33,7 +33,7 @@
1641
1642 #include "properties.h"
1643 #include "string_pool.h"
1644-#include "store.h"
1645+#include "simple_store.h"
1646 #include "simple_temp_seq.h"
1647 #include "simple_lazy_temp_seq.h"
1648 #include "collection.h"
1649
1650=== modified file 'src/store/naive/store.h'
1651--- src/store/naive/store.h 2012-03-28 22:09:36 +0000
1652+++ src/store/naive/store.h 2012-04-06 00:18:21 +0000
1653@@ -16,10 +16,18 @@
1654 #ifndef ZORBA_SIMPLESTORE_STORE_H
1655 #define ZORBA_SIMPLESTORE_STORE_H
1656
1657+#include "store/api/store.h"
1658+
1659 #include "shared_types.h"
1660 #include "store_defs.h"
1661 #include "hashmap_nodep.h"
1662 #include "tree_id.h"
1663+#include "store/util/hashmap_stringbuf.h"
1664+#include "zorbautils/mutex.h"
1665+#include "zorbautils/lock.h"
1666+#include "zorbautils/hashmap.h"
1667+#include "zorbautils/hashmap_itemp.h"
1668+#include "zorbautils/hashmap_zstring_nonserializable.h"
1669
1670 #if (defined (WIN32) || defined (WINCE))
1671 #include "node_items.h"
1672@@ -28,14 +36,7 @@
1673 #include "store/api/ic.h"
1674 #endif
1675
1676-#include "store/api/store.h"
1677-
1678-#include "store/util/hashmap_stringbuf.h"
1679-
1680-#include "zorbautils/mutex.h"
1681-#include "zorbautils/lock.h"
1682-#include "zorbautils/hashmap_itemp.h"
1683-#include "zorbautils/hashmap_zstring_nonserializable.h"
1684+using namespace zorba;
1685
1686 namespace zorba
1687 {
1688@@ -63,9 +64,9 @@
1689 class TreeIdGeneratorFactory;
1690 class TreeIdGenerator;
1691
1692-typedef zorba::HashMapZString<XmlNode_t> DocumentSet;
1693-typedef ItemPointerHashMap<store::Index_t> IndexSet;
1694-typedef ItemPointerHashMap<store::IC_t> ICSet;
1695+typedef HashMapZString<XmlNode_t> DocumentSet;
1696+typedef zorba::ItemPointerHashMap<store::Index_t> IndexSet;
1697+typedef zorba::ItemPointerHashMap<store::IC_t> ICSet;
1698
1699
1700
1701
1702=== modified file 'src/system/globalenv.cpp'
1703--- src/system/globalenv.cpp 2012-03-28 05:19:57 +0000
1704+++ src/system/globalenv.cpp 2012-04-06 00:18:21 +0000
1705@@ -17,11 +17,11 @@
1706
1707 #include "common/common.h"
1708
1709-#ifndef ZORBA_NO_UNICODE
1710+#ifndef ZORBA_NO_ICU
1711 # include <unicode/uclean.h>
1712 # include <unicode/utypes.h>
1713 # include <unicode/udata.h>
1714-#endif /* ZORBA_NO_UNICODE */
1715+#endif /* ZORBA_NO_ICU */
1716
1717 #ifdef ZORBA_WITH_BIG_INTEGER
1718 # include "zorbatypes/m_apm.h"
1719@@ -208,7 +208,7 @@
1720 // from one thread only
1721 // see http://www.icu-project.org/userguide/design.html#Init_and_Termination
1722 // and http://www.icu-project.org/apiref/icu4c/uclean_8h.html
1723-#ifndef ZORBA_NO_UNICODE
1724+#ifndef ZORBA_NO_ICU
1725 # if defined U_STATIC_IMPLEMENTATION && (defined WIN32 || defined WINCE)
1726 {
1727 TCHAR self_path[1024];
1728@@ -238,13 +238,13 @@
1729 udata_setCommonData(icu_appdata, &data_err);
1730 ZORBA_ASSERT(data_err == U_ZERO_ERROR);
1731
1732- // u_setDataDirectory(self_path);
1733+ // u_setDataDirectory(self_path);
1734 }
1735 # endif
1736 UErrorCode lICUInitStatus = U_ZERO_ERROR;
1737 u_init(&lICUInitStatus);
1738 ZORBA_ASSERT(lICUInitStatus == U_ZERO_ERROR);
1739-#endif//ifndef ZORBA_NO_UNICODE
1740+#endif /* ZORBA_NO_ICU */
1741 }
1742
1743
1744@@ -256,12 +256,12 @@
1745 // releases statically initialized memory and prevents
1746 // valgrind from reporting those problems at the end
1747 // see http://www.icu-project.org/apiref/icu4c/uclean_8h.html#93f27d0ddc7c196a1da864763f2d8920
1748-#ifndef ZORBA_NO_UNICODE
1749+#ifndef ZORBA_NO_ICU
1750 u_cleanup();
1751 # if defined U_STATIC_IMPLEMENTATION && (defined WIN32 || defined WINCE)
1752 delete[] icu_appdata;
1753 # endif
1754-#endif//ifndef ZORBA_NO_UNICODE
1755+#endif /* ZORBA_NO_ICU */
1756 }
1757
1758
1759
1760=== modified file 'src/unit_tests/CMakeLists.txt'
1761--- src/unit_tests/CMakeLists.txt 2012-03-28 05:19:57 +0000
1762+++ src/unit_tests/CMakeLists.txt 2012-04-06 00:18:21 +0000
1763@@ -29,9 +29,9 @@
1764 tokenizer.cpp)
1765 ENDIF (NOT ZORBA_NO_FULL_TEXT)
1766
1767-IF (NOT ZORBA_NO_UNICODE)
1768+IF (NOT ZORBA_NO_ICU)
1769 LIST (APPEND UNIT_TEST_SRCS
1770 test_icu_streambuf.cpp)
1771-ENDIF (NOT ZORBA_NO_UNICODE)
1772+ENDIF (NOT ZORBA_NO_ICU)
1773
1774 # vim:set et sw=2 tw=2:
1775
1776=== modified file 'src/unit_tests/string.cpp'
1777--- src/unit_tests/string.cpp 2012-03-28 05:19:57 +0000
1778+++ src/unit_tests/string.cpp 2012-04-06 00:18:21 +0000
1779@@ -569,6 +569,7 @@
1780 ASSERT_TRUE( t == s );
1781 }
1782
1783+#ifndef ZORBA_NO_ICU
1784 template<class StringType>
1785 static void test_to_string_from_wchar_t() {
1786 wchar_t const w[] = L"hello";
1787@@ -578,6 +579,7 @@
1788 for ( string::size_type i = 0; i < s.length(); ++i )
1789 ASSERT_TRUE( s[i] == w[i] );
1790 }
1791+#endif /* ZORBA_NO_ICU */
1792
1793 template<class StringType>
1794 static void test_to_upper() {
1795@@ -605,6 +607,7 @@
1796 }
1797 }
1798
1799+#ifndef ZORBA_NO_ICU
1800 static void test_to_wchar_t() {
1801 string const s = "hello";
1802 wchar_t *w;
1803@@ -616,6 +619,7 @@
1804 ASSERT_TRUE( w[i] == s[i] );
1805 delete[] w;
1806 }
1807+#endif /* ZORBA_NO_ICU */
1808
1809 static void test_trim_start() {
1810 char const *s;
1811@@ -873,16 +877,20 @@
1812 test_to_string_from_utf8<zstring>();
1813 test_to_string_from_utf8<zstring_p>();
1814
1815+#ifndef ZORBA_NO_ICU
1816 test_to_string_from_wchar_t<string>();
1817 test_to_string_from_wchar_t<zstring>();
1818 test_to_string_from_wchar_t<zstring_p>();
1819+#endif /* ZORBA_NO_ICU */
1820
1821 test_to_upper<string>();
1822 test_to_upper<zstring>();
1823 test_to_upper<zstring_p>();
1824 test_to_upper<String>();
1825
1826+#ifndef ZORBA_NO_ICU
1827 test_to_wchar_t();
1828+#endif /* ZORBA_NO_ICU */
1829
1830 test_trim_start();
1831 test_trim_end();
1832
1833=== modified file 'src/unit_tests/unit_test_list.h'
1834--- src/unit_tests/unit_test_list.h 2012-03-28 05:19:57 +0000
1835+++ src/unit_tests/unit_test_list.h 2012-04-06 00:18:21 +0000
1836@@ -36,9 +36,9 @@
1837 /**
1838 * ADD NEW UNIT TESTS HERE
1839 */
1840-#ifndef ZORBA_NO_UNICODE
1841+#ifndef ZORBA_NO_ICU
1842 int test_icu_streambuf( int, char*[] );
1843-#endif /* ZORBA_NO_UNICODE */
1844+#endif /* ZORBA_NO_ICU */
1845 int json_parser( int, char*[] );
1846
1847 void initializeTestList();
1848
1849=== modified file 'src/unit_tests/unit_tests.cpp'
1850--- src/unit_tests/unit_tests.cpp 2012-03-28 05:19:57 +0000
1851+++ src/unit_tests/unit_tests.cpp 2012-04-06 00:18:21 +0000
1852@@ -39,9 +39,9 @@
1853 void initializeTestList() {
1854 libunittests["string"] = test_string;
1855 libunittests["uri"] = runUriTest;
1856-#ifndef ZORBA_NO_UNICODE
1857+#ifndef ZORBA_NO_ICU
1858 libunittests["icu_streambuf"] = test_icu_streambuf;
1859-#endif /* ZORBA_NO_UNICODE */
1860+#endif /* ZORBA_NO_ICU */
1861 libunittests["json_parser"] = json_parser;
1862 libunittests["unique_ptr"] = test_unique_ptr;
1863 #ifndef ZORBA_NO_FULL_TEXT
1864
1865=== modified file 'src/util/CMakeLists.txt'
1866--- src/util/CMakeLists.txt 2012-03-28 05:19:57 +0000
1867+++ src/util/CMakeLists.txt 2012-04-06 00:18:21 +0000
1868@@ -40,14 +40,14 @@
1869 LIST(APPEND UTIL_SRCS mmap_file.cpp)
1870 ENDIF(ZORBA_WITH_FILE_ACCESS)
1871
1872-IF(ZORBA_NO_UNICODE)
1873+IF(ZORBA_NO_ICU)
1874 LIST(APPEND UTIL_SRCS
1875- regex_ascii.cpp
1876+ regex_xquery.cpp
1877 passthru_streambuf.cpp)
1878-ELSE(ZORBA_NO_UNICODE)
1879+ELSE(ZORBA_NO_ICU)
1880 LIST(APPEND UTIL_SRCS
1881 icu_streambuf.cpp)
1882-ENDIF(ZORBA_NO_UNICODE)
1883+ENDIF(ZORBA_NO_ICU)
1884
1885 HEADER_GROUP_SUBFOLDER(UTIL_SRCS fx)
1886 HEADER_GROUP_SUBFOLDER(UTIL_SRCS win32)
1887
1888=== modified file 'src/util/icu_streambuf.h'
1889--- src/util/icu_streambuf.h 2012-02-04 01:26:18 +0000
1890+++ src/util/icu_streambuf.h 2012-04-06 00:18:21 +0000
1891@@ -17,6 +17,7 @@
1892 #ifndef ZORBA_ICU_STREAMBUF_H
1893 #define ZORBA_ICU_STREAMBUF_H
1894
1895+#include <unicode/ucnv.h>
1896 #include <zorba/transcode_stream.h>
1897
1898 #include "util/utf8_util.h"
1899
1900=== modified file 'src/util/passthru_streambuf.cpp'
1901--- src/util/passthru_streambuf.cpp 2012-02-04 01:26:18 +0000
1902+++ src/util/passthru_streambuf.cpp 2012-04-06 00:18:21 +0000
1903@@ -14,8 +14,8 @@
1904 * limitations under the License.
1905 */
1906
1907+#include "stdafx.h"
1908 #include "passthru_streambuf.h"
1909-
1910 using namespace std;
1911
1912 namespace zorba {
1913@@ -47,7 +47,7 @@
1914 }
1915
1916 bool passthru_streambuf::is_supported( char const *cc_charset ) {
1917- return !is_necessary( charset );
1918+ return !is_necessary( cc_charset );
1919 }
1920
1921 passthru_streambuf::pos_type
1922
1923=== modified file 'src/util/passthru_streambuf.h'
1924--- src/util/passthru_streambuf.h 2012-02-02 18:37:24 +0000
1925+++ src/util/passthru_streambuf.h 2012-04-06 00:18:21 +0000
1926@@ -17,8 +17,9 @@
1927 #ifndef ZORBA_PASSTHRU_STREAMBUF_H
1928 #define ZORBA_PASSTHRU_STREAMBUF_H
1929
1930-#include <zorba/transcode_streambuf.h>
1931-
1932+#include <zorba/transcode_stream.h>
1933+#include "zorbatypes/zstring.h"
1934+#include "util/ascii_util.h"
1935 namespace zorba {
1936
1937 ///////////////////////////////////////////////////////////////////////////////
1938@@ -48,6 +49,13 @@
1939 * @return \c true only if the character encoding is supported.
1940 */
1941 static bool is_supported( char const *charset );
1942+ static bool is_necessary( char const *cc_charset );
1943+
1944+ typedef std::streambuf::char_type char_type;
1945+ typedef std::streambuf::int_type int_type;
1946+ typedef std::streambuf::off_type off_type;
1947+ typedef std::streambuf::pos_type pos_type;
1948+ typedef std::streambuf::traits_type traits_type;
1949
1950 protected:
1951 void imbue( std::locale const& );
1952
1953=== modified file 'src/util/regex.cpp'
1954--- src/util/regex.cpp 2012-03-28 05:19:57 +0000
1955+++ src/util/regex.cpp 2012-04-06 00:18:21 +0000
1956@@ -33,8 +33,7 @@
1957 #define INVALID_RE_EXCEPTION(...) \
1958 XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS( __VA_ARGS__ ) )
1959
1960-
1961-#ifndef ZORBA_NO_UNICODE
1962+#ifndef ZORBA_NO_ICU
1963 # include <unicode/uversion.h>
1964 U_NAMESPACE_USE
1965
1966@@ -103,6 +102,7 @@
1967
1968 bool got_backslash = false;
1969 bool in_char_class = false; // within [...]
1970+ bool is_first_char = true;
1971
1972 bool in_backref = false; // '\'[1-9][0-9]*
1973 unsigned backref_no = 0; // 1-based
1974@@ -231,6 +231,8 @@
1975 ++open_cap_subs;
1976 cap_sub.push_back( true );
1977 cur_cap_sub = cap_sub.size();
1978+ is_first_char = true;
1979+ goto append;
1980 }
1981 break;
1982 case ')':
1983@@ -245,8 +247,10 @@
1984 case '[':
1985 if ( q_flag )
1986 *icu_re += '\\';
1987- else
1988+ else {
1989 in_char_class = true;
1990+ goto append;
1991+ }
1992 break;
1993 case ']':
1994 if ( q_flag )
1995@@ -254,6 +258,19 @@
1996 else
1997 in_char_class = false;
1998 break;
1999+ case '^':
2000+ if ( q_flag )
2001+ *icu_re += '\\';
2002+ else if ( !is_first_char && !in_char_class )
2003+ throw INVALID_RE_EXCEPTION( xq_re, ZED( UnescapedChar_3 ), *xq_c );
2004+ break;
2005+ case '|':
2006+ if ( q_flag )
2007+ *icu_re += '\\';
2008+ else {
2009+ is_first_char = true;
2010+ goto append;
2011+ }
2012 default:
2013 if ( x_flag && ascii::is_space( *xq_c ) ) {
2014 if ( !in_char_class )
2015@@ -265,8 +282,10 @@
2016 //
2017 *icu_re += '\\';
2018 }
2019- }
2020- }
2021+ } // switch
2022+ } // else
2023+ is_first_char = false;
2024+append:
2025 *icu_re += *xq_c;
2026 } // FOR_EACH
2027
2028@@ -442,11 +461,11 @@
2029 }
2030
2031 } // namespace unicode
2032-
2033-}//namespace zorba
2034-
2035-
2036-#else /* ZORBA_NO_UNICODE */
2037+} // namespace zorba
2038+
2039+///////////////////////////////////////////////////////////////////////////////
2040+
2041+#else /* ZORBA_NO_ICU */
2042
2043 #include "zorbatypes/zstring.h"
2044
2045@@ -470,7 +489,7 @@
2046 case 'i': flags |= REGEX_ASCII_CASE_INSENSITIVE; break;
2047 case 's': flags |= REGEX_ASCII_DOTALL; break;
2048 case 'm': flags |= REGEX_ASCII_MULTILINE; break;
2049- case 'x': flags |= REGEX_ASCII_COMMENTS; break;
2050+ case 'x': flags |= REGEX_ASCII_NO_WHITESPACE; break;
2051 case 'q': flags |= REGEX_ASCII_LITERAL; break;
2052 default:
2053 throw XQUERY_EXCEPTION( err::FORX0001, ERROR_PARAMS( *p ) );
2054@@ -483,6 +502,7 @@
2055 void regex::compile( char const *pattern, char const *flags)
2056 {
2057 parsed_flags = parse_regex_flags(flags);
2058+ regex_xquery::CRegexXQuery_parser regex_parser;
2059 regex_matcher = regex_parser.parse(pattern, parsed_flags);
2060 if(!regex_matcher)
2061 throw INVALID_RE_EXCEPTION(pattern);
2062@@ -517,6 +537,8 @@
2063 bool regex::next_token( char const *s, size_type *pos, zstring *token,
2064 bool *matched)
2065 {
2066+ if(!s[*pos])
2067+ return false;
2068 bool retval;
2069 int match_pos;
2070 int matched_len;
2071@@ -528,14 +550,8 @@
2072 token->assign(s+*pos, match_pos);
2073 *pos += match_pos + matched_len;
2074 if(matched)
2075- if(match_pos)
2076- *matched = true;
2077- else
2078- *matched = false;
2079- if(match_pos)
2080- return true;
2081- else
2082- return false;
2083+ *matched = true;
2084+ return true;
2085 }
2086 else
2087 {
2088@@ -544,7 +560,7 @@
2089 *pos += strlen(s+*pos);
2090 if(matched)
2091 *matched = false;
2092- return s[*pos] != 0;
2093+ return true;
2094 }
2095 }
2096
2097@@ -554,13 +570,9 @@
2098 int matched_pos;
2099 int matched_len;
2100
2101- bool prev_align = regex_matcher->set_align_begin(true);
2102- retval = regex_matcher->match_from(s, parsed_flags, &matched_pos, &matched_len);
2103- regex_matcher->set_align_begin(prev_align);
2104+ retval = regex_matcher->match_anywhere(s, parsed_flags|REGEX_ASCII_WHOLE_MATCH, &matched_pos, &matched_len);
2105 if(!retval)
2106 return false;
2107- if(matched_len != strlen(s))
2108- return false;
2109 return true;
2110 }
2111
2112@@ -587,14 +599,19 @@
2113 //look for dollars
2114 if(*temprepl == '\\')
2115 {
2116- temprepl++;
2117- if(!*temprepl || (*temprepl != '\\') || (*temprepl != '$'))//Invalid replacement string.
2118- throw XQUERY_EXCEPTION( err::FORX0004, ERROR_PARAMS( replacement ) );
2119+ if(!(parsed_flags & REGEX_ASCII_LITERAL))
2120+ {
2121+ temprepl++;
2122+ if(!*temprepl)
2123+ temprepl--;
2124+ else if((*temprepl != '\\') && (*temprepl != '$'))//Invalid replacement string.
2125+ throw XQUERY_EXCEPTION( err::FORX0004, ERROR_PARAMS( replacement ) );
2126+ }
2127 result->append(1, *temprepl);
2128 temprepl++;
2129 continue;
2130 }
2131- if(*temprepl == '$')
2132+ if((*temprepl == '$') && !(parsed_flags & REGEX_ASCII_LITERAL))
2133 {
2134 temprepl++;
2135 index = 0;
2136@@ -648,7 +665,7 @@
2137 if(retval)
2138 {
2139 m_match_pos += m_pos;
2140- m_pos = m_match_pos = m_matched_len;
2141+ m_pos = m_match_pos + m_matched_len;
2142 }
2143 else
2144 {
2145@@ -666,35 +683,30 @@
2146 return (int)regex_matcher->get_indexed_regex_count();
2147 }
2148
2149-int regex::get_match_start( int groupId )
2150-{
2151- if(groupId == 0)
2152- return m_match_pos;
2153- if(groupId > (int)regex_matcher->get_indexed_regex_count())
2154- return -1;
2155- const char *submatched_source;
2156- int submatched_len;
2157- if(!regex_matcher->get_indexed_match(groupId, &submatched_source, &submatched_len))
2158- return -1;
2159- return submatched_source - s_in_.c_str();
2160-}
2161-
2162-int regex::get_match_end( int groupId )
2163-{
2164- if(groupId == 0)
2165- return m_match_pos + m_matched_len;
2166- if(groupId > (int)regex_matcher->get_indexed_regex_count())
2167- return -1;
2168- const char *submatched_source;
2169- int submatched_len;
2170- if(!regex_matcher->get_indexed_match(groupId, &submatched_source, &submatched_len))
2171- return -1;
2172- return submatched_source - s_in_.c_str() + submatched_len;
2173+bool regex::get_match_start_end_bytes( int groupId, int *start, int *end )
2174+{
2175+ *start = -1;
2176+ *end = -1;
2177+ if(groupId == 0)
2178+ {
2179+ *start = m_match_pos;
2180+ *end = m_match_pos + m_matched_len;
2181+ return true;
2182+ }
2183+ if(groupId > (int)regex_matcher->get_indexed_regex_count())
2184+ return false;
2185+ const char *submatched_source;
2186+ int submatched_len;
2187+ if(!regex_matcher->get_indexed_match(groupId, &submatched_source, &submatched_len))
2188+ return false;
2189+ *start = submatched_source - s_in_.c_str();
2190+ *end = *start + submatched_len;
2191+ return true;
2192 }
2193
2194 } // namespace unicode
2195 } // namespace zorba
2196-#endif /* ZORBA_NO_UNICODE */
2197+#endif /* ZORBA_NO_ICU */
2198
2199 ///////////////////////////////////////////////////////////////////////////////
2200
2201
2202=== modified file 'src/util/regex.h'
2203--- src/util/regex.h 2012-03-28 05:19:57 +0000
2204+++ src/util/regex.h 2012-04-06 00:18:21 +0000
2205@@ -17,15 +17,13 @@
2206 #ifndef ZORBA_REGEX_H
2207 #define ZORBA_REGEX_H
2208
2209-#ifndef ZORBA_NO_UNICODE
2210-#include <unicode/regex.h>
2211-#endif
2212-
2213 #include "cxx_util.h"
2214 #include "unicode_util.h"
2215 #include "zorbatypes/zstring.h"
2216
2217-#ifndef ZORBA_NO_UNICODE
2218+#ifndef ZORBA_NO_ICU
2219+
2220+#include <unicode/regex.h>
2221
2222 namespace zorba {
2223
2224@@ -496,15 +494,17 @@
2225 } // namespace unicode
2226 } // namespace zorba
2227
2228-#else ///ZORBA_NO_UNICODE (ascii part:)
2229-
2230-#include "util/regex_ascii.h"
2231+///////////////////////////////////////////////////////////////////////////////
2232+
2233+#else /* ZORBA_NO_ICU */
2234+
2235+#include "util/regex_xquery.h"
2236 #include <string>
2237
2238 namespace zorba{
2239 /**
2240 * Converts an XQuery regular expression to the form used by the regular
2241- * expression library Zorba is using (here regex_ascii).
2242+ * expression library Zorba is using (here regex_xquery).
2243 *
2244 * @param xq_re The XQuery regular expression.
2245 * @param lib_re A pointer to the resuling library regular expression.
2246@@ -525,7 +525,7 @@
2247 /**
2248 * Constructs a %regex.
2249 */
2250- regex() : regex_matcher( NULL ) { }
2251+ regex() : regex_matcher( nullptr ) { }
2252
2253 /**
2254 * Destroys a %regex.
2255@@ -835,31 +835,21 @@
2256
2257 /**
2258 * Get the start position of the matched group.
2259- * If groupId is zero, then the start position of the whole match is returned.
2260- * If groupId is non-zero, then the start position of that group is returned.
2261- * If that group has not been matched, -1 is returned.
2262+ * If groupId is zero, then the start and end position of the whole match is returned.
2263+ * If groupId is non-zero, then the start and end position of that group is returned.
2264+ * If that group has not been matched, false is returned.
2265 *
2266 * @param groupId the id of the group, either zero for the entire regex,
2267 * or [1 .. group_count] for that specific group
2268- * @return the start position, zero based, or -1 if that group didn't match
2269+ * @param start to return start position in bytes
2270+ * @param end to return end position in bytes
2271+ * @return true if that group exists and has been matched
2272 */
2273- int get_match_start( int groupId = 0 );
2274+ bool get_match_start_end_bytes( int groupId, int *start, int *end );
2275
2276- /**
2277- * Get the end position of the matched group.
2278- * If groupId is zero, then the end position of the whole match is returned.
2279- * If groupId is non-zero, then the end position of that group is returned.
2280- * If that group has not been matched, -1 is returned.
2281- *
2282- * @param groupId the id of the group, either zero for the entire regex,
2283- * or [1 .. group_count] for that specific group
2284- * @return the end position, zero based, or -1 if that group didn't match
2285- */
2286- int get_match_end( int groupId = 0 );
2287
2288 private:
2289- regex_ascii::CRegexAscii_parser regex_parser;
2290- regex_ascii::CRegexAscii_regex *regex_matcher;
2291+ regex_xquery::CRegexXQuery_regex *regex_matcher;
2292 uint32_t parsed_flags;
2293
2294 zstring s_in_;
2295@@ -873,15 +863,13 @@
2296 regex( regex const& );
2297 regex& operator=( regex const& );
2298 };
2299+
2300+///////////////////////////////////////////////////////////////////////////////
2301+
2302 } // namespace unicode
2303 } // namespace zorba
2304
2305-#endif /* ZORBA_NO_UNICODE */
2306-
2307-
2308-///////////////////////////////////////////////////////////////////////////////
2309-
2310-
2311+#endif /* ZORBA_NO_ICU */
2312 #endif /* ZORBA_REGEX_H */
2313 /*
2314 * Local variables:
2315
2316=== renamed file 'src/util/regex_ascii.cpp' => 'src/util/regex_xquery.cpp'
2317--- src/util/regex_ascii.cpp 2012-03-28 05:19:57 +0000
2318+++ src/util/regex_xquery.cpp 2012-04-06 00:18:21 +0000
2319@@ -1,4 +1,4 @@
2320-a/*
2321+/*
2322 * Copyright 2006-2008 The FLWOR Foundation.
2323 *
2324 * Licensed under the Apache License, Version 2.0 (the "License");
2325@@ -18,12 +18,15 @@
2326
2327 #include "diagnostics/xquery_diagnostics.h"
2328
2329-#include "regex_ascii.h"
2330+#include "regex_xquery.h"
2331 #include <string.h>
2332 #include "zorbatypes/chartype.h"
2333+#include "util/unicode_categories.h"
2334+#include "util/ascii_util.h"
2335+#include "util/utf8_string.h"
2336
2337 namespace zorba {
2338- namespace regex_ascii{
2339+ namespace regex_xquery{
2340 //ascii regular expression matching
2341
2342 /*http://www.w3.org/TR/xmlschema-2/#regexs
2343@@ -62,96 +65,138 @@
2344 + http://www.w3.org/TR/xquery-operators/#regex-syntax (not implemented)
2345 */
2346
2347+
2348+static bool compare_ascii_i(const char *str1, const char *str2)
2349+{
2350+ while(*str1 && *str2)
2351+ {
2352+ if(ascii::to_lower(*str1) != ascii::to_lower(*str2))
2353+ return false;
2354+ str1++;
2355+ str2++;
2356+ }
2357+ if(*str1 || *str2)
2358+ return false;
2359+ return true;
2360+}
2361+
2362+static bool compare_unicode_ni(const char *str1, const char *str2, int len)
2363+{
2364+ while(len > 0)
2365+ {
2366+ const char *temp_str1 = str1;
2367+ const char *temp_str2 = str2;
2368+ unicode::code_point cp1 = unicode::to_upper(utf8::next_char(temp_str1));
2369+ unicode::code_point cp2 = unicode::to_upper(utf8::next_char(temp_str2));
2370+ if(cp1 != cp2)
2371+ return false;
2372+ len -= temp_str1-str1;
2373+ str1 = temp_str1;
2374+ str2 = temp_str2;
2375+ }
2376+ return true;
2377+}
2378+static utf8::size_type myutf8len(const char *source)
2379+{
2380+ utf8::size_type len = utf8::char_length(*source);
2381+ if(!len)
2382+ return 1;
2383+ else
2384+ return len;
2385+}
2386 ////////////////////////////////////
2387 ////Regular expression parsing and building of the tree
2388 ////////////////////////////////////
2389
2390-CRegexAscii_regex* CRegexAscii_parser::parse(const char *pattern, unsigned int flags)
2391+CRegexXQuery_regex* CRegexXQuery_parser::parse(const char *pattern, unsigned int flags)
2392 {
2393 this->flags = flags;
2394- bool align_begin = false;
2395
2396- if(!(flags & REGEX_ASCII_LITERAL) && (pattern[0] == '^'))
2397- align_begin = true;
2398-
2399 int regex_len;
2400- CRegexAscii_regex* regex = parse_regexp(pattern + (align_begin?1:0), &regex_len);
2401+ CRegexXQuery_regex* regex = parse_regexp(pattern, &regex_len);
2402
2403- if(regex)
2404- regex->set_align_begin(align_begin);
2405-
2406 return regex;
2407 }
2408
2409 //until '\0' or ')'
2410-CRegexAscii_regex* CRegexAscii_parser::parse_regexp(const char *pattern,
2411+CRegexXQuery_regex* CRegexXQuery_parser::parse_regexp(const char *pattern,
2412 int *regex_len)
2413 {
2414 *regex_len = 0;
2415 int branch_len;
2416 regex_depth++;
2417- CRegexAscii_regex *regex = new CRegexAscii_regex(current_regex);
2418+ std::auto_ptr<CRegexXQuery_regex> regex(new CRegexXQuery_regex(current_regex));
2419 if(!current_regex)
2420- current_regex = regex;
2421+ current_regex = regex.get();
2422 if(regex_depth >= 2)
2423 {
2424 //mark this as group if it does not start with ?:
2425 if(pattern[0] != '?' || pattern[1] != ':')
2426- current_regex->subregex.push_back(regex);
2427+ current_regex->subregex.push_back(regex.get());
2428 else
2429 *regex_len = 2;
2430 }
2431- CRegexAscii_branch *branch;
2432+ CRegexXQuery_branch *branch;
2433+ bool must_read_another_branch = true;
2434 while(pattern[*regex_len] && (pattern[*regex_len] != ')'))
2435 {
2436 branch = parse_branch(pattern+*regex_len, &branch_len);
2437 if(!branch)
2438 {
2439 regex_depth--;
2440- delete regex;
2441 return NULL;
2442 }
2443 regex->add_branch(branch);
2444 *regex_len += branch_len;
2445+ if(pattern[*regex_len] == '|')
2446+ (*regex_len)++;
2447+ else
2448+ must_read_another_branch = false;
2449 }
2450- if((current_regex == regex) && (pattern[*regex_len] == ')'))
2451+ if((current_regex == regex.get()) && (pattern[*regex_len] == ')'))
2452 {
2453- throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(U_REGEX_MISMATCHED_PAREN)) );
2454+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_MISMATCHED_PAREN)) );
2455 }
2456 if(pattern[*regex_len])
2457 (*regex_len)++;
2458+ if(must_read_another_branch)
2459+ regex->add_branch(new CRegexXQuery_branch(current_regex));//add empty branch
2460 regex->flags = 0;//finished initialization
2461 regex_depth--;
2462- return regex;
2463+ return regex.release();
2464 }
2465
2466-CRegexAscii_branch* CRegexAscii_parser::parse_branch(const char *pattern, int *branch_len)
2467+CRegexXQuery_branch* CRegexXQuery_parser::parse_branch(const char *pattern, int *branch_len)
2468 {
2469 int piece_len;
2470
2471- CRegexAscii_branch *branch = new CRegexAscii_branch(current_regex);
2472- CRegexAscii_piece *piece;
2473+ std::auto_ptr<CRegexXQuery_branch> branch(new CRegexXQuery_branch(current_regex));
2474+ CRegexXQuery_piece *piece;
2475 *branch_len = 0;
2476 while(pattern[*branch_len] && (pattern[*branch_len] != '|') && (pattern[*branch_len] != ')'))
2477 {
2478 piece = parse_piece(pattern+*branch_len, &piece_len);
2479 if(!piece)
2480 {
2481- delete branch;
2482 return NULL;
2483 }
2484+ if(branch->piece_list.size() && dynamic_cast<CRegexXQuery_pinstart*>(piece->atom))
2485+ {
2486+ //found ^ that is not at the beginning of branch
2487+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_ATOM_CHAR), '^') );
2488+ }
2489 branch->add_piece(piece);
2490 *branch_len += piece_len;
2491 }
2492- if(pattern[*branch_len] == '|')
2493- (*branch_len)++;
2494- return branch;
2495+ //if(pattern[*branch_len] == '|')
2496+ // (*branch_len)++;
2497+ return branch.release();
2498 }
2499
2500 //piece = atom + quantifier
2501-CRegexAscii_piece* CRegexAscii_parser::parse_piece(const char *pattern, int *piece_len)
2502+CRegexXQuery_piece* CRegexXQuery_parser::parse_piece(const char *pattern, int *piece_len)
2503 {
2504- CRegexAscii_piece *piece = new CRegexAscii_piece;
2505+ std::auto_ptr<CRegexXQuery_piece> piece(new CRegexXQuery_piece);
2506 IRegexAtom *atom;
2507 *piece_len = 0;
2508
2509@@ -160,19 +205,18 @@
2510 atom = read_atom(pattern, &atom_len);
2511 if(!atom)
2512 {
2513- delete piece;
2514 return NULL;
2515 }
2516 piece->set_atom(atom);
2517 if(!(flags & REGEX_ASCII_LITERAL))
2518- read_quantifier(piece, pattern+atom_len, &quantif_len);
2519+ read_quantifier(piece.get(), pattern+atom_len, &quantif_len);
2520
2521 *piece_len += atom_len + quantif_len;
2522
2523- return piece;
2524+ return piece.release();
2525 }
2526
2527-char CRegexAscii_parser::myishex(char c)
2528+char CRegexXQuery_parser::myishex(char c)
2529 {
2530 if((c >= '0') && (c <= '9'))
2531 return c-'0'+1;
2532@@ -183,26 +227,125 @@
2533 return 0;//not a hex
2534 }
2535
2536-bool CRegexAscii_parser::myisdigit(char c)
2537-{
2538- return (c >= '0') || (c <= '9');
2539-}
2540-
2541-char CRegexAscii_parser::readChar(const char *pattern, int *char_len, bool *is_multichar)
2542+bool CRegexXQuery_parser::myisdigit(char c)
2543+{
2544+ return (c >= '0') && (c <= '9');
2545+}
2546+
2547+bool CRegexXQuery_parser::myisletterAZ(char c)
2548+{
2549+ return ((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z'));
2550+}
2551+
2552+static const unicode::code_point specials_extcp[] = {0xFFF0, 0xFFFD, 0};
2553+
2554+static CRegexXQuery_parser::block_escape_t block_escape[] =
2555+{
2556+{{0x0000, 0x007F}, NULL, "BasicLatin"},
2557+{{0x0080, 0x00FF}, NULL, "Latin-1Supplement"},
2558+{{0x0100, 0x017F}, NULL, "LatinExtended-A"},
2559+{{0x0180, 0x024F}, NULL, "LatinExtended-B"},
2560+{{0x0250, 0x02AF}, NULL, "IPAExtensions"},
2561+{{0x02B0, 0x02FF}, NULL, "SpacingModifierLetters"},
2562+{{0x0300, 0x036F}, NULL, "CombiningDiacriticalMarks"},
2563+{{0x0370, 0x03FF}, NULL, "Greek"},
2564+{{0x0400, 0x04FF}, NULL, "Cyrillic"},
2565+{{0x0530, 0x058F}, NULL, "Armenian"},
2566+{{0x0590, 0x05FF}, NULL, "Hebrew"},
2567+{{0x0600, 0x06FF}, NULL, "Arabic"},
2568+{{0x0700, 0x074F}, NULL, "Syriac"},
2569+{{0x0780, 0x07BF}, NULL, "Thaana"},
2570+{{0x0900, 0x097F}, NULL, "Devanagari"},
2571+{{0x0980, 0x09FF}, NULL, "Bengali"},
2572+{{0x0A00, 0x0A7F}, NULL, "Gurmukhi"},
2573+{{0x0A80, 0x0AFF}, NULL, "Gujarati"},
2574+{{0x0B00, 0x0B7F}, NULL, "Oriya"},
2575+{{0x0B80, 0x0BFF}, NULL, "Tamil"},
2576+{{0x0C00, 0x0C7F}, NULL, "Telugu"},
2577+{{0x0C80, 0x0CFF}, NULL, "Kannada"},
2578+{{0x0D00, 0x0D7F}, NULL, "Malayalam"},
2579+{{0x0D80, 0x0DFF}, NULL, "Sinhala"},
2580+{{0x0E00, 0x0E7F}, NULL, "Thai"},
2581+{{0x0E80, 0x0EFF}, NULL, "Lao"},
2582+{{0x0F00, 0x0FFF}, NULL, "Tibetan"},
2583+{{0x1000, 0x109F}, NULL, "Myanmar"},
2584+{{0x10A0, 0x10FF}, NULL, "Georgian"},
2585+{{0x1100, 0x11FF}, NULL, "HangulJamo"},
2586+{{0x1200, 0x137F}, NULL, "Ethiopic"},
2587+{{0x13A0, 0x13FF}, NULL, "Cherokee"},
2588+{{0x1400, 0x167F}, NULL, "UnifiedCanadianAboriginalSyllabics"},
2589+{{0x1680, 0x169F}, NULL, "Ogham"},
2590+{{0x16A0, 0x16FF}, NULL, "Runic"},
2591+{{0x1780, 0x17FF}, NULL, "Khmer"},
2592+{{0x1800, 0x18AF}, NULL, "Mongolian"},
2593+{{0x1E00, 0x1EFF}, NULL, "LatinExtendedAdditional"},
2594+{{0x1F00, 0x1FFF}, NULL, "GreekExtended"},
2595+{{0x2000, 0x206F}, NULL, "GeneralPunctuation"},
2596+{{0x2070, 0x209F}, NULL, "SuperscriptsandSubscripts"},
2597+{{0x20A0, 0x20CF}, NULL, "CurrencySymbols"},
2598+{{0x20D0, 0x20FF}, NULL, "CombiningMarksforSymbols"},
2599+{{0x2100, 0x214F}, NULL, "LetterlikeSymbols"},
2600+{{0x2150, 0x218F}, NULL, "NumberForms"},
2601+{{0x2190, 0x21FF}, NULL, "Arrows"},
2602+{{0x2200, 0x22FF}, NULL, "MathematicalOperators"},
2603+{{0x2300, 0x23FF}, NULL, "MiscellaneousTechnical"},
2604+{{0x2400, 0x243F}, NULL, "ControlPictures"},
2605+{{0x2440, 0x245F}, NULL, "OpticalCharacterRecognition"},
2606+{{0x2460, 0x24FF}, NULL, "EnclosedAlphanumerics"},
2607+{{0x2500, 0x257F}, NULL, "BoxDrawing"},
2608+{{0x2580, 0x259F}, NULL, "BlockElements"},
2609+{{0x25A0, 0x25FF}, NULL, "GeometricShapes"},
2610+{{0x2600, 0x26FF}, NULL, "MiscellaneousSymbols"},
2611+{{0x2700, 0x27BF}, NULL, "Dingbats"},
2612+{{0x2800, 0x28FF}, NULL, "BraillePatterns"},
2613+{{0x2E80, 0x2EFF}, NULL, "CJKRadicalsSupplement"},
2614+{{0x2F00, 0x2FDF}, NULL, "KangxiRadicals"},
2615+{{0x2FF0, 0x2FFF}, NULL, "IdeographicDescriptionCharacters"},
2616+{{0x3000, 0x303F}, NULL, "CJKSymbolsandPunctuation"},
2617+{{0x3040, 0x309F}, NULL, "Hiragana"},
2618+{{0x30A0, 0x30FF}, NULL, "Katakana"},
2619+{{0x3100, 0x312F}, NULL, "Bopomofo"},
2620+{{0x3130, 0x318F}, NULL, "HangulCompatibilityJamo"},
2621+{{0x3190, 0x319F}, NULL, "Kanbun"},
2622+{{0x31A0, 0x31BF}, NULL, "BopomofoExtended"},
2623+{{0x3200, 0x32FF}, NULL, "EnclosedCJKLettersandMonths"},
2624+{{0x3300, 0x33FF}, NULL, "CJKCompatibility"},
2625+{{0x3400, 0x4DB5}, NULL, "CJKUnifiedIdeographsExtensionA"},
2626+{{0x4E00, 0x9FFF}, NULL, "CJKUnifiedIdeographs"},
2627+{{0xA000, 0xA48F}, NULL, "YiSyllables"},
2628+{{0xA490, 0xA4CF}, NULL, "YiRadicals"},
2629+{{0xAC00, 0xD7A3}, NULL, "HangulSyllables"},
2630+{{0xE000, 0xF8FF}, NULL, "PrivateUse"},
2631+{{0xF900, 0xFAFF}, NULL, "CJKCompatibilityIdeographs"},
2632+{{0xFB00, 0xFB4F}, NULL, "AlphabeticPresentationForms"},
2633+{{0xFB50, 0xFDFF}, NULL, "ArabicPresentationForms-A"},
2634+{{0xFE20, 0xFE2F}, NULL, "CombiningHalfMarks"},
2635+{{0xFE30, 0xFE4F}, NULL, "CJKCompatibilityForms"},
2636+{{0xFE50, 0xFE6F}, NULL, "SmallFormVariants"},
2637+{{0xFE70, 0xFEFE}, NULL, "ArabicPresentationForms-B"},
2638+{{0xFEFF, 0xFEFF}, specials_extcp, "Specials"},
2639+{{0xFF00, 0xFFEF}, NULL, "HalfwidthandFullwidthForms"}
2640+};
2641+
2642+CRegexXQuery_charmatch* CRegexXQuery_parser::readChar(const char *pattern,
2643+ int *char_len,
2644+ enum CHARGROUP_t *multichar_type)
2645 {
2646 char c = 0;
2647 *char_len = 0;
2648- *is_multichar = false;
2649+ *multichar_type = CHARGROUP_NO_MULTICHAR;
2650 switch(pattern[*char_len])
2651 {
2652 case '\\':
2653- { (*char_len)++;
2654+ {
2655+ (*char_len)++;
2656 switch(pattern[*char_len])
2657 {
2658- case 'n': c = '\n';break;
2659- case 'r': c = '\r';break;
2660- case 't': c = '\t';break;
2661+ case 'n': c = '\n';(*char_len)++;return new CRegexXQuery_char_ascii(current_regex, c);
2662+ case 'r': c = '\r';(*char_len)++;return new CRegexXQuery_char_ascii(current_regex, c);
2663+ case 't': c = '\t';(*char_len)++;return new CRegexXQuery_char_ascii(current_regex, c);
2664 case '\\':
2665+ case '/'://+
2666 case '|':
2667 case '.':
2668 case '?':
2669@@ -216,19 +359,205 @@
2670 case '['://#x5B
2671 case ']'://#x5D
2672 case '^'://#x5E
2673+ case '$'://+
2674 c = pattern[*char_len];
2675- break;
2676+ (*char_len)++;
2677+ *multichar_type = CHARGROUP_FLAGS_ONECHAR_ASCII;
2678+ return new CRegexXQuery_char_ascii(current_regex, c);
2679 case 'p'://catEsc
2680 case 'P'://complEsc
2681+ {
2682 //ignore the prop for now
2683- c = pattern[*char_len];
2684- *is_multichar = true;
2685- if(pattern[*char_len+1] == '{')
2686- {
2687- while(pattern[*char_len] != '}')
2688+ *multichar_type = CHARGROUP_FLAGS_MULTICHAR_p;//(CHARGROUP_t)((pattern[*char_len] == 'P') ? 128 : 0);
2689+ bool is_reverse = (pattern[*char_len] == 'P');
2690+ c = 0;
2691+ if(pattern[(*char_len)+1] != '{')
2692+ {
2693+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_BROKEN_P_CONSTRUCT)) );
2694+ }
2695+ (*char_len) += 2;
2696+ switch(pattern[*char_len])
2697+ {//IsCategory
2698+ case 'L':
2699+ {
2700+ switch(pattern[(*char_len)+1])
2701+ {
2702+ case '}':
2703+ c = unicode::UNICODE_Ll + 50;break;
2704+ case 'u':
2705+ c = unicode::UNICODE_Lu; (*char_len)++;break;
2706+ case 'l':
2707+ c = unicode::UNICODE_Ll; (*char_len)++;break;
2708+ case 't':
2709+ c = unicode::UNICODE_Lt; (*char_len)++;break;
2710+ case 'm':
2711+ c = unicode::UNICODE_Lm; (*char_len)++;break;
2712+ case 'o':
2713+ c = unicode::UNICODE_Lo; (*char_len)++;break;
2714+ default:
2715+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PL_CONSTRUCT)) );
2716+ }
2717+ }break;
2718+ case 'M':
2719+ {
2720+ switch(pattern[(*char_len)+1])
2721+ {
2722+ case '}':
2723+ c = unicode::UNICODE_Mc + 50;break;
2724+ case 'n':
2725+ c = unicode::UNICODE_Mn; (*char_len)++;break;
2726+ case 'c':
2727+ c = unicode::UNICODE_Mc; (*char_len)++;break;
2728+ case 'e':
2729+ c = unicode::UNICODE_Me; (*char_len)++;break;
2730+ default:
2731+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PM_CONSTRUCT)) );
2732+ }
2733+ }break;
2734+ case 'N':
2735+ {
2736+ switch(pattern[(*char_len)+1])
2737+ {
2738+ case '}':
2739+ c = unicode::UNICODE_Nd + 50;break;
2740+ case 'd':
2741+ c = unicode::UNICODE_Nd; (*char_len)++;break;
2742+ case 'l':
2743+ c = unicode::UNICODE_Nl; (*char_len)++;break;
2744+ case 'o':
2745+ c = unicode::UNICODE_No; (*char_len)++;break;
2746+ default:
2747+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PN_CONSTRUCT)) );
2748+ }
2749+ }break;
2750+ case 'P':
2751+ {
2752+ switch(pattern[(*char_len)+1])
2753+ {
2754+ case '}':
2755+ c = unicode::UNICODE_Pc + 50;break;
2756+ case 'c':
2757+ c = unicode::UNICODE_Pc; (*char_len)++;break;
2758+ case 'd':
2759+ c = unicode::UNICODE_Pd; (*char_len)++;break;
2760+ case 's':
2761+ c = unicode::UNICODE_Ps; (*char_len)++;break;
2762+ case 'e':
2763+ c = unicode::UNICODE_Pe; (*char_len)++;break;
2764+ case 'i':
2765+ c = unicode::UNICODE_Pi; (*char_len)++;break;
2766+ case 'f':
2767+ c = unicode::UNICODE_Pf; (*char_len)++;break;
2768+ case 'o':
2769+ c = unicode::UNICODE_Po; (*char_len)++;break;
2770+ default:
2771+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PP_CONSTRUCT)) );
2772+ }
2773+ }break;
2774+ case 'Z':
2775+ {
2776+ switch(pattern[(*char_len)+1])
2777+ {
2778+ case '}':
2779+ c = unicode::UNICODE_Zl + 50;break;
2780+ case 's':
2781+ c = unicode::UNICODE_Zs; (*char_len)++;break;
2782+ case 'l':
2783+ c = unicode::UNICODE_Zl; (*char_len)++;break;
2784+ case 'p':
2785+ c = unicode::UNICODE_Zp; (*char_len)++;break;
2786+ default:
2787+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PZ_CONSTRUCT)) );
2788+ }
2789+ }break;
2790+ case 'S':
2791+ {
2792+ switch(pattern[(*char_len)+1])
2793+ {
2794+ case '}':
2795+ c = unicode::UNICODE_Sc + 50;break;
2796+ case 'm':
2797+ c = unicode::UNICODE_Sm; (*char_len)++;break;
2798+ case 'c':
2799+ c = unicode::UNICODE_Sc; (*char_len)++;break;
2800+ case 'k':
2801+ c = unicode::UNICODE_Sk; (*char_len)++;break;
2802+ case 'o':
2803+ c = unicode::UNICODE_So; (*char_len)++;break;
2804+ default:
2805+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PS_CONSTRUCT)) );
2806+ }
2807+ }break;
2808+ case 'C':
2809+ {
2810+ switch(pattern[(*char_len)+1])
2811+ {
2812+ case '}':
2813+ c = unicode::UNICODE_Cc + 50;break;
2814+ case 'c':
2815+ c = unicode::UNICODE_Cc; (*char_len)++;break;
2816+ case 'f':
2817+ c = unicode::UNICODE_Cf; (*char_len)++;break;
2818+ case 'o':
2819+ c = unicode::UNICODE_Co; (*char_len)++;break;
2820+ case 'n':
2821+ c = unicode::UNICODE_Cn; (*char_len)++;break;
2822+ default:
2823+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PC_CONSTRUCT)) );
2824+ }
2825+ }break;
2826+ }//end switch
2827+ if(c)
2828+ {
2829+ if(pattern[(*char_len) + 1] != '}')
2830+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_BROKEN_P_CONSTRUCT)) );
2831+ (*char_len)++;
2832+ (*char_len)++;
2833+ return new CRegexXQuery_multicharP(current_regex, c, is_reverse);
2834+ }
2835+ if(pattern[*char_len] == 'I')
2836+ {
2837+ if(pattern[(*char_len)+1] == 's')//IsBlock
2838+ {
2839+ *multichar_type = CHARGROUP_FLAGS_MULTICHAR_Is;
2840+ (*char_len) += 2;
2841+ zstring block_name;
2842+ char tempc = pattern[(*char_len)];
2843+ while(tempc && (tempc != '}'))
2844+ {
2845+ if(!myisletterAZ(tempc) && !myisdigit(tempc) && (tempc != '-'))
2846+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_BROKEN_PIs_CONSTRUCT)) );
2847+ block_name.append(1, tempc);
2848+ (*char_len)++;
2849+ tempc = pattern[(*char_len)];
2850+ }
2851+ if(!tempc)
2852+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_BROKEN_PIs_CONSTRUCT)) );
2853+ //search for the block name
2854+ int i;
2855+ int nr_blocks = sizeof(block_escape)/sizeof(CRegexXQuery_parser::block_escape_t);
2856+ for(i=0;i<nr_blocks;i++)
2857+ {
2858+ if(compare_ascii_i(block_name.c_str(), block_escape[i].group_name))
2859+ {
2860+ c = i;
2861+ break;
2862+ }
2863+ }
2864+ if(i==nr_blocks)
2865+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PIs_CONSTRUCT)) );
2866 (*char_len)++;
2867- }
2868- break;
2869+ return new CRegexXQuery_multicharIs(current_regex, i, is_reverse);
2870+ }
2871+ else
2872+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_BROKEN_PIs_CONSTRUCT)) );
2873+ }
2874+ else
2875+ {
2876+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_BROKEN_P_CONSTRUCT)) );
2877+ }
2878+ break;//unreachable
2879+ }//end case 'p'
2880 //multiCharEsc
2881 case 's':
2882 case 'S':
2883@@ -240,40 +569,104 @@
2884 case 'D':
2885 case 'w':
2886 case 'W':
2887- *is_multichar = true;
2888+ *multichar_type = CHARGROUP_FLAGS_MULTICHAR_OTHER;
2889 c = pattern[*char_len];
2890- break;
2891- }
2892- break;
2893- }
2894- case '#':///might be #xXX
2895- {
2896- if((pattern[*char_len+1] == 'x') &&
2897- myishex(pattern[*char_len+2]) && myishex(pattern[*char_len+3]))
2898- {
2899- c = (myishex(pattern[*char_len+2])-1)<<4 | (myishex(pattern[*char_len+3])-1);
2900- *char_len += 3;
2901- break;
2902- }
2903- }
2904+ (*char_len)++;
2905+ return new CRegexXQuery_multicharOther(current_regex, c);
2906+ case 'u'://unicode codepoint \uXXXX
2907+ {
2908+ unicode::code_point utf8c = 0;
2909+ (*char_len)++;
2910+ for(int i=0;i<4;i++)
2911+ {
2912+ char hex = myishex(pattern[*char_len]);
2913+ if(!hex)
2914+ {
2915+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_UNICODE_CODEPOINT_u)) );
2916+ }
2917+ utf8c <<= 4;
2918+ utf8c |= (hex-1) & 0x0f;
2919+ (*char_len)++;
2920+ }
2921+ return create_charmatch(utf8c, NULL, 0, multichar_type);
2922+ }
2923+ case 'U'://unicode codepoint \UXXXXXXXX
2924+ {
2925+ unicode::code_point utf8c = 0;
2926+ (*char_len)++;
2927+ for(int i=0;i<8;i++)
2928+ {
2929+ char hex = myishex(pattern[*char_len]);
2930+ if(!hex)
2931+ {
2932+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_UNICODE_CODEPOINT_u)) );
2933+ }
2934+ utf8c <<= 4;
2935+ utf8c |= (hex-1) & 0x0f;
2936+ (*char_len)++;
2937+ }
2938+ return create_charmatch(utf8c, NULL, 0, multichar_type);
2939+ }
2940+ default:
2941+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_ESC_CHAR)) );
2942+ }
2943+ assert(false);
2944+ break;//unreachable
2945+ }//end case '\'
2946 default:
2947- c = pattern[*char_len];
2948- break;
2949- }
2950-
2951- (*char_len)++;
2952- return c;
2953-}
2954-
2955-
2956-
2957-IRegexAtom* CRegexAscii_parser::read_atom(const char *pattern, int *atom_len)
2958+ {
2959+ const char *temp_pattern = pattern;
2960+ unicode::code_point utf8c = utf8::next_char(temp_pattern);
2961+ (*char_len) = temp_pattern - pattern;
2962+ return create_charmatch(utf8c, pattern, *char_len, multichar_type);
2963+ }
2964+ }
2965+ return NULL;
2966+}
2967+
2968+CRegexXQuery_charmatch *CRegexXQuery_parser::create_charmatch(unicode::code_point utf8c,
2969+ const char *pattern, int utf8len,
2970+ enum CHARGROUP_t *multichar_type)
2971+{
2972+ if(utf8c <= 0x7F)
2973+ {
2974+ *multichar_type = CHARGROUP_FLAGS_ONECHAR_ASCII;
2975+ if(flags & REGEX_ASCII_CASE_INSENSITIVE)
2976+ return new CRegexXQuery_char_ascii_i(current_regex, (char)utf8c);
2977+ else
2978+ return new CRegexXQuery_char_ascii(current_regex, (char)utf8c);
2979+ }
2980+ else
2981+ {
2982+ *multichar_type = CHARGROUP_FLAGS_ONECHAR_UNICODE;
2983+ if(flags & REGEX_ASCII_CASE_INSENSITIVE)
2984+ return new CRegexXQuery_char_unicode_i(current_regex, utf8c);
2985+ else
2986+ {
2987+ if(pattern)
2988+ return new CRegexXQuery_char_unicode(current_regex, pattern, utf8len);
2989+ else
2990+ return new CRegexXQuery_char_unicode_cp(current_regex, utf8c);
2991+ }
2992+ }
2993+}
2994+
2995+IRegexAtom* CRegexXQuery_parser::read_atom(const char *pattern, int *atom_len)
2996 {
2997 *atom_len = 0;
2998- char c;
2999- bool is_end_line = false;
3000- c = pattern[*atom_len];
3001- if((!(flags & REGEX_ASCII_LITERAL)) && (c == '\\'))
3002+ if(flags & REGEX_ASCII_LITERAL)
3003+ {
3004+ unicode::code_point utf8c;
3005+ //bool is_end_line = false;
3006+ const char *temp_pattern = pattern;
3007+ utf8c = utf8::next_char(temp_pattern);
3008+ *atom_len = temp_pattern - pattern;
3009+ enum CHARGROUP_t multichar_type;
3010+ return create_charmatch(utf8c, pattern, *atom_len, &multichar_type);
3011+ }
3012+
3013+ char c = *pattern;
3014+ if(c == '\\')
3015 {
3016 //check for back reference
3017 if(myisdigit(pattern[(*atom_len)+1]))
3018@@ -281,13 +674,13 @@
3019 (*atom_len)++;
3020 if(pattern[*atom_len] == '0')
3021 {
3022- throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(U_REGEX_INVALID_BACK_REF)) );
3023+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_BACK_REF), 0, current_regex->subregex.size()) );
3024 }
3025 unsigned int backref = pattern[*atom_len] - '0';
3026 if((backref > current_regex->subregex.size()) ||
3027 (current_regex->subregex.at(backref-1)->flags != 0))
3028 {
3029- throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(U_REGEX_INVALID_BACK_REF)) );
3030+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_BACK_REF), backref, current_regex->subregex.size()) );
3031 }
3032 while(current_regex->subregex.size() >= backref*10)
3033 {
3034@@ -303,70 +696,86 @@
3035 break;
3036 }
3037 }
3038- return new CRegexAscii_backref(current_regex, backref);
3039+ (*atom_len)++;
3040+ return new CRegexXQuery_backref(current_regex, backref);
3041 }
3042 }
3043+ if(c == '^')
3044+ {
3045+ (*atom_len)++;
3046+ return new CRegexXQuery_pinstart(current_regex);
3047+ }
3048+ if((c == '}') || (c == '{') || (c == '?') || (c == '*') || (c == '+') || (c == '|'))
3049+ {
3050+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_ATOM_CHAR), c) );
3051+ }
3052 switch(c)
3053 {
3054 case '[':
3055 {
3056- if(!(flags & REGEX_ASCII_LITERAL))
3057- {
3058- (*atom_len)++;
3059- CRegexAscii_chargroup *chargroup = NULL;
3060- int chargroup_len;
3061- chargroup = readchargroup(pattern+*atom_len, &chargroup_len);
3062- *atom_len += chargroup_len;
3063- return chargroup;
3064- }
3065+ (*atom_len)++;
3066+ CRegexXQuery_chargroup *chargroup = NULL;
3067+ int chargroup_len;
3068+ chargroup = readchargroup(pattern+*atom_len, &chargroup_len);
3069+ *atom_len += chargroup_len;
3070+ return chargroup;
3071 }
3072 case '.'://WildCharEsc
3073 {
3074- if(!(flags & REGEX_ASCII_LITERAL))
3075- {
3076- CRegexAscii_wildchar *wildchar = new CRegexAscii_wildchar(current_regex);
3077- (*atom_len)++;
3078- return wildchar;
3079- }
3080+ (*atom_len)++;
3081+ return new CRegexXQuery_wildchar(current_regex);
3082 }
3083 case '('://begin an embedded reg exp
3084 {
3085- if(!(flags & REGEX_ASCII_LITERAL))
3086- {
3087- (*atom_len)++;
3088- CRegexAscii_regex *emb_regex = NULL;
3089- int regex_len;
3090- emb_regex = parse_regexp(pattern + *atom_len, &regex_len);
3091- *atom_len += regex_len;
3092- return emb_regex;
3093- }
3094+ (*atom_len)++;
3095+ CRegexXQuery_regex *emb_regex = NULL;
3096+ int regex_len;
3097+ emb_regex = parse_regexp(pattern + *atom_len, &regex_len);
3098+ *atom_len += regex_len;
3099+ return emb_regex;
3100 }
3101 case '$'://end line
3102- if(!(flags & REGEX_ASCII_LITERAL))
3103- {
3104- is_end_line = true;
3105- }
3106+ //is_end_line = true;
3107+ (*atom_len)++;
3108+ return new CRegexXQuery_endline(current_regex);
3109 default:
3110 {
3111- char c;
3112+ //char c;
3113+ CRegexXQuery_charmatch *charmatch = NULL;
3114 int c_len;
3115- bool is_multichar = false;
3116- if(!(flags & REGEX_ASCII_LITERAL))
3117- c = readChar(pattern+*atom_len, &c_len, &is_multichar);
3118- else
3119+ CHARGROUP_t multichar_type = CHARGROUP_NO_MULTICHAR;
3120+ *atom_len = 0;
3121+ while(pattern[*atom_len])
3122 {
3123- c = pattern[*atom_len];
3124- c_len = 1;
3125+ charmatch = readChar(pattern+*atom_len, &c_len, &multichar_type);
3126+ *atom_len += c_len;
3127+ if((flags & REGEX_ASCII_NO_WHITESPACE) && (multichar_type == CHARGROUP_FLAGS_ONECHAR_ASCII))
3128+ {
3129+ char c = (char)charmatch->get_c();
3130+ if((c == ' ') || (c == '\t') || (c == '\r') || (c == '\n'))
3131+ {
3132+ //ignore this whitespace
3133+ delete charmatch;
3134+ continue;
3135+ }
3136+ else
3137+ break;
3138+ }
3139+ else
3140+ break;
3141 }
3142- CRegexAscii_chargroup *chargroup = new CRegexAscii_chargroup(current_regex);
3143- if(is_multichar)
3144- chargroup->addMultiChar(c);
3145+ /*
3146+ std::auto_ptr<CRegexXQuery_chargroup> chargroup(new CRegexXQuery_chargroup(current_regex));
3147+ if(multichar_type)
3148+ chargroup->addMultiChar(c, multichar_type);
3149 else if(is_end_line)
3150 chargroup->addEndLine();
3151 else
3152- chargroup->addCharRange(c, c);
3153+ chargroup->addOneChar(c);
3154 *atom_len += c_len;
3155- return chargroup;
3156+ return chargroup.release();
3157+ */
3158+ return charmatch;
3159 }
3160 }
3161 }
3162@@ -374,81 +783,119 @@
3163 //read until ']'
3164 //posCharGroup ::= ( charRange | charClassEsc )+
3165 //charRange ::= seRange | XmlCharIncDash
3166-CRegexAscii_chargroup* CRegexAscii_parser::readchargroup(const char *pattern, int *chargroup_len)
3167+CRegexXQuery_chargroup* CRegexXQuery_parser::readchargroup(const char *pattern, int *chargroup_len)
3168 {
3169- CRegexAscii_chargroup *chargroup = NULL;
3170+ std::auto_ptr<CRegexXQuery_chargroup> chargroup;
3171 *chargroup_len = 0;
3172 if(pattern[*chargroup_len] == '^')//negative group
3173 {
3174 (*chargroup_len)++;
3175- chargroup = new CRegexAscii_negchargroup(current_regex);
3176+ chargroup.reset(new CRegexXQuery_negchargroup(current_regex));
3177 }
3178 else
3179- chargroup = new CRegexAscii_chargroup(current_regex);
3180+ chargroup.reset(new CRegexXQuery_chargroup(current_regex));
3181 while(pattern[*chargroup_len] && (pattern[*chargroup_len]!=']'))
3182 {
3183- char c1, c2;
3184- bool is_multichar;
3185+ //char c1, c2;
3186+ CHARGROUP_t multichar_type = CHARGROUP_NO_MULTICHAR;
3187 int c1_len;
3188- c1 = pattern[*chargroup_len];
3189- c2 = pattern[*chargroup_len+1];
3190- if((c1 == '-') && (c2 == '['))//charClassSub
3191+ if((pattern[*chargroup_len] == '-') && (pattern[(*chargroup_len)+1] == '['))//charClassSub
3192 {
3193 int classsub_len;
3194- CRegexAscii_chargroup *classsub = readchargroup(pattern + *chargroup_len+1 + 1, &classsub_len);
3195+ CRegexXQuery_chargroup *classsub = readchargroup(pattern + (*chargroup_len)+1 + 1, &classsub_len);
3196 if(!classsub)
3197 {
3198- delete chargroup;
3199- return NULL;
3200+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_SUBCLASS)) );
3201 }
3202 chargroup->addClassSub(classsub);
3203 *chargroup_len += 2 + classsub_len + 1;
3204 if(pattern[*chargroup_len-1] != ']')
3205 {
3206- delete chargroup;
3207- return NULL;
3208+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_USE_OF_SUBCLASS)) );
3209 }
3210- return chargroup;
3211+ return chargroup.release();
3212 }
3213
3214- c1 = readChar(pattern+*chargroup_len, &c1_len, &is_multichar);
3215- if(is_multichar)//first char is multichar
3216+ std::unique_ptr<CRegexXQuery_charmatch> charmatch(readChar(pattern+*chargroup_len, &c1_len, &multichar_type));
3217+ if((multichar_type == CHARGROUP_FLAGS_MULTICHAR_p) ||
3218+ (multichar_type == CHARGROUP_FLAGS_MULTICHAR_Is) ||
3219+ (multichar_type == CHARGROUP_FLAGS_MULTICHAR_OTHER))//first char is multichar
3220 {
3221- chargroup->addMultiChar(c1);
3222+ if((pattern[*chargroup_len+c1_len] == '-') &&///should not be a range
3223+ (pattern[*chargroup_len+c1_len+1] != ']'))
3224+ {
3225+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_MULTICHAR_IN_CHAR_RANGE)) );
3226+ }
3227+ //chargroup->addMultiChar(c1, multichar_type);
3228+ chargroup->addCharMatch(charmatch.release());
3229 *chargroup_len += c1_len;
3230 continue;
3231 }
3232- if(pattern[*chargroup_len+c1_len] == '-')///might be a range
3233+ (*chargroup_len) += c1_len;
3234+ if(pattern[*chargroup_len] == '-')///might be a range
3235 {
3236- if(pattern[*chargroup_len+c1_len+1] == ']')//no range, just the last char is '-'
3237+ if(pattern[(*chargroup_len)+1] == ']')//no range, just the last char is '-'
3238 {
3239- chargroup->addCharRange(c1, c1);
3240- chargroup->addCharRange('-', '-');
3241- *chargroup_len += c1_len + 1;
3242+ //chargroup->addOneChar(c1);
3243+ //chargroup->addOneChar('-');
3244+ chargroup->addCharMatch(charmatch.release());
3245+ chargroup->addCharMatch(new CRegexXQuery_char_ascii(current_regex, '-'));
3246+ (*chargroup_len)++;
3247 continue;
3248 }
3249- else
3250+ else if(pattern[(*chargroup_len)+1] != '[')
3251 {
3252 //it is a range
3253- char c3;
3254- int c3_len;
3255- c3 = readChar(pattern+*chargroup_len+c1_len+1, &c3_len, &is_multichar);
3256- if(is_multichar)
3257- return NULL;//error
3258- chargroup->addCharRange(c1, c3);
3259- *chargroup_len += c1_len + 1 + c3_len;
3260+ (*chargroup_len)++;
3261+ std::unique_ptr<CRegexXQuery_charmatch> charmatch2;
3262+ CHARGROUP_t multichar_type2 = CHARGROUP_NO_MULTICHAR;
3263+ int c2_len;
3264+ charmatch2.reset(readChar(pattern+(*chargroup_len), &c2_len, &multichar_type2));
3265+ if((multichar_type2 != CHARGROUP_FLAGS_ONECHAR_ASCII) &&
3266+ (multichar_type2 != CHARGROUP_FLAGS_ONECHAR_ASCII))//second char in range is multichar
3267+ {
3268+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_MULTICHAR_IN_CHAR_RANGE)) );
3269+ }
3270+ //chargroup->addCharRange(c1, c3);
3271+ if((multichar_type == CHARGROUP_FLAGS_ONECHAR_ASCII) && (multichar_type2 == CHARGROUP_FLAGS_ONECHAR_ASCII))
3272+ {
3273+ if(flags & REGEX_ASCII_CASE_INSENSITIVE)
3274+ chargroup->addCharMatch(new CRegexXQuery_char_range_ascii_i(current_regex,
3275+ (char)charmatch->get_c(),
3276+ (char)charmatch2->get_c()));
3277+ else
3278+ chargroup->addCharMatch(new CRegexXQuery_char_range_ascii(current_regex,
3279+ (char)charmatch->get_c(),
3280+ (char)charmatch2->get_c()));
3281+ }
3282+ else
3283+ {
3284+ if(flags & REGEX_ASCII_CASE_INSENSITIVE)
3285+ chargroup->addCharMatch(new CRegexXQuery_char_range_unicode_i(current_regex,
3286+ charmatch->get_c(),
3287+ charmatch2->get_c()));
3288+ else
3289+ chargroup->addCharMatch(new CRegexXQuery_char_range_unicode(current_regex,
3290+ charmatch->get_c(),
3291+ charmatch2->get_c()));
3292+ }
3293+ *chargroup_len += c2_len;
3294 continue;
3295 }
3296 }
3297- chargroup->addCharRange(c1, c1);
3298- *chargroup_len += c1_len;
3299+ //chargroup->addOneChar(c1);
3300+ chargroup->addCharMatch(charmatch.release());
3301 }
3302 if(pattern[*chargroup_len])
3303 (*chargroup_len)++;
3304- return chargroup;
3305+ else
3306+ {
3307+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_MISSING_CLOSE_BRACKET)) );
3308+ }
3309+ return chargroup.release();
3310 }
3311
3312-void CRegexAscii_parser::read_quantifier(CRegexAscii_piece *piece,
3313+void CRegexXQuery_parser::read_quantifier(CRegexXQuery_piece *piece,
3314 const char *pattern, int *quantif_len)
3315 {
3316 *quantif_len = 0;
3317@@ -496,6 +943,10 @@
3318 max = max*10 + pattern[*quantif_len] - '0';
3319 (*quantif_len)++;
3320 }
3321+ if(max < min)
3322+ {
3323+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_MAX_LT_MIN)) );
3324+ }
3325 piece->set_quantifier_min_max(min, max, true);
3326 }
3327 while(pattern[*quantif_len] && (pattern[*quantif_len] != '}'))
3328@@ -524,23 +975,25 @@
3329 ///Constructors and destructors and internal functions
3330 ////////////////////////////
3331
3332-CRegexAscii_regex::CRegexAscii_regex(CRegexAscii_regex *topregex) : IRegexAtom(topregex?topregex:this)
3333+CRegexXQuery_regex::CRegexXQuery_regex(CRegexXQuery_regex *topregex) : IRegexAtom(topregex?topregex:this)
3334 {
3335 matched_source = NULL;
3336 matched_len = 0;
3337+// backup_matched_source = NULL;
3338+// backup_matched_len = 0;
3339 flags = 128;//set to 0 after initialization
3340 }
3341
3342-CRegexAscii_regex::~CRegexAscii_regex()
3343+CRegexXQuery_regex::~CRegexXQuery_regex()
3344 {
3345- std::list<CRegexAscii_branch*>::iterator branch_it;
3346+ std::list<CRegexXQuery_branch*>::iterator branch_it;
3347
3348 for(branch_it = branch_list.begin(); branch_it != branch_list.end(); branch_it++)
3349 {
3350 delete (*branch_it);
3351 }
3352 /*
3353- std::vector<CRegexAscii_regex*>::iterator subregex_it;
3354+ std::vector<CRegexXQuery_regex*>::iterator subregex_it;
3355 for(subregex_it = subregex.begin(); subregex_it != subregex.end(); subregex_it++)
3356 {
3357 delete (*subregex_it);
3358@@ -548,25 +1001,18 @@
3359 */
3360 }
3361
3362-bool CRegexAscii_regex::set_align_begin(bool align_begin)
3363-{
3364- bool prev_align = this->align_begin;
3365- this->align_begin = align_begin;
3366- return prev_align;
3367-}
3368-
3369-void CRegexAscii_regex::add_branch(CRegexAscii_branch *branch)
3370+void CRegexXQuery_regex::add_branch(CRegexXQuery_branch *branch)
3371 {
3372 branch_list.push_back(branch);
3373 }
3374
3375-bool CRegexAscii_regex::get_indexed_match(int index,
3376+bool CRegexXQuery_regex::get_indexed_match(int index,
3377 const char **matched_source,
3378 int *matched_len)
3379 {
3380 if(!index || index > (int)subregex.size())
3381 return false;
3382- CRegexAscii_regex *subr = subregex[index-1];
3383+ CRegexXQuery_regex *subr = subregex[index-1];
3384 *matched_source = subr->matched_source;
3385 if(!*matched_source)
3386 return false;
3387@@ -574,145 +1020,209 @@
3388 return true;
3389 }
3390
3391-unsigned int CRegexAscii_regex::get_indexed_regex_count()
3392+unsigned int CRegexXQuery_regex::get_indexed_regex_count()
3393 {
3394 return subregex.size();
3395 }
3396
3397-CRegexAscii_branch::CRegexAscii_branch(CRegexAscii_regex* regex) :
3398- IRegexMatcher(regex)
3399+CRegexXQuery_branch::CRegexXQuery_branch(CRegexXQuery_regex* regex)
3400+ //:
3401+ //IRegexMatcher(regex)
3402 {
3403 }
3404
3405-CRegexAscii_branch::~CRegexAscii_branch()
3406+CRegexXQuery_branch::~CRegexXQuery_branch()
3407 {
3408- std::list<CRegexAscii_piece*>::iterator piece_it;
3409+ std::list<RegexAscii_pieceinfo>::iterator piece_it;
3410
3411 for(piece_it = piece_list.begin(); piece_it != piece_list.end(); piece_it++)
3412 {
3413- delete (*piece_it);
3414+ delete (*piece_it).piece;
3415 }
3416 }
3417
3418-void CRegexAscii_branch::add_piece(CRegexAscii_piece *piece)
3419+void CRegexXQuery_branch::add_piece(CRegexXQuery_piece *piece)
3420 {
3421 piece_list.push_back(piece);
3422 }
3423
3424-CRegexAscii_piece::CRegexAscii_piece()
3425+CRegexXQuery_piece::CRegexXQuery_piece()
3426 {
3427+ atom = NULL;
3428+ regex_atom = NULL;
3429 }
3430
3431-CRegexAscii_piece::~CRegexAscii_piece()
3432+CRegexXQuery_piece::~CRegexXQuery_piece()
3433 {
3434 delete atom;
3435 }
3436
3437-void CRegexAscii_piece::set_atom(IRegexAtom *atom)
3438+void CRegexXQuery_piece::set_atom(IRegexAtom *atom)
3439 {
3440 this->atom = atom;
3441+ this->regex_atom = dynamic_cast<CRegexXQuery_regex*>(atom);
3442 }
3443
3444-void CRegexAscii_piece::set_quantifier_min_max(int min, int max, bool strict_max)
3445+void CRegexXQuery_piece::set_quantifier_min_max(int min, int max, bool strict_max)
3446 {
3447 this->min = min;
3448 this->max = max;
3449 this->strict_max = strict_max;
3450 }
3451-void CRegexAscii_piece::set_is_reluctant(bool is_reluctant)
3452+void CRegexXQuery_piece::set_is_reluctant(bool is_reluctant)
3453 {
3454 this->is_reluctant = is_reluctant;
3455 }
3456-void CRegexAscii_piece::get_quantifier(int *min, int *max, bool *strict_max)
3457+void CRegexXQuery_piece::get_quantifier(int *min, int *max, bool *strict_max)
3458 {
3459 *min = this->min;
3460 *max = this->max;
3461 *strict_max = this->strict_max;
3462 }
3463-bool CRegexAscii_piece::get_is_reluctant()
3464+bool CRegexXQuery_piece::get_is_reluctant()
3465 {
3466+ if(atom->regex_intern->flags & REGEX_ASCII_MINIMAL_MATCH)
3467+ return true;
3468 return is_reluctant;
3469 }
3470
3471
3472-CRegexAscii_chargroup::CRegexAscii_chargroup(CRegexAscii_regex* regex) :
3473+CRegexXQuery_charmatch::CRegexXQuery_charmatch(CRegexXQuery_regex* regex) :
3474+ IRegexAtom(regex)
3475+{
3476+}
3477+CRegexXQuery_multicharP::CRegexXQuery_multicharP(CRegexXQuery_regex* regex, char type, bool is_reverse) :
3478+ CRegexXQuery_charmatch(regex)
3479+{
3480+ this->multichar_type = type; this->is_reverse = is_reverse;
3481+}
3482+CRegexXQuery_multicharIs::CRegexXQuery_multicharIs(CRegexXQuery_regex* regex, int block_index, bool is_reverse) :
3483+ CRegexXQuery_charmatch(regex)
3484+{
3485+ this->block_index = block_index; this->is_reverse = is_reverse;
3486+}
3487+CRegexXQuery_multicharOther::CRegexXQuery_multicharOther(CRegexXQuery_regex* regex, char type) :
3488+ CRegexXQuery_charmatch(regex)
3489+{
3490+ this->multichar_type = type;
3491+}
3492+CRegexXQuery_char_ascii::CRegexXQuery_char_ascii(CRegexXQuery_regex* regex, char c) :
3493+ CRegexXQuery_charmatch(regex)
3494+{
3495+ this->c = c;
3496+}
3497+CRegexXQuery_char_ascii_i::CRegexXQuery_char_ascii_i(CRegexXQuery_regex* regex, char c) :
3498+ CRegexXQuery_char_ascii(regex, toupper(c))
3499+{
3500+}
3501+CRegexXQuery_char_range_ascii::CRegexXQuery_char_range_ascii(CRegexXQuery_regex* regex, char c1, char c2) :
3502+ CRegexXQuery_charmatch(regex)
3503+{
3504+ this->c1 = c1; this->c2 = c2;
3505+}
3506+CRegexXQuery_char_range_ascii_i::CRegexXQuery_char_range_ascii_i(CRegexXQuery_regex* regex, char c1, char c2) :
3507+ CRegexXQuery_char_range_ascii(regex, toupper(c1), toupper(c2))
3508+{
3509+}
3510+CRegexXQuery_char_unicode::CRegexXQuery_char_unicode(CRegexXQuery_regex* regex, const char *source, int len) :
3511+ CRegexXQuery_charmatch(regex)
3512+{
3513+ this->len = len;
3514+ memcpy(c, source, len);
3515+}
3516+CRegexXQuery_char_unicode_cp::CRegexXQuery_char_unicode_cp(CRegexXQuery_regex* regex, unicode::code_point c) :
3517+ CRegexXQuery_charmatch(regex)
3518+{
3519+ this->c = c;
3520+}
3521+CRegexXQuery_char_unicode_i::CRegexXQuery_char_unicode_i(CRegexXQuery_regex* regex, unicode::code_point c) :
3522+ CRegexXQuery_char_unicode_cp(regex, unicode::to_upper(c))
3523+{
3524+}
3525+CRegexXQuery_char_range_unicode::CRegexXQuery_char_range_unicode(CRegexXQuery_regex* regex, unicode::code_point c1, unicode::code_point c2) :
3526+ CRegexXQuery_charmatch(regex)
3527+{
3528+ this->c1 = c1; this->c2 = c2;
3529+}
3530+CRegexXQuery_char_range_unicode_i::CRegexXQuery_char_range_unicode_i(CRegexXQuery_regex* regex, unicode::code_point c1, unicode::code_point c2) :
3531+ CRegexXQuery_char_range_unicode(regex, unicode::to_upper(c1), unicode::to_upper(c2))
3532+{
3533+}
3534+CRegexXQuery_endline::CRegexXQuery_endline(CRegexXQuery_regex* regex) :
3535+ CRegexXQuery_charmatch(regex)
3536+{
3537+}
3538+
3539+unicode::code_point CRegexXQuery_char_unicode::get_c()
3540+{
3541+ const char *temp_c = (const char*)c;
3542+ return utf8::next_char(temp_c);
3543+}
3544+
3545+
3546+CRegexXQuery_chargroup::CRegexXQuery_chargroup(CRegexXQuery_regex* regex) :
3547 IRegexAtom(regex)
3548 {
3549 classsub = NULL;
3550 }
3551
3552-CRegexAscii_chargroup::~CRegexAscii_chargroup()
3553+CRegexXQuery_chargroup::~CRegexXQuery_chargroup()
3554 {
3555 delete classsub;
3556-}
3557-
3558-void CRegexAscii_chargroup::addMultiChar(char c)
3559-{
3560- chargroup_t cgt;
3561- cgt.flags = CHARGROUP_FLAGS_MULTICHAR;
3562- cgt.c1 = c;
3563- cgt.c2 = 0;
3564- chargroup_list.push_back(cgt);
3565-}
3566-
3567-void CRegexAscii_chargroup::addEndLine()
3568-{
3569- chargroup_t cgt;
3570- cgt.flags = CHARGROUP_FLAGS_ENDLINE;
3571- cgt.c1 = '$';
3572- cgt.c2 = 0;
3573- chargroup_list.push_back(cgt);
3574-}
3575-
3576-void CRegexAscii_chargroup::addCharRange(char c1, char c2)
3577-{
3578- chargroup_t cgt;
3579- cgt.flags = 0;
3580- cgt.c1 = c1;
3581- cgt.c2 = c2;
3582- chargroup_list.push_back(cgt);
3583-}
3584-
3585-void CRegexAscii_chargroup::addClassSub(CRegexAscii_chargroup* classsub)
3586+ std::list<CRegexXQuery_charmatch* >::iterator charmatch_it;
3587+ for(charmatch_it=chargroup_list.begin(); charmatch_it != chargroup_list.end(); charmatch_it++)
3588+ delete (*charmatch_it);
3589+}
3590+
3591+void CRegexXQuery_chargroup::addCharMatch(CRegexXQuery_charmatch *charmatch)
3592+{
3593+ chargroup_list.push_back(charmatch);
3594+}
3595+void CRegexXQuery_chargroup::addClassSub(CRegexXQuery_chargroup* classsub)
3596 {
3597 this->classsub = classsub;
3598 }
3599
3600-CRegexAscii_negchargroup::CRegexAscii_negchargroup(CRegexAscii_regex* regex) :
3601- CRegexAscii_chargroup(regex)
3602-{
3603-}
3604-
3605-CRegexAscii_negchargroup::~CRegexAscii_negchargroup()
3606-{
3607-}
3608-
3609-CRegexAscii_wildchar::CRegexAscii_wildchar(CRegexAscii_regex* regex) :
3610+CRegexXQuery_negchargroup::CRegexXQuery_negchargroup(CRegexXQuery_regex* regex) :
3611+ CRegexXQuery_chargroup(regex)
3612+{
3613+}
3614+
3615+CRegexXQuery_negchargroup::~CRegexXQuery_negchargroup()
3616+{
3617+}
3618+
3619+CRegexXQuery_wildchar::CRegexXQuery_wildchar(CRegexXQuery_regex* regex) :
3620 IRegexAtom(regex)
3621 {
3622 }
3623
3624-CRegexAscii_wildchar::~CRegexAscii_wildchar()
3625+CRegexXQuery_wildchar::~CRegexXQuery_wildchar()
3626 {
3627 }
3628
3629-CRegexAscii_backref::CRegexAscii_backref(CRegexAscii_regex* regex, unsigned int backref_) :
3630+CRegexXQuery_backref::CRegexXQuery_backref(CRegexXQuery_regex* regex, unsigned int backref_) :
3631 IRegexAtom(regex),
3632 backref(backref_)
3633 {
3634 }
3635
3636-CRegexAscii_backref::~CRegexAscii_backref()
3637-{
3638-}
3639-
3640-CRegexAscii_parser::CRegexAscii_parser()
3641+CRegexXQuery_backref::~CRegexXQuery_backref()
3642+{
3643+}
3644+
3645+CRegexXQuery_pinstart::CRegexXQuery_pinstart(CRegexXQuery_regex* regex):
3646+ IRegexAtom(regex)
3647+{
3648+}
3649+
3650+CRegexXQuery_parser::CRegexXQuery_parser()
3651 {
3652 current_regex = NULL;
3653 regex_depth = 0;
3654 }
3655
3656-CRegexAscii_parser::~CRegexAscii_parser()
3657+CRegexXQuery_parser::~CRegexXQuery_parser()
3658 {
3659 }
3660
3661@@ -720,9 +1230,68 @@
3662 //////////////////////////////////////////
3663 ////Matching the pattern on a string
3664 /////////////////////////////////////////
3665+static std::list<RegexAscii_pieceinfo> empty_pieces;//empty list of pieces
3666+/*
3667+std::list<RegexAscii_pieceinfo>::iterator
3668+IRegexAtom::choose_next_piece(const char *source, int *matched_len,
3669+ std::list<RegexAscii_pieceinfo>::iterator this_piece,
3670+ std::list<RegexAscii_pieceinfo>::iterator end_piece)
3671+{
3672+ //if this_piece is repetition, repeat until max, then go to next piece
3673+ int min, max;
3674+ bool strict_max;
3675+ while(this_piece != end_piece)
3676+ {
3677+ (*this_piece).piece->get_quantifier(&min, &max, &strict_max);
3678+ if(max <= ((*this_piece).nr_matches))//finished this piece
3679+ {
3680+ this_piece++;
3681+ }
3682+ else
3683+ break;
3684+ }
3685+ return this_piece;
3686+}
3687+*/
3688+
3689+bool IRegexAtom::match(const char *source, int *start_from_branch, int *matched_len,
3690+ std::list<RegexAscii_pieceinfo>::iterator this_piece,
3691+ std::list<RegexAscii_pieceinfo>::iterator end_piece)
3692+{
3693+ *start_from_branch = 0;
3694+ bool retmatch;
3695+ retmatch = match_internal(source, start_from_branch, matched_len);
3696+ if(!retmatch)
3697+ return false;
3698+
3699+ if(this_piece == end_piece)
3700+ return true;
3701+
3702+ (*this_piece).nr_matches++;
3703+ int min,max;
3704+ bool strict_max;
3705+ (*this_piece).piece->get_quantifier(&min, &max, &strict_max);
3706+ std::list<RegexAscii_pieceinfo>::iterator init_piece = this_piece;
3707+ if(((min == 1) && (max == 1)) || //the simple common case
3708+ ((*matched_len == 0) && ((*this_piece).nr_matches>=min)))//to avoid infinite loop
3709+ {
3710+ this_piece++;
3711+ if(this_piece == end_piece)
3712+ return true;
3713+ }
3714+ int matched_len2;
3715+ retmatch = (*this_piece).piece->match_piece(this_piece, end_piece, source + *matched_len, &matched_len2);
3716+ if(!retmatch)
3717+ {
3718+ (*init_piece).nr_matches--;
3719+ return false;
3720+ }
3721+ *matched_len += matched_len2;
3722+ return true;
3723+}
3724
3725 //try every position in source to match the pattern
3726-bool CRegexAscii_regex::match_anywhere(const char *source, unsigned int flags,
3727+bool CRegexXQuery_regex::match_anywhere(const char *source, unsigned int flags,
3728 int *match_pos, int *matched_len)
3729 {
3730 *match_pos = 0;
3731@@ -730,43 +1299,66 @@
3732 return match_from(source, flags, match_pos, matched_len);
3733 }
3734
3735-bool CRegexAscii_regex::match_from(const char *source, unsigned int flags,
3736+bool CRegexXQuery_regex::match_from(const char *source, unsigned int flags,
3737 int *match_pos, int *matched_len)
3738 {
3739 this->flags = flags;
3740+ this->source_start = source;
3741 reachedEnd = false;
3742
3743- std::vector<CRegexAscii_regex*>::iterator regex_it;
3744+ std::vector<CRegexXQuery_regex*>::iterator regex_it;
3745 for(regex_it = subregex.begin(); regex_it != subregex.end(); regex_it++)
3746 {
3747 (*regex_it)->matched_source = NULL;
3748 }
3749-// if(!source[0])
3750-// {
3751-// if(branch_list.empty())
3752-// return true;
3753-// else
3754-// return false;
3755-// }
3756-
3757- bool skip_first_match = false;
3758- if(*match_pos && align_begin)
3759- skip_first_match = true;
3760+
3761+ std::vector<std::pair<const char*, int> > saved_subregex;
3762+
3763+ if(*match_pos && (flags & REGEX_ASCII_WHOLE_MATCH))
3764+ return false;
3765+
3766 do
3767 {
3768- if(!skip_first_match)
3769- {
3770- if(match(source + *match_pos, matched_len))
3771- return true;
3772- }
3773- skip_first_match = false;
3774- if(align_begin)
3775+ int start_from_branch = 0;
3776+ int longest_match = -1;
3777+ while(1)
3778+ {
3779+ if(!match(source + *match_pos, &start_from_branch, matched_len, empty_pieces.begin(), empty_pieces.end()))
3780+ break;
3781+ if(longest_match < *matched_len)
3782+ {
3783+ longest_match = *matched_len;
3784+ if(start_from_branch && (flags & REGEX_ASCII_GET_LONGEST_BRANCH))
3785+ save_subregex_list(saved_subregex);
3786+ }
3787+ if(!start_from_branch || !(flags & REGEX_ASCII_GET_LONGEST_BRANCH))
3788+ break;
3789+ //else try the other branches to see which is longer
3790+ }
3791+ if(longest_match != -1)
3792+ {
3793+ *matched_len = longest_match;
3794+ if(saved_subregex.size())
3795+ load_subregex_list(saved_subregex);
3796+ if(flags & REGEX_ASCII_WHOLE_MATCH)
3797+ {
3798+ if(!source[*match_pos+*matched_len])
3799+ return true;
3800+ if((flags & REGEX_ASCII_MULTILINE) &&
3801+ ((source[*match_pos+*matched_len] == '\n') || (source[*match_pos+*matched_len] == '\r')))
3802+ return true;
3803+ return false;
3804+ }
3805+ return true;
3806+ }
3807+
3808+ if(flags & REGEX_ASCII_WHOLE_MATCH)
3809 {
3810 if(flags & REGEX_ASCII_MULTILINE)
3811 {
3812- //goto the next line
3813+ //go to next line
3814 while(source[*match_pos] && (source[*match_pos] != '\n') && (source[*match_pos] != '\r'))
3815- (*match_pos)++;
3816+ (*match_pos) += myutf8len(source);
3817 if(source[*match_pos] == '\n')
3818 {
3819 (*match_pos)++;
3820@@ -780,190 +1372,1039 @@
3821 (*match_pos)++;
3822 }
3823 if(!source[*match_pos])
3824- return false;
3825+ break;
3826 continue;
3827 }
3828- return false;
3829+ break;
3830 }
3831 if(!source[*match_pos])
3832 break;
3833- (*match_pos)++;
3834+ (*match_pos) += myutf8len(source);
3835 }
3836 while(source[*match_pos]);
3837+// if(!source[*match_pos])
3838+// {
3839+// reachedEnd = true;
3840+// }
3841 return false;
3842 }
3843
3844+void CRegexXQuery_regex::reset_match()
3845+{
3846+// this->backup_matched_source = this->matched_source;
3847+// this->backup_matched_len = this->matched_len;
3848+ this->matched_source = NULL;
3849+ this->matched_len = 0;
3850+ std::list<CRegexXQuery_branch*>::iterator branch_it;
3851+ for(branch_it = branch_list.begin(); branch_it != branch_list.end(); branch_it++)
3852+ {
3853+ (*branch_it)->reset();
3854+ }
3855+}
3856+/*
3857+void CRegexXQuery_regex::restore_match()
3858+{
3859+ this->matched_source = this->backup_matched_source;
3860+ this->matched_len = this->backup_matched_len;
3861+ std::list<CRegexXQuery_branch*>::iterator branch_it;
3862+ for(branch_it = branch_list.begin(); branch_it != branch_list.end(); branch_it++)
3863+ {
3864+ (*branch_it)->restore();
3865+ }
3866+}
3867+*/
3868 //match any of the branches
3869-bool CRegexAscii_regex::match(const char *source, int *matched_len)
3870+bool CRegexXQuery_regex::match(const char *source, int *start_from_branch, int *matched_len,
3871+ std::list<RegexAscii_pieceinfo>::iterator next_piece,
3872+ std::list<RegexAscii_pieceinfo>::iterator end_piece)
3873 {
3874 reachedEnd = false;
3875- std::list<CRegexAscii_branch*>::iterator branch_it;
3876-
3877- for(branch_it = branch_list.begin(); branch_it != branch_list.end(); branch_it++)
3878- {
3879- if((*branch_it)->match(source, matched_len))
3880- {
3881- matched_source = source;
3882- this->matched_len = *matched_len;
3883+ if(!(flags & REGEX_ASCII_GROUPING_LEN_WHOLE_PIECE) ||
3884+ (this->matched_source == NULL) || ((this->matched_source + this->matched_len) != source))
3885+ this->matched_source = source;
3886+ *matched_len = 0;
3887+ std::list<CRegexXQuery_branch*>::iterator branch_it;
3888+
3889+ if(*start_from_branch == 0)
3890+ {
3891+ for(branch_it = branch_list.begin(); branch_it != branch_list.end(); branch_it++)
3892+ {
3893+ (*branch_it)->reset();
3894+ }
3895+ }
3896+
3897+ branch_it = branch_list.begin();
3898+ if(*start_from_branch)
3899+ {
3900+ for(int i=0;i<*start_from_branch;i++)
3901+ branch_it++;
3902+ }
3903+ (*start_from_branch)++;
3904+ for(; branch_it != branch_list.end(); branch_it++,(*start_from_branch)++)
3905+ {
3906+ if((*branch_it)->match(source, matched_len, this, next_piece, end_piece))
3907+ {
3908+ //matched_source = source;
3909+ //this->matched_len = *matched_len;
3910 return true;
3911 }
3912 }
3913- matched_source = NULL;
3914- matched_len = 0;
3915+ *start_from_branch = 0;
3916+ if(this->matched_source == source)
3917+ this->matched_source = NULL;
3918+ *matched_len = 0;
3919 return false;
3920 }
3921
3922+void CRegexXQuery_regex::save_subregex_list(std::vector<std::pair<const char*, int> > &saved_subregex)
3923+{
3924+ saved_subregex.resize(0);
3925+ saved_subregex.reserve(subregex.size());
3926+ std::vector<CRegexXQuery_regex*>::iterator it;
3927+ for(it=subregex.begin(); it != subregex.end(); it++)
3928+ {
3929+ saved_subregex.push_back(std::pair<const char*, int>((*it)->matched_source, (*it)->matched_len));
3930+ }
3931+}
3932+
3933+void CRegexXQuery_regex::load_subregex_list(std::vector<std::pair<const char*, int> > &saved_subregex)
3934+{
3935+ std::vector<std::pair<const char*, int> >::iterator it;
3936+ std::vector<CRegexXQuery_regex*>::iterator subit;
3937+ for(it=saved_subregex.begin(), subit = subregex.begin(); it != saved_subregex.end(); it++, subit++)
3938+ {
3939+ (*subit)->matched_source = (*it).first;
3940+ (*subit)->matched_len = (*it).second;
3941+ }
3942+}
3943+
3944+void CRegexXQuery_branch::reset()
3945+{
3946+ std::list<RegexAscii_pieceinfo>::iterator piece_it;
3947+ for(piece_it = piece_list.begin(); piece_it != piece_list.end(); piece_it++)
3948+ {
3949+ (*piece_it).piece->atom->reset_match();
3950+ }
3951+}
3952+/*
3953+void CRegexXQuery_branch::restore()
3954+{
3955+ std::list<RegexAscii_pieceinfo>::iterator piece_it;
3956+ for(piece_it = piece_list.begin(); piece_it != piece_list.end(); piece_it++)
3957+ {
3958+ (*piece_it).piece->atom->restore_match();
3959+ }
3960+}
3961+*/
3962 //match all the pieces
3963-bool CRegexAscii_branch::match(const char *source, int *matched_len)
3964+bool CRegexXQuery_branch::match(const char *source, int *matched_len,
3965+ CRegexXQuery_regex* group_regex,
3966+ std::list<RegexAscii_pieceinfo>::iterator next_piece,
3967+ std::list<RegexAscii_pieceinfo>::iterator end_piece)
3968 {
3969- std::list<CRegexAscii_piece*>::iterator piece_it;
3970+ std::list<RegexAscii_pieceinfo>::iterator piece_it;
3971
3972 piece_it = piece_list.begin();
3973+ //if(piece_it == piece_list.end())
3974+ //if(!source[0])
3975+ // return true;
3976+ //else
3977+ // return false;
3978 if(piece_it == piece_list.end())
3979- if(source[0])
3980- return false;
3981+ {
3982+ piece_it = next_piece;
3983+ if(next_piece == end_piece)
3984+ {
3985+ group_regex->matched_len = 0;
3986+ return true;
3987+ }
3988+ }
3989+
3990+ std::list<RegexAscii_pieceinfo> temp_pieces(piece_list);
3991+ temp_pieces.push_back(group_regex);//this will be used to store the group match
3992+ temp_pieces.insert(temp_pieces.end(), next_piece, end_piece);
3993+
3994+ return (*piece_it).piece->match_piece(temp_pieces.begin(), temp_pieces.end(), source, matched_len);
3995+}
3996+
3997+bool CRegexXQuery_piece::match_piece(std::list<RegexAscii_pieceinfo>::iterator piece_it,
3998+ std::list<RegexAscii_pieceinfo>::iterator end_it,
3999+ const char *source, int *matched_len)
4000+{
4001+ if((*piece_it).nr_matches < 0)
4002+ {
4003+ //special case, store the group match
4004+ (*piece_it).group_regex->matched_len = source - (*piece_it).group_regex->matched_source;
4005+ piece_it++;
4006+ if(piece_it == end_it)
4007+ return true;
4008 else
4009- return true;
4010- if(!(*piece_it)->get_is_reluctant())
4011- return match_piece_iter_normal(piece_it, source, matched_len);
4012+ return (*piece_it).piece->match_piece(piece_it, end_it, source, matched_len);
4013+ }
4014+
4015+ if(!get_is_reluctant())
4016+ return match_piece_iter_normal(piece_it, end_it, source, matched_len);
4017 else
4018- return match_piece_iter_reluctant(piece_it, source, matched_len);
4019-}
4020-
4021-//match as less as possible
4022-bool CRegexAscii_branch::match_piece_iter_reluctant(
4023- std::list<CRegexAscii_piece*>::iterator piece_it,
4024+ return match_piece_iter_reluctant(piece_it, end_it, source, matched_len);
4025+}
4026+
4027+int CRegexXQuery_piece::choose_another_branch(std::vector<std::pair<int,int> > &match_lens)
4028+{
4029+ int i = match_lens.size()-1;
4030+ i--;
4031+ while((i >= 0) && (match_lens.at(i).second == 0))
4032+ i--;
4033+ if(i < 0)
4034+ return -1;//no more branches
4035+ match_lens.resize(i+1);
4036+ i++;
4037+ return i;
4038+}
4039+
4040+bool CRegexXQuery_piece::is_regex_atom()
4041+{
4042+ return regex_atom != NULL;
4043+}
4044+
4045+//match as less as possible (shortest string)
4046+bool CRegexXQuery_piece::match_piece_iter_reluctant(
4047+ std::list<RegexAscii_pieceinfo>::iterator piece_it,
4048+ std::list<RegexAscii_pieceinfo>::iterator end_it,
4049 const char *source, int *matched_len)
4050 {
4051 *matched_len = 0;
4052- if(piece_it == piece_list.end())
4053+ if(piece_it == end_it)
4054 return true;
4055
4056 int min, max;
4057 bool strict_max;
4058 //std::vector<int> match_lens;
4059- (*piece_it)->get_quantifier(&min, &max, &strict_max);
4060- if(strict_max && (max >= 0))
4061+ (*piece_it).piece->get_quantifier(&min, &max, &strict_max);
4062+
4063+ std::vector<std::pair<const char*, int> > saved_subregex;
4064+
4065+ if(is_regex_atom())
4066 {
4067- int timeslen;
4068- //check if the piece doesn't exceed the max match
4069- if((*piece_it)->match_piece_times(source, &timeslen, max+1, NULL))
4070- return false;///too many matches
4071+ //recursive
4072+ bool retmatch;
4073+ atom->regex_intern->save_subregex_list(saved_subregex);
4074+ if((*piece_it).nr_matches >= min)
4075+ {
4076+ //go to next piece
4077+ std::list<RegexAscii_pieceinfo>::iterator next_it = piece_it;
4078+ next_it++;
4079+ if(next_it == end_it)
4080+ return true;
4081+ retmatch = (*next_it).piece->match_piece(next_it, end_it, source, matched_len);
4082+ if(retmatch)
4083+ return true;
4084+ }
4085+ if(((max == -1) || ((*piece_it).nr_matches < max)) &&//try further with this piece
4086+ (((*piece_it).nr_matches < min) || ((*piece_it).nr_matches == 0) || ((*piece_it).piece->regex_atom->matched_len)))//if matched_len is zero, avoid infinite loop
4087+ {
4088+ int start_from_branch = 0;
4089+ int shortest_len = -1;
4090+ bool branch_saved = false;
4091+ //try all branches to get the shortest len
4092+ (*piece_it).nr_matches++;
4093+ while(atom->match(source, &start_from_branch, matched_len, piece_it, end_it))
4094+ {
4095+ if((shortest_len == -1) || (shortest_len > *matched_len))
4096+ {
4097+ shortest_len = *matched_len;
4098+ if(start_from_branch && (atom->regex_intern->flags & REGEX_ASCII_GET_LONGEST_BRANCH))
4099+ {
4100+ atom->regex_intern->save_subregex_list(saved_subregex);
4101+ branch_saved = true;
4102+ }
4103+ }
4104+ if(!start_from_branch || !(atom->regex_intern->flags & REGEX_ASCII_GET_LONGEST_BRANCH))
4105+ break;
4106+ }
4107+ if(shortest_len != -1)
4108+ {
4109+ *matched_len = shortest_len;
4110+ if(branch_saved)
4111+ atom->regex_intern->load_subregex_list(saved_subregex);
4112+ return true;
4113+ }
4114+ else
4115+ {
4116+ (*piece_it).nr_matches--;
4117+ atom->regex_intern->load_subregex_list(saved_subregex);
4118+ return false;
4119+ }
4120+ }
4121+ else
4122+ {
4123+ atom->regex_intern->load_subregex_list(saved_subregex);
4124+ return false;
4125+ }
4126 }
4127
4128- int i=min;
4129- std::list<CRegexAscii_piece*>::iterator next_it = piece_it;
4130+ int i=0;
4131+ int shortest_len = -1;
4132+ int otherpieces_shortest = -1;
4133+ int i_shortest = -1;
4134+ std::list<RegexAscii_pieceinfo>::iterator next_it = piece_it;
4135+ std::vector<std::pair<int,int> > match_lens;
4136 next_it++;
4137 int pieceslen = 0;
4138 while(1)
4139 {
4140- if((max > 0) && (i>max))
4141- break;
4142- int piecelen = 0;
4143- if((*piece_it)->match_piece_times(source+pieceslen, &piecelen, !pieceslen ? i : 1, NULL))
4144- {
4145- pieceslen += piecelen;
4146+ int piecelen = 0;
4147+ bool retmatch;
4148+ retmatch = match_piece_times(source, &piecelen, i < min ? min : i, &match_lens);
4149+ i = match_lens.size()-1;//number of matches
4150+ if(i<0)
4151+ i = 0;
4152+ if((i>=min))
4153+ {
4154+ pieceslen = piecelen;
4155+ if((shortest_len >= 0) && (shortest_len <= pieceslen))//this branch is longer
4156+ {//try another branch
4157+ i = choose_another_branch(match_lens);
4158+ if(i >= 0)
4159+ continue;//try another branch
4160+ else
4161+ break;
4162+ }
4163 int otherpieces = 0;
4164- if((next_it == piece_list.end()) ||
4165- ((*next_it)->get_is_reluctant() && match_piece_iter_reluctant(next_it, source+pieceslen, &otherpieces)) ||
4166- (!(*next_it)->get_is_reluctant() && match_piece_iter_normal(next_it, source+pieceslen, &otherpieces)))
4167- {
4168- *matched_len = pieceslen + otherpieces;
4169- return true;
4170- }
4171+ if((next_it == end_it) ||
4172+ (*next_it).piece->match_piece(next_it, end_it, source+pieceslen, &otherpieces)
4173+ )
4174+ {
4175+ if((i == pieceslen) || (match_lens.at(0).second == 0) ||//minimum achieved already, cannot go lower than that
4176+ !(atom->regex_intern->flags & REGEX_ASCII_GET_LONGEST_BRANCH))
4177+ {
4178+ *matched_len = pieceslen + otherpieces;
4179+ return true;
4180+ }
4181+ if((shortest_len < 0) || (shortest_len > pieceslen))
4182+ {
4183+ shortest_len = pieceslen;
4184+ otherpieces_shortest = otherpieces;
4185+ i_shortest = i;
4186+ if(match_lens.at(0).second != 0)
4187+ atom->regex_intern->save_subregex_list(saved_subregex);
4188+ }
4189+ i = choose_another_branch(match_lens);
4190+ if(i >= 0)
4191+ continue;//try another branch
4192+ else
4193+ break;
4194+ }
4195+ else
4196+ {
4197+ //try further
4198+ if(retmatch)
4199+ {
4200+ i++;
4201+ if((max < 0) || (i<=max))
4202+ continue;
4203+ i--;
4204+ }
4205+ }
4206+ }
4207+
4208+ if(i==0)
4209+ {
4210+ break;
4211 }
4212 else
4213- break;
4214- i++;
4215+ {
4216+ i = choose_another_branch(match_lens);
4217+ if(i >= 0)
4218+ continue;//try another branch
4219+ else
4220+ break;
4221+ }
4222 }
4223
4224+ if(shortest_len >= 0)
4225+ {
4226+ if(strict_max && (max>=0) && (i_shortest > max))
4227+ return false;
4228+ *matched_len = shortest_len + otherpieces_shortest;
4229+ if(saved_subregex.size())
4230+ atom->regex_intern->load_subregex_list(saved_subregex);
4231+ return true;
4232+ }
4233 return false;
4234 }
4235
4236 //match as much as possible
4237-bool CRegexAscii_branch::match_piece_iter_normal(
4238- std::list<CRegexAscii_piece*>::iterator piece_it,
4239+bool CRegexXQuery_piece::match_piece_iter_normal(
4240+ std::list<RegexAscii_pieceinfo>::iterator piece_it,
4241+ std::list<RegexAscii_pieceinfo>::iterator end_it,
4242 const char *source, int *matched_len)
4243 {
4244 *matched_len = 0;
4245
4246 int min, max;
4247 bool strict_max;
4248- std::vector<int> match_lens;
4249- (*piece_it)->get_quantifier(&min, &max, &strict_max);
4250- int timeslen;
4251- if(strict_max && (max >= 0))
4252+ std::vector<std::pair<int,int> > match_lens;
4253+ (*piece_it).piece->get_quantifier(&min, &max, &strict_max);
4254+ int timeslen = 0;
4255+ std::vector<std::pair<const char*, int> > saved_subregex;
4256+
4257+ if(is_regex_atom())
4258 {
4259- //check if the piece doesn't exceed the max match
4260- //if((*piece_it)->match_piece_times(source, &timeslen, max+1, &match_lens))
4261- // return false;///too many matches
4262- (*piece_it)->match_piece_times(source, &timeslen, max, &match_lens);
4263+ //recursive
4264+ bool retmatch;
4265+ atom->regex_intern->save_subregex_list(saved_subregex);
4266+ if(((max == -1) || ((*piece_it).nr_matches < max)) && //try further with this piece
4267+ (((*piece_it).nr_matches < min) || ((*piece_it).nr_matches == 0) || ((*piece_it).piece->regex_atom->matched_len)))//if matched_len is zero, avoid infinite loop
4268+ {
4269+ int start_from_branch = 0;
4270+ int longest_len = -1;
4271+ bool branch_saved = false;
4272+ //try all branches to get the longest len
4273+ (*piece_it).nr_matches++;
4274+ while(atom->match(source, &start_from_branch, matched_len, piece_it, end_it))
4275+ {
4276+ if((longest_len < *matched_len))
4277+ {
4278+ longest_len = *matched_len;
4279+ if(start_from_branch && (atom->regex_intern->flags & REGEX_ASCII_GET_LONGEST_BRANCH))
4280+ {
4281+ atom->regex_intern->save_subregex_list(saved_subregex);
4282+ branch_saved = true;
4283+ }
4284+ }
4285+ if(!start_from_branch || !(atom->regex_intern->flags & REGEX_ASCII_GET_LONGEST_BRANCH))
4286+ break;
4287+ }
4288+ if(longest_len != -1)
4289+ {
4290+ *matched_len = longest_len;
4291+ if(branch_saved)
4292+ atom->regex_intern->load_subregex_list(saved_subregex);
4293+ return true;
4294+ }
4295+ else
4296+ {
4297+ atom->regex_intern->load_subregex_list(saved_subregex);
4298+ (*piece_it).nr_matches--;
4299+ }
4300+ }
4301+ if((*piece_it).nr_matches >= min)
4302+ {
4303+ //go to next piece
4304+ std::list<RegexAscii_pieceinfo>::iterator next_it = piece_it;
4305+ next_it++;
4306+ if(next_it == end_it)
4307+ return true;
4308+ retmatch = (*next_it).piece->match_piece(next_it, end_it, source, matched_len);
4309+ if(!retmatch)
4310+ atom->regex_intern->load_subregex_list(saved_subregex);
4311+ return retmatch;
4312+ }
4313+ else
4314+ {
4315+ // regex_atom->restore_match();
4316+ atom->regex_intern->load_subregex_list(saved_subregex);
4317+ return false;
4318+ }
4319 }
4320- else if(!strict_max && (max >= 0))
4321- (*piece_it)->match_piece_times(source, &timeslen, max, &match_lens);
4322- else
4323- (*piece_it)->match_piece_times(source, &timeslen, -1, &match_lens);
4324
4325- int i;
4326- std::list<CRegexAscii_piece*>::iterator next_it = piece_it;
4327+ int longest_len = -1;
4328+ int otherpieces_longest = -1;
4329+ int i_longest = -1;
4330+ int i = max;
4331+ std::list<RegexAscii_pieceinfo>::iterator next_it = piece_it;
4332 next_it++;
4333- if(next_it == piece_list.end())
4334+
4335+ bool retmatch;
4336+ while(1)
4337 {
4338- if((int)match_lens.size() > min)
4339- {
4340- *matched_len = timeslen;
4341- return true;
4342+ retmatch = match_piece_times(source, &timeslen, i, &match_lens);
4343+ i=match_lens.size()-1;//number of matches
4344+ if((i>=min))
4345+ {
4346+ if(timeslen < longest_len)
4347+ {//this branch is no use
4348+ i = choose_another_branch(match_lens);
4349+ if(i >= 0)
4350+ {
4351+ i = max;
4352+ continue;//try another branch
4353+ }
4354+ else
4355+ break;
4356+ }
4357+ //int piecelen = 0;
4358+ int otherpieces = 0;
4359+ if((next_it == end_it) ||
4360+ (*next_it).piece->match_piece(next_it, end_it, source+timeslen, &otherpieces)
4361+ )
4362+ {
4363+ if(timeslen > longest_len)
4364+ {
4365+ longest_len = timeslen;
4366+ otherpieces_longest = otherpieces;
4367+ i_longest = i;
4368+ if(!(atom->regex_intern->flags & REGEX_ASCII_GET_LONGEST_BRANCH))
4369+ {
4370+ *matched_len = longest_len + otherpieces_longest;
4371+ return true;
4372+ }
4373+ else
4374+ {
4375+ if(match_lens.at(0).second)
4376+ atom->regex_intern->save_subregex_list(saved_subregex);
4377+ }
4378+ }
4379+ }
4380+ else
4381+ {
4382+ if(!match_lens.at(0).second)
4383+ {
4384+ match_lens.resize(match_lens.size()-1);
4385+ i--;
4386+ if(i >= 0)
4387+ continue;//try smaller
4388+ else
4389+ break;
4390+ }
4391+ else
4392+ {
4393+ i = choose_another_branch(match_lens);
4394+ if(i >= 0)
4395+ continue;//try another branch
4396+ else
4397+ break;
4398+ }
4399+ }
4400+ }
4401+ //now try another branch
4402+ i = choose_another_branch(match_lens);
4403+ if(i >= 0)
4404+ {
4405+ i = max;
4406+ continue;//try another branch
4407 }
4408 else
4409- return false;
4410- }
4411- for(i=match_lens.size()-1; i>=min; i--)
4412+ break;
4413+ }//end while
4414+
4415+ if(longest_len >= 0)
4416 {
4417- int piecelen = 0;
4418- int otherpieces = 0;
4419- if(((*next_it)->get_is_reluctant() && match_piece_iter_reluctant(next_it, source+match_lens[i]+piecelen, &otherpieces)) ||
4420- (!(*next_it)->get_is_reluctant() && match_piece_iter_normal(next_it, source+match_lens[i]+piecelen, &otherpieces)))
4421- {
4422- *matched_len = match_lens[i] + piecelen + otherpieces;
4423- return true;
4424- }
4425+ *matched_len = longest_len + otherpieces_longest;
4426+ if(saved_subregex.size())
4427+ atom->regex_intern->load_subregex_list(saved_subregex);
4428+ return true;
4429 }
4430
4431 return false;
4432 }
4433
4434-bool CRegexAscii_piece::match_piece_times(const char *source,
4435+bool CRegexXQuery_piece::match_piece_times(const char *source,
4436 int *piecelen,
4437 int times,
4438- std::vector<int> *match_lens)
4439+ std::vector<std::pair<int,int> > *match_lens)
4440 {
4441- *piecelen = 0;
4442- for(int i=0;(times < 0) || (i<times);i++)
4443- {
4444+ int i=0;
4445+ if(match_lens && match_lens->size())
4446+ {
4447+ i = match_lens->size()-1;
4448+ }
4449+ if(match_lens && match_lens->size())
4450+ *piecelen = match_lens->at(match_lens->size()-1).first;
4451+ else
4452+ *piecelen = 0;
4453+ if((times >= 0) && (i>=times))
4454+ return true;
4455+ for(;(times < 0) || (i<times);i++)
4456+ {
4457+ int atomlen;
4458+ int start_from_branch = 0;
4459+ if(match_lens && (i<(int)match_lens->size()))
4460+ start_from_branch = match_lens->at(i).second;
4461+ bool first_branch = (start_from_branch == 0);
4462+ if(!atom->match(source+*piecelen, &start_from_branch, &atomlen, empty_pieces.begin(), empty_pieces.end()))
4463+ {
4464+ if(match_lens)
4465+ {
4466+ if(i >= (int)match_lens->size())
4467+ match_lens->push_back(std::pair<int,int>(*piecelen, 0));
4468+ else
4469+ (*match_lens)[i] = std::pair<int,int>(*piecelen, 0);
4470+ }
4471+ return false;
4472+ }
4473 if(match_lens)
4474- match_lens->push_back(*piecelen);
4475- int atomlen;
4476- if(!atom->match(source+*piecelen, &atomlen))
4477- return false;
4478+ {
4479+ if(i >= (int)match_lens->size())
4480+ match_lens->push_back(std::pair<int,int>(*piecelen, start_from_branch));
4481+ else
4482+ (*match_lens)[i] = std::pair<int,int>(*piecelen, start_from_branch);
4483+ }
4484 *piecelen += atomlen;
4485 if(!atomlen && !source[*piecelen])
4486 {
4487- atom->regex_intern->reachedEnd = true;
4488+ // atom->regex_intern->set_reachedEnd(source);
4489+ break;
4490+ }
4491+ if(first_branch && (atomlen == 0))//avoid infinite loop
4492+ {
4493 break;
4494 }
4495 }
4496 if(match_lens)
4497- match_lens->push_back(*piecelen);
4498+ {
4499+ // if(i >= match_lens->size())
4500+ match_lens->push_back(std::pair<int,int>(*piecelen, 0));
4501+ // else
4502+ // (*match_lens)[i] = std::pair<int,int>(*piecelen, 0);
4503+ }
4504
4505 return true;
4506 }
4507
4508+bool CRegexXQuery_multicharP::match_internal(const char *source, int *start_from_branch, int *matched_len)
4509+{
4510+ if(!source[0])
4511+ {
4512+ regex_intern->set_reachedEnd(source);
4513+ return false;
4514+ }
4515+ bool found = false;
4516+ const char *temp_source = source;
4517+ unicode::code_point utf8c = utf8::next_char(temp_source);
4518+ switch(multichar_type)
4519+ {
4520+ case unicode::UNICODE_Ll + 50:
4521+ if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Ll) ||
4522+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Lm) ||
4523+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Lo) ||
4524+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Lt) ||
4525+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Lu))
4526+ {
4527+ if(!is_reverse)
4528+ found = true;
4529+ }
4530+ else
4531+ {
4532+ if(is_reverse)
4533+ found = true;
4534+ }
4535+ break;
4536+ case unicode::UNICODE_Mc + 50:
4537+ if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Mn) ||
4538+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Mc) ||
4539+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Me))
4540+ {
4541+ if(!is_reverse)
4542+ found = true;
4543+ }
4544+ else
4545+ {
4546+ if(is_reverse)
4547+ found = true;
4548+ }
4549+ break;
4550+ case unicode::UNICODE_Nd + 50:
4551+ if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Nd) ||
4552+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Nl) ||
4553+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_No))
4554+ {
4555+ if(!is_reverse)
4556+ found = true;
4557+ }
4558+ else
4559+ {
4560+ if(is_reverse)
4561+ found = true;
4562+ }
4563+ break;
4564+ case unicode::UNICODE_Pc + 50:
4565+ if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pc) ||
4566+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pd) ||
4567+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Ps) ||
4568+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pe) ||
4569+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pi) ||
4570+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pf) ||
4571+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Po))
4572+ {
4573+ if(!is_reverse)
4574+ found = true;
4575+ }
4576+ else
4577+ {
4578+ if(is_reverse)
4579+ found = true;
4580+ }
4581+ break;
4582+ case unicode::UNICODE_Zl + 50:
4583+ if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Zs) ||
4584+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Zl) ||
4585+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Zp))
4586+ {
4587+ if(!is_reverse)
4588+ found = true;
4589+ }
4590+ else
4591+ {
4592+ if(is_reverse)
4593+ found = true;
4594+ }
4595+ break;
4596+ case unicode::UNICODE_Sc + 50:
4597+ if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Sm) ||
4598+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Sc) ||
4599+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Sk) ||
4600+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_So))
4601+ {
4602+ if(!is_reverse)
4603+ found = true;
4604+ }
4605+ else
4606+ {
4607+ if(is_reverse)
4608+ found = true;
4609+ }
4610+ break;
4611+ case unicode::UNICODE_Cc + 50:
4612+ if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Cc) ||
4613+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Cf) ||
4614+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Co))//ignore unicode::UNICODE_Cn
4615+ {
4616+ if(!is_reverse)
4617+ found = true;
4618+ }
4619+ else
4620+ {
4621+ if(is_reverse)
4622+ found = true;
4623+ }
4624+ break;
4625+ default:
4626+ if(unicode::check_codepoint_category(utf8c, (unicode::category)multichar_type))
4627+ {
4628+ if(!is_reverse)
4629+ found = true;
4630+ }
4631+ else
4632+ {
4633+ if(is_reverse)
4634+ found = true;
4635+ }
4636+ break;
4637+ }
4638+
4639+ if(found)
4640+ {
4641+ *matched_len = temp_source - source;
4642+ }
4643+ return found;
4644+}
4645+
4646+bool CRegexXQuery_multicharIs::match_internal(const char *source, int *start_from_branch, int *matched_len)
4647+{
4648+ if(!source[0])
4649+ {
4650+ regex_intern->set_reachedEnd(source);
4651+ return false;
4652+ }
4653+ bool found = false;
4654+ const char *temp_source = source;
4655+ unicode::code_point utf8c = utf8::next_char(temp_source);
4656+ const unicode::code_point *cp = block_escape[block_index].cp;
4657+ if((utf8c >= cp[0]) && (utf8c <= cp[1]))
4658+ {
4659+ if(!is_reverse)
4660+ found = true;
4661+ }
4662+ else if(block_escape[block_index].ext_cp)
4663+ {
4664+ cp = block_escape[block_index].ext_cp;
4665+ while(*cp)
4666+ {
4667+ if((utf8c >= cp[0]) && (utf8c <= cp[1]))
4668+ break;
4669+ cp += 2;
4670+ }
4671+ if(*cp)
4672+ {
4673+ if(!is_reverse)
4674+ found = true;
4675+ }
4676+ else
4677+ {
4678+ if(is_reverse)
4679+ found = true;
4680+ }
4681+ }
4682+ else
4683+ {
4684+ if(is_reverse)
4685+ found = true;
4686+ }
4687+ if(found)
4688+ {
4689+ *matched_len = temp_source - source;
4690+ }
4691+ return found;
4692+}
4693+
4694+bool CRegexXQuery_multicharOther::match_internal(const char *source, int *start_from_branch, int *matched_len)
4695+{
4696+ if(!source[0])
4697+ {
4698+ regex_intern->set_reachedEnd(source);
4699+ return false;
4700+ }
4701+ bool found = false;
4702+ bool value_true = true;
4703+ const char *temp_source = source;
4704+ unicode::code_point utf8c = utf8::next_char(temp_source);
4705+ switch(multichar_type)
4706+ {
4707+ case 'S':value_true = false;//[^\s]
4708+ case 's'://[#x20\t\n\r]
4709+ switch(utf8c)
4710+ {
4711+ case '\t':
4712+ case '\r':
4713+ case '\n':
4714+ case ' ':
4715+ found = true;
4716+ default:
4717+ break;
4718+ }
4719+ break;
4720+ case 'I':value_true = false;//[^\i]
4721+ case 'i'://the set of initial name characters, those matched by Letter | '_' | ':'
4722+ if((utf8c == '_') ||
4723+ (utf8c == ':') ||
4724+ XQCharType::isLetter(utf8c))
4725+ {
4726+ found = true;
4727+ }
4728+ break;
4729+ case 'C':value_true = false;//[^\c]
4730+ case 'c'://the set of name characters, those matched by NameChar
4731+ if(XQCharType::isNameChar(utf8c))
4732+ {
4733+ found = true;
4734+ }
4735+ break;
4736+ case 'D':value_true = false;//[^\d]
4737+ case 'd':
4738+ if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Nd))
4739+ found = true;
4740+ break;
4741+ case 'W':value_true = false;//[^\w]
4742+ case 'w':
4743+ found = !(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pc) ||
4744+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pd) ||
4745+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Ps) ||
4746+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pe) ||
4747+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pi) ||
4748+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pf) ||
4749+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Po) ||
4750+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Zs) ||
4751+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Zl) ||
4752+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Zp) ||
4753+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Cc) ||
4754+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Cf) ||
4755+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Co));//ignore unicode::UNICODE_Cn
4756+ break;
4757+ default:
4758+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(source, ZED(REGEX_UNIMPLEMENTED)) );
4759+ }
4760+ if((found && value_true) || (!found && !value_true))
4761+ {
4762+ *matched_len = temp_source - source;
4763+ return true;
4764+ }
4765+ else
4766+ {
4767+ return false;
4768+ }
4769+}
4770+
4771+bool CRegexXQuery_char_ascii::match_internal(const char *source, int *start_from_branch, int *matched_len)
4772+{
4773+ if(!source[0])
4774+ {
4775+ regex_intern->set_reachedEnd(source);
4776+ return false;
4777+ }
4778+ if(source[0] == c)
4779+ {
4780+ *matched_len = 1;
4781+ return true;
4782+ }
4783+ else
4784+ return false;
4785+}
4786+
4787+bool CRegexXQuery_char_ascii_i::match_internal(const char *source, int *start_from_branch, int *matched_len)
4788+{
4789+ if(!source[0])
4790+ {
4791+ regex_intern->set_reachedEnd(source);
4792+ return false;
4793+ }
4794+ char sup = toupper(source[0]);
4795+ if(sup == c)
4796+ {
4797+ *matched_len = 1;
4798+ return true;
4799+ }
4800+ else
4801+ return false;
4802+}
4803+
4804+bool CRegexXQuery_char_range_ascii::match_internal(const char *source, int *start_from_branch, int *matched_len)
4805+{
4806+ if(!source[0])
4807+ {
4808+ regex_intern->set_reachedEnd(source);
4809+ return false;
4810+ }
4811+ if((source[0] >= c1) && (source[0] <= c2))
4812+ {
4813+ *matched_len = 1;
4814+ return true;
4815+ }
4816+ else
4817+ return false;
4818+}
4819+
4820+bool CRegexXQuery_char_range_ascii_i::match_internal(const char *source, int *start_from_branch, int *matched_len)
4821+{
4822+ if(!source[0])
4823+ {
4824+ regex_intern->set_reachedEnd(source);
4825+ return false;
4826+ }
4827+ char sup = toupper(source[0]);
4828+ if((sup >= c1) && (sup <= c2))
4829+ {
4830+ *matched_len = 1;
4831+ return true;
4832+ }
4833+ else
4834+ return false;
4835+}
4836+
4837+bool CRegexXQuery_char_unicode::match_internal(const char *source, int *start_from_branch, int *matched_len)
4838+{
4839+ if(!source[0])
4840+ {
4841+ regex_intern->set_reachedEnd(source);
4842+ return false;
4843+ }
4844+ if(!memcmp(source, c, len))
4845+ {
4846+ *matched_len = len;
4847+ return true;
4848+ }
4849+ else
4850+ return false;
4851+}
4852+
4853+bool CRegexXQuery_char_unicode_cp::match_internal(const char *source, int *start_from_branch, int *matched_len)
4854+{
4855+ if(!source[0])
4856+ {
4857+ regex_intern->set_reachedEnd(source);
4858+ return false;
4859+ }
4860+ const char *temp_source = source;
4861+ unicode::code_point utf8c = utf8::next_char(temp_source);
4862+ if(utf8c == c)
4863+ {
4864+ *matched_len = temp_source - source;
4865+ return true;
4866+ }
4867+ else
4868+ return false;
4869+}
4870+
4871+bool CRegexXQuery_char_unicode_i::match_internal(const char *source, int *start_from_branch, int *matched_len)
4872+{
4873+ if(!source[0])
4874+ {
4875+ regex_intern->set_reachedEnd(source);
4876+ return false;
4877+ }
4878+ const char *temp_source = source;
4879+ unicode::code_point sup = unicode::to_upper(utf8::next_char(temp_source));
4880+ if(sup == c)
4881+ {
4882+ *matched_len = temp_source - source;
4883+ return true;
4884+ }
4885+ else
4886+ return false;
4887+}
4888+
4889+bool CRegexXQuery_char_range_unicode::match_internal(const char *source, int *start_from_branch, int *matched_len)
4890+{
4891+ if(!source[0])
4892+ {
4893+ regex_intern->set_reachedEnd(source);
4894+ return false;
4895+ }
4896+ const char *temp_source = source;
4897+ unicode::code_point utf8c = utf8::next_char(temp_source);
4898+ if((utf8c >= c1) && (utf8c <= c2))
4899+ {
4900+ *matched_len = temp_source - source;
4901+ return true;
4902+ }
4903+ else
4904+ return false;
4905+}
4906+
4907+bool CRegexXQuery_char_range_unicode_i::match_internal(const char *source, int *start_from_branch, int *matched_len)
4908+{
4909+ if(!source[0])
4910+ {
4911+ regex_intern->set_reachedEnd(source);
4912+ return false;
4913+ }
4914+ const char *temp_source = source;
4915+ unicode::code_point sup = unicode::to_upper(utf8::next_char(temp_source));
4916+ if((sup >= c1) && (sup <= c2))
4917+ {
4918+ *matched_len = temp_source - source;
4919+ return true;
4920+ }
4921+ else
4922+ return false;
4923+}
4924+
4925+bool CRegexXQuery_endline::match_internal(const char *source, int *start_from_branch, int *matched_len)
4926+{
4927+ *matched_len = 0;
4928+ if(!source[0])
4929+ {
4930+ // regex_intern->reachedEnd = true;
4931+ return true;
4932+ }
4933+ if((source[0] == 0x0A) || ((source[0] == 0x0D) && (source[1] == 0x0A)))
4934+ {
4935+ if(regex_intern->get_flags() & REGEX_ASCII_MULTILINE)
4936+ {
4937+ // regex_intern->reachedEnd = true;
4938+ return true;
4939+ }
4940+ }
4941+ return false;
4942+}
4943+
4944+
4945 //match any of chargroups
4946-bool CRegexAscii_chargroup::match(const char *source, int *matched_len)
4947+bool CRegexXQuery_chargroup::match_internal(const char *source, int *start_from_branch, int *matched_len)
4948 {
4949 *matched_len = 0;
4950- std::list<chargroup_t>::iterator cgt_it;
4951-
4952+ std::list<CRegexXQuery_charmatch* >::iterator cgt_it;
4953+/*
4954 if(!source[0])
4955 {
4956 regex_intern->reachedEnd = true;
4957@@ -975,113 +2416,21 @@
4958 return false;
4959 }
4960
4961- if(source[0] == 0x0A)
4962+ if((source[0] == 0x0A) || ((source[0] == 0x0D) && (source[1] == 0x0A)))
4963 {
4964 if((regex_intern->flags & REGEX_ASCII_MULTILINE) &&
4965 (chargroup_list.size() == 1) && (chargroup_list.begin()->flags == CHARGROUP_FLAGS_ENDLINE))
4966 {
4967- *matched_len = 1;
4968+ // *matched_len = 1;
4969 return true;
4970 }
4971 }
4972-
4973+*/
4974+ //bool found = false;
4975 for(cgt_it = chargroup_list.begin(); cgt_it != chargroup_list.end(); cgt_it++)
4976 {
4977- if(cgt_it->flags == CHARGROUP_FLAGS_MULTICHAR)
4978- {
4979- switch(cgt_it->c1)
4980- {
4981- case 'p'://catEsc
4982- case 'P'://complEsc
4983- //ignore the prop for now
4984- throw XQUERY_EXCEPTION( err::FORX0002 );
4985- case 's'://[#x20\t\n\r]
4986- switch(source[0])
4987- {
4988- case '\t':
4989- case '\r':
4990- case '\n':
4991- case ' ':
4992- *matched_len = 1;
4993- return true;
4994- default:
4995- return false;
4996- }
4997- case 'S'://[^\s]
4998- switch(source[0])
4999- {
5000- case 0:
The diff has been truncated for viewing.

Subscribers

People subscribed via source and target branches