Merge lp:~zorba-coders/zorba/no_unicode into lp:zorba

Proposed by Paul J. Lucas
Status: Superseded
Proposed branch: lp:~zorba-coders/zorba/no_unicode
Merge into: lp:zorba
Diff against target: 9029 lines (+3908/-1422)
270 files modified
CMakeConfiguration.txt (+5/-5)
CMakeLists.txt (+6/-2)
ChangeLog (+7/-0)
KNOWN_ISSUES.txt (+1/-1)
doc/cxx/examples/context.cpp (+4/-0)
include/zorba/config.h.cmake (+3/-1)
include/zorba/static_context.h (+4/-0)
include/zorba/util/time.h (+1/-1)
src/CMakeLists.txt (+4/-0)
src/api/serialization/serializer.cpp (+36/-33)
src/api/serialization/serializer.h (+2/-4)
src/diagnostics/diagnostic_en.xml (+116/-27)
src/diagnostics/pregenerated/dict_en.cpp (+98/-20)
src/precompiled/stdafx.h (+74/-356)
src/runtime/full_text/CMakeLists.txt (+3/-3)
src/runtime/full_text/default_tokenizer.cpp (+4/-4)
src/runtime/full_text/latin_tokenizer.cpp (+3/-2)
src/runtime/full_text/latin_tokenizer.h (+9/-8)
src/runtime/numerics/format_integer_impl.cpp (+1/-1)
src/runtime/numerics/numerics_impl.cpp (+1/-1)
src/runtime/strings/strings_impl.cpp (+58/-20)
src/store/api/store.h (+1/-1)
src/store/naive/simple_store.h (+7/-3)
src/store/naive/store.cpp (+1/-1)
src/store/naive/store.h (+12/-11)
src/system/globalenv.cpp (+7/-7)
src/unit_tests/CMakeLists.txt (+2/-2)
src/unit_tests/string.cpp (+8/-0)
src/unit_tests/unit_test_list.h (+2/-2)
src/unit_tests/unit_tests.cpp (+2/-2)
src/util/CMakeLists.txt (+4/-4)
src/util/icu_streambuf.h (+1/-0)
src/util/passthru_streambuf.cpp (+2/-2)
src/util/passthru_streambuf.h (+10/-2)
src/util/regex.cpp (+96/-82)
src/util/regex.h (+22/-34)
src/util/regex_xquery.cpp (+1860/-489)
src/util/regex_xquery.h (+359/-123)
src/util/transcode_streambuf.h (+5/-5)
src/util/unicode_categories.cpp (+3/-3)
src/util/unicode_categories.h (+44/-37)
src/util/unicode_util.cpp (+20/-2)
src/util/unicode_util.h (+47/-15)
src/util/utf8_util.cpp (+6/-6)
src/util/utf8_util.h (+29/-13)
src/util/utf8_util.tcc (+10/-2)
src/zorbatypes/collation_manager.cpp (+17/-17)
src/zorbatypes/collation_manager.h (+3/-3)
src/zorbatypes/libicu.h (+0/-32)
src/zorbatypes/transcoder.cpp (+8/-4)
src/zorbatypes/transcoder.h (+9/-9)
src/zorbautils/hashmap_itemh.h (+4/-0)
src/zorbautils/string_util.cpp (+19/-18)
src/zorbautils/string_util.h (+15/-1)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a1.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a10.xml.res (+242/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a11.xml.res (+6/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a2.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a3.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a5.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a6.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a7.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a8.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a9.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m1.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m10.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m11.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m12.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m13.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m14.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m15.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m16.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m17.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m18.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m19.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m2.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m20.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m21.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m22.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m23.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m24.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m25.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m26.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m27.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m28.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m29.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m3.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m30.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m31.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m32.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m33.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m34.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m35.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m36.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m37.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m38.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m39.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m4.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m40.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m41.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m42.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m43.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m44.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m45.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m46.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m47.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m48.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m49.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m5.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m50.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m51.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m52.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m53.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m6.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m7.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m8.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m9.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_prime1.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r1.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r10.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r11.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r12.xml.res (+5/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r2.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r3.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r4.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r5.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r6.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r9.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_t1.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_t4.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_t5.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/testdriver/bom_bug.xml.res (+1/-0)
test/rbkt/Queries/CMakeLists.txt (+16/-1)
test/rbkt/Queries/zorba/string/Regex/regex_a1.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_a10.xq (+11/-0)
test/rbkt/Queries/zorba/string/Regex/regex_a11.xq (+9/-0)
test/rbkt/Queries/zorba/string/Regex/regex_a2.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_a3.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_a5.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_a6.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_a7.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_a8.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_a9.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err1.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err1.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err10.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err10.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err11.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err11.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err12.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err12.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err13.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err13.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err14.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err14.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err15.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err15.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err16.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err16.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err17.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err17.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err18.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err18.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err19.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err19.xq (+3/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err2.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err2.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err20.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err20.xq (+3/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err21.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err21.xq (+3/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err22.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err22.xq (+3/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err23.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err23.xq (+3/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err24.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err24.xq (+3/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err25.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err25.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err3.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err3.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err4.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err4.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err5.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err5.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err7.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err7.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err8.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err8.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err9.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err9.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m1.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m10.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m11.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m12.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m13.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m14.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m15.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m16.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m17.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m18.xq (+3/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m19.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m2.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m20.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m21.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m22.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m23.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m24.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m25.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m26.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m27.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m28.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m29.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m3.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m30.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m31.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m32.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m33.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m34.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m35.xq (+4/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m36.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m37.xq (+4/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m38.xq (+4/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m39.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m4.xq (+6/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m40.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m41.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m42.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m43.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m44.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m45.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m46.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m47.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m48.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m49.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m5.xq (+6/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m50.xq (+2/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m51.xq (+2/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m52.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m53.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m6.xq (+6/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m7.xq (+6/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m8.xq (+7/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m9.xq (+7/-0)
test/rbkt/Queries/zorba/string/Regex/regex_prime1.xq (+17/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r1.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r10.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r11.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r12.xq (+7/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r2.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r3.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r4.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r5.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r6.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r7_err.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r7_err.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r8_err.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r8_err.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r9.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_t1.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_t2.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_t3_err.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_t3_err.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_t4.xq (+2/-0)
test/rbkt/Queries/zorba/string/Regex/regex_t5.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/zorba.html (+242/-0)
test/rbkt/Queries/zorba/string/Regex/zorba2.html (+5/-0)
test/rbkt/Queries/zorba/testdriver/bom_bug.xq (+1/-0)
test/unit/static_context.cpp (+2/-0)
test/update/CMakeLists.txt (+9/-0)
To merge this branch: bzr merge lp:~zorba-coders/zorba/no_unicode
Reviewer Review Type Date Requested Status
Matthias Brantner Pending
Markos Zaharioudakis Pending
Review via email: mp+101052@code.launchpad.net

This proposal supersedes a proposal from 2012-01-18.

This proposal has been superseded by a proposal from 2012-04-07.

Commit message

"No Unicode" is now "No ICU."

Description of the change

"No Unicode" is now "No ICU."

To post a comment you must log in.
Revision history for this message
Matthias Brantner (matthias-brantner) wrote : Posted in a previous version of this proposal

Compiling with ZORBA_NO_ICU=ON fails on Linux:

[ 1%] Building CXX object src/CMakeFiles/zorba_simplestore.dir/api/zorba_string.cpp.o
In file included from /home/mbrantner/zorba/sandbox/src/util/regex.h:501:0,
                 from /home/mbrantner/zorba/sandbox/src/api/zorba_string.cpp:23:
/home/mbrantner/zorba/sandbox/src/util/regex_xquery.h:209:3: error: a class-key must be used when declaring a friend
/home/mbrantner/zorba/sandbox/src/util/regex_xquery.h:209:3: error: friend declaration does not name a class or function
/home/mbrantner/zorba/sandbox/src/util/regex_xquery.h:253:3: error: a class-key must be used when declaring a friend
/home/mbrantner/zorba/sandbox/src/util/regex_xquery.h:253:3: error: friend declaration does not name a class or function
make[2]: *** [src/CMakeFiles/zorba_simplestore.dir/api/zorba_string.cpp.o] Erro

Revision history for this message
Matthias Brantner (matthias-brantner) : Posted in a previous version of this proposal
review: Needs Fixing
Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote : Posted in a previous version of this proposal

There are additional revisions which have not been approved in review. Please seek review and approval of these new revisions.

Revision history for this message
Matthias Brantner (matthias-brantner) wrote : Posted in a previous version of this proposal

The test suite doesn't run clean on my system (Linux) without ICU. This prevents us from adding the built to the remote queue. For example, the following tests fail without ICU (some of them also seem to fail with ICU):

 1294 - test/rbkt/zorba/string/Regex/regex_a10 (Failed)
 1548 - test/rbkt/zorba/fulltext/ft-wildcard-true-2 (Failed)
 1560 - test/rbkt/zorba/fulltext/ft-wildcard-true-4 (Failed)
 1574 - test/rbkt/zorba/fulltext/ft-same-sentence-true-4 (Failed)
 1581 - test/rbkt/zorba/fulltext/ft-wildcard-true-3 (Failed)
 1587 - test/rbkt/zorba/fulltext/ft-wildcard-true-9 (Failed)
 1600 - test/rbkt/zorba/fulltext/ft-diacritics-insensitive-true-1 (Failed)
 1605 - test/rbkt/zorba/fulltext/ft-wildcard-true-8 (Failed)
 1612 - test/rbkt/zorba/fulltext/ft-wildcard-true-10 (Failed)
 1635 - test/rbkt/zorba/fulltext/ft-wildcard-true-7 (Failed)
 1637 - test/rbkt/zorba/fulltext/ft-wildcard-true-11 (Failed)
 1643 - test/rbkt/zorba/fulltext/ft-wildcard-FTDY0020-3 (Failed)
 1789 - test/rbkt/zorba/index/numbers (Failed)
 2345 - test/unit/string_test (Failed)
 2534 - test/update/zorba/store/sc3 (Failed)
 2544 - doc/cxx/examples/context.cpp (Failed)

Please make sure the test suite runs clean.

review: Needs Fixing
Revision history for this message
Paul J. Lucas (paul-lucas) wrote : Posted in a previous version of this proposal

Try it now.

Revision history for this message
Daniel Turcanu (danielturcanu) wrote : Posted in a previous version of this proposal

Before commiting this branch, the branch lp:~danielturcanu/zorba/my_conv_module should be merged.

Revision history for this message
Chris Hillery (ceejatec) wrote : Posted in a previous version of this proposal

FWIW, I've skimmed the change for CMake-related changes, and they all look fine (mostly quite trivial).

Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote : Posted in a previous version of this proposal

Attempt to merge into lp:zorba failed due to conflicts:

text conflict in ChangeLog

Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote : Posted in a previous version of this proposal
Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote : Posted in a previous version of this proposal

The attempt to merge lp:~zorba-coders/zorba/no_unicode into lp:zorba failed. Below is the output from the failed tests.

CMake Error at /home/ceej/zo/testing/zorbatest/tester/TarmacLander.cmake:274 (message):
  Validation queue job no_unicode-2012-03-30T19-15-23.23Z is finished. The
  final status was:

  6 tests did not succeed - changes not commited.

Error in read script: /home/ceej/zo/testing/zorbatest/tester/TarmacLander.cmake

Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote : Posted in a previous version of this proposal
Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote : Posted in a previous version of this proposal

The attempt to merge lp:~zorba-coders/zorba/no_unicode into lp:zorba failed. Below is the output from the failed tests.

CMake Error at /home/ceej/zo/testing/zorbatest/tester/TarmacLander.cmake:274 (message):
  Validation queue job no_unicode-2012-04-03T15-17-37.639Z is finished. The
  final status was:

  6 tests did not succeed - changes not commited.

Error in read script: /home/ceej/zo/testing/zorbatest/tester/TarmacLander.cmake

Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote :
Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote :

The attempt to merge lp:~zorba-coders/zorba/no_unicode into lp:zorba failed. Below is the output from the failed tests.

CMake Error at /home/ceej/zo/testing/zorbatest/tester/TarmacLander.cmake:274 (message):
  Validation queue job no_unicode-2012-04-06T00-21-13.829Z is finished. The
  final status was:

  6 tests did not succeed - changes not commited.

Error in read script: /home/ceej/zo/testing/zorbatest/tester/TarmacLander.cmake

lp:~zorba-coders/zorba/no_unicode updated
10534. By Paul J. Lucas

No longer doing some stuff when q_flag is set.

10535. By Paul J. Lucas

Tweaked one error message.

10536. By Paul J. Lucas

Merge from trunk.

10537. By Rodolfo Ochoa

Merge from trunk

10538. By Rodolfo Ochoa

Strange error on include guards

10539. By Rodolfo Ochoa

merge from trunk

10540. By Rodolfo Ochoa

fix for regex errors in RQ

Unmerged revisions

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
=== modified file 'CMakeConfiguration.txt'
--- CMakeConfiguration.txt 2012-03-28 05:19:57 +0000
+++ CMakeConfiguration.txt 2012-04-07 00:45:26 +0000
@@ -135,14 +135,14 @@
135SET (ZORBA_DEBUG_STRING ${ZORBA_DEBUG_STRING} CACHE BOOL "debug strings")135SET (ZORBA_DEBUG_STRING ${ZORBA_DEBUG_STRING} CACHE BOOL "debug strings")
136MESSAGE (STATUS "ZORBA_DEBUG_STRING: " ${ZORBA_DEBUG_STRING})136MESSAGE (STATUS "ZORBA_DEBUG_STRING: " ${ZORBA_DEBUG_STRING})
137137
138SET(ZORBA_NO_UNICODE OFF CACHE BOOL "disable ICU")138SET(ZORBA_NO_ICU OFF CACHE BOOL "disable ICU")
139MESSAGE(STATUS "ZORBA_NO_UNICODE: " ${ZORBA_NO_UNICODE})139MESSAGE(STATUS "ZORBA_NO_ICU: " ${ZORBA_NO_ICU})
140140
141IF (ZORBA_NO_UNICODE)141IF (ZORBA_NO_ICU)
142 SET (no_full_text ON)142 SET (no_full_text ON)
143ELSE (ZORBA_NO_UNICODE)143ELSE (ZORBA_NO_ICU)
144 SET (no_full_text OFF)144 SET (no_full_text OFF)
145ENDIF (ZORBA_NO_UNICODE)145ENDIF (ZORBA_NO_ICU)
146SET (ZORBA_NO_FULL_TEXT ${no_full_text} CACHE BOOL "disable XQuery Full-Text support")146SET (ZORBA_NO_FULL_TEXT ${no_full_text} CACHE BOOL "disable XQuery Full-Text support")
147MESSAGE(STATUS "ZORBA_NO_FULL_TEXT: " ${ZORBA_NO_FULL_TEXT})147MESSAGE(STATUS "ZORBA_NO_FULL_TEXT: " ${ZORBA_NO_FULL_TEXT})
148148
149149
=== modified file 'CMakeLists.txt'
--- CMakeLists.txt 2012-03-28 05:19:57 +0000
+++ CMakeLists.txt 2012-04-07 00:45:26 +0000
@@ -123,10 +123,14 @@
123CHECK_TYPE_SIZE("int64_t" ZORBA_HAVE_INT64_T) 123CHECK_TYPE_SIZE("int64_t" ZORBA_HAVE_INT64_T)
124124
125CHECK_CXX_SOURCE_COMPILES ("#include <type_traits>\nint main() { std::enable_if<true,int> x; }" ZORBA_CXX_ENABLE_IF)125CHECK_CXX_SOURCE_COMPILES ("#include <type_traits>\nint main() { std::enable_if<true,int> x; }" ZORBA_CXX_ENABLE_IF)
126CHECK_CXX_SOURCE_COMPILES ("int main() { int *p = nullptr; }" ZORBA_CXX_NULLPTR)126SET(CMAKE_EXTRA_INCLUDE_FILES wchar.h)
127CHECK_CXX_SOURCE_COMPILES ("int main() { static_assert(1,\"\"); }" ZORBA_CXX_STATIC_ASSERT)127CHECK_TYPE_SIZE("wchar_t" ZORBA_SIZEOF_WCHAR_T)
128SET(CMAKE_EXTRA_INCLUDE_FILES)
128CHECK_CXX_SOURCE_COMPILES ("#include <memory>\nint main() { std::unique_ptr<int> p; }" ZORBA_CXX_UNIQUE_PTR)129CHECK_CXX_SOURCE_COMPILES ("#include <memory>\nint main() { std::unique_ptr<int> p; }" ZORBA_CXX_UNIQUE_PTR)
129130
131CHECK_CXX_SOURCE_COMPILES("int main() { int *p = nullptr; }" ZORBA_CXX_NULLPTR)
132CHECK_CXX_SOURCE_COMPILES("int main() { static_assert(1,\"\"); }" ZORBA_CXX_STATIC_ASSERT)
133
130################################################################################134################################################################################
131# Various cmake macros135# Various cmake macros
132136
133137
=== modified file 'ChangeLog'
--- ChangeLog 2012-04-04 15:59:01 +0000
+++ ChangeLog 2012-04-07 00:45:26 +0000
@@ -4,6 +4,7 @@
44
5New Features:5New Features:
6 * Extended API for Python, Java, PHP and Ruby.6 * Extended API for Python, Java, PHP and Ruby.
7 * Added support for NO_ICU (to not use ICU for unicode processing)
78
8Bug Fixes/Other Changes:9Bug Fixes/Other Changes:
9 * Fixed bug #967864 (var substitution did not update theFreeVars property)10 * Fixed bug #967864 (var substitution did not update theFreeVars property)
@@ -148,7 +149,9 @@
148 * Fixed bug when parsing a document with a base-uri attribute.149 * Fixed bug when parsing a document with a base-uri attribute.
149 * Fixed bug #863320 (Sentence is incorrectly incremented when token characters end without sentence terminator)150 * Fixed bug #863320 (Sentence is incorrectly incremented when token characters end without sentence terminator)
150 * Fixed bug #863730 (static delete-node* functions don't raise ZDDY0012)151 * Fixed bug #863730 (static delete-node* functions don't raise ZDDY0012)
152 * Implemented the probe-index-range-value for general indexes
151 * Removed ZSTR0005 and ZSTR0006 error codes153 * Removed ZSTR0005 and ZSTR0006 error codes
154 * Fixed bug #867662 ("nullptr" warning)
152 * Fixed bug #868258 (Assertion failure with two delete collection)155 * Fixed bug #868258 (Assertion failure with two delete collection)
153 * Fixed bug #871623 and #871629 (assertion failures with insertions in dynamic collections)156 * Fixed bug #871623 and #871629 (assertion failures with insertions in dynamic collections)
154 * Fixed bug #867262 (allow reuse of iterator over ExtFuncArgItemSequence)157 * Fixed bug #867262 (allow reuse of iterator over ExtFuncArgItemSequence)
@@ -157,6 +160,8 @@
157 * New node-reference module. References can be obtained for any node, and160 * New node-reference module. References can be obtained for any node, and
158 different nodes cannot have the same identifier.161 different nodes cannot have the same identifier.
159 * Fixed bug #872697 (segmentation fault with validation of NMTOKENS)162 * Fixed bug #872697 (segmentation fault with validation of NMTOKENS)
163 * General index cannot be declared as unique if the type of its key is
164 xs:anyAtomicType or xs:untypedAtomic.
160 * Added undo for node revalidation165 * Added undo for node revalidation
161 * Optimization for count(collection()) expressions166 * Optimization for count(collection()) expressions
162 * Fixed bug #872796 (validate-in-place can interfere with other update primitives)167 * Fixed bug #872796 (validate-in-place can interfere with other update primitives)
@@ -175,6 +180,8 @@
175 * Fixed bug #855715 (Invalid escaped characters in regex not caught)180 * Fixed bug #855715 (Invalid escaped characters in regex not caught)
176 * Fixed bug #862089 (Split binary/xq install directories for modules) by181 * Fixed bug #862089 (Split binary/xq install directories for modules) by
177 splitting "module path" into separate URI and Library paths182 splitting "module path" into separate URI and Library paths
183 * New node-position module. This module allows to obtain a representation of a node position, which
184 can be used to assess structural relationships with other nodes.
178 * Fixed bug #872502 (validation of the JSON module xqdoc fails)185 * Fixed bug #872502 (validation of the JSON module xqdoc fails)
179 * Fixed bug #897619 (testdriver_mt can not run the XQueryX tests)186 * Fixed bug #897619 (testdriver_mt can not run the XQueryX tests)
180 * Fixed bug #867107 (xqdoc dependency to zorba is wrong)187 * Fixed bug #867107 (xqdoc dependency to zorba is wrong)
181188
=== modified file 'KNOWN_ISSUES.txt'
--- KNOWN_ISSUES.txt 2012-03-28 05:19:57 +0000
+++ KNOWN_ISSUES.txt 2012-04-07 00:45:26 +0000
@@ -37,7 +37,7 @@
37* The serializer currently doesn't implement character maps as specified37* The serializer currently doesn't implement character maps as specified
38 (http://www.w3.org/TR/xslt-xquery-serialization/#character-maps)38 (http://www.w3.org/TR/xslt-xquery-serialization/#character-maps)
3939
40* In the 2.0 release, setting the CMake variables ZORBA_NO_UNICODE to40* In the 2.0 release, setting the CMake variables ZORBA_NO_ICU to
41 ON is not supported.41 ON is not supported.
4242
43* The PHP language binding is not supported on Mac OS X. For details,43* The PHP language binding is not supported on Mac OS X. For details,
4444
=== modified file 'doc/cxx/examples/context.cpp'
--- doc/cxx/examples/context.cpp 2012-03-28 05:19:57 +0000
+++ doc/cxx/examples/context.cpp 2012-04-07 00:45:26 +0000
@@ -149,7 +149,11 @@
149 outStream2 << lQuery << std::endl;149 outStream2 << lQuery << std::endl;
150 std::cout << outStream2.str() << std::endl;150 std::cout << outStream2.str() << std::endl;
151151
152#ifndef ZORBA_NO_ICU
152 if (outStream2.str() != "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\nBook 1.1\n")153 if (outStream2.str() != "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\nBook 1.1\n")
154#else
155 if (outStream2.str() != "<?xml version=\"1.0\"?>\nBook 1.1\n")
156#endif /* ZORBA_NO_ICU */
153 {157 {
154 std::cerr << "Test 4 failed with a wrong result : " << std::endl158 std::cerr << "Test 4 failed with a wrong result : " << std::endl
155 << outStream2.str() << std::endl;159 << outStream2.str() << std::endl;
156160
=== modified file 'include/zorba/config.h.cmake'
--- include/zorba/config.h.cmake 2012-03-28 05:19:57 +0000
+++ include/zorba/config.h.cmake 2012-04-07 00:45:26 +0000
@@ -96,6 +96,8 @@
96typedef __int64 int64_t;96typedef __int64 int64_t;
97#endif /* ZORBA_HAVE_INT64_T */97#endif /* ZORBA_HAVE_INT64_T */
9898
99#cmakedefine ZORBA_SIZEOF_WCHAR_T @ZORBA_SIZEOF_WCHAR_T@
100
99// Compiler101// Compiler
100#cmakedefine CLANG102#cmakedefine CLANG
101#cmakedefine MSVC103#cmakedefine MSVC
@@ -148,7 +150,7 @@
148150
149// Zorba features151// Zorba features
150#cmakedefine ZORBA_NO_FULL_TEXT152#cmakedefine ZORBA_NO_FULL_TEXT
151#cmakedefine ZORBA_NO_UNICODE153#cmakedefine ZORBA_NO_ICU
152#cmakedefine ZORBA_NO_XMLSCHEMA154#cmakedefine ZORBA_NO_XMLSCHEMA
153#cmakedefine ZORBA_NUMERIC_OPTIMIZATION155#cmakedefine ZORBA_NUMERIC_OPTIMIZATION
154#cmakedefine ZORBA_VERIFY_PEER_SSL_CERTIFICATE156#cmakedefine ZORBA_VERIFY_PEER_SSL_CERTIFICATE
155157
=== modified file 'include/zorba/static_context.h'
--- include/zorba/static_context.h 2012-03-28 05:19:57 +0000
+++ include/zorba/static_context.h 2012-04-07 00:45:26 +0000
@@ -26,9 +26,13 @@
26#include <zorba/function.h>26#include <zorba/function.h>
27#include <zorba/annotation.h>27#include <zorba/annotation.h>
28#include <zorba/smart_ptr.h>28#include <zorba/smart_ptr.h>
29#include <zorba/smart_ptr.h>
29#ifndef ZORBA_NO_FULL_TEXT30#ifndef ZORBA_NO_FULL_TEXT
30#include <zorba/thesaurus.h>31#include <zorba/thesaurus.h>
31#endif /* ZORBA_NO_FULL_TEXT */32#endif /* ZORBA_NO_FULL_TEXT */
33#include <zorba/zorba.h>
34#include <zorba/store_manager.h>
35#include <zorba/zorba_exception.h>
3236
33namespace zorba {37namespace zorba {
3438
3539
=== modified file 'include/zorba/util/time.h'
--- include/zorba/util/time.h 2012-03-28 05:19:57 +0000
+++ include/zorba/util/time.h 2012-04-07 00:45:26 +0000
@@ -178,7 +178,7 @@
178 178
179 inline long get_walltime_in_millis(const walltime& t)179 inline long get_walltime_in_millis(const walltime& t)
180 {180 {
181 return t.time * 1000 + t.millitm;181 return (long)(t.time * 1000 + t.millitm);
182 }182 }
183183
184#else /* not Windows, and no clock_gettime() */184#else /* not Windows, and no clock_gettime() */
185185
=== modified file 'src/CMakeLists.txt'
--- src/CMakeLists.txt 2012-03-28 05:19:57 +0000
+++ src/CMakeLists.txt 2012-04-07 00:45:26 +0000
@@ -59,7 +59,10 @@
59#59#
60# Next, add the files to be compiled into the library60# Next, add the files to be compiled into the library
61#61#
62
63MESSAGE(STATUS "PRECOMPILED HEADERS: " ${ZORBA_PRECOMPILED_HEADERS})
62SET(ZORBA_PRECOMPILED_HEADERS OFF CACHE BOOL "Activate Zorba precompiled headers.")64SET(ZORBA_PRECOMPILED_HEADERS OFF CACHE BOOL "Activate Zorba precompiled headers.")
65MESSAGE(STATUS "PRECOMPILED HEADERS: " ${ZORBA_PRECOMPILED_HEADERS})
6366
64SET(ZORBA_SRCS)67SET(ZORBA_SRCS)
65ADD_SRC_SUBFOLDER(ZORBA_SRCS api API_SRCS)68ADD_SRC_SUBFOLDER(ZORBA_SRCS api API_SRCS)
@@ -97,6 +100,7 @@
97ENDIF(ZORBA_WITH_DEBUGGER)100ENDIF(ZORBA_WITH_DEBUGGER)
98ADD_SRC_SUBFOLDER(ZORBA_SRCS unit_tests UNIT_TEST_SRCS)101ADD_SRC_SUBFOLDER(ZORBA_SRCS unit_tests UNIT_TEST_SRCS)
99102
103MESSAGE(STATUS "PRECOMPILED HEADERS: " ${ZORBA_PRECOMPILED_HEADERS})
100IF(ZORBA_PRECOMPILED_HEADERS)104IF(ZORBA_PRECOMPILED_HEADERS)
101 ADD_SRC_SUBFOLDER(ZORBA_SRCS precompiled ZORBAMISC_SRCS)105 ADD_SRC_SUBFOLDER(ZORBA_SRCS precompiled ZORBAMISC_SRCS)
102 INCLUDE_DIRECTORIES("${CMAKE_SOURCE_DIR}/src/precompiled")106 INCLUDE_DIRECTORIES("${CMAKE_SOURCE_DIR}/src/precompiled")
103107
=== modified file 'src/api/serialization/serializer.cpp'
--- src/api/serialization/serializer.cpp 2012-03-28 05:19:57 +0000
+++ src/api/serialization/serializer.cpp 2012-04-07 00:45:26 +0000
@@ -180,7 +180,6 @@
180 for (; chars < chars_end; chars++ )180 for (; chars < chars_end; chars++ )
181 {181 {
182182
183#ifndef ZORBA_NO_UNICODE
184 // the input string is UTF-8183 // the input string is UTF-8
185 int char_length = utf8::char_length(*chars);184 int char_length = utf8::char_length(*chars);
186 if (char_length == 0)185 if (char_length == 0)
@@ -217,7 +216,6 @@
217216
218 continue;217 continue;
219 }218 }
220#endif//ZORBA_NO_UNICODE
221219
222 // raise an error iff (1) the serialization format is XML 1.0 and (2) the given character is an invalid XML 1.0 character220 // raise an error iff (1) the serialization format is XML 1.0 and (2) the given character is an invalid XML 1.0 character
223 if (ser && ser->method == PARAMETER_VALUE_XML &&221 if (ser && ser->method == PARAMETER_VALUE_XML &&
@@ -332,14 +330,12 @@
332 {330 {
333 tr << (char)0xEF << (char)0xBB << (char)0xBF;331 tr << (char)0xEF << (char)0xBB << (char)0xBF;
334 }332 }
335#ifndef ZORBA_NO_UNICODE
336 else if (ser->encoding == PARAMETER_VALUE_UTF_16)333 else if (ser->encoding == PARAMETER_VALUE_UTF_16)
337 {334 {
338 // Little-endian335 // Little-endian
339 tr.verbatim((char)0xFF);336 tr.verbatim((char)0xFF);
340 tr.verbatim((char)0xFE);337 tr.verbatim((char)0xFE);
341 }338 }
342#endif
343 }339 }
344}340}
345341
@@ -862,13 +858,17 @@
862 emitter::emit_declaration();858 emitter::emit_declaration();
863859
864 if (ser->omit_xml_declaration == PARAMETER_VALUE_NO) {860 if (ser->omit_xml_declaration == PARAMETER_VALUE_NO) {
865 tr << "<?xml version=\"" << ser->version << "\" encoding=\"";861 tr << "<?xml version=\"" << ser->version;
866 if (ser->encoding == PARAMETER_VALUE_UTF_8) {862 switch (ser->encoding) {
867 tr << "UTF-8";863 case PARAMETER_VALUE_UTF_8:
868#ifndef ZORBA_NO_UNICODE864 case PARAMETER_VALUE_UTF_16:
869 } else if (ser->encoding == PARAMETER_VALUE_UTF_16) {865 tr << "\" encoding=\"";
870 tr << "UTF-16";866 switch (ser->encoding) {
871#endif867 case PARAMETER_VALUE_UTF_8 : tr << "UTF-8" ; break;
868 case PARAMETER_VALUE_UTF_16: tr << "UTF-16"; break;
869 default : ZORBA_ASSERT(false);
870 }
871 break;
872 }872 }
873 tr << "\"";873 tr << "\"";
874874
@@ -1174,14 +1174,18 @@
1174 }1174 }
11751175
1176 tr << "<meta http-equiv=\"content-type\" content=\""1176 tr << "<meta http-equiv=\"content-type\" content=\""
1177 << ser->media_type << "; charset=";1177 << ser->media_type;
11781178 switch (ser->encoding) {
1179 if (ser->encoding == PARAMETER_VALUE_UTF_8)1179 case PARAMETER_VALUE_UTF_8:
1180 tr << "UTF-8";1180 case PARAMETER_VALUE_UTF_16:
1181#ifndef ZORBA_NO_UNICODE1181 tr << "\" charset=\"";
1182 else if (ser->encoding == PARAMETER_VALUE_UTF_16)1182 switch (ser->encoding) {
1183 tr << "UTF-16";1183 case PARAMETER_VALUE_UTF_8 : tr << "UTF-8" ; break;
1184#endif1184 case PARAMETER_VALUE_UTF_16: tr << "UTF-16"; break;
1185 default : ZORBA_ASSERT(false);
1186 }
1187 break;
1188 }
1185 tr << "\"";1189 tr << "\"";
1186 // closed_parent_tag = 1;1190 // closed_parent_tag = 1;
1187 }1191 }
@@ -1371,14 +1375,18 @@
1371 }1375 }
13721376
1373 tr << "<meta http-equiv=\"content-type\" content=\""1377 tr << "<meta http-equiv=\"content-type\" content=\""
1374 << ser->media_type << "; charset=";1378 << ser->media_type;
13751379 switch (ser->encoding) {
1376 if (ser->encoding == PARAMETER_VALUE_UTF_8)1380 case PARAMETER_VALUE_UTF_8:
1377 tr << "UTF-8";1381 case PARAMETER_VALUE_UTF_16:
1378#ifndef ZORBA_NO_UNICODE1382 tr << "\" charset=\"";
1379 else if (ser->encoding == PARAMETER_VALUE_UTF_16)1383 switch (ser->encoding) {
1380 tr << "UTF-16";1384 case PARAMETER_VALUE_UTF_8 : tr << "UTF-8" ; break;
1381#endif1385 case PARAMETER_VALUE_UTF_16: tr << "UTF-16"; break;
1386 default : ZORBA_ASSERT(false);
1387 }
1388 break;
1389 }
1382 tr << "\"/";1390 tr << "\"/";
1383 //closed_parent_tag = 1;1391 //closed_parent_tag = 1;
1384 }1392 }
@@ -2098,10 +2106,8 @@
2098 {2106 {
2099 if (!strcmp(aValue, "UTF-8"))2107 if (!strcmp(aValue, "UTF-8"))
2100 encoding = PARAMETER_VALUE_UTF_8;2108 encoding = PARAMETER_VALUE_UTF_8;
2101#ifndef ZORBA_NO_UNICODE
2102 else if (!strcmp(aValue, "UTF-16"))2109 else if (!strcmp(aValue, "UTF-16"))
2103 encoding = PARAMETER_VALUE_UTF_16;2110 encoding = PARAMETER_VALUE_UTF_16;
2104#endif
2105 else2111 else
2106 throw XQUERY_EXCEPTION(2112 throw XQUERY_EXCEPTION(
2107 err::SEPM0016, ERROR_PARAMS( aValue, aName, ZED( GoodValuesAreUTF8 ) )2113 err::SEPM0016, ERROR_PARAMS( aValue, aName, ZED( GoodValuesAreUTF8 ) )
@@ -2210,16 +2216,13 @@
2210 {2216 {
2211 tr = new transcoder(os, false);2217 tr = new transcoder(os, false);
2212 }2218 }
2213#ifndef ZORBA_NO_UNICODE
2214 else if (encoding == PARAMETER_VALUE_UTF_16)2219 else if (encoding == PARAMETER_VALUE_UTF_16)
2215 {2220 {
2216 tr = new transcoder(os, true);2221 tr = new transcoder(os, true);
2217 }2222 }
2218#endif
2219 else2223 else
2220 {2224 {
2221 ZORBA_ASSERT(0);2225 ZORBA_ASSERT(false);
2222 return false;
2223 }2226 }
22242227
2225 if (method == PARAMETER_VALUE_XML)2228 if (method == PARAMETER_VALUE_XML)
22262229
=== modified file 'src/api/serialization/serializer.h'
--- src/api/serialization/serializer.h 2012-03-28 05:19:57 +0000
+++ src/api/serialization/serializer.h 2012-04-07 00:45:26 +0000
@@ -70,10 +70,8 @@
70 PARAMETER_VALUE_TEXT,70 PARAMETER_VALUE_TEXT,
71 PARAMETER_VALUE_BINARY,71 PARAMETER_VALUE_BINARY,
7272
73 PARAMETER_VALUE_UTF_873 PARAMETER_VALUE_UTF_8,
74#ifndef ZORBA_NO_UNICODE74 PARAMETER_VALUE_UTF_16
75 ,PARAMETER_VALUE_UTF_16
76#endif
77 } PARAMETER_VALUE_TYPE;75 } PARAMETER_VALUE_TYPE;
7876
79protected:77protected:
8078
=== modified file 'src/diagnostics/diagnostic_en.xml'
--- src/diagnostics/diagnostic_en.xml 2012-03-28 05:19:57 +0000
+++ src/diagnostics/diagnostic_en.xml 2012-04-07 00:45:26 +0000
@@ -2517,11 +2517,11 @@
2517 <value>attribute node</value>2517 <value>attribute node</value>
2518 </entry>2518 </entry>
25192519
2520 <entry key="BackRef0Illegal">2520 <entry key="BackRef0Illegal" if="!defined(ZORBA_NO_ICU)">
2521 <value>"0": illegal backreference</value>2521 <value>"0": illegal backreference</value>
2522 </entry>2522 </entry>
25232523
2524 <entry key="BackRefIllegalInCharClass">2524 <entry key="BackRefIllegalInCharClass" if="!defined(ZORBA_NO_ICU)">
2525 <value>backreference illegal in character class</value>2525 <value>backreference illegal in character class</value>
2526 </entry>2526 </entry>
25272527
@@ -2569,7 +2569,7 @@
2569 <value>invalid library module</value>2569 <value>invalid library module</value>
2570 </entry>2570 </entry>
25712571
2572 <entry key="BadRegexEscape_3">2572 <entry key="BadRegexEscape_3" if="!defined(ZORBA_NO_ICU)">
2573 <value>"$3": illegal escape character</value>2573 <value>"$3": illegal escape character</value>
2574 </entry>2574 </entry>
25752575
@@ -3029,7 +3029,7 @@
3029 <value>nodeid component too big for encoding</value>3029 <value>nodeid component too big for encoding</value>
3030 </entry>3030 </entry>
30313031
3032 <entry key="NonClosedBackRef_3">3032 <entry key="NonClosedBackRef_3" if="!defined(ZORBA_NO_ICU)">
3033 <value>'$$3': non-closed backreference</value>3033 <value>'$$3': non-closed backreference</value>
3034 </entry>3034 </entry>
30353035
@@ -3041,7 +3041,7 @@
3041 <value>non-localhost authority</value>3041 <value>non-localhost authority</value>
3042 </entry>3042 </entry>
30433043
3044 <entry key="NonexistentBackRef_3">3044 <entry key="NonexistentBackRef_3" if="!defined(ZORBA_NO_ICU)">
3045 <value>'$$3': non-existent backreference</value>3045 <value>'$$3': non-existent backreference</value>
3046 </entry>3046 </entry>
30473047
@@ -3193,94 +3193,183 @@
3193 <value>item type is not a subtype of "$3"</value>3193 <value>item type is not a subtype of "$3"</value>
3194 </entry>3194 </entry>
31953195
3196 <entry key="U_REGEX_BAD_ESCAPE_SEQUENCE" if="!defined(ZORBA_NO_UNICODE)">3196 <entry key="U_REGEX_BAD_ESCAPE_SEQUENCE" if="!defined(ZORBA_NO_ICU)">
3197 <value>unrecognized backslash escape sequence</value>3197 <value>unrecognized backslash escape sequence</value>
3198 </entry>3198 </entry>
31993199
3200 <entry key="U_REGEX_BAD_INTERVAL" if="!defined(ZORBA_NO_UNICODE)">3200 <entry key="U_REGEX_BAD_INTERVAL" if="!defined(ZORBA_NO_ICU)">
3201 <value>error in {min,max} interval</value>3201 <value>error in {min,max} interval</value>
3202 </entry>3202 </entry>
32033203
3204 <entry key="U_REGEX_INTERNAL_ERROR" if="!defined(ZORBA_NO_UNICODE)">3204 <entry key="U_REGEX_INTERNAL_ERROR" if="!defined(ZORBA_NO_ICU)">
3205 <value>an internal ICU error (bug) was detected</value>3205 <value>an internal ICU error (bug) was detected</value>
3206 </entry>3206 </entry>
32073207
3208 <entry key="U_REGEX_INVALID_BACK_REF" if="!defined(ZORBA_NO_UNICODE)">3208 <entry key="U_REGEX_INVALID_BACK_REF" if="!defined(ZORBA_NO_ICU)">
3209 <value>backreference to a non-existent capture group</value>3209 <value>backreference to a non-existent capture group</value>
3210 </entry>3210 </entry>
32113211
3212 <entry key="U_REGEX_INVALID_FLAG" if="!defined(ZORBA_NO_UNICODE)">3212 <entry key="U_REGEX_INVALID_FLAG" if="!defined(ZORBA_NO_ICU)">
3213 <value>invalid value for match mode flags</value>3213 <value>invalid value for match mode flags</value>
3214 </entry>3214 </entry>
32153215
3216 <entry key="U_REGEX_INVALID_RANGE" if="!defined(ZORBA_NO_UNICODE)">3216 <entry key="U_REGEX_INVALID_RANGE" if="!defined(ZORBA_NO_ICU)">
3217 <value>in character range [x-y], x is greater than y</value>3217 <value>in character range [x-y], x is greater than y</value>
3218 </entry>3218 </entry>
32193219
3220 <entry key="U_REGEX_INVALID_STATE" if="!defined(ZORBA_NO_UNICODE)">3220 <entry key="U_REGEX_INVALID_STATE" if="!defined(ZORBA_NO_ICU)">
3221 <value>RegexMatcher in invalid state for requested operation</value>3221 <value>RegexMatcher in invalid state for requested operation</value>
3222 </entry>3222 </entry>
32233223
3224 <entry key="U_REGEX_LOOK_BEHIND_LIMIT" if="!defined(ZORBA_NO_UNICODE)">3224 <entry key="U_REGEX_LOOK_BEHIND_LIMIT" if="!defined(ZORBA_NO_ICU)">
3225 <value>look-behind pattern matches must have a bounded maximum length</value>3225 <value>look-behind pattern matches must have a bounded maximum length</value>
3226 </entry>3226 </entry>
32273227
3228 <entry key="U_REGEX_MAX_LT_MIN" if="!defined(ZORBA_NO_UNICODE)">3228 <entry key="U_REGEX_MAX_LT_MIN" if="!defined(ZORBA_NO_ICU)">
3229 <value>in {min,max}, max is less than min</value>3229 <value>in {min,max}, max is less than min</value>
3230 </entry>3230 </entry>
32313231
3232 <entry key="U_REGEX_MISMATCHED_PAREN" if="!defined(ZORBA_NO_UNICODE)">3232 <entry key="U_REGEX_MISMATCHED_PAREN" if="!defined(ZORBA_NO_ICU)">
3233 <value>incorrectly nested parentheses</value>3233 <value>incorrectly nested parentheses</value>
3234 </entry>3234 </entry>
32353235
3236 <entry key="U_REGEX_MISSING_CLOSE_BRACKET" if="!defined(ZORBA_NO_UNICODE)">3236 <entry key="U_REGEX_MISSING_CLOSE_BRACKET" if="!defined(ZORBA_NO_ICU)">
3237 <value>missing ']'</value>3237 <value>missing ']'</value>
3238 </entry>3238 </entry>
32393239
3240 <entry key="U_REGEX_NUMBER_TOO_BIG" if="!defined(ZORBA_NO_UNICODE)">3240 <entry key="U_REGEX_NUMBER_TOO_BIG" if="!defined(ZORBA_NO_ICU)">
3241 <value>decimal number is too large</value>3241 <value>decimal number is too large</value>
3242 </entry>3242 </entry>
32433243
3244 <entry key="U_REGEX_OCTAL_TOO_BIG" if="!defined(ZORBA_NO_UNICODE)">3244 <entry key="U_REGEX_OCTAL_TOO_BIG" if="!defined(ZORBA_NO_ICU)">
3245 <value>octal character constants must be &lt;= 0377</value>3245 <value>octal character constants must be &lt;= 0377</value>
3246 </entry>3246 </entry>
32473247
3248 <entry key="U_REGEX_PROPERTY_SYNTAX" if="!defined(ZORBA_NO_UNICODE)">3248 <entry key="U_REGEX_PROPERTY_SYNTAX" if="!defined(ZORBA_NO_ICU)">
3249 <value>incorrect Unicode property</value>3249 <value>incorrect Unicode property</value>
3250 </entry>3250 </entry>
32513251
3252 <entry key="U_REGEX_RULE_SYNTAX" if="!defined(ZORBA_NO_UNICODE)">3252 <entry key="U_REGEX_RULE_SYNTAX" if="!defined(ZORBA_NO_ICU)">
3253 <value>syntax error</value>3253 <value>syntax error</value>
3254 </entry>3254 </entry>
32553255
3256 <entry key="U_REGEX_SET_CONTAINS_STRING" if="!defined(ZORBA_NO_UNICODE)">3256 <entry key="U_REGEX_SET_CONTAINS_STRING" if="!defined(ZORBA_NO_ICU)">
3257 <value>can not have UnicodeSets containing strings</value>3257 <value>can not have UnicodeSets containing strings</value>
3258 </entry>3258 </entry>
32593259
3260 <entry key="U_REGEX_STACK_OVERFLOW" if="!defined(ZORBA_NO_UNICODE)">3260 <entry key="U_REGEX_STACK_OVERFLOW" if="!defined(ZORBA_NO_ICU)">
3261 <value>backtrack stack overflow</value>3261 <value>backtrack stack overflow</value>
3262 </entry>3262 </entry>
32633263
3264 <entry key="U_REGEX_STOPPED_BY_CALLER" if="!defined(ZORBA_NO_UNICODE)">3264 <entry key="U_REGEX_STOPPED_BY_CALLER" if="!defined(ZORBA_NO_ICU)">
3265 <value>matching operation aborted by user callback fn</value>3265 <value>matching operation aborted by user callback fn</value>
3266 </entry>3266 </entry>
32673267
3268 <entry key="U_REGEX_TIME_OUT" if="!defined(ZORBA_NO_UNICODE)">3268 <entry key="U_REGEX_TIME_OUT" if="!defined(ZORBA_NO_ICU)">
3269 <value>maximum allowed match time exceeded</value>3269 <value>maximum allowed match time exceeded</value>
3270 </entry>3270 </entry>
32713271
3272 <entry key="U_REGEX_UNIMPLEMENTED" if="!defined(ZORBA_NO_UNICODE)">3272 <entry key="U_REGEX_UNIMPLEMENTED" if="!defined(ZORBA_NO_ICU)">
3273 <value>use of regular expression feature that is not yet implemented</value>3273 <value>use of regular expression feature that is not yet implemented</value>
3274 </entry>
3275
3276 <!-- Regex Ascii error messages-->
3277 <entry key="REGEX_UNIMPLEMENTED" if="defined(ZORBA_NO_ICU)">
3278 <value>use of regular expression feature that is not yet implemented</value>
3279 </entry>
3280
3281 <entry key="REGEX_MISMATCHED_PAREN" if="defined(ZORBA_NO_ICU)">
3282 <value>incorrectly nested parentheses</value>
3283 </entry>
3284
3285 <entry key="REGEX_BROKEN_P_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
3286 <value>broken \\p construct</value>
3287 </entry>
3288
3289 <entry key="REGEX_UNKNOWN_PL_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
3290 <value>unknown \\p{L?} category; supported categories: L, Lu, Ll, Lt, Lm, Lo</value>
3291 </entry>
3292
3293 <entry key="REGEX_UNKNOWN_PM_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
3294 <value>unknown \\p{M?} category; supported categories: M, Mn, Mc, Me</value>
3295 </entry>
3296
3297 <entry key="REGEX_UNKNOWN_PN_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
3298 <value>unknown \\p{N?} category; supported categories: N, Nd, Nl, No</value>
3299 </entry>
3300
3301 <entry key="REGEX_UNKNOWN_PP_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
3302 <value>unknown \\p{P?} category; supported categories: P, Pc, Pd, Ps, Pe, Pi, Pf, Po</value>
3303 </entry>
3304
3305 <entry key="REGEX_UNKNOWN_PZ_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
3306 <value>unknown \\p{Z?} category; supported categories: Z, Zs, Zl, Zp</value>
3307 </entry>
3308
3309 <entry key="REGEX_UNKNOWN_PS_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
3310 <value>unknown \\p{S?} category; supported categories: S, Sm, Sc, Sk, So</value>
3311 </entry>
3312
3313 <entry key="REGEX_UNKNOWN_PC_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
3314 <value>unknown \\p{C?} category; supported categories: C, Cc, Cf, Co, Cn(for not assigned)</value>
3315 </entry>
3316
3317 <entry key="REGEX_BROKEN_PIs_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
3318 <value>broken \\p{Is} construct; valid characters are [a-zA-Z0-9-]</value>
3319 </entry>
3320
3321 <entry key="REGEX_UNKNOWN_PIs_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
3322 <value>unknown \\p{Is} category block; see supported block escapes here: http://www.w3.org/TR/xmlschema-2/#charcter-classes</value>
3323 </entry>
3324
3325 <entry key="REGEX_INVALID_UNICODE_CODEPOINT_u" if="defined(ZORBA_NO_ICU)">
3326 <value>invalid unicode hex, should be in form \\uXXXX or \\UXXXXXXXX</value>
3327 </entry>
3328
3329 <entry key="REGEX_UNKNOWN_ESC_CHAR" if="defined(ZORBA_NO_ICU)">
3330 <value>unknown \\? escape char; supported escapes are: \\[nrt\\|.?*+(){}[]-^$] for char escapes, \\[pP] for categories and \\[sSiIcCdDwW] for multichar groups</value>
3331 </entry>
3332
3333 <entry key="REGEX_INVALID_BACK_REF" if="defined(ZORBA_NO_ICU)">
3334 <value>\\$3 backreference to a non-existent capture group ($4 groups so far)</value>
3335 </entry>
3336
3337 <entry key="REGEX_INVALID_ATOM_CHAR" if="defined(ZORBA_NO_ICU)">
3338 <value>'$3': invalid character for an atom; forbidden characters are: [{}?*+|^]</value>
3339 </entry>
3340
3341 <entry key="REGEX_INVALID_SUBCLASS" if="defined(ZORBA_NO_ICU)">
3342 <value>malformed class subtraction</value>
3343 </entry>
3344
3345 <entry key="REGEX_INVALID_USE_OF_SUBCLASS" if="defined(ZORBA_NO_ICU)">
3346 <value>improper use of class subtraction: it must be the last construct in a class group [xxx-[yyy]]</value>
3347 </entry>
3348
3349 <entry key="REGEX_MULTICHAR_IN_CHAR_RANGE" if="defined(ZORBA_NO_ICU)">
3350 <value>multichars or char categories cannot be part of a char range</value>
3351 </entry>
3352
3353 <entry key="REGEX_MISSING_CLOSE_BRACKET" if="defined(ZORBA_NO_ICU)">
3354 <value>missing ']' in character group</value>
3355 </entry>
3356
3357 <entry key="REGEX_MAX_LT_MIN" if="defined(ZORBA_NO_ICU)">
3358 <value>in {min,max}, max is less than min</value>
3274 </entry>3359 </entry>
32753360
3276 <entry key="UnaryArithOp">3361 <entry key="UnaryArithOp">
3277 <value>unary arithmetic operator</value>3362 <value>unary arithmetic operator</value>
3278 </entry>3363 </entry>
32793364
3280 <entry key="UnbalancedChar_3">3365 <entry key="UnbalancedChar_3" if="!defined(ZORBA_NO_ICU)">
3281 <value>missing '$3'</value>3366 <value>missing '$3'</value>
3282 </entry>3367 </entry>
32833368
3369 <entry key="UnescapedChar_3" if="!defined(ZORBA_NO_ICU)">
3370 <value>character '$3' must be escaped here</value>
3371 </entry>
3372
3284 <entry key="UnexpectedElement">3373 <entry key="UnexpectedElement">
3285 <value>unexpected element</value>3374 <value>unexpected element</value>
3286 </entry>3375 </entry>
32873376
=== modified file 'src/diagnostics/pregenerated/dict_en.cpp'
--- src/diagnostics/pregenerated/dict_en.cpp 2012-03-28 05:19:57 +0000
+++ src/diagnostics/pregenerated/dict_en.cpp 2012-04-07 00:45:26 +0000
@@ -437,8 +437,12 @@
437 { "~AtomizationOfGroupByMakesMoreThanOneItem", "atomization of groupby variable produces more than one item" },437 { "~AtomizationOfGroupByMakesMoreThanOneItem", "atomization of groupby variable produces more than one item" },
438 { "~AttributeName", "attribute name" },438 { "~AttributeName", "attribute name" },
439 { "~AttributeNode", "attribute node" },439 { "~AttributeNode", "attribute node" },
440#if !defined(ZORBA_NO_ICU)
440 { "~BackRef0Illegal", "\"0\": illegal backreference" },441 { "~BackRef0Illegal", "\"0\": illegal backreference" },
442#endif
443#if !defined(ZORBA_NO_ICU)
441 { "~BackRefIllegalInCharClass", "backreference illegal in character class" },444 { "~BackRefIllegalInCharClass", "backreference illegal in character class" },
445#endif
442 { "~BadAnyURI", "invalid xs:anyURI" },446 { "~BadAnyURI", "invalid xs:anyURI" },
443 { "~BadArgTypeForFn_2o34o", "${\"2\": }invalid argument type for function $3()${: 4}" },447 { "~BadArgTypeForFn_2o34o", "${\"2\": }invalid argument type for function $3()${: 4}" },
444 { "~BadCharAfter_34", "'$3': illegal character after '$4'" },448 { "~BadCharAfter_34", "'$3': illegal character after '$4'" },
@@ -451,7 +455,9 @@
451 { "~BadIterator", "invalid iterator" },455 { "~BadIterator", "invalid iterator" },
452 { "~BadLibraryModule", "invalid library module" },456 { "~BadLibraryModule", "invalid library module" },
453 { "~BadPath", "invalid path" },457 { "~BadPath", "invalid path" },
458#if !defined(ZORBA_NO_ICU)
454 { "~BadRegexEscape_3", "\"$3\": illegal escape character" },459 { "~BadRegexEscape_3", "\"$3\": illegal escape character" },
460#endif
455 { "~BadStreamState", "bad I/O stream state" },461 { "~BadStreamState", "bad I/O stream state" },
456 { "~BadTokenInBraces_3", "\"$3\": illegal token within { }" },462 { "~BadTokenInBraces_3", "\"$3\": illegal token within { }" },
457 { "~BadTraceStream", "trace stream not retrievable using SerializationCallback" },463 { "~BadTraceStream", "trace stream not retrievable using SerializationCallback" },
@@ -567,10 +573,14 @@
567 { "~NoUntypedKeyNodeValue_2", "node with untyped key value found during probe on index \"$2\"" },573 { "~NoUntypedKeyNodeValue_2", "node with untyped key value found during probe on index \"$2\"" },
568 { "~NodeIDNeedsBytes_2", "nodeid requires more than $2 bytes" },574 { "~NodeIDNeedsBytes_2", "nodeid requires more than $2 bytes" },
569 { "~NodeIDTooBig", "nodeid component too big for encoding" },575 { "~NodeIDTooBig", "nodeid component too big for encoding" },
576#if !defined(ZORBA_NO_ICU)
570 { "~NonClosedBackRef_3", "'$$3': non-closed backreference" },577 { "~NonClosedBackRef_3", "'$$3': non-closed backreference" },
578#endif
571 { "~NonFileThesaurusURI", "non-file thesaurus URI" },579 { "~NonFileThesaurusURI", "non-file thesaurus URI" },
572 { "~NonLocalhostAuthority", "non-localhost authority" },580 { "~NonLocalhostAuthority", "non-localhost authority" },
581#if !defined(ZORBA_NO_ICU)
573 { "~NonexistentBackRef_3", "'$$3': non-existent backreference" },582 { "~NonexistentBackRef_3", "'$$3': non-existent backreference" },
583#endif
574 { "~NotAllowedForTypeName", "not allowed for typeName (use xsd:untyped instead)" },584 { "~NotAllowedForTypeName", "not allowed for typeName (use xsd:untyped instead)" },
575 { "~NotAmongInScopeSchemaTypes", "not among in-scope schema types" },585 { "~NotAmongInScopeSchemaTypes", "not among in-scope schema types" },
576 { "~NotDefInDynamicCtx", "not defined in dynamic context" },586 { "~NotDefInDynamicCtx", "not defined in dynamic context" },
@@ -589,6 +599,69 @@
589 { "~ParserNoCreateTree", "XML tree creation failed" },599 { "~ParserNoCreateTree", "XML tree creation failed" },
590 { "~PromotionImpossible", "promotion not possible" },600 { "~PromotionImpossible", "promotion not possible" },
591 { "~QuotedColon_23", "\"$2\": $3" },601 { "~QuotedColon_23", "\"$2\": $3" },
602#if defined(ZORBA_NO_ICU)
603 { "~REGEX_BROKEN_PIs_CONSTRUCT", "broken \\p{Is} construct; valid characters are [a-zA-Z0-9-]" },
604#endif
605#if defined(ZORBA_NO_ICU)
606 { "~REGEX_BROKEN_P_CONSTRUCT", "broken \\p construct" },
607#endif
608#if defined(ZORBA_NO_ICU)
609 { "~REGEX_INVALID_ATOM_CHAR", "'$3': invalid character for an atom; forbidden characters are: [{}?*+|^]" },
610#endif
611#if defined(ZORBA_NO_ICU)
612 { "~REGEX_INVALID_BACK_REF", "\\$3 backreference to a non-existent capture group ($4 groups so far)" },
613#endif
614#if defined(ZORBA_NO_ICU)
615 { "~REGEX_INVALID_SUBCLASS", "malformed class subtraction" },
616#endif
617#if defined(ZORBA_NO_ICU)
618 { "~REGEX_INVALID_UNICODE_CODEPOINT_u", "invalid unicode hex, should be in form \\uXXXX or \\UXXXXXXXX" },
619#endif
620#if defined(ZORBA_NO_ICU)
621 { "~REGEX_INVALID_USE_OF_SUBCLASS", "improper use of class subtraction: it must be the last construct in a class group [xxx-[yyy]]" },
622#endif
623#if defined(ZORBA_NO_ICU)
624 { "~REGEX_MAX_LT_MIN", "in {min,max}, max is less than min" },
625#endif
626#if defined(ZORBA_NO_ICU)
627 { "~REGEX_MISMATCHED_PAREN", "incorrectly nested parentheses" },
628#endif
629#if defined(ZORBA_NO_ICU)
630 { "~REGEX_MISSING_CLOSE_BRACKET", "missing ']' in character group" },
631#endif
632#if defined(ZORBA_NO_ICU)
633 { "~REGEX_MULTICHAR_IN_CHAR_RANGE", "multichars or char categories cannot be part of a char range" },
634#endif
635#if defined(ZORBA_NO_ICU)
636 { "~REGEX_UNIMPLEMENTED", "use of regular expression feature that is not yet implemented" },
637#endif
638#if defined(ZORBA_NO_ICU)
639 { "~REGEX_UNKNOWN_ESC_CHAR", "unknown \\? escape char; supported escapes are: \\[nrt\\|.?*+(){}[]-^$] for char escapes, \\[pP] for categories and \\[sSiIcCdDwW] for multichar groups" },
640#endif
641#if defined(ZORBA_NO_ICU)
642 { "~REGEX_UNKNOWN_PC_CONSTRUCT", "unknown \\p{C?} category; supported categories: C, Cc, Cf, Co, Cn(for not assigned)" },
643#endif
644#if defined(ZORBA_NO_ICU)
645 { "~REGEX_UNKNOWN_PIs_CONSTRUCT", "unknown \\p{Is} category block; see supported block escapes here: http://www.w3.org/TR/xmlschema-2/#charcter-classes" },
646#endif
647#if defined(ZORBA_NO_ICU)
648 { "~REGEX_UNKNOWN_PL_CONSTRUCT", "unknown \\p{L?} category; supported categories: L, Lu, Ll, Lt, Lm, Lo" },
649#endif
650#if defined(ZORBA_NO_ICU)
651 { "~REGEX_UNKNOWN_PM_CONSTRUCT", "unknown \\p{M?} category; supported categories: M, Mn, Mc, Me" },
652#endif
653#if defined(ZORBA_NO_ICU)
654 { "~REGEX_UNKNOWN_PN_CONSTRUCT", "unknown \\p{N?} category; supported categories: N, Nd, Nl, No" },
655#endif
656#if defined(ZORBA_NO_ICU)
657 { "~REGEX_UNKNOWN_PP_CONSTRUCT", "unknown \\p{P?} category; supported categories: P, Pc, Pd, Ps, Pe, Pi, Pf, Po" },
658#endif
659#if defined(ZORBA_NO_ICU)
660 { "~REGEX_UNKNOWN_PS_CONSTRUCT", "unknown \\p{S?} category; supported categories: S, Sm, Sc, Sk, So" },
661#endif
662#if defined(ZORBA_NO_ICU)
663 { "~REGEX_UNKNOWN_PZ_CONSTRUCT", "unknown \\p{Z?} category; supported categories: Z, Zs, Zl, Zp" },
664#endif
592 { "~SEPM0009_Not10", "the version parameter has a value other than \"1.0\" and the doctype-system parameter is specified" },665 { "~SEPM0009_Not10", "the version parameter has a value other than \"1.0\" and the doctype-system parameter is specified" },
593 { "~SEPM0009_NotOmit", "the standalone attribute has a value other than \"omit\"" },666 { "~SEPM0009_NotOmit", "the standalone attribute has a value other than \"omit\"" },
594 { "~SchemaAttributeName", "schema-attribute name" },667 { "~SchemaAttributeName", "schema-attribute name" },
@@ -610,68 +683,73 @@
610 { "~TwoDecimalFormatsSameName_2", "\"$2\": two decimal formats with this name" },683 { "~TwoDecimalFormatsSameName_2", "\"$2\": two decimal formats with this name" },
611 { "~TwoDefaultDecimalFormats", "two default decimal formats" },684 { "~TwoDefaultDecimalFormats", "two default decimal formats" },
612 { "~TypeIsNotSubtype", "item type is not a subtype of \"$3\"" },685 { "~TypeIsNotSubtype", "item type is not a subtype of \"$3\"" },
613#if !defined(ZORBA_NO_UNICODE)686#if !defined(ZORBA_NO_ICU)
614 { "~U_REGEX_BAD_ESCAPE_SEQUENCE", "unrecognized backslash escape sequence" },687 { "~U_REGEX_BAD_ESCAPE_SEQUENCE", "unrecognized backslash escape sequence" },
615#endif688#endif
616#if !defined(ZORBA_NO_UNICODE)689#if !defined(ZORBA_NO_ICU)
617 { "~U_REGEX_BAD_INTERVAL", "error in {min,max} interval" },690 { "~U_REGEX_BAD_INTERVAL", "error in {min,max} interval" },
618#endif691#endif
619#if !defined(ZORBA_NO_UNICODE)692#if !defined(ZORBA_NO_ICU)
620 { "~U_REGEX_INTERNAL_ERROR", "an internal ICU error (bug) was detected" },693 { "~U_REGEX_INTERNAL_ERROR", "an internal ICU error (bug) was detected" },
621#endif694#endif
622#if !defined(ZORBA_NO_UNICODE)695#if !defined(ZORBA_NO_ICU)
623 { "~U_REGEX_INVALID_BACK_REF", "backreference to a non-existent capture group" },696 { "~U_REGEX_INVALID_BACK_REF", "backreference to a non-existent capture group" },
624#endif697#endif
625#if !defined(ZORBA_NO_UNICODE)698#if !defined(ZORBA_NO_ICU)
626 { "~U_REGEX_INVALID_FLAG", "invalid value for match mode flags" },699 { "~U_REGEX_INVALID_FLAG", "invalid value for match mode flags" },
627#endif700#endif
628#if !defined(ZORBA_NO_UNICODE)701#if !defined(ZORBA_NO_ICU)
629 { "~U_REGEX_INVALID_RANGE", "in character range [x-y], x is greater than y" },702 { "~U_REGEX_INVALID_RANGE", "in character range [x-y], x is greater than y" },
630#endif703#endif
631#if !defined(ZORBA_NO_UNICODE)704#if !defined(ZORBA_NO_ICU)
632 { "~U_REGEX_INVALID_STATE", "RegexMatcher in invalid state for requested operation" },705 { "~U_REGEX_INVALID_STATE", "RegexMatcher in invalid state for requested operation" },
633#endif706#endif
634#if !defined(ZORBA_NO_UNICODE)707#if !defined(ZORBA_NO_ICU)
635 { "~U_REGEX_LOOK_BEHIND_LIMIT", "look-behind pattern matches must have a bounded maximum length" },708 { "~U_REGEX_LOOK_BEHIND_LIMIT", "look-behind pattern matches must have a bounded maximum length" },
636#endif709#endif
637#if !defined(ZORBA_NO_UNICODE)710#if !defined(ZORBA_NO_ICU)
638 { "~U_REGEX_MAX_LT_MIN", "in {min,max}, max is less than min" },711 { "~U_REGEX_MAX_LT_MIN", "in {min,max}, max is less than min" },
639#endif712#endif
640#if !defined(ZORBA_NO_UNICODE)713#if !defined(ZORBA_NO_ICU)
641 { "~U_REGEX_MISMATCHED_PAREN", "incorrectly nested parentheses" },714 { "~U_REGEX_MISMATCHED_PAREN", "incorrectly nested parentheses" },
642#endif715#endif
643#if !defined(ZORBA_NO_UNICODE)716#if !defined(ZORBA_NO_ICU)
644 { "~U_REGEX_MISSING_CLOSE_BRACKET", "missing ']'" },717 { "~U_REGEX_MISSING_CLOSE_BRACKET", "missing ']'" },
645#endif718#endif
646#if !defined(ZORBA_NO_UNICODE)719#if !defined(ZORBA_NO_ICU)
647 { "~U_REGEX_NUMBER_TOO_BIG", "decimal number is too large" },720 { "~U_REGEX_NUMBER_TOO_BIG", "decimal number is too large" },
648#endif721#endif
649#if !defined(ZORBA_NO_UNICODE)722#if !defined(ZORBA_NO_ICU)
650 { "~U_REGEX_OCTAL_TOO_BIG", "octal character constants must be <= 0377" },723 { "~U_REGEX_OCTAL_TOO_BIG", "octal character constants must be <= 0377" },
651#endif724#endif
652#if !defined(ZORBA_NO_UNICODE)725#if !defined(ZORBA_NO_ICU)
653 { "~U_REGEX_PROPERTY_SYNTAX", "incorrect Unicode property" },726 { "~U_REGEX_PROPERTY_SYNTAX", "incorrect Unicode property" },
654#endif727#endif
655#if !defined(ZORBA_NO_UNICODE)728#if !defined(ZORBA_NO_ICU)
656 { "~U_REGEX_RULE_SYNTAX", "syntax error" },729 { "~U_REGEX_RULE_SYNTAX", "syntax error" },
657#endif730#endif
658#if !defined(ZORBA_NO_UNICODE)731#if !defined(ZORBA_NO_ICU)
659 { "~U_REGEX_SET_CONTAINS_STRING", "can not have UnicodeSets containing strings" },732 { "~U_REGEX_SET_CONTAINS_STRING", "can not have UnicodeSets containing strings" },
660#endif733#endif
661#if !defined(ZORBA_NO_UNICODE)734#if !defined(ZORBA_NO_ICU)
662 { "~U_REGEX_STACK_OVERFLOW", "backtrack stack overflow" },735 { "~U_REGEX_STACK_OVERFLOW", "backtrack stack overflow" },
663#endif736#endif
664#if !defined(ZORBA_NO_UNICODE)737#if !defined(ZORBA_NO_ICU)
665 { "~U_REGEX_STOPPED_BY_CALLER", "matching operation aborted by user callback fn" },738 { "~U_REGEX_STOPPED_BY_CALLER", "matching operation aborted by user callback fn" },
666#endif739#endif
667#if !defined(ZORBA_NO_UNICODE)740#if !defined(ZORBA_NO_ICU)
668 { "~U_REGEX_TIME_OUT", "maximum allowed match time exceeded" },741 { "~U_REGEX_TIME_OUT", "maximum allowed match time exceeded" },
669#endif742#endif
670#if !defined(ZORBA_NO_UNICODE)743#if !defined(ZORBA_NO_ICU)
671 { "~U_REGEX_UNIMPLEMENTED", "use of regular expression feature that is not yet implemented" },744 { "~U_REGEX_UNIMPLEMENTED", "use of regular expression feature that is not yet implemented" },
672#endif745#endif
673 { "~UnaryArithOp", "unary arithmetic operator" },746 { "~UnaryArithOp", "unary arithmetic operator" },
747#if !defined(ZORBA_NO_ICU)
674 { "~UnbalancedChar_3", "missing '$3'" },748 { "~UnbalancedChar_3", "missing '$3'" },
749#endif
750#if !defined(ZORBA_NO_ICU)
751 { "~UnescapedChar_3", "character '$3' must be escaped here" },
752#endif
675 { "~UnexpectedElement", "unexpected element" },753 { "~UnexpectedElement", "unexpected element" },
676 { "~VarValMustBeSingleItem_2", "\"$2\": variable value must be single item" },754 { "~VarValMustBeSingleItem_2", "\"$2\": variable value must be single item" },
677 { "~Variable", "variable" },755 { "~Variable", "variable" },
678756
=== modified file 'src/precompiled/stdafx.h'
--- src/precompiled/stdafx.h 2012-03-28 05:19:57 +0000
+++ src/precompiled/stdafx.h 2012-04-07 00:45:26 +0000
@@ -15,363 +15,81 @@
1515
16 */16 */
17 17
18#if defined STDAFX18#ifdef STDAFX
19#include <iostream>19
20#include <stdexcept>20 #include <fstream>
21#include <cassert>21 #include <iostream>
22#include <cstring>22 #include <stdexcept>
23#include <memory>23 #include <cassert>
2424 #include <cstring>
25#include <sstream>25 #include <memory>
26#include <xfwrap>26
27#include <xfwrap1>27 #include <sstream>
28#include <istream>28 #include <xfwrap>
29#include <cstdio>29 #include <xfwrap1>
30#include <xxshared>30 #include <istream>
31#include <crtdefs.h>31 #include <cstdio>
32#include <map>32 #include <xxshared>
33#include <set>33 #include <crtdefs.h>
34//#include <poppack.h>34 #include <map>
35//#include <xxtype_traits>35 #include <set>
36//#include <xxcallwrap>36
3737 #include "runtime/sequences/sequences.h"
38// #include <xxcallpmf>38 #include "diagnostics/xquery_diagnostics.h"
39// //#include <xxbind0>39 #include "xercesc/util/xercesdefs.hpp"
40// //#include <xxbind1>40 #include "runtime/collections/collections.h"
41// //#include <xxresult>41 #include "unicode/utypes.h"
42// #include <zorba/audit.h>42 #include "zorba/config.h"
43// #include "api/auditimpl.h"43 #include "store/api/store.h"
44// #include <zorba/audit.h>44 #include "zorba/zorba.h"
4545 #include "zorba/api_shared_types.h"
46 //#include "unicode/unistr.h"46 #include "compiler/parsetree/parsenodes.h"
47 #include "runtime/sequences/sequences.h"47 #include "compiler/parser/parse_constants.h"
48 #include "diagnostics/xquery_diagnostics.h"48 #include "zorbautils/checked_vector.h"
49 #include "xercesc/util/xercesdefs.hpp"49 #include "compiler/parser/xquery_driver.h"
50 #include "runtime/collections/collections.h"50 #include "util/sorter.h"
51 #include "unicode/utypes.h"51 #include "compiler/xqueryx/xqueryx_to_xquery.h"
52 #include "zorba/config.h"52 #include <zorba/store_manager.h>
53 #include "store/api/store.h"53 #include <zorba/xquery.h>
54 #include "zorba/zorba.h"54 #include <zorba/xquery_exception.h>
55 #include "zorba/api_shared_types.h"
56 #include "compiler/parsetree/parsenodes.h"
57 #include "compiler/parser/parse_constants.h"
58 //#include "compiler/api/compilercb.h"
59 #include "zorbautils/checked_vector.h"
60 #include "compiler/parser/xquery_driver.h"
61 #include "util/sorter.h"
62 #include "compiler/xqueryx/xqueryx_to_xquery.h"
63// #include "compiler/xqueryx/xqueryx_xslt.h"
64//#include "compiler/parser/xquery_scanner.h"
65//#include "compiler/parsetree/parsenode_base.h"
66//#include "compiler/parsetree/parsenode_visitor.h"
67// #include "runtime/core/flwor_iterator.h"
68// #include "context/static_context.h"
69// #include "zorbautils/fatal.h"
70// #include "runtime/base/unarybase.h"
71// #include "compiler/expression/expr_consts.h"
72// #include "api/iterator_singleton.h"
73// #include "runtime/visitors/printer_visitor_api.h"
74// //#include "compiler/parsetree/parsenode_print_dot_visitor.h"
75// //#include "compiler/parsetree/parsenode_print_dot_visitor.h"
76// //#include "runtime/visitors/planiter_visitor_impl_code.h"
77// //#include "runtime/visitors/planiter_visitor_impl_include.h"
78// //#include "runtime/visitors/printer_visitor_impl.h"
79// //#include "runtime/core/path.h"
80// #include "compiler/expression/ft_expr.h"
81// #include "compiler/expression/ftnode.h"
82// #include "compiler/parser/query_loc.h"
83 #include "util/cxx_util.h"55 #include "util/cxx_util.h"
84// #include "util/indent.h"56 #include "diagnostics/assert.h"
85// #include "util/stl_util.h"57 #include "zorbatypes/mapm/m_apm_lc.h"
86// #include "diagnostics/xquery_diagnostics.h"58 #include "zorbatypes/datetime/parse.h"
87// #include "zorbatypes/numconversions.h"59 #include "zorbatypes/chartype.h"
60 #include "zorbatypes/collation_manager.h"
61 #include "zorbatypes/ft_token.h"
62 #include "zorbatypes/m_apm.h"
63 #include "zorbatypes/rclock.h"
64 #include "zorbatypes/schema_types.h"
65 #include "zorbatypes/timezone.h"
66 #include "zorbatypes/transcoder.h"
67 #include "zorbatypes/URI.h"
68 #include "zorbatypes/xerces_xmlcharray.h"
69 #include "zorbatypes/zorbatypes_decl.h"
70 #include "zorbatypes/zstring.h"
71 #include "zorbautils/condition.h"
72 #include "zorbautils/hashfun.h"
73 #include "zorbautils/hashmap.h"
74 #include "zorbautils/hashmap_itemp.h"
75 #include "zorbautils/hashmap_str_obj.h"
76 #include "zorbautils/hashmap_zstring.h"
77 #include "zorbautils/hashset.h"
78 #include "zorbautils/hashset_itemh.h"
79 #include "zorbautils/latch.h"
80 #include "zorbautils/locale.h"
81 #include "zorbautils/lock.h"
82 #include "zorbautils/mutex.h"
83 #include "zorbautils/runnable.h"
84 #include "zorbautils/SAXParser.h"
85 #include "zorbautils/stack.h"
86 #include "zorbautils/string_util.h"
87 #include "unit_tests/unit_test_list.h"
88 #include "zorba/diagnostic_handler.h"
89 #include "zorba/xquery_warning.h"
90 #include "runtime/full_text/ftcontains_visitor.h"
91 #include "store/api/ft_token_iterator.h"
92 #include "store/naive/ft_token_store.h"
8893
89// #include "api/serialization/serializable.h"
90// #include "api/serialization/serializer.h"
91// #include "api/collectionimpl.h"
92// #include "api/dynamiccontextimpl.h"
93// #include "api/fileimpl.h"
94// #include "api/functionimpl.h"
95// #include "api/invoke_item_sequence.h"
96// #include "api/itemfactoryimpl.h"
97// #include "api/resultiteratorchainer.h"
98// #include "api/resultiteratorimpl.h"
99// #include "api/sax2impl.h"
100// #include "api/serializerimpl.h"
101// #include "api/staticcontextimpl.h"
102// #include "api/storeiteratorimpl.h"
103// #include "api/unmarshaller.h"
104// #include "api/uri_resolver_wrappers.h"
105// #include "api/vectoriterator.h"
106// #include "api/xmldatamanagerimpl.h"
107// //#include "api/xqueryimpl.h"
108// #include "api/zorbaimpl.h"
109// #include "capi/cdynamic_context.h"
110// #include "capi/cexpression.h"
111// #include "capi/cexternal_function.h"
112// #include "capi/cimplementation.h"
113// #include "capi/csequence.h"
114// #include "capi/cstatic_context.h"
115// #include "capi/error.h"
116// #include "capi/external_module.h"
117// #include "capi/single_item_sequence.h"
118// #include "capi/user_item_sequence.h"
119// #include "compiler/parser/flexlexer.h"
120// #include "compiler/parser/ft_types.h"
121// #include "compiler/parser/symbol_table.h"
122// #include "compiler/parser/xqdoc_comment.h"
123// #include "compiler/parsetree/parsenode_print_xml_visitor.h"
124// #include "compiler/parsetree/parsenode_print_xqdoc_visitor.h"
125// #include "compiler/parsetree/parsenode_print_xquery_visitor.h"
126// #include "compiler/parsetree/parsenode_xqdoc_visitor.h"
127// #include "compiler/translator/prolog_graph.h"
128// #include "compiler/translator/translator.h"
129// #include "compiler/codegen/plan_visitor.h"
130// #include "compiler/expression/abstract_expr_visitor.h"
131// #include "compiler/expression/expr.h"
132// #include "compiler/expression/expr_annotations.h"
133// #include "compiler/expression/expr_base.h"
134// #include "compiler/expression/expr_classes.h"
135// #include "compiler/expression/expr_iter.h"
136// #include "compiler/expression/expr_utils.h"
137// #include "compiler/expression/expr_visitor.h"
138// #include "compiler/expression/flwor_expr.h"
139// //#include "compiler/expression/fo_expr.h"
140// #include "compiler/expression/ftnode_classes.h"
141// #include "compiler/expression/ftnode_visitor.h"
142// #include "compiler/expression/function_item_expr.h"
143// #include "compiler/expression/path_expr.h"
144// #include "compiler/expression/script_exprs.h"
145// #include "compiler/expression/update_exprs.h"
146// #include "compiler/expression/var_expr.h"
147// #include "compiler/rewriter/framework/rewriter.h"
148// #include "compiler/rewriter/framework/rewriter_context.h"
149// #include "compiler/rewriter/framework/rule_driver.h"
150// #include "compiler/rewriter/framework/sequential_rewriter.h"
151// #include "compiler/rewriter/rewriters/common_rewriter.h"
152// #include "compiler/rewriter/rewriters/default_optimizer.h"
153// #include "compiler/rewriter/rewriters/phase1_rewriter.h"
154// #include "compiler/rewriter/rules/ruleset.h"
155// #include "compiler/rewriter/rules/rule_base.h"
156// #include "compiler/rewriter/rules/type_rules.h"
157// #include "compiler/rewriter/tools/dataflow_annotations.h"
158// #include "compiler/rewriter/tools/expr_tools.h"
159// #include "compiler/rewriter/tools/udf_graph.h"
160// #include "compiler/xqddf/collection_decl.h"
161// #include "compiler/xqddf/value_ic.h"
162// #include "compiler/xqddf/value_index.h"
163// #include "compiler/semantic_annotations/annotations.h"
164// #include "compiler/semantic_annotations/annotation_holder.h"
165// #include "compiler/semantic_annotations/annotation_keys.h"
166// #include "compiler/api/compiler_api.h"
167// #include "compiler/api/compiler_api_impl.h"
168// #include "system/globalenv.h"
169// #include "system/properties.h"
170// #include "system/zorba_properties.h"
171// #include "context/decimal_format.h"
172// #include "context/default_uri_mappers.h"
173// #include "context/default_url_resolvers.h"
174// #include "context/dynamic_context.h"
175// #include "context/dynamic_loader.h"
176// #include "context/internal_uri_resolvers.h"
177// //#include "context/namespace_context.h"
178// #include "context/root_static_context.h"
179// #include "context/sctx_map_iterator.h"
180// #include "context/standard_uri_resolvers.h"
181// #include "context/static_context_consts.h"
182// #include "context/stemmer_wrappers.h"
183// #include "context/uri_resolver.h"
184// #include "context/uri_resolver_wrapper.h"
185#include "diagnostics/assert.h"
186// #include "diagnostics/diagnostic.h"
187// #include "diagnostics/dict.h"
188// #include "diagnostics/dict_impl.h"
189// #include "diagnostics/StackWalker.h"
190// #include "diagnostics/user_error.h"
191// #include "diagnostics/user_exception.h"
192// #include "diagnostics/xquery_exception.h"
193// #include "diagnostics/xquery_stack_trace.h"
194// #include "diagnostics/xquery_warning.h"
195// #include "diagnostics/zorba_exception.h"
196// //#include "functions/annotation.h"
197// #include "functions/external_function.h"
198// #include "functions/function.h"
199// #include "functions/function_consts.h"
200// #include "functions/function_impl.h"
201// #include "functions/func_accessors_impl.h"
202// #include "functions/func_apply.h"
203// #include "functions/func_arithmetic.h"
204// #include "functions/func_booleans_impl.h"
205// #include "functions/func_durations_dates_times_impl.h"
206// #include "functions/func_enclosed.h"
207// #include "functions/func_eval.h"
208// #include "functions/func_hoist.h"
209// #include "functions/func_index_ddl.h"
210// #include "functions/func_node_sort_distinct.h"
211// #include "functions/func_numerics_impl.h"
212// #include "functions/func_reflection.h"
213// #include "functions/func_sequences_impl.h"
214// #include "functions/func_var_decl.h"
215// #include "functions/library.h"
216// #include "functions/signature.h"
217// #include "functions/udf.h"
218// #include "runtime/full_text/thesauri/decode_base128.h"
219// #include "runtime/full_text/thesauri/encoded_list.h"
220// #include "runtime/full_text/thesauri/iso2788.h"
221// #include "runtime/full_text/thesauri/wn_db_segment.h"
222// #include "runtime/full_text/thesauri/wn_synset.h"
223// #include "runtime/full_text/thesauri/wn_thesaurus.h"
224// #include "runtime/full_text/thesauri/wn_types.h"
225// #include "runtime/full_text/thesauri/xqftts_relationship.h"
226// #include "runtime/full_text/thesauri/xqftts_thesaurus.h"
227// #include "runtime/full_text/ft_match.h"
228// #include "runtime/full_text/ft_query_item.h"
229// #include "runtime/full_text/ft_single_token_iterator.h"
230// #include "runtime/full_text/ft_stop_words_set.h"
231// #include "runtime/full_text/ft_thesaurus.h"
232// #include "runtime/full_text/ft_token_matcher.h"
233// #include "runtime/full_text/ft_token_seq_iterator.h"
234// #include "runtime/full_text/ft_token_span.h"
235// #include "runtime/full_text/ft_wildcard.h"
236// #include "runtime/full_text/full_text.h"
237// #include "runtime/full_text/apply.h"
238// #include "runtime/full_text/ft_util.h"
239// #include "runtime/collections/collections_base.h"
240// #include "runtime/core/apply_updates.h"
241// #include "runtime/core/arithmetic_impl.h"
242// #include "runtime/core/constructors.h"
243// #include "runtime/core/fncall_iterator.h"
244// #include "runtime/core/internal_operators.h"
245// #include "runtime/core/item_iterator.h"
246// #include "runtime/core/nodeid_iterators.h"
247// #include "runtime/core/path_iterators.h"
248// #include "runtime/core/sequencetypes.h"
249// #include "runtime/core/trycatch.h"
250// #include "runtime/core/var_iterators.h"
251// #include "runtime/numerics/NumericsImpl.h"
252// #include "runtime/booleans/BooleanImpl.h"
253// #include "runtime/base/binarybase.h"
254// #include "runtime/base/narybase.h"
255// #include "runtime/base/noarybase.h"
256// #include "runtime/base/plan_iterator.h"
257// #include "runtime/sequences/SequencesImpl.h"
258// #include "runtime/visitors/iterprinter.h"
259// #include "runtime/misc/materialize.h"
260// #include "runtime/scripting/scripting.h"
261// #include "types/schema/EventSchemaValidator.h"
262// #include "types/schema/LoadSchemaErrorHandler.h"
263// #include "types/schema/PrintSchema.h"
264// #include "types/schema/revalidateUtils.h"
265// #include "types/schema/schema.h"
266// #include "types/schema/SchemaValidatorFilter.h"
267// #include "types/schema/StrX.h"
268// #include "types/schema/validate.h"
269// #include "types/schema/ValidationEventHandler.h"
270// #include "types/schema/xercesIncludes.h"
271// #include "types/schema/XercesParseUtils.h"
272// #include "types/schema/XercSchemaValidator.h"
273// #include "types/casting.h"
274// #include "types/collation.h"
275// #include "types/node_test.h"
276// #include "types/root_typemanager.h"
277// #include "types/typeconstants.h"
278// #include "types/typeimpl.h"
279// #include "types/typemanager.h"
280// #include "types/typemanagerimpl.h"
281// #include "types/typeops.h"
282// #include "util/fx/fxarray.h"
283// #include "util/fx/fxcharheap.h"
284// #include "util/ascii_util.h"
285// #include "util/atomic_int.h"
286// #include "util/auto_vector.h"
287// #include "util/curl_util.h"
288// #include "util/dir.h"
289// #include "util/dynamic_bitset.h"
290// #include "util/empty.h"
291// #include "util/error_util.h"
292// #include "util/fs_util.h"
293// #include "util/hashmap.h"
294// //#include "util/hashmap32.h"
295// #include "util/less.h"
296// #include "util/mmap_file.h"
297// #include "util/nonatomic_int.h"
298// #include "util/omanip.h"
299// #include "util/oseparator.h"
300// #include "util/regex.h"
301// #include "util/singleton.h"
302// #include "util/string_util.h"
303// #include "util/threads.h"
304// #include "util/tokenbuf.h"
305// #include "util/tracer.h"
306// #include "util/triple.h"
307// #include "util/unicode_categories.h"
308// #include "util/unicode_util.h"
309// #include "util/uri_util.h"
310// #include "util/utf8_string.h"
311// #include "util/utf8_util.h"
312// #include "util/utf8_util_base.h"
313// #include "util/void_int.h"
314// #include "util/xml_util.h"
315// #include "zorbamisc/config/platform.h"
316// //#include "zorbaserialization/archiver.h"
317// #include "zorbaserialization/base64impl.h"
318// #include "zorbaserialization/bin_archiver.h"
319// //#include "zorbaserialization/class_serializer.h"
320// #include "zorbaserialization/mem_archiver.h"
321// #include "zorbaserialization/serialization_engine.h"
322// #include "zorbaserialization/template_serializer.h"
323// #include "zorbaserialization/xml_archiver.h"
324// #include "zorbaserialization/zorba_class_serializer.h"
325 #include "zorbatypes/mapm/m_apm_lc.h"
326 #include "zorbatypes/datetime/parse.h"
327 //#include "zorbatypes/binary.h"
328 #include "zorbatypes/chartype.h"
329 #include "zorbatypes/collation_manager.h"
330 //#include "zorbatypes/datetime.h"
331 //#include "zorbatypes/decimal.h"
332 //#include "zorbatypes/duration.h"
333 //#include "zorbatypes/floatimpl.h"
334 #include "zorbatypes/ft_token.h"
335 //#include "zorbatypes/integer.h"
336 #include "zorbatypes/libicu.h"
337 #include "zorbatypes/m_apm.h"
338 //#include "zorbatypes/rchandle.h"
339 #include "zorbatypes/rclock.h"
340 //#include "zorbatypes/regex_ascii.h"
341 #include "zorbatypes/schema_types.h"
342 #include "zorbatypes/timezone.h"
343 #include "zorbatypes/transcoder.h"
344 #include "zorbatypes/URI.h"
345 #include "zorbatypes/xerces_xmlcharray.h"
346 #include "zorbatypes/zorbatypes_decl.h"
347 #include "zorbatypes/zstring.h"
348 //#include "zorbautils/stemmer/sb_stemmer.h"
349 #include "zorbautils/condition.h"
350 #include "zorbautils/hashfun.h"
351 #include "zorbautils/hashmap.h"
352 #include "zorbautils/hashmap_itemp.h"
353 #include "zorbautils/hashmap_str_obj.h"
354 #include "zorbautils/hashmap_zstring.h"
355 #include "zorbautils/hashset.h"
356 #include "zorbautils/hashset_itemh.h"
357 //#include "zorbautils/icu_tokenizer.h"
358 #include "zorbautils/latch.h"
359 #include "zorbautils/locale.h"
360 #include "zorbautils/lock.h"
361 #include "zorbautils/mutex.h"
362 #include "zorbautils/runnable.h"
363 #include "zorbautils/SAXParser.h"
364 #include "zorbautils/stack.h"
365// #include "zorbautils/stemmer.h"
366 #include "zorbautils/string_util.h"
367 //#include "zorbautils/synchronous_logger.h"
368 //#include "zorbautils/tokenizer.h"
369 #include "unit_tests/unit_test_list.h"
370 #include "zorba/diagnostic_handler.h"
371 #include "zorba/xquery_warning.h"
372 #include "runtime/full_text/ftcontains_visitor.h"
373 #include "store/naive/naive_ft_token_iterator.h"
374 #include "store/api/ft_token_iterator.h"
375 #include "store/naive/ft_token_store.h"
376#endif94#endif
377/* vim:set et sw=2 ts=2: */95/* vim:set et sw=2 ts=2: */
37896
=== modified file 'src/runtime/full_text/CMakeLists.txt'
--- src/runtime/full_text/CMakeLists.txt 2012-03-28 05:19:57 +0000
+++ src/runtime/full_text/CMakeLists.txt 2012-04-07 00:45:26 +0000
@@ -42,11 +42,11 @@
42 default_tokenizer.cpp42 default_tokenizer.cpp
43 )43 )
4444
45IF (ZORBA_NO_UNICODE)45IF (ZORBA_NO_ICU)
46 LIST(APPEND FULLTEXT_SRCS latin_tokenizer.cpp)46 LIST(APPEND FULLTEXT_SRCS latin_tokenizer.cpp)
47ELSE (ZORBA_NO_UNICODE)47ELSE (ZORBA_NO_ICU)
48 LIST(APPEND FULLTEXT_SRCS icu_tokenizer.cpp)48 LIST(APPEND FULLTEXT_SRCS icu_tokenizer.cpp)
49ENDIF (ZORBA_NO_UNICODE)49ENDIF (ZORBA_NO_ICU)
5050
51ADD_SRC_SUBFOLDER(FULLTEXT_SRCS stemmer LIBSTEMMER_SRCS)51ADD_SRC_SUBFOLDER(FULLTEXT_SRCS stemmer LIBSTEMMER_SRCS)
5252
5353
=== modified file 'src/runtime/full_text/default_tokenizer.cpp'
--- src/runtime/full_text/default_tokenizer.cpp 2012-03-28 05:19:57 +0000
+++ src/runtime/full_text/default_tokenizer.cpp 2012-04-07 00:45:26 +0000
@@ -19,22 +19,22 @@
19#include <zorba/config.h>19#include <zorba/config.h>
2020
21#include "default_tokenizer.h"21#include "default_tokenizer.h"
22#ifdef ZORBA_NO_UNICODE22#ifdef ZORBA_NO_ICU
23# include "latin_tokenizer.h"23# include "latin_tokenizer.h"
24#else24#else
25# include "icu_tokenizer.h"25# include "icu_tokenizer.h"
26#endif /* ZORBA_NO_UNICODE */26#endif /* ZORBA_NO_ICU */
2727
28namespace zorba {28namespace zorba {
2929
30///////////////////////////////////////////////////////////////////////////////30///////////////////////////////////////////////////////////////////////////////
3131
32TokenizerProvider const& default_tokenizer_provider() {32TokenizerProvider const& default_tokenizer_provider() {
33#ifdef ZORBA_NO_UNICODE33#ifdef ZORBA_NO_ICU
34 static LatinTokenizerProvider const instance;34 static LatinTokenizerProvider const instance;
35#else35#else
36 static ICU_TokenizerProvider const instance;36 static ICU_TokenizerProvider const instance;
37#endif /* ZORBA_NO_UNICODE */37#endif /* ZORBA_NO_ICU */
38 return instance;38 return instance;
39};39};
4040
4141
=== modified file 'src/runtime/full_text/latin_tokenizer.cpp'
--- src/runtime/full_text/latin_tokenizer.cpp 2012-03-28 05:19:57 +0000
+++ src/runtime/full_text/latin_tokenizer.cpp 2012-04-07 00:45:26 +0000
@@ -18,8 +18,9 @@
18#include <functional>18#include <functional>
1919
20#include <zorba/diagnostic_list.h>20#include <zorba/diagnostic_list.h>
21#include <zorba/xquery_exception.h>21
22#include <zorba/zorba.h>22#include "diagnostics/dict.h"
23#include "diagnostics/xquery_exception.h"
2324
24#include "latin_tokenizer.h"25#include "latin_tokenizer.h"
2526
2627
=== modified file 'src/runtime/full_text/latin_tokenizer.h'
--- src/runtime/full_text/latin_tokenizer.h 2012-03-28 05:19:57 +0000
+++ src/runtime/full_text/latin_tokenizer.h 2012-04-07 00:45:26 +0000
@@ -14,12 +14,12 @@
14 * limitations under the License.14 * limitations under the License.
15 */15 */
1616
17#ifndef ZORBA_WESTERN_TOKENIZER_H17#ifndef ZORBA_LATIN_TOKENIZER_H
18#define ZORBA_WESTERN_TOKENIZER_H18#define ZORBA_LATIN_TOKENIZER_H
1919
20#include <zorba/config.h>20#include <zorba/config.h>
2121
22#ifdef ZORBA_NO_FULL_TEXT22#ifdef ZORBA_NO_ICU
2323
24#include <zorba/tokenizer.h>24#include <zorba/tokenizer.h>
25#include "zorbatypes/zstring.h"25#include "zorbatypes/zstring.h"
@@ -38,8 +38,8 @@
3838
39 // inherited39 // inherited
40 void destroy() const;40 void destroy() const;
41 void tokenize( char const*, size_type, iso639_1::type, bool, Callback&,41 void tokenize( char const*, size_type, locale::iso639_1::type, bool,
42 void* );42 Callback&, void* );
4343
44private:44private:
45 typedef zstring string_type;45 typedef zstring string_type;
@@ -64,13 +64,14 @@
64class LatinTokenizerProvider : public TokenizerProvider {64class LatinTokenizerProvider : public TokenizerProvider {
65public:65public:
66 // inherited66 // inherited
67 Tokenizer::ptr getTokenizer( iso639_1::type, Tokenizer::Numbers& ) const;67 Tokenizer::ptr getTokenizer( locale::iso639_1::type,
68 Tokenizer::Numbers& ) const;
68};69};
6970
70///////////////////////////////////////////////////////////////////////////////71///////////////////////////////////////////////////////////////////////////////
7172
72} // namespace zorba73} // namespace zorba
7374
74#endif /* ZORBA_NO_FULL_TEXT */75#endif /* ZORBA_NO_ICU */
75#endif /* ZORBA_WESTERN_TOKENIZER_H */76#endif /* ZORBA_LATIN_TOKENIZER_H */
76/* vim:set et sw=2 ts=2: */77/* vim:set et sw=2 ts=2: */
7778
=== modified file 'src/runtime/numerics/format_integer_impl.cpp'
--- src/runtime/numerics/format_integer_impl.cpp 2012-03-28 05:19:57 +0000
+++ src/runtime/numerics/format_integer_impl.cpp 2012-04-07 00:45:26 +0000
@@ -881,7 +881,7 @@
881 utf8_result += (*valueit);881 utf8_result += (*valueit);
882 }882 }
883 else883 else
884 utf8_result += (0x2080 + *valueit - '0');884 utf8_result += (unicode::code_point)(0x2080 + *valueit - '0');
885 }885 }
886 }886 }
887 else if((c0 == 0x2460) || //CIRCLED DIGIT ONE (1-20)887 else if((c0 == 0x2460) || //CIRCLED DIGIT ONE (1-20)
888888
=== modified file 'src/runtime/numerics/numerics_impl.cpp'
--- src/runtime/numerics/numerics_impl.cpp 2012-03-28 05:19:57 +0000
+++ src/runtime/numerics/numerics_impl.cpp 2012-04-07 00:45:26 +0000
@@ -462,7 +462,7 @@
462 minus( "-" )462 minus( "-" )
463 {463 {
464 utf8_string<zstring> u_per_mille( per_mille );464 utf8_string<zstring> u_per_mille( per_mille );
465 u_per_mille = 0x2030;465 u_per_mille = (unicode::code_point)0x2030;
466 }466 }
467467
468 void readFormat(const DecimalFormat_t& df_t)468 void readFormat(const DecimalFormat_t& df_t)
469469
=== modified file 'src/runtime/strings/strings_impl.cpp'
--- src/runtime/strings/strings_impl.cpp 2012-03-28 05:19:57 +0000
+++ src/runtime/strings/strings_impl.cpp 2012-04-07 00:45:26 +0000
@@ -810,7 +810,9 @@
810 zstring normForm;810 zstring normForm;
811 zstring resStr;811 zstring resStr;
812 unicode::normalization::type normType;812 unicode::normalization::type normType;
813#ifndef ZORBA_NO_ICU
813 bool success;814 bool success;
815#endif /* ZORBA_NO_ICU */
814816
815 PlanIteratorState* state;817 PlanIteratorState* state;
816 DEFAULT_STACK_INIT(PlanIteratorState, state, planState);818 DEFAULT_STACK_INIT(PlanIteratorState, state, planState);
@@ -860,10 +862,10 @@
860 }862 }
861863
862 item0->getStringValue2(resStr);864 item0->getStringValue2(resStr);
863#ifndef ZORBA_NO_UNICODE865#ifndef ZORBA_NO_ICU
864 success = utf8::normalize(resStr, normType, &resStr);866 success = utf8::normalize(resStr, normType, &resStr);
865 ZORBA_ASSERT(success);867 ZORBA_ASSERT(success);
866#endif//#ifndef ZORBA_NO_UNICODE868#endif//#ifndef ZORBA_NO_ICU
867 STACK_PUSH(GENV_ITEMFACTORY->createString(result, resStr), state );869 STACK_PUSH(GENV_ITEMFACTORY->createString(result, resStr), state );
868 }870 }
869 else871 else
@@ -992,7 +994,7 @@
992 trans_map[ *map_i ] = *trans_i;994 trans_map[ *map_i ] = *trans_i;
993995
994 for ( ; map_i != map_end; ++map_i )996 for ( ; map_i != map_end; ++map_i )
995 trans_map[ *map_i ] = ~0;997 trans_map[ *map_i ] = static_cast<unicode::code_point>( ~0 );
996 }998 }
997999
998 utf8_string<zstring> u_result_string( result_string );1000 utf8_string<zstring> u_result_string( result_string );
@@ -1007,7 +1009,7 @@
1007 cp_map_type::const_iterator const found_i = trans_map.find( cp );1009 cp_map_type::const_iterator const found_i = trans_map.find( cp );
1008 if ( found_i != trans_map.end() ) {1010 if ( found_i != trans_map.end() ) {
1009 cp = found_i->second;1011 cp = found_i->second;
1010 if ( cp == ~0 )1012 if ( cp == static_cast<unicode::code_point>( ~0 ) )
1011 continue;1013 continue;
1012 }1014 }
1013 u_result_string += cp;1015 u_result_string += cp;
@@ -1795,16 +1797,33 @@
1795 int &utf8start,1797 int &utf8start,
1796 unsigned int &bytestart,1798 unsigned int &bytestart,
1797 int utf8end,1799 int utf8end,
1800 unsigned int byteend,
1798 zstring &out)1801 zstring &out)
1799{1802{
1803#ifndef ZORBA_NO_ICU
1800 utf8::size_type clen;1804 utf8::size_type clen;
1801 while(utf8start < utf8end)1805 if(utf8end)
1802 {1806 {
1803 clen = utf8::char_length(*sin);1807 while(utf8start < utf8end)
1804 out.append(sin, clen);1808 {
1805 utf8start++;1809 clen = utf8::char_length(*sin);
1806 bytestart += clen;1810 if(clen == 0)
1807 sin += clen;1811 clen = 1;
1812 out.append(sin, clen);
1813 utf8start++;
1814 bytestart += clen;
1815 sin += clen;
1816 }
1817 }
1818 else
1819#endif
1820 {
1821 if(!utf8end)
1822 utf8end = byteend;
1823 out.append(sin, utf8end-bytestart);
1824 sin += utf8end-bytestart;
1825 utf8start = utf8end;
1826 bytestart = utf8end;
1808 }1827 }
1809}1828}
18101829
@@ -1812,6 +1831,7 @@
1812 int &match_end1,1831 int &match_end1,
1813 unsigned int &match_end1_bytes,1832 unsigned int &match_end1_bytes,
1814 int match_start2,1833 int match_start2,
1834 unsigned int match_start2_bytes,
1815 const char *&strin)1835 const char *&strin)
1816{1836{
1817 store::Item_t non_match_elem;1837 store::Item_t non_match_elem;
@@ -1833,7 +1853,7 @@
1833 // utf8_it++;1853 // utf8_it++;
1834 // match_end1++;1854 // match_end1++;
1835 //}1855 //}
1836 copyUtf8Chars(strin, match_end1, match_end1_bytes, match_start2, non_match_str);1856 copyUtf8Chars(strin, match_end1, match_end1_bytes, match_start2, match_start2_bytes, non_match_str);
1837 store::Item_t non_match_text_item;1857 store::Item_t non_match_text_item;
1838 GENV_ITEMFACTORY->createTextNode(non_match_text_item, non_match_elem, non_match_str);1858 GENV_ITEMFACTORY->createTextNode(non_match_text_item, non_match_elem, non_match_str);
1839}1859}
@@ -1864,19 +1884,31 @@
1864 i--;1884 i--;
1865 break;1885 break;
1866 }1886 }
1887#ifndef ZORBA_NO_ICU
1867 match_startg = rx.get_match_start(i+1);1888 match_startg = rx.get_match_start(i+1);
1868 if((match_startg < 0) && (gparent < 0))1889 if((match_startg < 0) && (gparent < 0))
1869 continue;1890 continue;
1891#else
1892 int temp_endg;
1893 match_startg = -1;
1894 temp_endg = -1;
1895 if(!rx.get_match_start_end_bytes(i+1, &match_startg, &temp_endg) && (gparent < 0))
1896 continue;
1897#endif
1870 if(match_endgood < match_startg)1898 if(match_endgood < match_startg)
1871 {1899 {
1872 //add non-group match text1900 //add non-group match text
1873 zstring non_group_str;1901 zstring non_group_str;
18741902
1875 copyUtf8Chars(sin, match_endgood, match_end1_bytes, match_startg, non_group_str);1903 copyUtf8Chars(sin, match_endgood, match_end1_bytes, match_startg, 0, non_group_str);
1876 store::Item_t non_group_text_item;1904 store::Item_t non_group_text_item;
1877 GENV_ITEMFACTORY->createTextNode(non_group_text_item, parent.getp(), non_group_str);1905 GENV_ITEMFACTORY->createTextNode(non_group_text_item, parent.getp(), non_group_str);
1878 }1906 }
1907#ifndef ZORBA_NO_ICU
1879 match_endg = rx.get_match_end(i+1);1908 match_endg = rx.get_match_end(i+1);
1909#else
1910 match_endg = temp_endg;
1911#endif
1880 //add group match text1912 //add group match text
1881 GENV_ITEMFACTORY->createQName(group_element_name,1913 GENV_ITEMFACTORY->createQName(group_element_name,
1882 static_context::W3C_FN_NS, "fn", "group");1914 static_context::W3C_FN_NS, "fn", "group");
@@ -1907,7 +1939,7 @@
1907 }1939 }
1908 zstring group_str;1940 zstring group_str;
19091941
1910 copyUtf8Chars(sin, match_startg, match_end1_bytes, match_endg, group_str);1942 copyUtf8Chars(sin, match_startg, match_end1_bytes, match_endg, 0, group_str);
1911 store::Item_t group_text_item;1943 store::Item_t group_text_item;
1912 GENV_ITEMFACTORY->createTextNode(group_text_item, group_elem.getp(), group_str);1944 GENV_ITEMFACTORY->createTextNode(group_text_item, group_elem.getp(), group_str);
1913 }1945 }
@@ -1916,7 +1948,7 @@
1916 {1948 {
1917 zstring non_group_str;1949 zstring non_group_str;
19181950
1919 copyUtf8Chars(sin, match_endgood, match_end1_bytes, match_end2, non_group_str);1951 copyUtf8Chars(sin, match_endgood, match_end1_bytes, match_end2, 0, non_group_str);
1920 store::Item_t non_group_text_item;1952 store::Item_t non_group_text_item;
1921 GENV_ITEMFACTORY->createTextNode(non_group_text_item, parent, non_group_str);1953 GENV_ITEMFACTORY->createTextNode(non_group_text_item, parent, non_group_str);
1922 }1954 }
@@ -2144,8 +2176,14 @@
2144 reachedEnd = false;2176 reachedEnd = false;
2145 while(rx.find_next_match(&reachedEnd))2177 while(rx.find_next_match(&reachedEnd))
2146 {2178 {
2147 int match_start2 = rx.get_match_start();2179 int match_start2;
2148 int match_end2 = rx.get_match_end();2180 int match_end2;
2181#ifndef ZORBA_NO_ICU
2182 match_start2 = rx.get_match_start();
2183 match_end2 = rx.get_match_end();
2184#else
2185 rx.get_match_start_end_bytes(0, &match_start2, &match_end2);
2186#endif
2149 ZORBA_ASSERT(match_start2 >= 0);2187 ZORBA_ASSERT(match_start2 >= 0);
21502188
2151 if(is_input_stream && reachedEnd && !instream->eof())2189 if(is_input_stream && reachedEnd && !instream->eof())
@@ -2157,7 +2195,7 @@
2157 //construct the fn:non-match2195 //construct the fn:non-match
2158 if(match_start2 > match_end1)2196 if(match_start2 > match_end1)
2159 {2197 {
2160 addNonMatchElement(result, match_end1, match_end1_bytes, match_start2, instr);2198 addNonMatchElement(result, match_end1, match_end1_bytes, match_start2, 0, instr);
2161 }2199 }
21622200
2163 //construct the fn:match2201 //construct the fn:match
@@ -2165,7 +2203,7 @@
2165 match_end1 = match_end2;2203 match_end1 = match_end2;
2166 }2204 }
21672205
2168 if(is_input_stream && reachedEnd && !instream->eof())2206 if(is_input_stream && !instream->eof())
2169 {2207 {
2170 //load some more data, maybe the match will be different2208 //load some more data, maybe the match will be different
2171 if(match_end1_bytes)2209 if(match_end1_bytes)
@@ -2213,7 +2251,7 @@
2213 else2251 else
2214 {2252 {
2215 if(match_end1_bytes < streambuf_read)2253 if(match_end1_bytes < streambuf_read)
2216 addNonMatchElement(result, match_end1, match_end1_bytes, streambuf_read, instr);2254 addNonMatchElement(result, match_end1, match_end1_bytes, 0, streambuf_read, instr);
2217 if(is_input_stream && instream->eof())2255 if(is_input_stream && instream->eof())
2218 reachedEnd = true;2256 reachedEnd = true;
2219 }2257 }
22202258
=== modified file 'src/store/api/store.h'
--- src/store/api/store.h 2012-03-28 05:19:57 +0000
+++ src/store/api/store.h 2012-04-07 00:45:26 +0000
@@ -16,7 +16,7 @@
16#ifndef ZORBA_STORE_STORE_H16#ifndef ZORBA_STORE_STORE_H
17#define ZORBA_STORE_STORE_H17#define ZORBA_STORE_STORE_H
1818
19#include <zorba/config.h>19#include "zorba/config.h"
20#include "zorbatypes/schema_types.h"20#include "zorbatypes/schema_types.h"
2121
22#include "store/api/shared_types.h"22#include "store/api/shared_types.h"
2323
=== modified file 'src/store/naive/simple_store.h'
--- src/store/naive/simple_store.h 2012-03-28 23:58:23 +0000
+++ src/store/naive/simple_store.h 2012-04-07 00:45:26 +0000
@@ -16,7 +16,11 @@
16#ifndef ZORBA_SIMPLE_STORE16#ifndef ZORBA_SIMPLE_STORE
17#define ZORBA_SIMPLE_STORE17#define ZORBA_SIMPLE_STORE
1818
19#include "store.h"19#include "store/naive/store.h"
20
21#include "store/naive/node_factory.h"
22#include "store/naive/pul_primitive_factory.h"
23#include "store/naive/tree_id_generator.h"
2024
21namespace zorba {25namespace zorba {
22namespace simplestore {26namespace simplestore {
@@ -72,7 +76,7 @@
7276
73 NodeFactory* createNodeFactory() const;77 NodeFactory* createNodeFactory() const;
7478
75 void destroyNodeFactory(NodeFactory*) const;79 void destroyNodeFactory(zorba::simplestore::NodeFactory*) const;
7680
77 store::ItemFactory* createItemFactory() const;81 store::ItemFactory* createItemFactory() const;
7882
@@ -84,7 +88,7 @@
8488
85 PULPrimitiveFactory* createPULFactory() const;89 PULPrimitiveFactory* createPULFactory() const;
8690
87 void destroyPULFactory(PULPrimitiveFactory*) const;91 void destroyPULFactory(zorba::simplestore::PULPrimitiveFactory*) const;
8892
89 CollectionSet* createCollectionSet() const;93 CollectionSet* createCollectionSet() const;
9094
9195
=== modified file 'src/store/naive/store.cpp'
--- src/store/naive/store.cpp 2012-03-28 22:09:36 +0000
+++ src/store/naive/store.cpp 2012-04-07 00:45:26 +0000
@@ -33,7 +33,7 @@
3333
34#include "properties.h"34#include "properties.h"
35#include "string_pool.h"35#include "string_pool.h"
36#include "store.h"36#include "simple_store.h"
37#include "simple_temp_seq.h"37#include "simple_temp_seq.h"
38#include "simple_lazy_temp_seq.h"38#include "simple_lazy_temp_seq.h"
39#include "collection.h"39#include "collection.h"
4040
=== modified file 'src/store/naive/store.h'
--- src/store/naive/store.h 2012-03-28 22:09:36 +0000
+++ src/store/naive/store.h 2012-04-07 00:45:26 +0000
@@ -16,10 +16,18 @@
16#ifndef ZORBA_SIMPLESTORE_STORE_H16#ifndef ZORBA_SIMPLESTORE_STORE_H
17#define ZORBA_SIMPLESTORE_STORE_H17#define ZORBA_SIMPLESTORE_STORE_H
1818
19#include "store/api/store.h"
20
19#include "shared_types.h"21#include "shared_types.h"
20#include "store_defs.h"22#include "store_defs.h"
21#include "hashmap_nodep.h"23#include "hashmap_nodep.h"
22#include "tree_id.h"24#include "tree_id.h"
25#include "store/util/hashmap_stringbuf.h"
26#include "zorbautils/mutex.h"
27#include "zorbautils/lock.h"
28#include "zorbautils/hashmap.h"
29#include "zorbautils/hashmap_itemp.h"
30#include "zorbautils/hashmap_zstring_nonserializable.h"
2331
24#if (defined (WIN32) || defined (WINCE))32#if (defined (WIN32) || defined (WINCE))
25#include "node_items.h"33#include "node_items.h"
@@ -28,14 +36,7 @@
28#include "store/api/ic.h"36#include "store/api/ic.h"
29#endif37#endif
3038
31#include "store/api/store.h"39using namespace zorba;
32
33#include "store/util/hashmap_stringbuf.h"
34
35#include "zorbautils/mutex.h"
36#include "zorbautils/lock.h"
37#include "zorbautils/hashmap_itemp.h"
38#include "zorbautils/hashmap_zstring_nonserializable.h"
3940
40namespace zorba41namespace zorba
41{42{
@@ -63,9 +64,9 @@
63class TreeIdGeneratorFactory;64class TreeIdGeneratorFactory;
64class TreeIdGenerator;65class TreeIdGenerator;
6566
66typedef zorba::HashMapZString<XmlNode_t> DocumentSet;67typedef HashMapZString<XmlNode_t> DocumentSet;
67typedef ItemPointerHashMap<store::Index_t> IndexSet;68typedef zorba::ItemPointerHashMap<store::Index_t> IndexSet;
68typedef ItemPointerHashMap<store::IC_t> ICSet;69typedef zorba::ItemPointerHashMap<store::IC_t> ICSet;
6970
7071
7172
7273
=== modified file 'src/system/globalenv.cpp'
--- src/system/globalenv.cpp 2012-03-28 05:19:57 +0000
+++ src/system/globalenv.cpp 2012-04-07 00:45:26 +0000
@@ -17,11 +17,11 @@
1717
18#include "common/common.h"18#include "common/common.h"
1919
20#ifndef ZORBA_NO_UNICODE20#ifndef ZORBA_NO_ICU
21# include <unicode/uclean.h>21# include <unicode/uclean.h>
22# include <unicode/utypes.h>22# include <unicode/utypes.h>
23# include <unicode/udata.h>23# include <unicode/udata.h>
24#endif /* ZORBA_NO_UNICODE */24#endif /* ZORBA_NO_ICU */
2525
26#ifdef ZORBA_WITH_BIG_INTEGER26#ifdef ZORBA_WITH_BIG_INTEGER
27# include "zorbatypes/m_apm.h"27# include "zorbatypes/m_apm.h"
@@ -208,7 +208,7 @@
208 // from one thread only208 // from one thread only
209 // see http://www.icu-project.org/userguide/design.html#Init_and_Termination209 // see http://www.icu-project.org/userguide/design.html#Init_and_Termination
210 // and http://www.icu-project.org/apiref/icu4c/uclean_8h.html210 // and http://www.icu-project.org/apiref/icu4c/uclean_8h.html
211#ifndef ZORBA_NO_UNICODE211#ifndef ZORBA_NO_ICU
212# if defined U_STATIC_IMPLEMENTATION && (defined WIN32 || defined WINCE)212# if defined U_STATIC_IMPLEMENTATION && (defined WIN32 || defined WINCE)
213 {213 {
214 TCHAR self_path[1024];214 TCHAR self_path[1024];
@@ -238,13 +238,13 @@
238 udata_setCommonData(icu_appdata, &data_err);238 udata_setCommonData(icu_appdata, &data_err);
239 ZORBA_ASSERT(data_err == U_ZERO_ERROR);239 ZORBA_ASSERT(data_err == U_ZERO_ERROR);
240 240
241 // u_setDataDirectory(self_path);241 // u_setDataDirectory(self_path);
242 }242 }
243# endif243# endif
244 UErrorCode lICUInitStatus = U_ZERO_ERROR;244 UErrorCode lICUInitStatus = U_ZERO_ERROR;
245 u_init(&lICUInitStatus);245 u_init(&lICUInitStatus);
246 ZORBA_ASSERT(lICUInitStatus == U_ZERO_ERROR);246 ZORBA_ASSERT(lICUInitStatus == U_ZERO_ERROR);
247#endif//ifndef ZORBA_NO_UNICODE247#endif /* ZORBA_NO_ICU */
248}248}
249249
250250
@@ -256,12 +256,12 @@
256 // releases statically initialized memory and prevents256 // releases statically initialized memory and prevents
257 // valgrind from reporting those problems at the end257 // valgrind from reporting those problems at the end
258 // see http://www.icu-project.org/apiref/icu4c/uclean_8h.html#93f27d0ddc7c196a1da864763f2d8920258 // see http://www.icu-project.org/apiref/icu4c/uclean_8h.html#93f27d0ddc7c196a1da864763f2d8920
259#ifndef ZORBA_NO_UNICODE259#ifndef ZORBA_NO_ICU
260 u_cleanup();260 u_cleanup();
261# if defined U_STATIC_IMPLEMENTATION && (defined WIN32 || defined WINCE)261# if defined U_STATIC_IMPLEMENTATION && (defined WIN32 || defined WINCE)
262 delete[] icu_appdata;262 delete[] icu_appdata;
263# endif263# endif
264#endif//ifndef ZORBA_NO_UNICODE264#endif /* ZORBA_NO_ICU */
265}265}
266266
267267
268268
=== modified file 'src/unit_tests/CMakeLists.txt'
--- src/unit_tests/CMakeLists.txt 2012-03-28 05:19:57 +0000
+++ src/unit_tests/CMakeLists.txt 2012-04-07 00:45:26 +0000
@@ -29,9 +29,9 @@
29 tokenizer.cpp)29 tokenizer.cpp)
30ENDIF (NOT ZORBA_NO_FULL_TEXT)30ENDIF (NOT ZORBA_NO_FULL_TEXT)
3131
32IF (NOT ZORBA_NO_UNICODE)32IF (NOT ZORBA_NO_ICU)
33 LIST (APPEND UNIT_TEST_SRCS33 LIST (APPEND UNIT_TEST_SRCS
34 test_icu_streambuf.cpp)34 test_icu_streambuf.cpp)
35ENDIF (NOT ZORBA_NO_UNICODE)35ENDIF (NOT ZORBA_NO_ICU)
3636
37# vim:set et sw=2 tw=2:37# vim:set et sw=2 tw=2:
3838
=== modified file 'src/unit_tests/string.cpp'
--- src/unit_tests/string.cpp 2012-03-28 05:19:57 +0000
+++ src/unit_tests/string.cpp 2012-04-07 00:45:26 +0000
@@ -569,6 +569,7 @@
569 ASSERT_TRUE( t == s );569 ASSERT_TRUE( t == s );
570}570}
571571
572#ifndef ZORBA_NO_ICU
572template<class StringType>573template<class StringType>
573static void test_to_string_from_wchar_t() {574static void test_to_string_from_wchar_t() {
574 wchar_t const w[] = L"hello";575 wchar_t const w[] = L"hello";
@@ -578,6 +579,7 @@
578 for ( string::size_type i = 0; i < s.length(); ++i )579 for ( string::size_type i = 0; i < s.length(); ++i )
579 ASSERT_TRUE( s[i] == w[i] );580 ASSERT_TRUE( s[i] == w[i] );
580}581}
582#endif /* ZORBA_NO_ICU */
581583
582template<class StringType>584template<class StringType>
583static void test_to_upper() {585static void test_to_upper() {
@@ -605,6 +607,7 @@
605 }607 }
606}608}
607609
610#ifndef ZORBA_NO_ICU
608static void test_to_wchar_t() {611static void test_to_wchar_t() {
609 string const s = "hello";612 string const s = "hello";
610 wchar_t *w;613 wchar_t *w;
@@ -616,6 +619,7 @@
616 ASSERT_TRUE( w[i] == s[i] );619 ASSERT_TRUE( w[i] == s[i] );
617 delete[] w;620 delete[] w;
618}621}
622#endif /* ZORBA_NO_ICU */
619623
620static void test_trim_start() {624static void test_trim_start() {
621 char const *s;625 char const *s;
@@ -873,16 +877,20 @@
873 test_to_string_from_utf8<zstring>();877 test_to_string_from_utf8<zstring>();
874 test_to_string_from_utf8<zstring_p>();878 test_to_string_from_utf8<zstring_p>();
875879
880#ifndef ZORBA_NO_ICU
876 test_to_string_from_wchar_t<string>();881 test_to_string_from_wchar_t<string>();
877 test_to_string_from_wchar_t<zstring>();882 test_to_string_from_wchar_t<zstring>();
878 test_to_string_from_wchar_t<zstring_p>();883 test_to_string_from_wchar_t<zstring_p>();
884#endif /* ZORBA_NO_ICU */
879885
880 test_to_upper<string>();886 test_to_upper<string>();
881 test_to_upper<zstring>();887 test_to_upper<zstring>();
882 test_to_upper<zstring_p>();888 test_to_upper<zstring_p>();
883 test_to_upper<String>();889 test_to_upper<String>();
884890
891#ifndef ZORBA_NO_ICU
885 test_to_wchar_t();892 test_to_wchar_t();
893#endif /* ZORBA_NO_ICU */
886894
887 test_trim_start();895 test_trim_start();
888 test_trim_end();896 test_trim_end();
889897
=== modified file 'src/unit_tests/unit_test_list.h'
--- src/unit_tests/unit_test_list.h 2012-03-28 05:19:57 +0000
+++ src/unit_tests/unit_test_list.h 2012-04-07 00:45:26 +0000
@@ -36,9 +36,9 @@
36 /**36 /**
37 * ADD NEW UNIT TESTS HERE37 * ADD NEW UNIT TESTS HERE
38 */38 */
39#ifndef ZORBA_NO_UNICODE39#ifndef ZORBA_NO_ICU
40 int test_icu_streambuf( int, char*[] );40 int test_icu_streambuf( int, char*[] );
41#endif /* ZORBA_NO_UNICODE */41#endif /* ZORBA_NO_ICU */
42 int json_parser( int, char*[] );42 int json_parser( int, char*[] );
4343
44 void initializeTestList();44 void initializeTestList();
4545
=== modified file 'src/unit_tests/unit_tests.cpp'
--- src/unit_tests/unit_tests.cpp 2012-03-28 05:19:57 +0000
+++ src/unit_tests/unit_tests.cpp 2012-04-07 00:45:26 +0000
@@ -39,9 +39,9 @@
39 void initializeTestList() {39 void initializeTestList() {
40 libunittests["string"] = test_string;40 libunittests["string"] = test_string;
41 libunittests["uri"] = runUriTest;41 libunittests["uri"] = runUriTest;
42#ifndef ZORBA_NO_UNICODE42#ifndef ZORBA_NO_ICU
43 libunittests["icu_streambuf"] = test_icu_streambuf;43 libunittests["icu_streambuf"] = test_icu_streambuf;
44#endif /* ZORBA_NO_UNICODE */44#endif /* ZORBA_NO_ICU */
45 libunittests["json_parser"] = json_parser;45 libunittests["json_parser"] = json_parser;
46 libunittests["unique_ptr"] = test_unique_ptr;46 libunittests["unique_ptr"] = test_unique_ptr;
47#ifndef ZORBA_NO_FULL_TEXT47#ifndef ZORBA_NO_FULL_TEXT
4848
=== modified file 'src/util/CMakeLists.txt'
--- src/util/CMakeLists.txt 2012-03-28 05:19:57 +0000
+++ src/util/CMakeLists.txt 2012-04-07 00:45:26 +0000
@@ -40,14 +40,14 @@
40 LIST(APPEND UTIL_SRCS mmap_file.cpp)40 LIST(APPEND UTIL_SRCS mmap_file.cpp)
41ENDIF(ZORBA_WITH_FILE_ACCESS)41ENDIF(ZORBA_WITH_FILE_ACCESS)
4242
43IF(ZORBA_NO_UNICODE)43IF(ZORBA_NO_ICU)
44 LIST(APPEND UTIL_SRCS44 LIST(APPEND UTIL_SRCS
45 regex_ascii.cpp45 regex_xquery.cpp
46 passthru_streambuf.cpp)46 passthru_streambuf.cpp)
47ELSE(ZORBA_NO_UNICODE)47ELSE(ZORBA_NO_ICU)
48 LIST(APPEND UTIL_SRCS48 LIST(APPEND UTIL_SRCS
49 icu_streambuf.cpp)49 icu_streambuf.cpp)
50ENDIF(ZORBA_NO_UNICODE)50ENDIF(ZORBA_NO_ICU)
5151
52HEADER_GROUP_SUBFOLDER(UTIL_SRCS fx)52HEADER_GROUP_SUBFOLDER(UTIL_SRCS fx)
53HEADER_GROUP_SUBFOLDER(UTIL_SRCS win32)53HEADER_GROUP_SUBFOLDER(UTIL_SRCS win32)
5454
=== modified file 'src/util/icu_streambuf.h'
--- src/util/icu_streambuf.h 2012-02-04 01:26:18 +0000
+++ src/util/icu_streambuf.h 2012-04-07 00:45:26 +0000
@@ -17,6 +17,7 @@
17#ifndef ZORBA_ICU_STREAMBUF_H17#ifndef ZORBA_ICU_STREAMBUF_H
18#define ZORBA_ICU_STREAMBUF_H18#define ZORBA_ICU_STREAMBUF_H
1919
20#include <unicode/ucnv.h>
20#include <zorba/transcode_stream.h>21#include <zorba/transcode_stream.h>
2122
22#include "util/utf8_util.h"23#include "util/utf8_util.h"
2324
=== modified file 'src/util/passthru_streambuf.cpp'
--- src/util/passthru_streambuf.cpp 2012-02-04 01:26:18 +0000
+++ src/util/passthru_streambuf.cpp 2012-04-07 00:45:26 +0000
@@ -14,8 +14,8 @@
14 * limitations under the License.14 * limitations under the License.
15 */15 */
1616
17#include "stdafx.h"
17#include "passthru_streambuf.h"18#include "passthru_streambuf.h"
18
19using namespace std;19using namespace std;
2020
21namespace zorba {21namespace zorba {
@@ -47,7 +47,7 @@
47}47}
4848
49bool passthru_streambuf::is_supported( char const *cc_charset ) {49bool passthru_streambuf::is_supported( char const *cc_charset ) {
50 return !is_necessary( charset );50 return !is_necessary( cc_charset );
51}51}
5252
53passthru_streambuf::pos_type53passthru_streambuf::pos_type
5454
=== modified file 'src/util/passthru_streambuf.h'
--- src/util/passthru_streambuf.h 2012-02-02 18:37:24 +0000
+++ src/util/passthru_streambuf.h 2012-04-07 00:45:26 +0000
@@ -17,8 +17,9 @@
17#ifndef ZORBA_PASSTHRU_STREAMBUF_H17#ifndef ZORBA_PASSTHRU_STREAMBUF_H
18#define ZORBA_PASSTHRU_STREAMBUF_H18#define ZORBA_PASSTHRU_STREAMBUF_H
1919
20#include <zorba/transcode_streambuf.h>20#include <zorba/transcode_stream.h>
2121#include "zorbatypes/zstring.h"
22#include "util/ascii_util.h"
22namespace zorba {23namespace zorba {
2324
24///////////////////////////////////////////////////////////////////////////////25///////////////////////////////////////////////////////////////////////////////
@@ -48,6 +49,13 @@
48 * @return \c true only if the character encoding is supported.49 * @return \c true only if the character encoding is supported.
49 */50 */
50 static bool is_supported( char const *charset );51 static bool is_supported( char const *charset );
52 static bool is_necessary( char const *cc_charset );
53
54 typedef std::streambuf::char_type char_type;
55 typedef std::streambuf::int_type int_type;
56 typedef std::streambuf::off_type off_type;
57 typedef std::streambuf::pos_type pos_type;
58 typedef std::streambuf::traits_type traits_type;
5159
52protected:60protected:
53 void imbue( std::locale const& );61 void imbue( std::locale const& );
5462
=== modified file 'src/util/regex.cpp'
--- src/util/regex.cpp 2012-03-28 05:19:57 +0000
+++ src/util/regex.cpp 2012-04-07 00:45:26 +0000
@@ -15,8 +15,6 @@
15 */15 */
16#include "stdafx.h"16#include "stdafx.h"
1717
18#include "regex.h"
19
20#include <cstring>18#include <cstring>
21#include <vector>19#include <vector>
2220
@@ -28,13 +26,13 @@
2826
29#include "ascii_util.h"27#include "ascii_util.h"
30#include "cxx_util.h"28#include "cxx_util.h"
29#include "regex.h"
31#include "stl_util.h"30#include "stl_util.h"
3231
33#define INVALID_RE_EXCEPTION(...) \32#define INVALID_RE_EXCEPTION(...) \
34 XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS( __VA_ARGS__ ) )33 XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS( __VA_ARGS__ ) )
3534
3635#ifndef ZORBA_NO_ICU
37#ifndef ZORBA_NO_UNICODE
38# include <unicode/uversion.h>36# include <unicode/uversion.h>
39U_NAMESPACE_USE37U_NAMESPACE_USE
4038
@@ -103,6 +101,7 @@
103101
104 bool got_backslash = false;102 bool got_backslash = false;
105 bool in_char_class = false; // within [...]103 bool in_char_class = false; // within [...]
104 bool is_first_char = true; // to check ^ placement
106105
107 bool in_backref = false; // '\'[1-9][0-9]*106 bool in_backref = false; // '\'[1-9][0-9]*
108 unsigned backref_no = 0; // 1-based107 unsigned backref_no = 0; // 1-based
@@ -231,6 +230,8 @@
231 ++open_cap_subs;230 ++open_cap_subs;
232 cap_sub.push_back( true );231 cap_sub.push_back( true );
233 cur_cap_sub = cap_sub.size();232 cur_cap_sub = cap_sub.size();
233 is_first_char = true;
234 goto append;
234 }235 }
235 break;236 break;
236 case ')':237 case ')':
@@ -245,8 +246,10 @@
245 case '[':246 case '[':
246 if ( q_flag )247 if ( q_flag )
247 *icu_re += '\\';248 *icu_re += '\\';
248 else249 else {
249 in_char_class = true;250 in_char_class = true;
251 goto append;
252 }
250 break;253 break;
251 case ']':254 case ']':
252 if ( q_flag )255 if ( q_flag )
@@ -254,6 +257,19 @@
254 else257 else
255 in_char_class = false;258 in_char_class = false;
256 break;259 break;
260 case '^':
261 if ( q_flag )
262 *icu_re += '\\';
263 else if ( !is_first_char && !in_char_class )
264 throw INVALID_RE_EXCEPTION( xq_re, ZED( UnescapedChar_3 ), *xq_c );
265 break;
266 case '|':
267 if ( q_flag )
268 *icu_re += '\\';
269 else {
270 is_first_char = true;
271 goto append;
272 }
257 default:273 default:
258 if ( x_flag && ascii::is_space( *xq_c ) ) {274 if ( x_flag && ascii::is_space( *xq_c ) ) {
259 if ( !in_char_class )275 if ( !in_char_class )
@@ -265,37 +281,42 @@
265 //281 //
266 *icu_re += '\\';282 *icu_re += '\\';
267 }283 }
268 }284 } // switch
269 }285 } // else
286 is_first_char = false;
287append:
270 *icu_re += *xq_c;288 *icu_re += *xq_c;
271 } // FOR_EACH289 } // FOR_EACH
272290
273 if ( i_flag ) {291 if ( !q_flag ) {
274 //292 if ( i_flag ) {
275 // XQuery 3.0 F&O 5.6.1.1: All other constructs are unaffected by the "i"293 //
276 // flag. For example, "\p{Lu}" continues to match upper-case letters only.294 // XQuery 3.0 F&O 5.6.1.1: All other constructs are unaffected by the "i"
277 //295 // flag. For example, "\p{Lu}" continues to match upper-case letters
278 // However, ICU lower-cases everything for the 'i' flag; hence we have to296 // only.
279 // turn off the 'i' flag for just the \p{Lu}.297 //
280 //298 // However, ICU lower-cases everything for the 'i' flag; hence we have to
281 // Note that the "6" and "12" below are correct since "\\" represents a299 // turn off the 'i' flag for just the \p{Lu}.
282 // single '\'.300 //
283 //301 // Note that the "6" and "12" below are correct since "\\" represents a
284 ascii::replace_all( *icu_re, "\\p{Lu}", 6, "(?-i:\\p{Lu})", 12 );302 // single '\'.
285 }303 //
304 ascii::replace_all( *icu_re, "\\p{Lu}", 6, "(?-i:\\p{Lu})", 12 );
305 }
286306
287 //307 //
288 // XML Schema Part 2 F.1.1: [Unicode Database] groups code points into a308 // XML Schema Part 2 F.1.1: [Unicode Database] groups code points into a
289 // number of blocks such as Basic Latin (i.e., ASCII), Latin-1 Supplement,309 // number of blocks such as Basic Latin (i.e., ASCII), Latin-1 Supplement,
290 // Hangul Jamo, CJK Compatibility, etc. The set containing all characters310 // Hangul Jamo, CJK Compatibility, etc. The set containing all characters
291 // that have block name X (with all white space stripped out), can be311 // that have block name X (with all white space stripped out), can be
292 // identified with a block escape \p{IsX}.312 // identified with a block escape \p{IsX}.
293 //313 //
294 // However, ICU uses \p{InX} rather than \p{IsX}.314 // However, ICU uses \p{InX} rather than \p{IsX}.
295 //315 //
296 // Note that the "5" below is correct since "\\" represents a single '\'.316 // Note that the "5" below is correct since "\\" represents a single '\'.
297 //317 //
298 ascii::replace_all( *icu_re, "\\p{Is", 5, "\\p{In", 5 );318 ascii::replace_all( *icu_re, "\\p{Is", 5, "\\p{In", 5 );
319 } // q_flag
299}320}
300321
301///////////////////////////////////////////////////////////////////////////////322///////////////////////////////////////////////////////////////////////////////
@@ -442,11 +463,11 @@
442}463}
443464
444} // namespace unicode465} // namespace unicode
445466} // namespace zorba
446}//namespace zorba467
447468///////////////////////////////////////////////////////////////////////////////
448469
449#else /* ZORBA_NO_UNICODE */470#else /* ZORBA_NO_ICU */
450471
451#include "zorbatypes/zstring.h"472#include "zorbatypes/zstring.h"
452473
@@ -470,7 +491,7 @@
470 case 'i': flags |= REGEX_ASCII_CASE_INSENSITIVE; break;491 case 'i': flags |= REGEX_ASCII_CASE_INSENSITIVE; break;
471 case 's': flags |= REGEX_ASCII_DOTALL; break;492 case 's': flags |= REGEX_ASCII_DOTALL; break;
472 case 'm': flags |= REGEX_ASCII_MULTILINE; break;493 case 'm': flags |= REGEX_ASCII_MULTILINE; break;
473 case 'x': flags |= REGEX_ASCII_COMMENTS; break;494 case 'x': flags |= REGEX_ASCII_NO_WHITESPACE; break;
474 case 'q': flags |= REGEX_ASCII_LITERAL; break;495 case 'q': flags |= REGEX_ASCII_LITERAL; break;
475 default:496 default:
476 throw XQUERY_EXCEPTION( err::FORX0001, ERROR_PARAMS( *p ) );497 throw XQUERY_EXCEPTION( err::FORX0001, ERROR_PARAMS( *p ) );
@@ -483,6 +504,7 @@
483void regex::compile( char const *pattern, char const *flags)504void regex::compile( char const *pattern, char const *flags)
484{505{
485 parsed_flags = parse_regex_flags(flags);506 parsed_flags = parse_regex_flags(flags);
507 regex_xquery::CRegexXQuery_parser regex_parser;
486 regex_matcher = regex_parser.parse(pattern, parsed_flags);508 regex_matcher = regex_parser.parse(pattern, parsed_flags);
487 if(!regex_matcher)509 if(!regex_matcher)
488 throw INVALID_RE_EXCEPTION(pattern);510 throw INVALID_RE_EXCEPTION(pattern);
@@ -517,6 +539,8 @@
517bool regex::next_token( char const *s, size_type *pos, zstring *token,539bool regex::next_token( char const *s, size_type *pos, zstring *token,
518 bool *matched)540 bool *matched)
519{541{
542 if(!s[*pos])
543 return false;
520 bool retval;544 bool retval;
521 int match_pos;545 int match_pos;
522 int matched_len;546 int matched_len;
@@ -528,14 +552,8 @@
528 token->assign(s+*pos, match_pos);552 token->assign(s+*pos, match_pos);
529 *pos += match_pos + matched_len;553 *pos += match_pos + matched_len;
530 if(matched)554 if(matched)
531 if(match_pos)555 *matched = true;
532 *matched = true;556 return true;
533 else
534 *matched = false;
535 if(match_pos)
536 return true;
537 else
538 return false;
539 }557 }
540 else558 else
541 {559 {
@@ -544,7 +562,7 @@
544 *pos += strlen(s+*pos);562 *pos += strlen(s+*pos);
545 if(matched)563 if(matched)
546 *matched = false;564 *matched = false;
547 return s[*pos] != 0;565 return true;
548 }566 }
549}567}
550568
@@ -554,13 +572,9 @@
554 int matched_pos;572 int matched_pos;
555 int matched_len;573 int matched_len;
556574
557 bool prev_align = regex_matcher->set_align_begin(true);575 retval = regex_matcher->match_anywhere(s, parsed_flags|REGEX_ASCII_WHOLE_MATCH, &matched_pos, &matched_len);
558 retval = regex_matcher->match_from(s, parsed_flags, &matched_pos, &matched_len);
559 regex_matcher->set_align_begin(prev_align);
560 if(!retval)576 if(!retval)
561 return false;577 return false;
562 if(matched_len != strlen(s))
563 return false;
564 return true;578 return true;
565}579}
566580
@@ -587,14 +601,19 @@
587 //look for dollars601 //look for dollars
588 if(*temprepl == '\\')602 if(*temprepl == '\\')
589 {603 {
590 temprepl++;604 if(!(parsed_flags & REGEX_ASCII_LITERAL))
591 if(!*temprepl || (*temprepl != '\\') || (*temprepl != '$'))//Invalid replacement string.605 {
592 throw XQUERY_EXCEPTION( err::FORX0004, ERROR_PARAMS( replacement ) );606 temprepl++;
607 if(!*temprepl)
608 temprepl--;
609 else if((*temprepl != '\\') && (*temprepl != '$'))//Invalid replacement string.
610 throw XQUERY_EXCEPTION( err::FORX0004, ERROR_PARAMS( replacement ) );
611 }
593 result->append(1, *temprepl);612 result->append(1, *temprepl);
594 temprepl++;613 temprepl++;
595 continue;614 continue;
596 }615 }
597 if(*temprepl == '$')616 if((*temprepl == '$') && !(parsed_flags & REGEX_ASCII_LITERAL))
598 {617 {
599 temprepl++;618 temprepl++;
600 index = 0;619 index = 0;
@@ -648,7 +667,7 @@
648 if(retval)667 if(retval)
649 {668 {
650 m_match_pos += m_pos;669 m_match_pos += m_pos;
651 m_pos = m_match_pos = m_matched_len;670 m_pos = m_match_pos + m_matched_len;
652 }671 }
653 else672 else
654 {673 {
@@ -666,35 +685,30 @@
666 return (int)regex_matcher->get_indexed_regex_count();685 return (int)regex_matcher->get_indexed_regex_count();
667}686}
668687
669int regex::get_match_start( int groupId )688bool regex::get_match_start_end_bytes( int groupId, int *start, int *end )
670{689{
671 if(groupId == 0)690 *start = -1;
672 return m_match_pos;691 *end = -1;
673 if(groupId > (int)regex_matcher->get_indexed_regex_count())692 if(groupId == 0)
674 return -1;693 {
675 const char *submatched_source;694 *start = m_match_pos;
676 int submatched_len;695 *end = m_match_pos + m_matched_len;
677 if(!regex_matcher->get_indexed_match(groupId, &submatched_source, &submatched_len))696 return true;
678 return -1;697 }
679 return submatched_source - s_in_.c_str();698 if(groupId > (int)regex_matcher->get_indexed_regex_count())
680}699 return false;
681700 const char *submatched_source;
682int regex::get_match_end( int groupId )701 int submatched_len;
683{702 if(!regex_matcher->get_indexed_match(groupId, &submatched_source, &submatched_len))
684 if(groupId == 0)703 return false;
685 return m_match_pos + m_matched_len;704 *start = submatched_source - s_in_.c_str();
686 if(groupId > (int)regex_matcher->get_indexed_regex_count())705 *end = *start + submatched_len;
687 return -1;706 return true;
688 const char *submatched_source;
689 int submatched_len;
690 if(!regex_matcher->get_indexed_match(groupId, &submatched_source, &submatched_len))
691 return -1;
692 return submatched_source - s_in_.c_str() + submatched_len;
693}707}
694708
695} // namespace unicode709} // namespace unicode
696} // namespace zorba710} // namespace zorba
697#endif /* ZORBA_NO_UNICODE */711#endif /* ZORBA_NO_ICU */
698712
699///////////////////////////////////////////////////////////////////////////////713///////////////////////////////////////////////////////////////////////////////
700714
701715
=== modified file 'src/util/regex.h'
--- src/util/regex.h 2012-03-28 05:19:57 +0000
+++ src/util/regex.h 2012-04-07 00:45:26 +0000
@@ -17,15 +17,13 @@
17#ifndef ZORBA_REGEX_H17#ifndef ZORBA_REGEX_H
18#define ZORBA_REGEX_H18#define ZORBA_REGEX_H
1919
20#ifndef ZORBA_NO_UNICODE
21#include <unicode/regex.h>
22#endif
23
24#include "cxx_util.h"20#include "cxx_util.h"
25#include "unicode_util.h"21#include "unicode_util.h"
26#include "zorbatypes/zstring.h"22#include "zorbatypes/zstring.h"
2723
28#ifndef ZORBA_NO_UNICODE24#ifndef ZORBA_NO_ICU
25
26#include <unicode/regex.h>
2927
30namespace zorba {28namespace zorba {
3129
@@ -496,15 +494,17 @@
496} // namespace unicode494} // namespace unicode
497} // namespace zorba495} // namespace zorba
498496
499#else ///ZORBA_NO_UNICODE (ascii part:)497///////////////////////////////////////////////////////////////////////////////
500498
501#include "util/regex_ascii.h"499#else /* ZORBA_NO_ICU */
500
501#include "util/regex_xquery.h"
502#include <string>502#include <string>
503503
504namespace zorba{504namespace zorba{
505/**505/**
506 * Converts an XQuery regular expression to the form used by the regular506 * Converts an XQuery regular expression to the form used by the regular
507 * expression library Zorba is using (here regex_ascii).507 * expression library Zorba is using (here regex_xquery).
508 *508 *
509 * @param xq_re The XQuery regular expression.509 * @param xq_re The XQuery regular expression.
510 * @param lib_re A pointer to the resuling library regular expression.510 * @param lib_re A pointer to the resuling library regular expression.
@@ -525,7 +525,7 @@
525 /**525 /**
526 * Constructs a %regex.526 * Constructs a %regex.
527 */527 */
528 regex() : regex_matcher( NULL ) { }528 regex() : regex_matcher( nullptr ) { }
529529
530 /**530 /**
531 * Destroys a %regex.531 * Destroys a %regex.
@@ -835,31 +835,21 @@
835835
836 /**836 /**
837 * Get the start position of the matched group.837 * Get the start position of the matched group.
838 * If groupId is zero, then the start position of the whole match is returned.838 * If groupId is zero, then the start and end position of the whole match is returned.
839 * If groupId is non-zero, then the start position of that group is returned.839 * If groupId is non-zero, then the start and end position of that group is returned.
840 * If that group has not been matched, -1 is returned.840 * If that group has not been matched, false is returned.
841 *841 *
842 * @param groupId the id of the group, either zero for the entire regex,842 * @param groupId the id of the group, either zero for the entire regex,
843 * or [1 .. group_count] for that specific group843 * or [1 .. group_count] for that specific group
844 * @return the start position, zero based, or -1 if that group didn't match844 * @param start to return start position in bytes
845 * @param end to return end position in bytes
846 * @return true if that group exists and has been matched
845 */847 */
846 int get_match_start( int groupId = 0 );848 bool get_match_start_end_bytes( int groupId, int *start, int *end );
847849
848 /**
849 * Get the end position of the matched group.
850 * If groupId is zero, then the end position of the whole match is returned.
851 * If groupId is non-zero, then the end position of that group is returned.
852 * If that group has not been matched, -1 is returned.
853 *
854 * @param groupId the id of the group, either zero for the entire regex,
855 * or [1 .. group_count] for that specific group
856 * @return the end position, zero based, or -1 if that group didn't match
857 */
858 int get_match_end( int groupId = 0 );
859850
860private:851private:
861 regex_ascii::CRegexAscii_parser regex_parser;852 regex_xquery::CRegexXQuery_regex *regex_matcher;
862 regex_ascii::CRegexAscii_regex *regex_matcher;
863 uint32_t parsed_flags;853 uint32_t parsed_flags;
864854
865 zstring s_in_;855 zstring s_in_;
@@ -873,15 +863,13 @@
873 regex( regex const& );863 regex( regex const& );
874 regex& operator=( regex const& );864 regex& operator=( regex const& );
875};865};
866
867///////////////////////////////////////////////////////////////////////////////
868
876} // namespace unicode869} // namespace unicode
877} // namespace zorba870} // namespace zorba
878871
879#endif /* ZORBA_NO_UNICODE */872#endif /* ZORBA_NO_ICU */
880
881
882///////////////////////////////////////////////////////////////////////////////
883
884
885#endif /* ZORBA_REGEX_H */873#endif /* ZORBA_REGEX_H */
886/*874/*
887 * Local variables:875 * Local variables:
888876
=== renamed file 'src/util/regex_ascii.cpp' => 'src/util/regex_xquery.cpp'
--- src/util/regex_ascii.cpp 2012-03-28 05:19:57 +0000
+++ src/util/regex_xquery.cpp 2012-04-07 00:45:26 +0000
@@ -1,4 +1,4 @@
1a/*1/*
2 * Copyright 2006-2008 The FLWOR Foundation.2 * Copyright 2006-2008 The FLWOR Foundation.
3 * 3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");4 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,12 +18,15 @@
1818
19#include "diagnostics/xquery_diagnostics.h"19#include "diagnostics/xquery_diagnostics.h"
2020
21#include "regex_ascii.h"21#include "regex_xquery.h"
22#include <string.h>22#include <string.h>
23#include "zorbatypes/chartype.h"23#include "zorbatypes/chartype.h"
24#include "util/unicode_categories.h"
25#include "util/ascii_util.h"
26#include "util/utf8_string.h"
2427
25namespace zorba {28namespace zorba {
26 namespace regex_ascii{29 namespace regex_xquery{
27//ascii regular expression matching30//ascii regular expression matching
2831
29/*http://www.w3.org/TR/xmlschema-2/#regexs32/*http://www.w3.org/TR/xmlschema-2/#regexs
@@ -62,96 +65,138 @@
62+ http://www.w3.org/TR/xquery-operators/#regex-syntax (not implemented)65+ http://www.w3.org/TR/xquery-operators/#regex-syntax (not implemented)
63*/66*/
6467
68
69static bool compare_ascii_i(const char *str1, const char *str2)
70{
71 while(*str1 && *str2)
72 {
73 if(ascii::to_lower(*str1) != ascii::to_lower(*str2))
74 return false;
75 str1++;
76 str2++;
77 }
78 if(*str1 || *str2)
79 return false;
80 return true;
81}
82
83static bool compare_unicode_ni(const char *str1, const char *str2, int len)
84{
85 while(len > 0)
86 {
87 const char *temp_str1 = str1;
88 const char *temp_str2 = str2;
89 unicode::code_point cp1 = unicode::to_upper(utf8::next_char(temp_str1));
90 unicode::code_point cp2 = unicode::to_upper(utf8::next_char(temp_str2));
91 if(cp1 != cp2)
92 return false;
93 len -= temp_str1-str1;
94 str1 = temp_str1;
95 str2 = temp_str2;
96 }
97 return true;
98}
99static utf8::size_type myutf8len(const char *source)
100{
101 utf8::size_type len = utf8::char_length(*source);
102 if(!len)
103 return 1;
104 else
105 return len;
106}
65////////////////////////////////////107////////////////////////////////////
66////Regular expression parsing and building of the tree108////Regular expression parsing and building of the tree
67////////////////////////////////////109////////////////////////////////////
68110
69CRegexAscii_regex* CRegexAscii_parser::parse(const char *pattern, unsigned int flags)111CRegexXQuery_regex* CRegexXQuery_parser::parse(const char *pattern, unsigned int flags)
70{112{
71 this->flags = flags;113 this->flags = flags;
72 bool align_begin = false;
73 114
74 if(!(flags & REGEX_ASCII_LITERAL) && (pattern[0] == '^'))
75 align_begin = true;
76
77 int regex_len;115 int regex_len;
78 CRegexAscii_regex* regex = parse_regexp(pattern + (align_begin?1:0), &regex_len);116 CRegexXQuery_regex* regex = parse_regexp(pattern, &regex_len);
79 117
80 if(regex)
81 regex->set_align_begin(align_begin);
82
83 return regex;118 return regex;
84}119}
85120
86//until '\0' or ')'121//until '\0' or ')'
87CRegexAscii_regex* CRegexAscii_parser::parse_regexp(const char *pattern, 122CRegexXQuery_regex* CRegexXQuery_parser::parse_regexp(const char *pattern,
88 int *regex_len)123 int *regex_len)
89{124{
90 *regex_len = 0;125 *regex_len = 0;
91 int branch_len;126 int branch_len;
92 regex_depth++;127 regex_depth++;
93 CRegexAscii_regex *regex = new CRegexAscii_regex(current_regex);128 std::auto_ptr<CRegexXQuery_regex> regex(new CRegexXQuery_regex(current_regex));
94 if(!current_regex)129 if(!current_regex)
95 current_regex = regex;130 current_regex = regex.get();
96 if(regex_depth >= 2)131 if(regex_depth >= 2)
97 {132 {
98 //mark this as group if it does not start with ?:133 //mark this as group if it does not start with ?:
99 if(pattern[0] != '?' || pattern[1] != ':')134 if(pattern[0] != '?' || pattern[1] != ':')
100 current_regex->subregex.push_back(regex);135 current_regex->subregex.push_back(regex.get());
101 else136 else
102 *regex_len = 2;137 *regex_len = 2;
103 }138 }
104 CRegexAscii_branch *branch;139 CRegexXQuery_branch *branch;
140 bool must_read_another_branch = true;
105 while(pattern[*regex_len] && (pattern[*regex_len] != ')'))141 while(pattern[*regex_len] && (pattern[*regex_len] != ')'))
106 {142 {
107 branch = parse_branch(pattern+*regex_len, &branch_len);143 branch = parse_branch(pattern+*regex_len, &branch_len);
108 if(!branch)144 if(!branch)
109 {145 {
110 regex_depth--;146 regex_depth--;
111 delete regex;
112 return NULL;147 return NULL;
113 }148 }
114 regex->add_branch(branch);149 regex->add_branch(branch);
115 *regex_len += branch_len;150 *regex_len += branch_len;
151 if(pattern[*regex_len] == '|')
152 (*regex_len)++;
153 else
154 must_read_another_branch = false;
116 }155 }
117 if((current_regex == regex) && (pattern[*regex_len] == ')'))156 if((current_regex == regex.get()) && (pattern[*regex_len] == ')'))
118 {157 {
119 throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(U_REGEX_MISMATCHED_PAREN)) );158 throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_MISMATCHED_PAREN)) );
120 }159 }
121 if(pattern[*regex_len])160 if(pattern[*regex_len])
122 (*regex_len)++;161 (*regex_len)++;
162 if(must_read_another_branch)
163 regex->add_branch(new CRegexXQuery_branch(current_regex));//add empty branch
123 regex->flags = 0;//finished initialization164 regex->flags = 0;//finished initialization
124 regex_depth--;165 regex_depth--;
125 return regex;166 return regex.release();
126}167}
127168
128CRegexAscii_branch* CRegexAscii_parser::parse_branch(const char *pattern, int *branch_len)169CRegexXQuery_branch* CRegexXQuery_parser::parse_branch(const char *pattern, int *branch_len)
129{170{
130 int piece_len;171 int piece_len;
131172
132 CRegexAscii_branch *branch = new CRegexAscii_branch(current_regex);173 std::auto_ptr<CRegexXQuery_branch> branch(new CRegexXQuery_branch(current_regex));
133 CRegexAscii_piece *piece;174 CRegexXQuery_piece *piece;
134 *branch_len = 0;175 *branch_len = 0;
135 while(pattern[*branch_len] && (pattern[*branch_len] != '|') && (pattern[*branch_len] != ')'))176 while(pattern[*branch_len] && (pattern[*branch_len] != '|') && (pattern[*branch_len] != ')'))
136 {177 {
137 piece = parse_piece(pattern+*branch_len, &piece_len);178 piece = parse_piece(pattern+*branch_len, &piece_len);
138 if(!piece)179 if(!piece)
139 {180 {
140 delete branch;
141 return NULL;181 return NULL;
142 }182 }
183 if(branch->piece_list.size() && dynamic_cast<CRegexXQuery_pinstart*>(piece->atom))
184 {
185 //found ^ that is not at the beginning of branch
186 throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_ATOM_CHAR), '^') );
187 }
143 branch->add_piece(piece);188 branch->add_piece(piece);
144 *branch_len += piece_len;189 *branch_len += piece_len;
145 }190 }
146 if(pattern[*branch_len] == '|')191 //if(pattern[*branch_len] == '|')
147 (*branch_len)++;192 // (*branch_len)++;
148 return branch;193 return branch.release();
149}194}
150195
151//piece = atom + quantifier196//piece = atom + quantifier
152CRegexAscii_piece* CRegexAscii_parser::parse_piece(const char *pattern, int *piece_len)197CRegexXQuery_piece* CRegexXQuery_parser::parse_piece(const char *pattern, int *piece_len)
153{198{
154 CRegexAscii_piece *piece = new CRegexAscii_piece;199 std::auto_ptr<CRegexXQuery_piece> piece(new CRegexXQuery_piece);
155 IRegexAtom *atom;200 IRegexAtom *atom;
156 *piece_len = 0;201 *piece_len = 0;
157202
@@ -160,19 +205,18 @@
160 atom = read_atom(pattern, &atom_len);205 atom = read_atom(pattern, &atom_len);
161 if(!atom)206 if(!atom)
162 {207 {
163 delete piece;
164 return NULL;208 return NULL;
165 }209 }
166 piece->set_atom(atom);210 piece->set_atom(atom);
167 if(!(flags & REGEX_ASCII_LITERAL))211 if(!(flags & REGEX_ASCII_LITERAL))
168 read_quantifier(piece, pattern+atom_len, &quantif_len);212 read_quantifier(piece.get(), pattern+atom_len, &quantif_len);
169213
170 *piece_len += atom_len + quantif_len;214 *piece_len += atom_len + quantif_len;
171215
172 return piece;216 return piece.release();
173}217}
174218
175char CRegexAscii_parser::myishex(char c)219char CRegexXQuery_parser::myishex(char c)
176{220{
177 if((c >= '0') && (c <= '9'))221 if((c >= '0') && (c <= '9'))
178 return c-'0'+1;222 return c-'0'+1;
@@ -183,26 +227,125 @@
183 return 0;//not a hex227 return 0;//not a hex
184}228}
185229
186bool CRegexAscii_parser::myisdigit(char c)230bool CRegexXQuery_parser::myisdigit(char c)
187{231{
188 return (c >= '0') || (c <= '9');232 return (c >= '0') && (c <= '9');
189}233}
190234
191char CRegexAscii_parser::readChar(const char *pattern, int *char_len, bool *is_multichar)235bool CRegexXQuery_parser::myisletterAZ(char c)
236{
237 return ((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z'));
238}
239
240static const unicode::code_point specials_extcp[] = {0xFFF0, 0xFFFD, 0};
241
242static CRegexXQuery_parser::block_escape_t block_escape[] =
243{
244{{0x0000, 0x007F}, NULL, "BasicLatin"},
245{{0x0080, 0x00FF}, NULL, "Latin-1Supplement"},
246{{0x0100, 0x017F}, NULL, "LatinExtended-A"},
247{{0x0180, 0x024F}, NULL, "LatinExtended-B"},
248{{0x0250, 0x02AF}, NULL, "IPAExtensions"},
249{{0x02B0, 0x02FF}, NULL, "SpacingModifierLetters"},
250{{0x0300, 0x036F}, NULL, "CombiningDiacriticalMarks"},
251{{0x0370, 0x03FF}, NULL, "Greek"},
252{{0x0400, 0x04FF}, NULL, "Cyrillic"},
253{{0x0530, 0x058F}, NULL, "Armenian"},
254{{0x0590, 0x05FF}, NULL, "Hebrew"},
255{{0x0600, 0x06FF}, NULL, "Arabic"},
256{{0x0700, 0x074F}, NULL, "Syriac"},
257{{0x0780, 0x07BF}, NULL, "Thaana"},
258{{0x0900, 0x097F}, NULL, "Devanagari"},
259{{0x0980, 0x09FF}, NULL, "Bengali"},
260{{0x0A00, 0x0A7F}, NULL, "Gurmukhi"},
261{{0x0A80, 0x0AFF}, NULL, "Gujarati"},
262{{0x0B00, 0x0B7F}, NULL, "Oriya"},
263{{0x0B80, 0x0BFF}, NULL, "Tamil"},
264{{0x0C00, 0x0C7F}, NULL, "Telugu"},
265{{0x0C80, 0x0CFF}, NULL, "Kannada"},
266{{0x0D00, 0x0D7F}, NULL, "Malayalam"},
267{{0x0D80, 0x0DFF}, NULL, "Sinhala"},
268{{0x0E00, 0x0E7F}, NULL, "Thai"},
269{{0x0E80, 0x0EFF}, NULL, "Lao"},
270{{0x0F00, 0x0FFF}, NULL, "Tibetan"},
271{{0x1000, 0x109F}, NULL, "Myanmar"},
272{{0x10A0, 0x10FF}, NULL, "Georgian"},
273{{0x1100, 0x11FF}, NULL, "HangulJamo"},
274{{0x1200, 0x137F}, NULL, "Ethiopic"},
275{{0x13A0, 0x13FF}, NULL, "Cherokee"},
276{{0x1400, 0x167F}, NULL, "UnifiedCanadianAboriginalSyllabics"},
277{{0x1680, 0x169F}, NULL, "Ogham"},
278{{0x16A0, 0x16FF}, NULL, "Runic"},
279{{0x1780, 0x17FF}, NULL, "Khmer"},
280{{0x1800, 0x18AF}, NULL, "Mongolian"},
281{{0x1E00, 0x1EFF}, NULL, "LatinExtendedAdditional"},
282{{0x1F00, 0x1FFF}, NULL, "GreekExtended"},
283{{0x2000, 0x206F}, NULL, "GeneralPunctuation"},
284{{0x2070, 0x209F}, NULL, "SuperscriptsandSubscripts"},
285{{0x20A0, 0x20CF}, NULL, "CurrencySymbols"},
286{{0x20D0, 0x20FF}, NULL, "CombiningMarksforSymbols"},
287{{0x2100, 0x214F}, NULL, "LetterlikeSymbols"},
288{{0x2150, 0x218F}, NULL, "NumberForms"},
289{{0x2190, 0x21FF}, NULL, "Arrows"},
290{{0x2200, 0x22FF}, NULL, "MathematicalOperators"},
291{{0x2300, 0x23FF}, NULL, "MiscellaneousTechnical"},
292{{0x2400, 0x243F}, NULL, "ControlPictures"},
293{{0x2440, 0x245F}, NULL, "OpticalCharacterRecognition"},
294{{0x2460, 0x24FF}, NULL, "EnclosedAlphanumerics"},
295{{0x2500, 0x257F}, NULL, "BoxDrawing"},
296{{0x2580, 0x259F}, NULL, "BlockElements"},
297{{0x25A0, 0x25FF}, NULL, "GeometricShapes"},
298{{0x2600, 0x26FF}, NULL, "MiscellaneousSymbols"},
299{{0x2700, 0x27BF}, NULL, "Dingbats"},
300{{0x2800, 0x28FF}, NULL, "BraillePatterns"},
301{{0x2E80, 0x2EFF}, NULL, "CJKRadicalsSupplement"},
302{{0x2F00, 0x2FDF}, NULL, "KangxiRadicals"},
303{{0x2FF0, 0x2FFF}, NULL, "IdeographicDescriptionCharacters"},
304{{0x3000, 0x303F}, NULL, "CJKSymbolsandPunctuation"},
305{{0x3040, 0x309F}, NULL, "Hiragana"},
306{{0x30A0, 0x30FF}, NULL, "Katakana"},
307{{0x3100, 0x312F}, NULL, "Bopomofo"},
308{{0x3130, 0x318F}, NULL, "HangulCompatibilityJamo"},
309{{0x3190, 0x319F}, NULL, "Kanbun"},
310{{0x31A0, 0x31BF}, NULL, "BopomofoExtended"},
311{{0x3200, 0x32FF}, NULL, "EnclosedCJKLettersandMonths"},
312{{0x3300, 0x33FF}, NULL, "CJKCompatibility"},
313{{0x3400, 0x4DB5}, NULL, "CJKUnifiedIdeographsExtensionA"},
314{{0x4E00, 0x9FFF}, NULL, "CJKUnifiedIdeographs"},
315{{0xA000, 0xA48F}, NULL, "YiSyllables"},
316{{0xA490, 0xA4CF}, NULL, "YiRadicals"},
317{{0xAC00, 0xD7A3}, NULL, "HangulSyllables"},
318{{0xE000, 0xF8FF}, NULL, "PrivateUse"},
319{{0xF900, 0xFAFF}, NULL, "CJKCompatibilityIdeographs"},
320{{0xFB00, 0xFB4F}, NULL, "AlphabeticPresentationForms"},
321{{0xFB50, 0xFDFF}, NULL, "ArabicPresentationForms-A"},
322{{0xFE20, 0xFE2F}, NULL, "CombiningHalfMarks"},
323{{0xFE30, 0xFE4F}, NULL, "CJKCompatibilityForms"},
324{{0xFE50, 0xFE6F}, NULL, "SmallFormVariants"},
325{{0xFE70, 0xFEFE}, NULL, "ArabicPresentationForms-B"},
326{{0xFEFF, 0xFEFF}, specials_extcp, "Specials"},
327{{0xFF00, 0xFFEF}, NULL, "HalfwidthandFullwidthForms"}
328};
329
330CRegexXQuery_charmatch* CRegexXQuery_parser::readChar(const char *pattern,
331 int *char_len,
332 enum CHARGROUP_t *multichar_type)
192{333{
193 char c = 0;334 char c = 0;
194 *char_len = 0;335 *char_len = 0;
195 *is_multichar = false;336 *multichar_type = CHARGROUP_NO_MULTICHAR;
196 switch(pattern[*char_len])337 switch(pattern[*char_len])
197 {338 {
198 case '\\':339 case '\\':
199 { (*char_len)++;340 {
341 (*char_len)++;
200 switch(pattern[*char_len])342 switch(pattern[*char_len])
201 {343 {
202 case 'n': c = '\n';break;344 case 'n': c = '\n';(*char_len)++;return new CRegexXQuery_char_ascii(current_regex, c);
203 case 'r': c = '\r';break;345 case 'r': c = '\r';(*char_len)++;return new CRegexXQuery_char_ascii(current_regex, c);
204 case 't': c = '\t';break;346 case 't': c = '\t';(*char_len)++;return new CRegexXQuery_char_ascii(current_regex, c);
205 case '\\':347 case '\\':
348 case '/'://+
206 case '|':349 case '|':
207 case '.':350 case '.':
208 case '?':351 case '?':
@@ -216,19 +359,205 @@
216 case '['://#x5B359 case '['://#x5B
217 case ']'://#x5D360 case ']'://#x5D
218 case '^'://#x5E361 case '^'://#x5E
362 case '$'://+
219 c = pattern[*char_len];363 c = pattern[*char_len];
220 break;364 (*char_len)++;
365 *multichar_type = CHARGROUP_FLAGS_ONECHAR_ASCII;
366 return new CRegexXQuery_char_ascii(current_regex, c);
221 case 'p'://catEsc367 case 'p'://catEsc
222 case 'P'://complEsc368 case 'P'://complEsc
369 {
223 //ignore the prop for now370 //ignore the prop for now
224 c = pattern[*char_len];371 *multichar_type = CHARGROUP_FLAGS_MULTICHAR_p;//(CHARGROUP_t)((pattern[*char_len] == 'P') ? 128 : 0);
225 *is_multichar = true;372 bool is_reverse = (pattern[*char_len] == 'P');
226 if(pattern[*char_len+1] == '{')373 c = 0;
227 {374 if(pattern[(*char_len)+1] != '{')
228 while(pattern[*char_len] != '}')375 {
376 throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_BROKEN_P_CONSTRUCT)) );
377 }
378 (*char_len) += 2;
379 switch(pattern[*char_len])
380 {//IsCategory
381 case 'L':
382 {
383 switch(pattern[(*char_len)+1])
384 {
385 case '}':
386 c = unicode::UNICODE_Ll + 50;break;
387 case 'u':
388 c = unicode::UNICODE_Lu; (*char_len)++;break;
389 case 'l':
390 c = unicode::UNICODE_Ll; (*char_len)++;break;
391 case 't':
392 c = unicode::UNICODE_Lt; (*char_len)++;break;
393 case 'm':
394 c = unicode::UNICODE_Lm; (*char_len)++;break;
395 case 'o':
396 c = unicode::UNICODE_Lo; (*char_len)++;break;
397 default:
398 throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PL_CONSTRUCT)) );
399 }
400 }break;
401 case 'M':
402 {
403 switch(pattern[(*char_len)+1])
404 {
405 case '}':
406 c = unicode::UNICODE_Mc + 50;break;
407 case 'n':
408 c = unicode::UNICODE_Mn; (*char_len)++;break;
409 case 'c':
410 c = unicode::UNICODE_Mc; (*char_len)++;break;
411 case 'e':
412 c = unicode::UNICODE_Me; (*char_len)++;break;
413 default:
414 throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PM_CONSTRUCT)) );
415 }
416 }break;
417 case 'N':
418 {
419 switch(pattern[(*char_len)+1])
420 {
421 case '}':
422 c = unicode::UNICODE_Nd + 50;break;
423 case 'd':
424 c = unicode::UNICODE_Nd; (*char_len)++;break;
425 case 'l':
426 c = unicode::UNICODE_Nl; (*char_len)++;break;
427 case 'o':
428 c = unicode::UNICODE_No; (*char_len)++;break;
429 default:
430 throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PN_CONSTRUCT)) );
431 }
432 }break;
433 case 'P':
434 {
435 switch(pattern[(*char_len)+1])
436 {
437 case '}':
438 c = unicode::UNICODE_Pc + 50;break;
439 case 'c':
440 c = unicode::UNICODE_Pc; (*char_len)++;break;
441 case 'd':
442 c = unicode::UNICODE_Pd; (*char_len)++;break;
443 case 's':
444 c = unicode::UNICODE_Ps; (*char_len)++;break;
445 case 'e':
446 c = unicode::UNICODE_Pe; (*char_len)++;break;
447 case 'i':
448 c = unicode::UNICODE_Pi; (*char_len)++;break;
449 case 'f':
450 c = unicode::UNICODE_Pf; (*char_len)++;break;
451 case 'o':
452 c = unicode::UNICODE_Po; (*char_len)++;break;
453 default:
454 throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PP_CONSTRUCT)) );
455 }
456 }break;
457 case 'Z':
458 {
459 switch(pattern[(*char_len)+1])
460 {
461 case '}':
462 c = unicode::UNICODE_Zl + 50;break;
463 case 's':
464 c = unicode::UNICODE_Zs; (*char_len)++;break;
465 case 'l':
466 c = unicode::UNICODE_Zl; (*char_len)++;break;
467 case 'p':
468 c = unicode::UNICODE_Zp; (*char_len)++;break;
469 default:
470 throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PZ_CONSTRUCT)) );
471 }
472 }break;
473 case 'S':
474 {
475 switch(pattern[(*char_len)+1])
476 {
477 case '}':
478 c = unicode::UNICODE_Sc + 50;break;
479 case 'm':
480 c = unicode::UNICODE_Sm; (*char_len)++;break;
481 case 'c':
482 c = unicode::UNICODE_Sc; (*char_len)++;break;
483 case 'k':
484 c = unicode::UNICODE_Sk; (*char_len)++;break;
485 case 'o':
486 c = unicode::UNICODE_So; (*char_len)++;break;
487 default:
488 throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PS_CONSTRUCT)) );
489 }
490 }break;
491 case 'C':
492 {
493 switch(pattern[(*char_len)+1])
494 {
495 case '}':
496 c = unicode::UNICODE_Cc + 50;break;
497 case 'c':
498 c = unicode::UNICODE_Cc; (*char_len)++;break;
499 case 'f':
500 c = unicode::UNICODE_Cf; (*char_len)++;break;
501 case 'o':
502 c = unicode::UNICODE_Co; (*char_len)++;break;
503 case 'n':
504 c = unicode::UNICODE_Cn; (*char_len)++;break;
505 default:
506 throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PC_CONSTRUCT)) );
507 }
508 }break;
509 }//end switch
510 if(c)
511 {
512 if(pattern[(*char_len) + 1] != '}')
513 throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_BROKEN_P_CONSTRUCT)) );
514 (*char_len)++;
515 (*char_len)++;
516 return new CRegexXQuery_multicharP(current_regex, c, is_reverse);
517 }
518 if(pattern[*char_len] == 'I')
519 {
520 if(pattern[(*char_len)+1] == 's')//IsBlock
521 {
522 *multichar_type = CHARGROUP_FLAGS_MULTICHAR_Is;
523 (*char_len) += 2;
524 zstring block_name;
525 char tempc = pattern[(*char_len)];
526 while(tempc && (tempc != '}'))
527 {
528 if(!myisletterAZ(tempc) && !myisdigit(tempc) && (tempc != '-'))
529 throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_BROKEN_PIs_CONSTRUCT)) );
530 block_name.append(1, tempc);
531 (*char_len)++;
532 tempc = pattern[(*char_len)];
533 }
534 if(!tempc)
535 throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_BROKEN_PIs_CONSTRUCT)) );
536 //search for the block name
537 int i;
538 int nr_blocks = sizeof(block_escape)/sizeof(CRegexXQuery_parser::block_escape_t);
539 for(i=0;i<nr_blocks;i++)
540 {
541 if(compare_ascii_i(block_name.c_str(), block_escape[i].group_name))
542 {
543 c = i;
544 break;
545 }
546 }
547 if(i==nr_blocks)
548 throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PIs_CONSTRUCT)) );
229 (*char_len)++;549 (*char_len)++;
230 }550 return new CRegexXQuery_multicharIs(current_regex, i, is_reverse);
231 break;551 }
552 else
553 throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_BROKEN_PIs_CONSTRUCT)) );
554 }
555 else
556 {
557 throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_BROKEN_P_CONSTRUCT)) );
558 }
559 break;//unreachable
560 }//end case 'p'
232 //multiCharEsc561 //multiCharEsc
233 case 's':562 case 's':
234 case 'S':563 case 'S':
@@ -240,40 +569,104 @@
240 case 'D':569 case 'D':
241 case 'w':570 case 'w':
242 case 'W':571 case 'W':
243 *is_multichar = true;572 *multichar_type = CHARGROUP_FLAGS_MULTICHAR_OTHER;
244 c = pattern[*char_len];573 c = pattern[*char_len];
245 break;574 (*char_len)++;
246 }575 return new CRegexXQuery_multicharOther(current_regex, c);
247 break;576 case 'u'://unicode codepoint \uXXXX
248 }577 {
249 case '#':///might be #xXX578 unicode::code_point utf8c = 0;
250 {579 (*char_len)++;
251 if((pattern[*char_len+1] == 'x') &&580 for(int i=0;i<4;i++)
252 myishex(pattern[*char_len+2]) && myishex(pattern[*char_len+3]))581 {
253 {582 char hex = myishex(pattern[*char_len]);
254 c = (myishex(pattern[*char_len+2])-1)<<4 | (myishex(pattern[*char_len+3])-1);583 if(!hex)
255 *char_len += 3;584 {
256 break;585 throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_UNICODE_CODEPOINT_u)) );
257 }586 }
258 }587 utf8c <<= 4;
588 utf8c |= (hex-1) & 0x0f;
589 (*char_len)++;
590 }
591 return create_charmatch(utf8c, NULL, 0, multichar_type);
592 }
593 case 'U'://unicode codepoint \UXXXXXXXX
594 {
595 unicode::code_point utf8c = 0;
596 (*char_len)++;
597 for(int i=0;i<8;i++)
598 {
599 char hex = myishex(pattern[*char_len]);
600 if(!hex)
601 {
602 throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_UNICODE_CODEPOINT_u)) );
603 }
604 utf8c <<= 4;
605 utf8c |= (hex-1) & 0x0f;
606 (*char_len)++;
607 }
608 return create_charmatch(utf8c, NULL, 0, multichar_type);
609 }
610 default:
611 throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_ESC_CHAR)) );
612 }
613 assert(false);
614 break;//unreachable
615 }//end case '\'
259 default:616 default:
260 c = pattern[*char_len];617 {
261 break;618 const char *temp_pattern = pattern;
262 }619 unicode::code_point utf8c = utf8::next_char(temp_pattern);
263620 (*char_len) = temp_pattern - pattern;
264 (*char_len)++;621 return create_charmatch(utf8c, pattern, *char_len, multichar_type);
265 return c;622 }
266}623 }
267624 return NULL;
268625}
269626
270IRegexAtom* CRegexAscii_parser::read_atom(const char *pattern, int *atom_len)627CRegexXQuery_charmatch *CRegexXQuery_parser::create_charmatch(unicode::code_point utf8c,
628 const char *pattern, int utf8len,
629 enum CHARGROUP_t *multichar_type)
630{
631 if(utf8c <= 0x7F)
632 {
633 *multichar_type = CHARGROUP_FLAGS_ONECHAR_ASCII;
634 if(flags & REGEX_ASCII_CASE_INSENSITIVE)
635 return new CRegexXQuery_char_ascii_i(current_regex, (char)utf8c);
636 else
637 return new CRegexXQuery_char_ascii(current_regex, (char)utf8c);
638 }
639 else
640 {
641 *multichar_type = CHARGROUP_FLAGS_ONECHAR_UNICODE;
642 if(flags & REGEX_ASCII_CASE_INSENSITIVE)
643 return new CRegexXQuery_char_unicode_i(current_regex, utf8c);
644 else
645 {
646 if(pattern)
647 return new CRegexXQuery_char_unicode(current_regex, pattern, utf8len);
648 else
649 return new CRegexXQuery_char_unicode_cp(current_regex, utf8c);
650 }
651 }
652}
653
654IRegexAtom* CRegexXQuery_parser::read_atom(const char *pattern, int *atom_len)
271{655{
272 *atom_len = 0;656 *atom_len = 0;
273 char c;657 if(flags & REGEX_ASCII_LITERAL)
274 bool is_end_line = false;658 {
275 c = pattern[*atom_len];659 unicode::code_point utf8c;
276 if((!(flags & REGEX_ASCII_LITERAL)) && (c == '\\'))660 //bool is_end_line = false;
661 const char *temp_pattern = pattern;
662 utf8c = utf8::next_char(temp_pattern);
663 *atom_len = temp_pattern - pattern;
664 enum CHARGROUP_t multichar_type;
665 return create_charmatch(utf8c, pattern, *atom_len, &multichar_type);
666 }
667
668 char c = *pattern;
669 if(c == '\\')
277 {670 {
278 //check for back reference671 //check for back reference
279 if(myisdigit(pattern[(*atom_len)+1]))672 if(myisdigit(pattern[(*atom_len)+1]))
@@ -281,13 +674,13 @@
281 (*atom_len)++;674 (*atom_len)++;
282 if(pattern[*atom_len] == '0')675 if(pattern[*atom_len] == '0')
283 {676 {
284 throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(U_REGEX_INVALID_BACK_REF)) );677 throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_BACK_REF), 0, current_regex->subregex.size()) );
285 }678 }
286 unsigned int backref = pattern[*atom_len] - '0';679 unsigned int backref = pattern[*atom_len] - '0';
287 if((backref > current_regex->subregex.size()) ||680 if((backref > current_regex->subregex.size()) ||
288 (current_regex->subregex.at(backref-1)->flags != 0))681 (current_regex->subregex.at(backref-1)->flags != 0))
289 {682 {
290 throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(U_REGEX_INVALID_BACK_REF)) );683 throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_BACK_REF), backref, current_regex->subregex.size()) );
291 }684 }
292 while(current_regex->subregex.size() >= backref*10)685 while(current_regex->subregex.size() >= backref*10)
293 {686 {
@@ -303,70 +696,86 @@
303 break;696 break;
304 }697 }
305 }698 }
306 return new CRegexAscii_backref(current_regex, backref);699 (*atom_len)++;
700 return new CRegexXQuery_backref(current_regex, backref);
307 }701 }
308 }702 }
703 if(c == '^')
704 {
705 (*atom_len)++;
706 return new CRegexXQuery_pinstart(current_regex);
707 }
708 if((c == '}') || (c == '{') || (c == '?') || (c == '*') || (c == '+') || (c == '|'))
709 {
710 throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_ATOM_CHAR), c) );
711 }
309 switch(c)712 switch(c)
310 {713 {
311 case '[':714 case '[':
312 {715 {
313 if(!(flags & REGEX_ASCII_LITERAL))716 (*atom_len)++;
314 {717 CRegexXQuery_chargroup *chargroup = NULL;
315 (*atom_len)++;718 int chargroup_len;
316 CRegexAscii_chargroup *chargroup = NULL;719 chargroup = readchargroup(pattern+*atom_len, &chargroup_len);
317 int chargroup_len;720 *atom_len += chargroup_len;
318 chargroup = readchargroup(pattern+*atom_len, &chargroup_len);721 return chargroup;
319 *atom_len += chargroup_len;
320 return chargroup;
321 }
322 }722 }
323 case '.'://WildCharEsc723 case '.'://WildCharEsc
324 {724 {
325 if(!(flags & REGEX_ASCII_LITERAL))725 (*atom_len)++;
326 {726 return new CRegexXQuery_wildchar(current_regex);
327 CRegexAscii_wildchar *wildchar = new CRegexAscii_wildchar(current_regex);
328 (*atom_len)++;
329 return wildchar;
330 }
331 }727 }
332 case '('://begin an embedded reg exp728 case '('://begin an embedded reg exp
333 { 729 {
334 if(!(flags & REGEX_ASCII_LITERAL))730 (*atom_len)++;
335 {731 CRegexXQuery_regex *emb_regex = NULL;
336 (*atom_len)++;732 int regex_len;
337 CRegexAscii_regex *emb_regex = NULL;733 emb_regex = parse_regexp(pattern + *atom_len, &regex_len);
338 int regex_len;734 *atom_len += regex_len;
339 emb_regex = parse_regexp(pattern + *atom_len, &regex_len);735 return emb_regex;
340 *atom_len += regex_len;
341 return emb_regex;
342 }
343 }736 }
344 case '$'://end line737 case '$'://end line
345 if(!(flags & REGEX_ASCII_LITERAL))738 //is_end_line = true;
346 {739 (*atom_len)++;
347 is_end_line = true;740 return new CRegexXQuery_endline(current_regex);
348 }
349 default:741 default:
350 { 742 {
351 char c;743 //char c;
744 CRegexXQuery_charmatch *charmatch = NULL;
352 int c_len;745 int c_len;
353 bool is_multichar = false;746 CHARGROUP_t multichar_type = CHARGROUP_NO_MULTICHAR;
354 if(!(flags & REGEX_ASCII_LITERAL))747 *atom_len = 0;
355 c = readChar(pattern+*atom_len, &c_len, &is_multichar);748 while(pattern[*atom_len])
356 else
357 {749 {
358 c = pattern[*atom_len];750 charmatch = readChar(pattern+*atom_len, &c_len, &multichar_type);
359 c_len = 1;751 *atom_len += c_len;
752 if((flags & REGEX_ASCII_NO_WHITESPACE) && (multichar_type == CHARGROUP_FLAGS_ONECHAR_ASCII))
753 {
754 char c = (char)charmatch->get_c();
755 if((c == ' ') || (c == '\t') || (c == '\r') || (c == '\n'))
756 {
757 //ignore this whitespace
758 delete charmatch;
759 continue;
760 }
761 else
762 break;
763 }
764 else
765 break;
360 }766 }
361 CRegexAscii_chargroup *chargroup = new CRegexAscii_chargroup(current_regex);767 /*
362 if(is_multichar)768 std::auto_ptr<CRegexXQuery_chargroup> chargroup(new CRegexXQuery_chargroup(current_regex));
363 chargroup->addMultiChar(c);769 if(multichar_type)
770 chargroup->addMultiChar(c, multichar_type);
364 else if(is_end_line)771 else if(is_end_line)
365 chargroup->addEndLine();772 chargroup->addEndLine();
366 else773 else
367 chargroup->addCharRange(c, c);774 chargroup->addOneChar(c);
368 *atom_len += c_len;775 *atom_len += c_len;
369 return chargroup;776 return chargroup.release();
777 */
778 return charmatch;
370 }779 }
371 }780 }
372}781}
@@ -374,81 +783,119 @@
374//read until ']'783//read until ']'
375//posCharGroup ::= ( charRange | charClassEsc )+ 784//posCharGroup ::= ( charRange | charClassEsc )+
376//charRange ::= seRange | XmlCharIncDash785//charRange ::= seRange | XmlCharIncDash
377CRegexAscii_chargroup* CRegexAscii_parser::readchargroup(const char *pattern, int *chargroup_len)786CRegexXQuery_chargroup* CRegexXQuery_parser::readchargroup(const char *pattern, int *chargroup_len)
378{787{
379 CRegexAscii_chargroup *chargroup = NULL;788 std::auto_ptr<CRegexXQuery_chargroup> chargroup;
380 *chargroup_len = 0;789 *chargroup_len = 0;
381 if(pattern[*chargroup_len] == '^')//negative group790 if(pattern[*chargroup_len] == '^')//negative group
382 {791 {
383 (*chargroup_len)++;792 (*chargroup_len)++;
384 chargroup = new CRegexAscii_negchargroup(current_regex);793 chargroup.reset(new CRegexXQuery_negchargroup(current_regex));
385 }794 }
386 else795 else
387 chargroup = new CRegexAscii_chargroup(current_regex);796 chargroup.reset(new CRegexXQuery_chargroup(current_regex));
388 while(pattern[*chargroup_len] && (pattern[*chargroup_len]!=']'))797 while(pattern[*chargroup_len] && (pattern[*chargroup_len]!=']'))
389 {798 {
390 char c1, c2;799 //char c1, c2;
391 bool is_multichar;800 CHARGROUP_t multichar_type = CHARGROUP_NO_MULTICHAR;
392 int c1_len;801 int c1_len;
393 c1 = pattern[*chargroup_len];802 if((pattern[*chargroup_len] == '-') && (pattern[(*chargroup_len)+1] == '['))//charClassSub
394 c2 = pattern[*chargroup_len+1];
395 if((c1 == '-') && (c2 == '['))//charClassSub
396 {803 {
397 int classsub_len;804 int classsub_len;
398 CRegexAscii_chargroup *classsub = readchargroup(pattern + *chargroup_len+1 + 1, &classsub_len);805 CRegexXQuery_chargroup *classsub = readchargroup(pattern + (*chargroup_len)+1 + 1, &classsub_len);
399 if(!classsub)806 if(!classsub)
400 {807 {
401 delete chargroup;808 throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_SUBCLASS)) );
402 return NULL;
403 }809 }
404 chargroup->addClassSub(classsub);810 chargroup->addClassSub(classsub);
405 *chargroup_len += 2 + classsub_len + 1;811 *chargroup_len += 2 + classsub_len + 1;
406 if(pattern[*chargroup_len-1] != ']')812 if(pattern[*chargroup_len-1] != ']')
407 {813 {
408 delete chargroup;814 throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_USE_OF_SUBCLASS)) );
409 return NULL;
410 }815 }
411 return chargroup;816 return chargroup.release();
412 }817 }
413818
414 c1 = readChar(pattern+*chargroup_len, &c1_len, &is_multichar);819 std::unique_ptr<CRegexXQuery_charmatch> charmatch(readChar(pattern+*chargroup_len, &c1_len, &multichar_type));
415 if(is_multichar)//first char is multichar820 if((multichar_type == CHARGROUP_FLAGS_MULTICHAR_p) ||
821 (multichar_type == CHARGROUP_FLAGS_MULTICHAR_Is) ||
822 (multichar_type == CHARGROUP_FLAGS_MULTICHAR_OTHER))//first char is multichar
416 {823 {
417 chargroup->addMultiChar(c1);824 if((pattern[*chargroup_len+c1_len] == '-') &&///should not be a range
825 (pattern[*chargroup_len+c1_len+1] != ']'))
826 {
827 throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_MULTICHAR_IN_CHAR_RANGE)) );
828 }
829 //chargroup->addMultiChar(c1, multichar_type);
830 chargroup->addCharMatch(charmatch.release());
418 *chargroup_len += c1_len;831 *chargroup_len += c1_len;
419 continue;832 continue;
420 }833 }
421 if(pattern[*chargroup_len+c1_len] == '-')///might be a range834 (*chargroup_len) += c1_len;
835 if(pattern[*chargroup_len] == '-')///might be a range
422 {836 {
423 if(pattern[*chargroup_len+c1_len+1] == ']')//no range, just the last char is '-'837 if(pattern[(*chargroup_len)+1] == ']')//no range, just the last char is '-'
424 {838 {
425 chargroup->addCharRange(c1, c1);839 //chargroup->addOneChar(c1);
426 chargroup->addCharRange('-', '-');840 //chargroup->addOneChar('-');
427 *chargroup_len += c1_len + 1;841 chargroup->addCharMatch(charmatch.release());
842 chargroup->addCharMatch(new CRegexXQuery_char_ascii(current_regex, '-'));
843 (*chargroup_len)++;
428 continue;844 continue;
429 }845 }
430 else846 else if(pattern[(*chargroup_len)+1] != '[')
431 {847 {
432 //it is a range848 //it is a range
433 char c3;849 (*chargroup_len)++;
434 int c3_len;850 std::unique_ptr<CRegexXQuery_charmatch> charmatch2;
435 c3 = readChar(pattern+*chargroup_len+c1_len+1, &c3_len, &is_multichar);851 CHARGROUP_t multichar_type2 = CHARGROUP_NO_MULTICHAR;
436 if(is_multichar)852 int c2_len;
437 return NULL;//error853 charmatch2.reset(readChar(pattern+(*chargroup_len), &c2_len, &multichar_type2));
438 chargroup->addCharRange(c1, c3);854 if((multichar_type2 != CHARGROUP_FLAGS_ONECHAR_ASCII) &&
439 *chargroup_len += c1_len + 1 + c3_len;855 (multichar_type2 != CHARGROUP_FLAGS_ONECHAR_ASCII))//second char in range is multichar
856 {
857 throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_MULTICHAR_IN_CHAR_RANGE)) );
858 }
859 //chargroup->addCharRange(c1, c3);
860 if((multichar_type == CHARGROUP_FLAGS_ONECHAR_ASCII) && (multichar_type2 == CHARGROUP_FLAGS_ONECHAR_ASCII))
861 {
862 if(flags & REGEX_ASCII_CASE_INSENSITIVE)
863 chargroup->addCharMatch(new CRegexXQuery_char_range_ascii_i(current_regex,
864 (char)charmatch->get_c(),
865 (char)charmatch2->get_c()));
866 else
867 chargroup->addCharMatch(new CRegexXQuery_char_range_ascii(current_regex,
868 (char)charmatch->get_c(),
869 (char)charmatch2->get_c()));
870 }
871 else
872 {
873 if(flags & REGEX_ASCII_CASE_INSENSITIVE)
874 chargroup->addCharMatch(new CRegexXQuery_char_range_unicode_i(current_regex,
875 charmatch->get_c(),
876 charmatch2->get_c()));
877 else
878 chargroup->addCharMatch(new CRegexXQuery_char_range_unicode(current_regex,
879 charmatch->get_c(),
880 charmatch2->get_c()));
881 }
882 *chargroup_len += c2_len;
440 continue;883 continue;
441 }884 }
442 }885 }
443 chargroup->addCharRange(c1, c1);886 //chargroup->addOneChar(c1);
444 *chargroup_len += c1_len;887 chargroup->addCharMatch(charmatch.release());
445 }888 }
446 if(pattern[*chargroup_len])889 if(pattern[*chargroup_len])
447 (*chargroup_len)++;890 (*chargroup_len)++;
448 return chargroup;891 else
892 {
893 throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_MISSING_CLOSE_BRACKET)) );
894 }
895 return chargroup.release();
449}896}
450897
451void CRegexAscii_parser::read_quantifier(CRegexAscii_piece *piece,898void CRegexXQuery_parser::read_quantifier(CRegexXQuery_piece *piece,
452 const char *pattern, int *quantif_len)899 const char *pattern, int *quantif_len)
453{900{
454 *quantif_len = 0;901 *quantif_len = 0;
@@ -496,6 +943,10 @@
496 max = max*10 + pattern[*quantif_len] - '0';943 max = max*10 + pattern[*quantif_len] - '0';
497 (*quantif_len)++;944 (*quantif_len)++;
498 }945 }
946 if(max < min)
947 {
948 throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_MAX_LT_MIN)) );
949 }
499 piece->set_quantifier_min_max(min, max, true);950 piece->set_quantifier_min_max(min, max, true);
500 }951 }
501 while(pattern[*quantif_len] && (pattern[*quantif_len] != '}'))952 while(pattern[*quantif_len] && (pattern[*quantif_len] != '}'))
@@ -524,23 +975,25 @@
524///Constructors and destructors and internal functions975///Constructors and destructors and internal functions
525////////////////////////////976////////////////////////////
526977
527CRegexAscii_regex::CRegexAscii_regex(CRegexAscii_regex *topregex) : IRegexAtom(topregex?topregex:this)978CRegexXQuery_regex::CRegexXQuery_regex(CRegexXQuery_regex *topregex) : IRegexAtom(topregex?topregex:this)
528{979{
529 matched_source = NULL;980 matched_source = NULL;
530 matched_len = 0;981 matched_len = 0;
982// backup_matched_source = NULL;
983// backup_matched_len = 0;
531 flags = 128;//set to 0 after initialization984 flags = 128;//set to 0 after initialization
532}985}
533986
534CRegexAscii_regex::~CRegexAscii_regex()987CRegexXQuery_regex::~CRegexXQuery_regex()
535{988{
536 std::list<CRegexAscii_branch*>::iterator branch_it;989 std::list<CRegexXQuery_branch*>::iterator branch_it;
537990
538 for(branch_it = branch_list.begin(); branch_it != branch_list.end(); branch_it++)991 for(branch_it = branch_list.begin(); branch_it != branch_list.end(); branch_it++)
539 {992 {
540 delete (*branch_it);993 delete (*branch_it);
541 }994 }
542/*995/*
543 std::vector<CRegexAscii_regex*>::iterator subregex_it;996 std::vector<CRegexXQuery_regex*>::iterator subregex_it;
544 for(subregex_it = subregex.begin(); subregex_it != subregex.end(); subregex_it++)997 for(subregex_it = subregex.begin(); subregex_it != subregex.end(); subregex_it++)
545 {998 {
546 delete (*subregex_it);999 delete (*subregex_it);
@@ -548,25 +1001,18 @@
548*/1001*/
549}1002}
5501003
551bool CRegexAscii_regex::set_align_begin(bool align_begin)1004void CRegexXQuery_regex::add_branch(CRegexXQuery_branch *branch)
552{
553 bool prev_align = this->align_begin;
554 this->align_begin = align_begin;
555 return prev_align;
556}
557
558void CRegexAscii_regex::add_branch(CRegexAscii_branch *branch)
559{1005{
560 branch_list.push_back(branch);1006 branch_list.push_back(branch);
561}1007}
5621008
563bool CRegexAscii_regex::get_indexed_match(int index, 1009bool CRegexXQuery_regex::get_indexed_match(int index,
564 const char **matched_source, 1010 const char **matched_source,
565 int *matched_len)1011 int *matched_len)
566{1012{
567 if(!index || index > (int)subregex.size())1013 if(!index || index > (int)subregex.size())
568 return false;1014 return false;
569 CRegexAscii_regex *subr = subregex[index-1];1015 CRegexXQuery_regex *subr = subregex[index-1];
570 *matched_source = subr->matched_source;1016 *matched_source = subr->matched_source;
571 if(!*matched_source)1017 if(!*matched_source)
572 return false;1018 return false;
@@ -574,145 +1020,209 @@
574 return true;1020 return true;
575}1021}
5761022
577unsigned int CRegexAscii_regex::get_indexed_regex_count()1023unsigned int CRegexXQuery_regex::get_indexed_regex_count()
578{1024{
579 return subregex.size();1025 return subregex.size();
580}1026}
5811027
582CRegexAscii_branch::CRegexAscii_branch(CRegexAscii_regex* regex) :1028CRegexXQuery_branch::CRegexXQuery_branch(CRegexXQuery_regex* regex)
583 IRegexMatcher(regex)1029 //:
1030 //IRegexMatcher(regex)
584{1031{
585}1032}
5861033
587CRegexAscii_branch::~CRegexAscii_branch()1034CRegexXQuery_branch::~CRegexXQuery_branch()
588{1035{
589 std::list<CRegexAscii_piece*>::iterator piece_it;1036 std::list<RegexAscii_pieceinfo>::iterator piece_it;
5901037
591 for(piece_it = piece_list.begin(); piece_it != piece_list.end(); piece_it++)1038 for(piece_it = piece_list.begin(); piece_it != piece_list.end(); piece_it++)
592 {1039 {
593 delete (*piece_it);1040 delete (*piece_it).piece;
594 }1041 }
595}1042}
5961043
597void CRegexAscii_branch::add_piece(CRegexAscii_piece *piece)1044void CRegexXQuery_branch::add_piece(CRegexXQuery_piece *piece)
598{1045{
599 piece_list.push_back(piece);1046 piece_list.push_back(piece);
600}1047}
6011048
602CRegexAscii_piece::CRegexAscii_piece()1049CRegexXQuery_piece::CRegexXQuery_piece()
603{1050{
1051 atom = NULL;
1052 regex_atom = NULL;
604}1053}
6051054
606CRegexAscii_piece::~CRegexAscii_piece()1055CRegexXQuery_piece::~CRegexXQuery_piece()
607{1056{
608 delete atom;1057 delete atom;
609}1058}
6101059
611void CRegexAscii_piece::set_atom(IRegexAtom *atom)1060void CRegexXQuery_piece::set_atom(IRegexAtom *atom)
612{1061{
613 this->atom = atom;1062 this->atom = atom;
1063 this->regex_atom = dynamic_cast<CRegexXQuery_regex*>(atom);
614}1064}
6151065
616void CRegexAscii_piece::set_quantifier_min_max(int min, int max, bool strict_max)1066void CRegexXQuery_piece::set_quantifier_min_max(int min, int max, bool strict_max)
617{1067{
618 this->min = min;1068 this->min = min;
619 this->max = max;1069 this->max = max;
620 this->strict_max = strict_max;1070 this->strict_max = strict_max;
621}1071}
622void CRegexAscii_piece::set_is_reluctant(bool is_reluctant)1072void CRegexXQuery_piece::set_is_reluctant(bool is_reluctant)
623{1073{
624 this->is_reluctant = is_reluctant;1074 this->is_reluctant = is_reluctant;
625}1075}
626void CRegexAscii_piece::get_quantifier(int *min, int *max, bool *strict_max)1076void CRegexXQuery_piece::get_quantifier(int *min, int *max, bool *strict_max)
627{1077{
628 *min = this->min;1078 *min = this->min;
629 *max = this->max;1079 *max = this->max;
630 *strict_max = this->strict_max;1080 *strict_max = this->strict_max;
631}1081}
632bool CRegexAscii_piece::get_is_reluctant()1082bool CRegexXQuery_piece::get_is_reluctant()
633{1083{
1084 if(atom->regex_intern->flags & REGEX_ASCII_MINIMAL_MATCH)
1085 return true;
634 return is_reluctant;1086 return is_reluctant;
635}1087}
6361088
6371089
638CRegexAscii_chargroup::CRegexAscii_chargroup(CRegexAscii_regex* regex) :1090CRegexXQuery_charmatch::CRegexXQuery_charmatch(CRegexXQuery_regex* regex) :
1091 IRegexAtom(regex)
1092{
1093}
1094CRegexXQuery_multicharP::CRegexXQuery_multicharP(CRegexXQuery_regex* regex, char type, bool is_reverse) :
1095 CRegexXQuery_charmatch(regex)
1096{
1097 this->multichar_type = type; this->is_reverse = is_reverse;
1098}
1099CRegexXQuery_multicharIs::CRegexXQuery_multicharIs(CRegexXQuery_regex* regex, int block_index, bool is_reverse) :
1100 CRegexXQuery_charmatch(regex)
1101{
1102 this->block_index = block_index; this->is_reverse = is_reverse;
1103}
1104CRegexXQuery_multicharOther::CRegexXQuery_multicharOther(CRegexXQuery_regex* regex, char type) :
1105 CRegexXQuery_charmatch(regex)
1106{
1107 this->multichar_type = type;
1108}
1109CRegexXQuery_char_ascii::CRegexXQuery_char_ascii(CRegexXQuery_regex* regex, char c) :
1110 CRegexXQuery_charmatch(regex)
1111{
1112 this->c = c;
1113}
1114CRegexXQuery_char_ascii_i::CRegexXQuery_char_ascii_i(CRegexXQuery_regex* regex, char c) :
1115 CRegexXQuery_char_ascii(regex, toupper(c))
1116{
1117}
1118CRegexXQuery_char_range_ascii::CRegexXQuery_char_range_ascii(CRegexXQuery_regex* regex, char c1, char c2) :
1119 CRegexXQuery_charmatch(regex)
1120{
1121 this->c1 = c1; this->c2 = c2;
1122}
1123CRegexXQuery_char_range_ascii_i::CRegexXQuery_char_range_ascii_i(CRegexXQuery_regex* regex, char c1, char c2) :
1124 CRegexXQuery_char_range_ascii(regex, toupper(c1), toupper(c2))
1125{
1126}
1127CRegexXQuery_char_unicode::CRegexXQuery_char_unicode(CRegexXQuery_regex* regex, const char *source, int len) :
1128 CRegexXQuery_charmatch(regex)
1129{
1130 this->len = len;
1131 memcpy(c, source, len);
1132}
1133CRegexXQuery_char_unicode_cp::CRegexXQuery_char_unicode_cp(CRegexXQuery_regex* regex, unicode::code_point c) :
1134 CRegexXQuery_charmatch(regex)
1135{
1136 this->c = c;
1137}
1138CRegexXQuery_char_unicode_i::CRegexXQuery_char_unicode_i(CRegexXQuery_regex* regex, unicode::code_point c) :
1139 CRegexXQuery_char_unicode_cp(regex, unicode::to_upper(c))
1140{
1141}
1142CRegexXQuery_char_range_unicode::CRegexXQuery_char_range_unicode(CRegexXQuery_regex* regex, unicode::code_point c1, unicode::code_point c2) :
1143 CRegexXQuery_charmatch(regex)
1144{
1145 this->c1 = c1; this->c2 = c2;
1146}
1147CRegexXQuery_char_range_unicode_i::CRegexXQuery_char_range_unicode_i(CRegexXQuery_regex* regex, unicode::code_point c1, unicode::code_point c2) :
1148 CRegexXQuery_char_range_unicode(regex, unicode::to_upper(c1), unicode::to_upper(c2))
1149{
1150}
1151CRegexXQuery_endline::CRegexXQuery_endline(CRegexXQuery_regex* regex) :
1152 CRegexXQuery_charmatch(regex)
1153{
1154}
1155
1156unicode::code_point CRegexXQuery_char_unicode::get_c()
1157{
1158 const char *temp_c = (const char*)c;
1159 return utf8::next_char(temp_c);
1160}
1161
1162
1163CRegexXQuery_chargroup::CRegexXQuery_chargroup(CRegexXQuery_regex* regex) :
639 IRegexAtom(regex)1164 IRegexAtom(regex)
640{1165{
641 classsub = NULL;1166 classsub = NULL;
642}1167}
6431168
644CRegexAscii_chargroup::~CRegexAscii_chargroup()1169CRegexXQuery_chargroup::~CRegexXQuery_chargroup()
645{1170{
646 delete classsub;1171 delete classsub;
647}1172 std::list<CRegexXQuery_charmatch* >::iterator charmatch_it;
6481173 for(charmatch_it=chargroup_list.begin(); charmatch_it != chargroup_list.end(); charmatch_it++)
649void CRegexAscii_chargroup::addMultiChar(char c)1174 delete (*charmatch_it);
650{1175}
651 chargroup_t cgt;1176
652 cgt.flags = CHARGROUP_FLAGS_MULTICHAR;1177void CRegexXQuery_chargroup::addCharMatch(CRegexXQuery_charmatch *charmatch)
653 cgt.c1 = c;1178{
654 cgt.c2 = 0;1179 chargroup_list.push_back(charmatch);
655 chargroup_list.push_back(cgt);1180}
656}1181void CRegexXQuery_chargroup::addClassSub(CRegexXQuery_chargroup* classsub)
657
658void CRegexAscii_chargroup::addEndLine()
659{
660 chargroup_t cgt;
661 cgt.flags = CHARGROUP_FLAGS_ENDLINE;
662 cgt.c1 = '$';
663 cgt.c2 = 0;
664 chargroup_list.push_back(cgt);
665}
666
667void CRegexAscii_chargroup::addCharRange(char c1, char c2)
668{
669 chargroup_t cgt;
670 cgt.flags = 0;
671 cgt.c1 = c1;
672 cgt.c2 = c2;
673 chargroup_list.push_back(cgt);
674}
675
676void CRegexAscii_chargroup::addClassSub(CRegexAscii_chargroup* classsub)
677{1182{
678 this->classsub = classsub;1183 this->classsub = classsub;
679}1184}
6801185
681CRegexAscii_negchargroup::CRegexAscii_negchargroup(CRegexAscii_regex* regex) :1186CRegexXQuery_negchargroup::CRegexXQuery_negchargroup(CRegexXQuery_regex* regex) :
682 CRegexAscii_chargroup(regex)1187 CRegexXQuery_chargroup(regex)
683{1188{
684}1189}
6851190
686CRegexAscii_negchargroup::~CRegexAscii_negchargroup()1191CRegexXQuery_negchargroup::~CRegexXQuery_negchargroup()
687{1192{
688}1193}
6891194
690CRegexAscii_wildchar::CRegexAscii_wildchar(CRegexAscii_regex* regex) :1195CRegexXQuery_wildchar::CRegexXQuery_wildchar(CRegexXQuery_regex* regex) :
691 IRegexAtom(regex)1196 IRegexAtom(regex)
692{1197{
693}1198}
6941199
695CRegexAscii_wildchar::~CRegexAscii_wildchar()1200CRegexXQuery_wildchar::~CRegexXQuery_wildchar()
696{1201{
697}1202}
6981203
699CRegexAscii_backref::CRegexAscii_backref(CRegexAscii_regex* regex, unsigned int backref_) :1204CRegexXQuery_backref::CRegexXQuery_backref(CRegexXQuery_regex* regex, unsigned int backref_) :
700 IRegexAtom(regex),1205 IRegexAtom(regex),
701 backref(backref_)1206 backref(backref_)
702{1207{
703}1208}
7041209
705CRegexAscii_backref::~CRegexAscii_backref()1210CRegexXQuery_backref::~CRegexXQuery_backref()
706{1211{
707}1212}
7081213
709CRegexAscii_parser::CRegexAscii_parser()1214CRegexXQuery_pinstart::CRegexXQuery_pinstart(CRegexXQuery_regex* regex):
1215 IRegexAtom(regex)
1216{
1217}
1218
1219CRegexXQuery_parser::CRegexXQuery_parser()
710{1220{
711 current_regex = NULL;1221 current_regex = NULL;
712 regex_depth = 0;1222 regex_depth = 0;
713}1223}
7141224
715CRegexAscii_parser::~CRegexAscii_parser()1225CRegexXQuery_parser::~CRegexXQuery_parser()
716{1226{
717}1227}
7181228
@@ -720,9 +1230,68 @@
720//////////////////////////////////////////1230//////////////////////////////////////////
721////Matching the pattern on a string1231////Matching the pattern on a string
722/////////////////////////////////////////1232/////////////////////////////////////////
1233static std::list<RegexAscii_pieceinfo> empty_pieces;//empty list of pieces
1234/*
1235std::list<RegexAscii_pieceinfo>::iterator
1236IRegexAtom::choose_next_piece(const char *source, int *matched_len,
1237 std::list<RegexAscii_pieceinfo>::iterator this_piece,
1238 std::list<RegexAscii_pieceinfo>::iterator end_piece)
1239{
1240 //if this_piece is repetition, repeat until max, then go to next piece
1241 int min, max;
1242 bool strict_max;
1243 while(this_piece != end_piece)
1244 {
1245 (*this_piece).piece->get_quantifier(&min, &max, &strict_max);
1246 if(max <= ((*this_piece).nr_matches))//finished this piece
1247 {
1248 this_piece++;
1249 }
1250 else
1251 break;
1252 }
1253 return this_piece;
1254}
1255*/
1256
1257bool IRegexAtom::match(const char *source, int *start_from_branch, int *matched_len,
1258 std::list<RegexAscii_pieceinfo>::iterator this_piece,
1259 std::list<RegexAscii_pieceinfo>::iterator end_piece)
1260{
1261 *start_from_branch = 0;
1262 bool retmatch;
1263 retmatch = match_internal(source, start_from_branch, matched_len);
1264 if(!retmatch)
1265 return false;
1266
1267 if(this_piece == end_piece)
1268 return true;
1269
1270 (*this_piece).nr_matches++;
1271 int min,max;
1272 bool strict_max;
1273 (*this_piece).piece->get_quantifier(&min, &max, &strict_max);
1274 std::list<RegexAscii_pieceinfo>::iterator init_piece = this_piece;
1275 if(((min == 1) && (max == 1)) || //the simple common case
1276 ((*matched_len == 0) && ((*this_piece).nr_matches>=min)))//to avoid infinite loop
1277 {
1278 this_piece++;
1279 if(this_piece == end_piece)
1280 return true;
1281 }
1282 int matched_len2;
1283 retmatch = (*this_piece).piece->match_piece(this_piece, end_piece, source + *matched_len, &matched_len2);
1284 if(!retmatch)
1285 {
1286 (*init_piece).nr_matches--;
1287 return false;
1288 }
1289 *matched_len += matched_len2;
1290 return true;
1291}
7231292
724//try every position in source to match the pattern1293//try every position in source to match the pattern
725bool CRegexAscii_regex::match_anywhere(const char *source, unsigned int flags,1294bool CRegexXQuery_regex::match_anywhere(const char *source, unsigned int flags,
726 int *match_pos, int *matched_len)1295 int *match_pos, int *matched_len)
727{1296{
728 *match_pos = 0;1297 *match_pos = 0;
@@ -730,43 +1299,66 @@
730 return match_from(source, flags, match_pos, matched_len);1299 return match_from(source, flags, match_pos, matched_len);
731}1300}
7321301
733bool CRegexAscii_regex::match_from(const char *source, unsigned int flags,1302bool CRegexXQuery_regex::match_from(const char *source, unsigned int flags,
734 int *match_pos, int *matched_len)1303 int *match_pos, int *matched_len)
735{1304{
736 this->flags = flags;1305 this->flags = flags;
1306 this->source_start = source;
737 reachedEnd = false;1307 reachedEnd = false;
7381308
739 std::vector<CRegexAscii_regex*>::iterator regex_it;1309 std::vector<CRegexXQuery_regex*>::iterator regex_it;
740 for(regex_it = subregex.begin(); regex_it != subregex.end(); regex_it++)1310 for(regex_it = subregex.begin(); regex_it != subregex.end(); regex_it++)
741 {1311 {
742 (*regex_it)->matched_source = NULL;1312 (*regex_it)->matched_source = NULL;
743 }1313 }
744// if(!source[0])1314
745// {1315 std::vector<std::pair<const char*, int> > saved_subregex;
746// if(branch_list.empty())1316
747// return true;1317 if(*match_pos && (flags & REGEX_ASCII_WHOLE_MATCH))
748// else1318 return false;
749// return false;1319
750// }
751
752 bool skip_first_match = false;
753 if(*match_pos && align_begin)
754 skip_first_match = true;
755 do1320 do
756 {1321 {
757 if(!skip_first_match)1322 int start_from_branch = 0;
758 {1323 int longest_match = -1;
759 if(match(source + *match_pos, matched_len))1324 while(1)
760 return true;1325 {
761 }1326 if(!match(source + *match_pos, &start_from_branch, matched_len, empty_pieces.begin(), empty_pieces.end()))
762 skip_first_match = false;1327 break;
763 if(align_begin)1328 if(longest_match < *matched_len)
1329 {
1330 longest_match = *matched_len;
1331 if(start_from_branch && (flags & REGEX_ASCII_GET_LONGEST_BRANCH))
1332 save_subregex_list(saved_subregex);
1333 }
1334 if(!start_from_branch || !(flags & REGEX_ASCII_GET_LONGEST_BRANCH))
1335 break;
1336 //else try the other branches to see which is longer
1337 }
1338 if(longest_match != -1)
1339 {
1340 *matched_len = longest_match;
1341 if(saved_subregex.size())
1342 load_subregex_list(saved_subregex);
1343 if(flags & REGEX_ASCII_WHOLE_MATCH)
1344 {
1345 if(!source[*match_pos+*matched_len])
1346 return true;
1347 if((flags & REGEX_ASCII_MULTILINE) &&
1348 ((source[*match_pos+*matched_len] == '\n') || (source[*match_pos+*matched_len] == '\r')))
1349 return true;
1350 return false;
1351 }
1352 return true;
1353 }
1354
1355 if(flags & REGEX_ASCII_WHOLE_MATCH)
764 {1356 {
765 if(flags & REGEX_ASCII_MULTILINE)1357 if(flags & REGEX_ASCII_MULTILINE)
766 {1358 {
767 //goto the next line1359 //go to next line
768 while(source[*match_pos] && (source[*match_pos] != '\n') && (source[*match_pos] != '\r'))1360 while(source[*match_pos] && (source[*match_pos] != '\n') && (source[*match_pos] != '\r'))
769 (*match_pos)++;1361 (*match_pos) += myutf8len(source);
770 if(source[*match_pos] == '\n')1362 if(source[*match_pos] == '\n')
771 {1363 {
772 (*match_pos)++;1364 (*match_pos)++;
@@ -780,190 +1372,1039 @@
780 (*match_pos)++;1372 (*match_pos)++;
781 }1373 }
782 if(!source[*match_pos])1374 if(!source[*match_pos])
783 return false;1375 break;
784 continue;1376 continue;
785 }1377 }
786 return false;1378 break;
787 }1379 }
788 if(!source[*match_pos])1380 if(!source[*match_pos])
789 break;1381 break;
790 (*match_pos)++;1382 (*match_pos) += myutf8len(source);
791 }1383 }
792 while(source[*match_pos]);1384 while(source[*match_pos]);
1385// if(!source[*match_pos])
1386// {
1387// reachedEnd = true;
1388// }
793 return false;1389 return false;
794}1390}
7951391
1392void CRegexXQuery_regex::reset_match()
1393{
1394// this->backup_matched_source = this->matched_source;
1395// this->backup_matched_len = this->matched_len;
1396 this->matched_source = NULL;
1397 this->matched_len = 0;
1398 std::list<CRegexXQuery_branch*>::iterator branch_it;
1399 for(branch_it = branch_list.begin(); branch_it != branch_list.end(); branch_it++)
1400 {
1401 (*branch_it)->reset();
1402 }
1403}
1404/*
1405void CRegexXQuery_regex::restore_match()
1406{
1407 this->matched_source = this->backup_matched_source;
1408 this->matched_len = this->backup_matched_len;
1409 std::list<CRegexXQuery_branch*>::iterator branch_it;
1410 for(branch_it = branch_list.begin(); branch_it != branch_list.end(); branch_it++)
1411 {
1412 (*branch_it)->restore();
1413 }
1414}
1415*/
796//match any of the branches1416//match any of the branches
797bool CRegexAscii_regex::match(const char *source, int *matched_len)1417bool CRegexXQuery_regex::match(const char *source, int *start_from_branch, int *matched_len,
1418 std::list<RegexAscii_pieceinfo>::iterator next_piece,
1419 std::list<RegexAscii_pieceinfo>::iterator end_piece)
798{1420{
799 reachedEnd = false;1421 reachedEnd = false;
800 std::list<CRegexAscii_branch*>::iterator branch_it;1422 if(!(flags & REGEX_ASCII_GROUPING_LEN_WHOLE_PIECE) ||
8011423 (this->matched_source == NULL) || ((this->matched_source + this->matched_len) != source))
802 for(branch_it = branch_list.begin(); branch_it != branch_list.end(); branch_it++)1424 this->matched_source = source;
803 {1425 *matched_len = 0;
804 if((*branch_it)->match(source, matched_len))1426 std::list<CRegexXQuery_branch*>::iterator branch_it;
805 {1427
806 matched_source = source;1428 if(*start_from_branch == 0)
807 this->matched_len = *matched_len;1429 {
1430 for(branch_it = branch_list.begin(); branch_it != branch_list.end(); branch_it++)
1431 {
1432 (*branch_it)->reset();
1433 }
1434 }
1435
1436 branch_it = branch_list.begin();
1437 if(*start_from_branch)
1438 {
1439 for(int i=0;i<*start_from_branch;i++)
1440 branch_it++;
1441 }
1442 (*start_from_branch)++;
1443 for(; branch_it != branch_list.end(); branch_it++,(*start_from_branch)++)
1444 {
1445 if((*branch_it)->match(source, matched_len, this, next_piece, end_piece))
1446 {
1447 //matched_source = source;
1448 //this->matched_len = *matched_len;
808 return true;1449 return true;
809 }1450 }
810 }1451 }
811 matched_source = NULL;1452 *start_from_branch = 0;
812 matched_len = 0;1453 if(this->matched_source == source)
1454 this->matched_source = NULL;
1455 *matched_len = 0;
813 return false;1456 return false;
814}1457}
8151458
1459void CRegexXQuery_regex::save_subregex_list(std::vector<std::pair<const char*, int> > &saved_subregex)
1460{
1461 saved_subregex.resize(0);
1462 saved_subregex.reserve(subregex.size());
1463 std::vector<CRegexXQuery_regex*>::iterator it;
1464 for(it=subregex.begin(); it != subregex.end(); it++)
1465 {
1466 saved_subregex.push_back(std::pair<const char*, int>((*it)->matched_source, (*it)->matched_len));
1467 }
1468}
1469
1470void CRegexXQuery_regex::load_subregex_list(std::vector<std::pair<const char*, int> > &saved_subregex)
1471{
1472 std::vector<std::pair<const char*, int> >::iterator it;
1473 std::vector<CRegexXQuery_regex*>::iterator subit;
1474 for(it=saved_subregex.begin(), subit = subregex.begin(); it != saved_subregex.end(); it++, subit++)
1475 {
1476 (*subit)->matched_source = (*it).first;
1477 (*subit)->matched_len = (*it).second;
1478 }
1479}
1480
1481void CRegexXQuery_branch::reset()
1482{
1483 std::list<RegexAscii_pieceinfo>::iterator piece_it;
1484 for(piece_it = piece_list.begin(); piece_it != piece_list.end(); piece_it++)
1485 {
1486 (*piece_it).piece->atom->reset_match();
1487 }
1488}
1489/*
1490void CRegexXQuery_branch::restore()
1491{
1492 std::list<RegexAscii_pieceinfo>::iterator piece_it;
1493 for(piece_it = piece_list.begin(); piece_it != piece_list.end(); piece_it++)
1494 {
1495 (*piece_it).piece->atom->restore_match();
1496 }
1497}
1498*/
816//match all the pieces1499//match all the pieces
817bool CRegexAscii_branch::match(const char *source, int *matched_len)1500bool CRegexXQuery_branch::match(const char *source, int *matched_len,
1501 CRegexXQuery_regex* group_regex,
1502 std::list<RegexAscii_pieceinfo>::iterator next_piece,
1503 std::list<RegexAscii_pieceinfo>::iterator end_piece)
818{1504{
819 std::list<CRegexAscii_piece*>::iterator piece_it;1505 std::list<RegexAscii_pieceinfo>::iterator piece_it;
8201506
821 piece_it = piece_list.begin(); 1507 piece_it = piece_list.begin();
1508 //if(piece_it == piece_list.end())
1509 //if(!source[0])
1510 // return true;
1511 //else
1512 // return false;
822 if(piece_it == piece_list.end())1513 if(piece_it == piece_list.end())
823 if(source[0])1514 {
824 return false;1515 piece_it = next_piece;
1516 if(next_piece == end_piece)
1517 {
1518 group_regex->matched_len = 0;
1519 return true;
1520 }
1521 }
1522
1523 std::list<RegexAscii_pieceinfo> temp_pieces(piece_list);
1524 temp_pieces.push_back(group_regex);//this will be used to store the group match
1525 temp_pieces.insert(temp_pieces.end(), next_piece, end_piece);
1526
1527 return (*piece_it).piece->match_piece(temp_pieces.begin(), temp_pieces.end(), source, matched_len);
1528}
1529
1530bool CRegexXQuery_piece::match_piece(std::list<RegexAscii_pieceinfo>::iterator piece_it,
1531 std::list<RegexAscii_pieceinfo>::iterator end_it,
1532 const char *source, int *matched_len)
1533{
1534 if((*piece_it).nr_matches < 0)
1535 {
1536 //special case, store the group match
1537 (*piece_it).group_regex->matched_len = source - (*piece_it).group_regex->matched_source;
1538 piece_it++;
1539 if(piece_it == end_it)
1540 return true;
825 else1541 else
826 return true;1542 return (*piece_it).piece->match_piece(piece_it, end_it, source, matched_len);
827 if(!(*piece_it)->get_is_reluctant())1543 }
828 return match_piece_iter_normal(piece_it, source, matched_len);1544
1545 if(!get_is_reluctant())
1546 return match_piece_iter_normal(piece_it, end_it, source, matched_len);
829 else1547 else
830 return match_piece_iter_reluctant(piece_it, source, matched_len);1548 return match_piece_iter_reluctant(piece_it, end_it, source, matched_len);
831}1549}
8321550
833//match as less as possible1551int CRegexXQuery_piece::choose_another_branch(std::vector<std::pair<int,int> > &match_lens)
834bool CRegexAscii_branch::match_piece_iter_reluctant(1552{
835 std::list<CRegexAscii_piece*>::iterator piece_it,1553 int i = match_lens.size()-1;
1554 i--;
1555 while((i >= 0) && (match_lens.at(i).second == 0))
1556 i--;
1557 if(i < 0)
1558 return -1;//no more branches
1559 match_lens.resize(i+1);
1560 i++;
1561 return i;
1562}
1563
1564bool CRegexXQuery_piece::is_regex_atom()
1565{
1566 return regex_atom != NULL;
1567}
1568
1569//match as less as possible (shortest string)
1570bool CRegexXQuery_piece::match_piece_iter_reluctant(
1571 std::list<RegexAscii_pieceinfo>::iterator piece_it,
1572 std::list<RegexAscii_pieceinfo>::iterator end_it,
836 const char *source, int *matched_len)1573 const char *source, int *matched_len)
837{1574{
838 *matched_len = 0;1575 *matched_len = 0;
839 if(piece_it == piece_list.end())1576 if(piece_it == end_it)
840 return true;1577 return true;
8411578
842 int min, max;1579 int min, max;
843 bool strict_max;1580 bool strict_max;
844 //std::vector<int> match_lens;1581 //std::vector<int> match_lens;
845 (*piece_it)->get_quantifier(&min, &max, &strict_max);1582 (*piece_it).piece->get_quantifier(&min, &max, &strict_max);
846 if(strict_max && (max >= 0))1583
1584 std::vector<std::pair<const char*, int> > saved_subregex;
1585
1586 if(is_regex_atom())
847 {1587 {
848 int timeslen;1588 //recursive
849 //check if the piece doesn't exceed the max match1589 bool retmatch;
850 if((*piece_it)->match_piece_times(source, &timeslen, max+1, NULL))1590 atom->regex_intern->save_subregex_list(saved_subregex);
851 return false;///too many matches1591 if((*piece_it).nr_matches >= min)
1592 {
1593 //go to next piece
1594 std::list<RegexAscii_pieceinfo>::iterator next_it = piece_it;
1595 next_it++;
1596 if(next_it == end_it)
1597 return true;
1598 retmatch = (*next_it).piece->match_piece(next_it, end_it, source, matched_len);
1599 if(retmatch)
1600 return true;
1601 }
1602 if(((max == -1) || ((*piece_it).nr_matches < max)) &&//try further with this piece
1603 (((*piece_it).nr_matches < min) || ((*piece_it).nr_matches == 0) || ((*piece_it).piece->regex_atom->matched_len)))//if matched_len is zero, avoid infinite loop
1604 {
1605 int start_from_branch = 0;
1606 int shortest_len = -1;
1607 bool branch_saved = false;
1608 //try all branches to get the shortest len
1609 (*piece_it).nr_matches++;
1610 while(atom->match(source, &start_from_branch, matched_len, piece_it, end_it))
1611 {
1612 if((shortest_len == -1) || (shortest_len > *matched_len))
1613 {
1614 shortest_len = *matched_len;
1615 if(start_from_branch && (atom->regex_intern->flags & REGEX_ASCII_GET_LONGEST_BRANCH))
1616 {
1617 atom->regex_intern->save_subregex_list(saved_subregex);
1618 branch_saved = true;
1619 }
1620 }
1621 if(!start_from_branch || !(atom->regex_intern->flags & REGEX_ASCII_GET_LONGEST_BRANCH))
1622 break;
1623 }
1624 if(shortest_len != -1)
1625 {
1626 *matched_len = shortest_len;
1627 if(branch_saved)
1628 atom->regex_intern->load_subregex_list(saved_subregex);
1629 return true;
1630 }
1631 else
1632 {
1633 (*piece_it).nr_matches--;
1634 atom->regex_intern->load_subregex_list(saved_subregex);
1635 return false;
1636 }
1637 }
1638 else
1639 {
1640 atom->regex_intern->load_subregex_list(saved_subregex);
1641 return false;
1642 }
852 }1643 }
8531644
854 int i=min;1645 int i=0;
855 std::list<CRegexAscii_piece*>::iterator next_it = piece_it;1646 int shortest_len = -1;
1647 int otherpieces_shortest = -1;
1648 int i_shortest = -1;
1649 std::list<RegexAscii_pieceinfo>::iterator next_it = piece_it;
1650 std::vector<std::pair<int,int> > match_lens;
856 next_it++;1651 next_it++;
857 int pieceslen = 0;1652 int pieceslen = 0;
858 while(1)1653 while(1)
859 {1654 {
860 if((max > 0) && (i>max))1655 int piecelen = 0;
861 break;1656 bool retmatch;
862 int piecelen = 0;1657 retmatch = match_piece_times(source, &piecelen, i < min ? min : i, &match_lens);
863 if((*piece_it)->match_piece_times(source+pieceslen, &piecelen, !pieceslen ? i : 1, NULL))1658 i = match_lens.size()-1;//number of matches
864 {1659 if(i<0)
865 pieceslen += piecelen;1660 i = 0;
1661 if((i>=min))
1662 {
1663 pieceslen = piecelen;
1664 if((shortest_len >= 0) && (shortest_len <= pieceslen))//this branch is longer
1665 {//try another branch
1666 i = choose_another_branch(match_lens);
1667 if(i >= 0)
1668 continue;//try another branch
1669 else
1670 break;
1671 }
866 int otherpieces = 0;1672 int otherpieces = 0;
867 if((next_it == piece_list.end()) ||1673 if((next_it == end_it) ||
868 ((*next_it)->get_is_reluctant() && match_piece_iter_reluctant(next_it, source+pieceslen, &otherpieces)) ||1674 (*next_it).piece->match_piece(next_it, end_it, source+pieceslen, &otherpieces)
869 (!(*next_it)->get_is_reluctant() && match_piece_iter_normal(next_it, source+pieceslen, &otherpieces)))1675 )
870 {1676 {
871 *matched_len = pieceslen + otherpieces;1677 if((i == pieceslen) || (match_lens.at(0).second == 0) ||//minimum achieved already, cannot go lower than that
872 return true;1678 !(atom->regex_intern->flags & REGEX_ASCII_GET_LONGEST_BRANCH))
873 }1679 {
1680 *matched_len = pieceslen + otherpieces;
1681 return true;
1682 }
1683 if((shortest_len < 0) || (shortest_len > pieceslen))
1684 {
1685 shortest_len = pieceslen;
1686 otherpieces_shortest = otherpieces;
1687 i_shortest = i;
1688 if(match_lens.at(0).second != 0)
1689 atom->regex_intern->save_subregex_list(saved_subregex);
1690 }
1691 i = choose_another_branch(match_lens);
1692 if(i >= 0)
1693 continue;//try another branch
1694 else
1695 break;
1696 }
1697 else
1698 {
1699 //try further
1700 if(retmatch)
1701 {
1702 i++;
1703 if((max < 0) || (i<=max))
1704 continue;
1705 i--;
1706 }
1707 }
1708 }
1709
1710 if(i==0)
1711 {
1712 break;
874 }1713 }
875 else1714 else
876 break;1715 {
877 i++;1716 i = choose_another_branch(match_lens);
1717 if(i >= 0)
1718 continue;//try another branch
1719 else
1720 break;
1721 }
878 }1722 }
8791723
1724 if(shortest_len >= 0)
1725 {
1726 if(strict_max && (max>=0) && (i_shortest > max))
1727 return false;
1728 *matched_len = shortest_len + otherpieces_shortest;
1729 if(saved_subregex.size())
1730 atom->regex_intern->load_subregex_list(saved_subregex);
1731 return true;
1732 }
880 return false;1733 return false;
881}1734}
8821735
883//match as much as possible1736//match as much as possible
884bool CRegexAscii_branch::match_piece_iter_normal(1737bool CRegexXQuery_piece::match_piece_iter_normal(
885 std::list<CRegexAscii_piece*>::iterator piece_it,1738 std::list<RegexAscii_pieceinfo>::iterator piece_it,
1739 std::list<RegexAscii_pieceinfo>::iterator end_it,
886 const char *source, int *matched_len)1740 const char *source, int *matched_len)
887{1741{
888 *matched_len = 0;1742 *matched_len = 0;
8891743
890 int min, max;1744 int min, max;
891 bool strict_max;1745 bool strict_max;
892 std::vector<int> match_lens;1746 std::vector<std::pair<int,int> > match_lens;
893 (*piece_it)->get_quantifier(&min, &max, &strict_max);1747 (*piece_it).piece->get_quantifier(&min, &max, &strict_max);
894 int timeslen;1748 int timeslen = 0;
895 if(strict_max && (max >= 0))1749 std::vector<std::pair<const char*, int> > saved_subregex;
1750
1751 if(is_regex_atom())
896 {1752 {
897 //check if the piece doesn't exceed the max match1753 //recursive
898 //if((*piece_it)->match_piece_times(source, &timeslen, max+1, &match_lens))1754 bool retmatch;
899 // return false;///too many matches1755 atom->regex_intern->save_subregex_list(saved_subregex);
900 (*piece_it)->match_piece_times(source, &timeslen, max, &match_lens);1756 if(((max == -1) || ((*piece_it).nr_matches < max)) && //try further with this piece
1757 (((*piece_it).nr_matches < min) || ((*piece_it).nr_matches == 0) || ((*piece_it).piece->regex_atom->matched_len)))//if matched_len is zero, avoid infinite loop
1758 {
1759 int start_from_branch = 0;
1760 int longest_len = -1;
1761 bool branch_saved = false;
1762 //try all branches to get the longest len
1763 (*piece_it).nr_matches++;
1764 while(atom->match(source, &start_from_branch, matched_len, piece_it, end_it))
1765 {
1766 if((longest_len < *matched_len))
1767 {
1768 longest_len = *matched_len;
1769 if(start_from_branch && (atom->regex_intern->flags & REGEX_ASCII_GET_LONGEST_BRANCH))
1770 {
1771 atom->regex_intern->save_subregex_list(saved_subregex);
1772 branch_saved = true;
1773 }
1774 }
1775 if(!start_from_branch || !(atom->regex_intern->flags & REGEX_ASCII_GET_LONGEST_BRANCH))
1776 break;
1777 }
1778 if(longest_len != -1)
1779 {
1780 *matched_len = longest_len;
1781 if(branch_saved)
1782 atom->regex_intern->load_subregex_list(saved_subregex);
1783 return true;
1784 }
1785 else
1786 {
1787 atom->regex_intern->load_subregex_list(saved_subregex);
1788 (*piece_it).nr_matches--;
1789 }
1790 }
1791 if((*piece_it).nr_matches >= min)
1792 {
1793 //go to next piece
1794 std::list<RegexAscii_pieceinfo>::iterator next_it = piece_it;
1795 next_it++;
1796 if(next_it == end_it)
1797 return true;
1798 retmatch = (*next_it).piece->match_piece(next_it, end_it, source, matched_len);
1799 if(!retmatch)
1800 atom->regex_intern->load_subregex_list(saved_subregex);
1801 return retmatch;
1802 }
1803 else
1804 {
1805 // regex_atom->restore_match();
1806 atom->regex_intern->load_subregex_list(saved_subregex);
1807 return false;
1808 }
901 }1809 }
902 else if(!strict_max && (max >= 0))
903 (*piece_it)->match_piece_times(source, &timeslen, max, &match_lens);
904 else
905 (*piece_it)->match_piece_times(source, &timeslen, -1, &match_lens);
9061810
907 int i;1811 int longest_len = -1;
908 std::list<CRegexAscii_piece*>::iterator next_it = piece_it;1812 int otherpieces_longest = -1;
1813 int i_longest = -1;
1814 int i = max;
1815 std::list<RegexAscii_pieceinfo>::iterator next_it = piece_it;
909 next_it++;1816 next_it++;
910 if(next_it == piece_list.end())1817
1818 bool retmatch;
1819 while(1)
911 {1820 {
912 if((int)match_lens.size() > min)1821 retmatch = match_piece_times(source, &timeslen, i, &match_lens);
913 {1822 i=match_lens.size()-1;//number of matches
914 *matched_len = timeslen;1823 if((i>=min))
915 return true;1824 {
1825 if(timeslen < longest_len)
1826 {//this branch is no use
1827 i = choose_another_branch(match_lens);
1828 if(i >= 0)
1829 {
1830 i = max;
1831 continue;//try another branch
1832 }
1833 else
1834 break;
1835 }
1836 //int piecelen = 0;
1837 int otherpieces = 0;
1838 if((next_it == end_it) ||
1839 (*next_it).piece->match_piece(next_it, end_it, source+timeslen, &otherpieces)
1840 )
1841 {
1842 if(timeslen > longest_len)
1843 {
1844 longest_len = timeslen;
1845 otherpieces_longest = otherpieces;
1846 i_longest = i;
1847 if(!(atom->regex_intern->flags & REGEX_ASCII_GET_LONGEST_BRANCH))
1848 {
1849 *matched_len = longest_len + otherpieces_longest;
1850 return true;
1851 }
1852 else
1853 {
1854 if(match_lens.at(0).second)
1855 atom->regex_intern->save_subregex_list(saved_subregex);
1856 }
1857 }
1858 }
1859 else
1860 {
1861 if(!match_lens.at(0).second)
1862 {
1863 match_lens.resize(match_lens.size()-1);
1864 i--;
1865 if(i >= 0)
1866 continue;//try smaller
1867 else
1868 break;
1869 }
1870 else
1871 {
1872 i = choose_another_branch(match_lens);
1873 if(i >= 0)
1874 continue;//try another branch
1875 else
1876 break;
1877 }
1878 }
1879 }
1880 //now try another branch
1881 i = choose_another_branch(match_lens);
1882 if(i >= 0)
1883 {
1884 i = max;
1885 continue;//try another branch
916 }1886 }
917 else1887 else
918 return false;1888 break;
919 }1889 }//end while
920 for(i=match_lens.size()-1; i>=min; i--)1890
1891 if(longest_len >= 0)
921 {1892 {
922 int piecelen = 0;1893 *matched_len = longest_len + otherpieces_longest;
923 int otherpieces = 0;1894 if(saved_subregex.size())
924 if(((*next_it)->get_is_reluctant() && match_piece_iter_reluctant(next_it, source+match_lens[i]+piecelen, &otherpieces)) ||1895 atom->regex_intern->load_subregex_list(saved_subregex);
925 (!(*next_it)->get_is_reluctant() && match_piece_iter_normal(next_it, source+match_lens[i]+piecelen, &otherpieces)))1896 return true;
926 {
927 *matched_len = match_lens[i] + piecelen + otherpieces;
928 return true;
929 }
930 }1897 }
9311898
932 return false;1899 return false;
933}1900}
9341901
935bool CRegexAscii_piece::match_piece_times(const char *source, 1902bool CRegexXQuery_piece::match_piece_times(const char *source,
936 int *piecelen, 1903 int *piecelen,
937 int times,1904 int times,
938 std::vector<int> *match_lens)1905 std::vector<std::pair<int,int> > *match_lens)
939{1906{
940 *piecelen = 0;1907 int i=0;
941 for(int i=0;(times < 0) || (i<times);i++)1908 if(match_lens && match_lens->size())
942 {1909 {
1910 i = match_lens->size()-1;
1911 }
1912 if(match_lens && match_lens->size())
1913 *piecelen = match_lens->at(match_lens->size()-1).first;
1914 else
1915 *piecelen = 0;
1916 if((times >= 0) && (i>=times))
1917 return true;
1918 for(;(times < 0) || (i<times);i++)
1919 {
1920 int atomlen;
1921 int start_from_branch = 0;
1922 if(match_lens && (i<(int)match_lens->size()))
1923 start_from_branch = match_lens->at(i).second;
1924 bool first_branch = (start_from_branch == 0);
1925 if(!atom->match(source+*piecelen, &start_from_branch, &atomlen, empty_pieces.begin(), empty_pieces.end()))
1926 {
1927 if(match_lens)
1928 {
1929 if(i >= (int)match_lens->size())
1930 match_lens->push_back(std::pair<int,int>(*piecelen, 0));
1931 else
1932 (*match_lens)[i] = std::pair<int,int>(*piecelen, 0);
1933 }
1934 return false;
1935 }
943 if(match_lens)1936 if(match_lens)
944 match_lens->push_back(*piecelen);1937 {
945 int atomlen;1938 if(i >= (int)match_lens->size())
946 if(!atom->match(source+*piecelen, &atomlen))1939 match_lens->push_back(std::pair<int,int>(*piecelen, start_from_branch));
947 return false;1940 else
1941 (*match_lens)[i] = std::pair<int,int>(*piecelen, start_from_branch);
1942 }
948 *piecelen += atomlen;1943 *piecelen += atomlen;
949 if(!atomlen && !source[*piecelen])1944 if(!atomlen && !source[*piecelen])
950 {1945 {
951 atom->regex_intern->reachedEnd = true;1946 // atom->regex_intern->set_reachedEnd(source);
1947 break;
1948 }
1949 if(first_branch && (atomlen == 0))//avoid infinite loop
1950 {
952 break;1951 break;
953 }1952 }
954 }1953 }
955 if(match_lens)1954 if(match_lens)
956 match_lens->push_back(*piecelen);1955 {
1956 // if(i >= match_lens->size())
1957 match_lens->push_back(std::pair<int,int>(*piecelen, 0));
1958 // else
1959 // (*match_lens)[i] = std::pair<int,int>(*piecelen, 0);
1960 }
9571961
958 return true;1962 return true;
959}1963}
9601964
1965bool CRegexXQuery_multicharP::match_internal(const char *source, int *start_from_branch, int *matched_len)
1966{
1967 if(!source[0])
1968 {
1969 regex_intern->set_reachedEnd(source);
1970 return false;
1971 }
1972 bool found = false;
1973 const char *temp_source = source;
1974 unicode::code_point utf8c = utf8::next_char(temp_source);
1975 switch(multichar_type)
1976 {
1977 case unicode::UNICODE_Ll + 50:
1978 if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Ll) ||
1979 unicode::check_codepoint_category(utf8c, unicode::UNICODE_Lm) ||
1980 unicode::check_codepoint_category(utf8c, unicode::UNICODE_Lo) ||
1981 unicode::check_codepoint_category(utf8c, unicode::UNICODE_Lt) ||
1982 unicode::check_codepoint_category(utf8c, unicode::UNICODE_Lu))
1983 {
1984 if(!is_reverse)
1985 found = true;
1986 }
1987 else
1988 {
1989 if(is_reverse)
1990 found = true;
1991 }
1992 break;
1993 case unicode::UNICODE_Mc + 50:
1994 if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Mn) ||
1995 unicode::check_codepoint_category(utf8c, unicode::UNICODE_Mc) ||
1996 unicode::check_codepoint_category(utf8c, unicode::UNICODE_Me))
1997 {
1998 if(!is_reverse)
1999 found = true;
2000 }
2001 else
2002 {
2003 if(is_reverse)
2004 found = true;
2005 }
2006 break;
2007 case unicode::UNICODE_Nd + 50:
2008 if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Nd) ||
2009 unicode::check_codepoint_category(utf8c, unicode::UNICODE_Nl) ||
2010 unicode::check_codepoint_category(utf8c, unicode::UNICODE_No))
2011 {
2012 if(!is_reverse)
2013 found = true;
2014 }
2015 else
2016 {
2017 if(is_reverse)
2018 found = true;
2019 }
2020 break;
2021 case unicode::UNICODE_Pc + 50:
2022 if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pc) ||
2023 unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pd) ||
2024 unicode::check_codepoint_category(utf8c, unicode::UNICODE_Ps) ||
2025 unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pe) ||
2026 unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pi) ||
2027 unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pf) ||
2028 unicode::check_codepoint_category(utf8c, unicode::UNICODE_Po))
2029 {
2030 if(!is_reverse)
2031 found = true;
2032 }
2033 else
2034 {
2035 if(is_reverse)
2036 found = true;
2037 }
2038 break;
2039 case unicode::UNICODE_Zl + 50:
2040 if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Zs) ||
2041 unicode::check_codepoint_category(utf8c, unicode::UNICODE_Zl) ||
2042 unicode::check_codepoint_category(utf8c, unicode::UNICODE_Zp))
2043 {
2044 if(!is_reverse)
2045 found = true;
2046 }
2047 else
2048 {
2049 if(is_reverse)
2050 found = true;
2051 }
2052 break;
2053 case unicode::UNICODE_Sc + 50:
2054 if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Sm) ||
2055 unicode::check_codepoint_category(utf8c, unicode::UNICODE_Sc) ||
2056 unicode::check_codepoint_category(utf8c, unicode::UNICODE_Sk) ||
2057 unicode::check_codepoint_category(utf8c, unicode::UNICODE_So))
2058 {
2059 if(!is_reverse)
2060 found = true;
2061 }
2062 else
2063 {
2064 if(is_reverse)
2065 found = true;
2066 }
2067 break;
2068 case unicode::UNICODE_Cc + 50:
2069 if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Cc) ||
2070 unicode::check_codepoint_category(utf8c, unicode::UNICODE_Cf) ||
2071 unicode::check_codepoint_category(utf8c, unicode::UNICODE_Co))//ignore unicode::UNICODE_Cn
2072 {
2073 if(!is_reverse)
2074 found = true;
2075 }
2076 else
2077 {
2078 if(is_reverse)
2079 found = true;
2080 }
2081 break;
2082 default:
2083 if(unicode::check_codepoint_category(utf8c, (unicode::category)multichar_type))
2084 {
2085 if(!is_reverse)
2086 found = true;
2087 }
2088 else
2089 {
2090 if(is_reverse)
2091 found = true;
2092 }
2093 break;
2094 }
2095
2096 if(found)
2097 {
2098 *matched_len = temp_source - source;
2099 }
2100 return found;
2101}
2102
2103bool CRegexXQuery_multicharIs::match_internal(const char *source, int *start_from_branch, int *matched_len)
2104{
2105 if(!source[0])
2106 {
2107 regex_intern->set_reachedEnd(source);
2108 return false;
2109 }
2110 bool found = false;
2111 const char *temp_source = source;
2112 unicode::code_point utf8c = utf8::next_char(temp_source);
2113 const unicode::code_point *cp = block_escape[block_index].cp;
2114 if((utf8c >= cp[0]) && (utf8c <= cp[1]))
2115 {
2116 if(!is_reverse)
2117 found = true;
2118 }
2119 else if(block_escape[block_index].ext_cp)
2120 {
2121 cp = block_escape[block_index].ext_cp;
2122 while(*cp)
2123 {
2124 if((utf8c >= cp[0]) && (utf8c <= cp[1]))
2125 break;
2126 cp += 2;
2127 }
2128 if(*cp)
2129 {
2130 if(!is_reverse)
2131 found = true;
2132 }
2133 else
2134 {
2135 if(is_reverse)
2136 found = true;
2137 }
2138 }
2139 else
2140 {
2141 if(is_reverse)
2142 found = true;
2143 }
2144 if(found)
2145 {
2146 *matched_len = temp_source - source;
2147 }
2148 return found;
2149}
2150
2151bool CRegexXQuery_multicharOther::match_internal(const char *source, int *start_from_branch, int *matched_len)
2152{
2153 if(!source[0])
2154 {
2155 regex_intern->set_reachedEnd(source);
2156 return false;
2157 }
2158 bool found = false;
2159 bool value_true = true;
2160 const char *temp_source = source;
2161 unicode::code_point utf8c = utf8::next_char(temp_source);
2162 switch(multichar_type)
2163 {
2164 case 'S':value_true = false;//[^\s]
2165 case 's'://[#x20\t\n\r]
2166 switch(utf8c)
2167 {
2168 case '\t':
2169 case '\r':
2170 case '\n':
2171 case ' ':
2172 found = true;
2173 default:
2174 break;
2175 }
2176 break;
2177 case 'I':value_true = false;//[^\i]
2178 case 'i'://the set of initial name characters, those matched by Letter | '_' | ':'
2179 if((utf8c == '_') ||
2180 (utf8c == ':') ||
2181 XQCharType::isLetter(utf8c))
2182 {
2183 found = true;
2184 }
2185 break;
2186 case 'C':value_true = false;//[^\c]
2187 case 'c'://the set of name characters, those matched by NameChar
2188 if(XQCharType::isNameChar(utf8c))
2189 {
2190 found = true;
2191 }
2192 break;
2193 case 'D':value_true = false;//[^\d]
2194 case 'd':
2195 if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Nd))
2196 found = true;
2197 break;
2198 case 'W':value_true = false;//[^\w]
2199 case 'w':
2200 found = !(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pc) ||
2201 unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pd) ||
2202 unicode::check_codepoint_category(utf8c, unicode::UNICODE_Ps) ||
2203 unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pe) ||
2204 unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pi) ||
2205 unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pf) ||
2206 unicode::check_codepoint_category(utf8c, unicode::UNICODE_Po) ||
2207 unicode::check_codepoint_category(utf8c, unicode::UNICODE_Zs) ||
2208 unicode::check_codepoint_category(utf8c, unicode::UNICODE_Zl) ||
2209 unicode::check_codepoint_category(utf8c, unicode::UNICODE_Zp) ||
2210 unicode::check_codepoint_category(utf8c, unicode::UNICODE_Cc) ||
2211 unicode::check_codepoint_category(utf8c, unicode::UNICODE_Cf) ||
2212 unicode::check_codepoint_category(utf8c, unicode::UNICODE_Co));//ignore unicode::UNICODE_Cn
2213 break;
2214 default:
2215 throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(source, ZED(REGEX_UNIMPLEMENTED)) );
2216 }
2217 if((found && value_true) || (!found && !value_true))
2218 {
2219 *matched_len = temp_source - source;
2220 return true;
2221 }
2222 else
2223 {
2224 return false;
2225 }
2226}
2227
2228bool CRegexXQuery_char_ascii::match_internal(const char *source, int *start_from_branch, int *matched_len)
2229{
2230 if(!source[0])
2231 {
2232 regex_intern->set_reachedEnd(source);
2233 return false;
2234 }
2235 if(source[0] == c)
2236 {
2237 *matched_len = 1;
2238 return true;
2239 }
2240 else
2241 return false;
2242}
2243
2244bool CRegexXQuery_char_ascii_i::match_internal(const char *source, int *start_from_branch, int *matched_len)
2245{
2246 if(!source[0])
2247 {
2248 regex_intern->set_reachedEnd(source);
2249 return false;
2250 }
2251 char sup = toupper(source[0]);
2252 if(sup == c)
2253 {
2254 *matched_len = 1;
2255 return true;
2256 }
2257 else
2258 return false;
2259}
2260
2261bool CRegexXQuery_char_range_ascii::match_internal(const char *source, int *start_from_branch, int *matched_len)
2262{
2263 if(!source[0])
2264 {
2265 regex_intern->set_reachedEnd(source);
2266 return false;
2267 }
2268 if((source[0] >= c1) && (source[0] <= c2))
2269 {
2270 *matched_len = 1;
2271 return true;
2272 }
2273 else
2274 return false;
2275}
2276
2277bool CRegexXQuery_char_range_ascii_i::match_internal(const char *source, int *start_from_branch, int *matched_len)
2278{
2279 if(!source[0])
2280 {
2281 regex_intern->set_reachedEnd(source);
2282 return false;
2283 }
2284 char sup = toupper(source[0]);
2285 if((sup >= c1) && (sup <= c2))
2286 {
2287 *matched_len = 1;
2288 return true;
2289 }
2290 else
2291 return false;
2292}
2293
2294bool CRegexXQuery_char_unicode::match_internal(const char *source, int *start_from_branch, int *matched_len)
2295{
2296 if(!source[0])
2297 {
2298 regex_intern->set_reachedEnd(source);
2299 return false;
2300 }
2301 if(!memcmp(source, c, len))
2302 {
2303 *matched_len = len;
2304 return true;
2305 }
2306 else
2307 return false;
2308}
2309
2310bool CRegexXQuery_char_unicode_cp::match_internal(const char *source, int *start_from_branch, int *matched_len)
2311{
2312 if(!source[0])
2313 {
2314 regex_intern->set_reachedEnd(source);
2315 return false;
2316 }
2317 const char *temp_source = source;
2318 unicode::code_point utf8c = utf8::next_char(temp_source);
2319 if(utf8c == c)
2320 {
2321 *matched_len = temp_source - source;
2322 return true;
2323 }
2324 else
2325 return false;
2326}
2327
2328bool CRegexXQuery_char_unicode_i::match_internal(const char *source, int *start_from_branch, int *matched_len)
2329{
2330 if(!source[0])
2331 {
2332 regex_intern->set_reachedEnd(source);
2333 return false;
2334 }
2335 const char *temp_source = source;
2336 unicode::code_point sup = unicode::to_upper(utf8::next_char(temp_source));
2337 if(sup == c)
2338 {
2339 *matched_len = temp_source - source;
2340 return true;
2341 }
2342 else
2343 return false;
2344}
2345
2346bool CRegexXQuery_char_range_unicode::match_internal(const char *source, int *start_from_branch, int *matched_len)
2347{
2348 if(!source[0])
2349 {
2350 regex_intern->set_reachedEnd(source);
2351 return false;
2352 }
2353 const char *temp_source = source;
2354 unicode::code_point utf8c = utf8::next_char(temp_source);
2355 if((utf8c >= c1) && (utf8c <= c2))
2356 {
2357 *matched_len = temp_source - source;
2358 return true;
2359 }
2360 else
2361 return false;
2362}
2363
2364bool CRegexXQuery_char_range_unicode_i::match_internal(const char *source, int *start_from_branch, int *matched_len)
2365{
2366 if(!source[0])
2367 {
2368 regex_intern->set_reachedEnd(source);
2369 return false;
2370 }
2371 const char *temp_source = source;
2372 unicode::code_point sup = unicode::to_upper(utf8::next_char(temp_source));
2373 if((sup >= c1) && (sup <= c2))
2374 {
2375 *matched_len = temp_source - source;
2376 return true;
2377 }
2378 else
2379 return false;
2380}
2381
2382bool CRegexXQuery_endline::match_internal(const char *source, int *start_from_branch, int *matched_len)
2383{
2384 *matched_len = 0;
2385 if(!source[0])
The diff has been truncated for viewing.

Subscribers

People subscribed via source and target branches