Merge lp:~zorba-coders/zorba/no_unicode into lp:zorba

Proposed by Daniel Turcanu
Status: Superseded
Proposed branch: lp:~zorba-coders/zorba/no_unicode
Merge into: lp:zorba
Diff against target: 8036 lines (+3707/-998)
255 files modified
CMakeConfiguration.txt (+5/-5)
CMakeLists.txt (+6/-2)
ChangeLog (+10/-0)
KNOWN_ISSUES.txt (+1/-1)
doc/cxx/examples/context.cpp (+4/-0)
include/zorba/config.h.cmake (+3/-1)
include/zorba/util/time.h (+1/-1)
src/api/serialization/serializer.cpp (+36/-33)
src/api/serialization/serializer.h (+2/-4)
src/diagnostics/diagnostic_en.xml (+108/-22)
src/diagnostics/pregenerated/dict_en.cpp (+83/-20)
src/runtime/full_text/CMakeLists.txt (+3/-3)
src/runtime/full_text/default_tokenizer.cpp (+4/-4)
src/runtime/full_text/latin_tokenizer.cpp (+3/-2)
src/runtime/full_text/latin_tokenizer.h (+9/-8)
src/runtime/numerics/format_integer_impl.cpp (+1/-1)
src/runtime/numerics/numerics_impl.cpp (+1/-1)
src/runtime/strings/strings_impl.cpp (+58/-20)
src/system/globalenv.cpp (+7/-7)
src/util/CMakeLists.txt (+3/-3)
src/util/regex.cpp (+44/-52)
src/util/regex.h (+22/-34)
src/util/regex_xquery.cpp (+1860/-489)
src/util/regex_xquery.h (+360/-122)
src/util/unicode_categories.cpp (+3/-3)
src/util/unicode_categories.h (+44/-37)
src/util/unicode_util.cpp (+20/-2)
src/util/unicode_util.h (+46/-15)
src/util/utf8_util.cpp (+6/-6)
src/util/utf8_util.h (+29/-13)
src/util/utf8_util.tcc (+10/-2)
src/zorbatypes/collation_manager.cpp (+17/-17)
src/zorbatypes/collation_manager.h (+3/-3)
src/zorbatypes/libicu.h (+0/-32)
src/zorbatypes/transcoder.cpp (+8/-4)
src/zorbatypes/transcoder.h (+9/-9)
src/zorbautils/string_util.cpp (+19/-18)
src/zorbautils/string_util.h (+15/-1)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a1.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a10.xml.res (+242/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a11.xml.res (+6/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a2.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a3.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a5.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a6.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a7.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a8.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a9.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m1.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m10.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m11.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m12.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m13.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m14.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m15.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m16.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m17.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m18.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m19.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m2.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m20.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m21.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m22.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m23.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m24.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m25.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m26.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m27.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m28.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m29.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m3.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m30.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m31.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m32.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m33.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m34.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m35.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m36.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m37.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m38.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m39.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m4.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m40.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m41.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m42.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m43.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m44.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m45.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m46.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m47.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m48.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m49.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m5.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m50.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m51.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m52.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m53.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m6.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m7.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m8.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m9.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_prime1.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r1.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r10.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r11.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r12.xml.res (+5/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r2.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r3.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r4.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r5.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r6.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r9.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_t1.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_t4.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/string/Regex/regex_t5.xml.res (+1/-0)
test/rbkt/ExpQueryResults/zorba/testdriver/bom_bug.xml.res (+1/-0)
test/rbkt/Queries/CMakeLists.txt (+17/-0)
test/rbkt/Queries/zorba/string/Regex/regex_a1.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_a10.xq (+11/-0)
test/rbkt/Queries/zorba/string/Regex/regex_a11.xq (+9/-0)
test/rbkt/Queries/zorba/string/Regex/regex_a2.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_a3.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_a5.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_a6.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_a7.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_a8.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_a9.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err1.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err1.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err10.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err10.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err11.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err11.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err12.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err12.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err13.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err13.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err14.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err14.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err15.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err15.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err16.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err16.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err17.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err17.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err18.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err18.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err19.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err19.xq (+3/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err2.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err2.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err20.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err20.xq (+3/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err21.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err21.xq (+3/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err22.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err22.xq (+3/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err23.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err23.xq (+3/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err24.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err24.xq (+3/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err25.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err25.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err3.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err3.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err4.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err4.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err5.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err5.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err7.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err7.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err8.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err8.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err9.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_err9.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m1.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m10.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m11.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m12.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m13.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m14.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m15.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m16.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m17.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m18.xq (+3/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m19.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m2.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m20.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m21.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m22.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m23.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m24.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m25.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m26.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m27.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m28.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m29.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m3.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m30.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m31.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m32.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m33.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m34.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m35.xq (+4/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m36.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m37.xq (+4/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m38.xq (+4/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m39.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m4.xq (+6/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m40.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m41.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m42.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m43.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m44.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m45.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m46.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m47.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m48.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m49.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m5.xq (+6/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m50.xq (+2/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m51.xq (+2/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m52.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m53.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m6.xq (+6/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m7.xq (+6/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m8.xq (+7/-0)
test/rbkt/Queries/zorba/string/Regex/regex_m9.xq (+7/-0)
test/rbkt/Queries/zorba/string/Regex/regex_prime1.xq (+17/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r1.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r10.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r11.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r12.xq (+7/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r2.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r3.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r4.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r5.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r6.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r7_err.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r7_err.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r8_err.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r8_err.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_r9.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_t1.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_t2.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_t3_err.spec (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_t3_err.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/regex_t4.xq (+2/-0)
test/rbkt/Queries/zorba/string/Regex/regex_t5.xq (+1/-0)
test/rbkt/Queries/zorba/string/Regex/zorba.html (+242/-0)
test/rbkt/Queries/zorba/string/Regex/zorba2.html (+5/-0)
test/rbkt/Queries/zorba/testdriver/bom_bug.xq (+1/-0)
test/unit/CMakeLists.txt (+4/-1)
test/unit/string_test.cpp (+8/-0)
test/update/CMakeLists.txt (+9/-0)
To merge this branch: bzr merge lp:~zorba-coders/zorba/no_unicode
Reviewer Review Type Date Requested Status
Matthias Brantner Needs Fixing
Markos Zaharioudakis Pending
Review via email: mp+85142@code.launchpad.net

This proposal supersedes a proposal from 2011-12-09.

This proposal has been superseded by a proposal from 2012-01-18.

Commit message

"No Unicode" is now "No ICU."

Description of the change

"No Unicode" is now "No ICU."

To post a comment you must log in.
Revision history for this message
Matthias Brantner (matthias-brantner) wrote :

Compiling with ZORBA_NO_ICU=ON fails on Linux:

[ 1%] Building CXX object src/CMakeFiles/zorba_simplestore.dir/api/zorba_string.cpp.o
In file included from /home/mbrantner/zorba/sandbox/src/util/regex.h:501:0,
                 from /home/mbrantner/zorba/sandbox/src/api/zorba_string.cpp:23:
/home/mbrantner/zorba/sandbox/src/util/regex_xquery.h:209:3: error: a class-key must be used when declaring a friend
/home/mbrantner/zorba/sandbox/src/util/regex_xquery.h:209:3: error: friend declaration does not name a class or function
/home/mbrantner/zorba/sandbox/src/util/regex_xquery.h:253:3: error: a class-key must be used when declaring a friend
/home/mbrantner/zorba/sandbox/src/util/regex_xquery.h:253:3: error: friend declaration does not name a class or function
make[2]: *** [src/CMakeFiles/zorba_simplestore.dir/api/zorba_string.cpp.o] Erro

Revision history for this message
Matthias Brantner (matthias-brantner) :
review: Needs Fixing
Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote :

There are additional revisions which have not been approved in review. Please seek review and approval of these new revisions.

Revision history for this message
Matthias Brantner (matthias-brantner) wrote :

The test suite doesn't run clean on my system (Linux) without ICU. This prevents us from adding the built to the remote queue. For example, the following tests fail without ICU (some of them also seem to fail with ICU):

 1294 - test/rbkt/zorba/string/Regex/regex_a10 (Failed)
 1548 - test/rbkt/zorba/fulltext/ft-wildcard-true-2 (Failed)
 1560 - test/rbkt/zorba/fulltext/ft-wildcard-true-4 (Failed)
 1574 - test/rbkt/zorba/fulltext/ft-same-sentence-true-4 (Failed)
 1581 - test/rbkt/zorba/fulltext/ft-wildcard-true-3 (Failed)
 1587 - test/rbkt/zorba/fulltext/ft-wildcard-true-9 (Failed)
 1600 - test/rbkt/zorba/fulltext/ft-diacritics-insensitive-true-1 (Failed)
 1605 - test/rbkt/zorba/fulltext/ft-wildcard-true-8 (Failed)
 1612 - test/rbkt/zorba/fulltext/ft-wildcard-true-10 (Failed)
 1635 - test/rbkt/zorba/fulltext/ft-wildcard-true-7 (Failed)
 1637 - test/rbkt/zorba/fulltext/ft-wildcard-true-11 (Failed)
 1643 - test/rbkt/zorba/fulltext/ft-wildcard-FTDY0020-3 (Failed)
 1789 - test/rbkt/zorba/index/numbers (Failed)
 2345 - test/unit/string_test (Failed)
 2534 - test/update/zorba/store/sc3 (Failed)
 2544 - doc/cxx/examples/context.cpp (Failed)

Please make sure the test suite runs clean.

review: Needs Fixing
lp:~zorba-coders/zorba/no_unicode updated
10512. By Paul J. Lucas

Merge from trunk.

10513. By Daniel Turcanu

Fix bug in FnAnalyzeStringIterator

10514. By Daniel Turcanu

Removed regex_err12 test from expected failures

10515. By Paul J. Lucas

Merge from trunk.

10516. By Paul J. Lucas

Merge from trunk.

Revision history for this message
Paul J. Lucas (paul-lucas) wrote :

Try it now.

lp:~zorba-coders/zorba/no_unicode updated
10517. By Paul J. Lucas

Merge from trunk.

10518. By Paul J. Lucas

Merge from trunk.

10519. By Paul J. Lucas

Merge from trunk.

10520. By Paul J. Lucas

Merge from trunk.

10521. By Rodolfo Ochoa

-Fixes for windows compiling

10522. By Rodolfo Ochoa

Merge from trunk

10523. By Rodolfo Ochoa

- Fixes for commiting on windows with ZORBA_NO_ICU
- Fixes for commiting on windows with ZORBA_NO_FULL_TEXT

10524. By Rodolfo Ochoa

-Fix for no precompiled headers usage

10525. By Rodolfo Ochoa

Merge from trunk

10526. By Rodolfo Ochoa

fixes for windows

10527. By Rodolfo Ochoa

Fix for linux

10528. By Rodolfo Ochoa

more fixes for linux

10529. By Rodolfo Ochoa

Final fix for windows

10530. By Paul J. Lucas

1. Added fix for not catching bad regexs like "^^".
2. Added if="!defined(ZORBA_NO_ICU)" for some entries in the diagnostics
   dictionary.

10531. By Paul J. Lucas

Merge from trunk.

10532. By Paul J. Lucas

Fix for '^' bug.

10533. By Paul J. Lucas

1. Fixed yet another '^' bug.
2. Marked some regex tests as expected failure with correct bug numbers.

10534. By Paul J. Lucas

No longer doing some stuff when q_flag is set.

10535. By Paul J. Lucas

Tweaked one error message.

10536. By Paul J. Lucas

Merge from trunk.

10537. By Rodolfo Ochoa

Merge from trunk

10538. By Rodolfo Ochoa

Strange error on include guards

10539. By Rodolfo Ochoa

merge from trunk

10540. By Rodolfo Ochoa

fix for regex errors in RQ

Unmerged revisions

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'CMakeConfiguration.txt'
2--- CMakeConfiguration.txt 2012-01-11 17:30:25 +0000
3+++ CMakeConfiguration.txt 2012-01-18 18:33:36 +0000
4@@ -135,14 +135,14 @@
5 SET (ZORBA_DEBUG_STRING ${ZORBA_DEBUG_STRING} CACHE BOOL "debug strings")
6 MESSAGE (STATUS "ZORBA_DEBUG_STRING: " ${ZORBA_DEBUG_STRING})
7
8-SET(ZORBA_NO_UNICODE OFF CACHE BOOL "disable ICU")
9-MESSAGE(STATUS "ZORBA_NO_UNICODE: " ${ZORBA_NO_UNICODE})
10+SET(ZORBA_NO_ICU OFF CACHE BOOL "disable ICU")
11+MESSAGE(STATUS "ZORBA_NO_ICU: " ${ZORBA_NO_ICU})
12
13-IF (ZORBA_NO_UNICODE)
14+IF (ZORBA_NO_ICU)
15 SET (no_full_text ON)
16-ELSE (ZORBA_NO_UNICODE)
17+ELSE (ZORBA_NO_ICU)
18 SET (no_full_text OFF)
19-ENDIF (ZORBA_NO_UNICODE)
20+ENDIF (ZORBA_NO_ICU)
21 SET (ZORBA_NO_FULL_TEXT ${no_full_text} CACHE BOOL "disable XQuery Full-Text support")
22 MESSAGE(STATUS "ZORBA_NO_FULL_TEXT: " ${ZORBA_NO_FULL_TEXT})
23
24
25=== modified file 'CMakeLists.txt'
26--- CMakeLists.txt 2012-01-04 09:47:54 +0000
27+++ CMakeLists.txt 2012-01-18 18:33:36 +0000
28@@ -123,10 +123,14 @@
29 CHECK_TYPE_SIZE("int64_t" ZORBA_HAVE_INT64_T)
30
31 CHECK_CXX_SOURCE_COMPILES ("#include <type_traits>\nint main() { std::enable_if<true,int> x; }" ZORBA_CXX_ENABLE_IF)
32-CHECK_CXX_SOURCE_COMPILES ("int main() { int *p = nullptr; }" ZORBA_CXX_NULLPTR)
33-CHECK_CXX_SOURCE_COMPILES ("int main() { static_assert(1,\"\"); }" ZORBA_CXX_STATIC_ASSERT)
34+SET(CMAKE_EXTRA_INCLUDE_FILES wchar.h)
35+CHECK_TYPE_SIZE("wchar_t" ZORBA_SIZEOF_WCHAR_T)
36+SET(CMAKE_EXTRA_INCLUDE_FILES)
37 CHECK_CXX_SOURCE_COMPILES ("#include <memory>\nint main() { std::unique_ptr<int> p; }" ZORBA_CXX_UNIQUE_PTR)
38
39+CHECK_CXX_SOURCE_COMPILES("int main() { int *p = nullptr; }" ZORBA_CXX_NULLPTR)
40+CHECK_CXX_SOURCE_COMPILES("int main() { static_assert(1,\"\"); }" ZORBA_CXX_STATIC_ASSERT)
41+
42 ################################################################################
43 # Various cmake macros
44
45
46=== modified file 'ChangeLog'
47--- ChangeLog 2012-01-18 12:18:59 +0000
48+++ ChangeLog 2012-01-18 18:33:36 +0000
49@@ -1,5 +1,9 @@
50 Zorba - The XQuery Processor
51
52+version 2.x
53+
54+ * Added support for NO_ICU (to not use ICU for unicode processing)
55+
56 version 2.2
57
58 * No-copy optimization: avoids copying nodes during node-constructor expressions.
59@@ -78,7 +82,9 @@
60 * Fixed bug when parsing a document with a base-uri attribute.
61 * Fixed bug #863320 (Sentence is incorrectly incremented when token characters end without sentence terminator)
62 * Fixed bug #863730 (static delete-node* functions don't raise ZDDY0012)
63+ * Implemented the probe-index-range-value for general indexes
64 * Removed ZSTR0005 and ZSTR0006 error codes
65+ * Fixed bug #867662 ("nullptr" warning)
66 * Fixed bug #868258 (Assertion failure with two delete collection)
67 * Fixed bug #871623 and #871629 (assertion failures with insertions in dynamic collections)
68 * Fixed bug #867262 (allow reuse of iterator over ExtFuncArgItemSequence)
69@@ -87,6 +93,8 @@
70 * New node-reference module. References can be obtained for any node, and
71 different nodes cannot have the same identifier.
72 * Fixed bug #872697 (segmentation fault with validation of NMTOKENS)
73+ * General index cannot be declared as unique if the type of its key is
74+ xs:anyAtomicType or xs:untypedAtomic.
75 * Added undo for node revalidation
76 * Optimization for count(collection()) expressions
77 * Fixed bug #872796 (validate-in-place can interfere with other update primitives)
78@@ -105,6 +113,8 @@
79 * Fixed bug #855715 (Invalid escaped characters in regex not caught)
80 * Fixed bug #862089 (Split binary/xq install directories for modules) by
81 splitting "module path" into separate URI and Library paths
82+ * New node-position module. This module allows to obtain a representation of a node position, which
83+ can be used to assess structural relationships with other nodes.
84 * Fixed bug #872502 (validation of the JSON module xqdoc fails)
85 * Fixed bug #897619 (testdriver_mt can not run the XQueryX tests)
86 * Fixed bug #867107 (xqdoc dependency to zorba is wrong)
87
88=== modified file 'KNOWN_ISSUES.txt'
89--- KNOWN_ISSUES.txt 2011-10-07 08:28:43 +0000
90+++ KNOWN_ISSUES.txt 2012-01-18 18:33:36 +0000
91@@ -37,7 +37,7 @@
92 * The serializer currently doesn't implement character maps as specified
93 (http://www.w3.org/TR/xslt-xquery-serialization/#character-maps)
94
95-* In the 2.0 release, setting the CMake variables ZORBA_NO_UNICODE to
96+* In the 2.0 release, setting the CMake variables ZORBA_NO_ICU to
97 ON is not supported.
98
99 * The PHP language binding is not supported on Mac OS X. For details,
100
101=== modified file 'doc/cxx/examples/context.cpp'
102--- doc/cxx/examples/context.cpp 2011-07-22 08:12:31 +0000
103+++ doc/cxx/examples/context.cpp 2012-01-18 18:33:36 +0000
104@@ -149,7 +149,11 @@
105 outStream2 << lQuery << std::endl;
106 std::cout << outStream2.str() << std::endl;
107
108+#ifndef ZORBA_NO_ICU
109 if (outStream2.str() != "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\nBook 1.1\n")
110+#else
111+ if (outStream2.str() != "<?xml version=\"1.0\"?>\nBook 1.1\n")
112+#endif /* ZORBA_NO_ICU */
113 {
114 std::cerr << "Test 4 failed with a wrong result : " << std::endl
115 << outStream2.str() << std::endl;
116
117=== modified file 'include/zorba/config.h.cmake'
118--- include/zorba/config.h.cmake 2012-01-11 17:30:25 +0000
119+++ include/zorba/config.h.cmake 2012-01-18 18:33:36 +0000
120@@ -93,6 +93,8 @@
121 typedef __int64 int64_t;
122 #endif /* ZORBA_HAVE_INT64_T */
123
124+#cmakedefine ZORBA_SIZEOF_WCHAR_T @ZORBA_SIZEOF_WCHAR_T@
125+
126 // Compiler
127 #cmakedefine CLANG
128 #cmakedefine MSVC
129@@ -145,7 +147,7 @@
130
131 // Zorba features
132 #cmakedefine ZORBA_NO_FULL_TEXT
133-#cmakedefine ZORBA_NO_UNICODE
134+#cmakedefine ZORBA_NO_ICU
135 #cmakedefine ZORBA_NO_XMLSCHEMA
136 #cmakedefine ZORBA_NUMERIC_OPTIMIZATION
137 #cmakedefine ZORBA_VERIFY_PEER_SSL_CERTIFICATE
138
139=== modified file 'include/zorba/util/time.h'
140--- include/zorba/util/time.h 2011-06-16 16:40:44 +0000
141+++ include/zorba/util/time.h 2012-01-18 18:33:36 +0000
142@@ -178,7 +178,7 @@
143
144 inline long get_walltime_in_millis(const walltime& t)
145 {
146- return t.time * 1000 + t.millitm;
147+ return (long)(t.time * 1000 + t.millitm);
148 }
149
150 #else /* not Windows, and no clock_gettime() */
151
152=== modified file 'src/api/serialization/serializer.cpp'
153--- src/api/serialization/serializer.cpp 2012-01-11 17:30:25 +0000
154+++ src/api/serialization/serializer.cpp 2012-01-18 18:33:36 +0000
155@@ -180,7 +180,6 @@
156 for (; chars < chars_end; chars++ )
157 {
158
159-#ifndef ZORBA_NO_UNICODE
160 // the input string is UTF-8
161 int char_length = utf8::char_length(*chars);
162 if (char_length == 0)
163@@ -217,7 +216,6 @@
164
165 continue;
166 }
167-#endif//ZORBA_NO_UNICODE
168
169 // raise an error iff (1) the serialization format is XML 1.0 and (2) the given character is an invalid XML 1.0 character
170 if (ser && ser->method == PARAMETER_VALUE_XML &&
171@@ -332,14 +330,12 @@
172 {
173 tr << (char)0xEF << (char)0xBB << (char)0xBF;
174 }
175-#ifndef ZORBA_NO_UNICODE
176 else if (ser->encoding == PARAMETER_VALUE_UTF_16)
177 {
178 // Little-endian
179 tr.verbatim((char)0xFF);
180 tr.verbatim((char)0xFE);
181 }
182-#endif
183 }
184 }
185
186@@ -834,13 +830,17 @@
187 emitter::emit_declaration();
188
189 if (ser->omit_xml_declaration == PARAMETER_VALUE_NO) {
190- tr << "<?xml version=\"" << ser->version << "\" encoding=\"";
191- if (ser->encoding == PARAMETER_VALUE_UTF_8) {
192- tr << "UTF-8";
193-#ifndef ZORBA_NO_UNICODE
194- } else if (ser->encoding == PARAMETER_VALUE_UTF_16) {
195- tr << "UTF-16";
196-#endif
197+ tr << "<?xml version=\"" << ser->version;
198+ switch (ser->encoding) {
199+ case PARAMETER_VALUE_UTF_8:
200+ case PARAMETER_VALUE_UTF_16:
201+ tr << "\" encoding=\"";
202+ switch (ser->encoding) {
203+ case PARAMETER_VALUE_UTF_8 : tr << "UTF-8" ; break;
204+ case PARAMETER_VALUE_UTF_16: tr << "UTF-16"; break;
205+ default : ZORBA_ASSERT(false);
206+ }
207+ break;
208 }
209 tr << "\"";
210
211@@ -1146,14 +1146,18 @@
212 }
213
214 tr << "<meta http-equiv=\"content-type\" content=\""
215- << ser->media_type << "; charset=";
216-
217- if (ser->encoding == PARAMETER_VALUE_UTF_8)
218- tr << "UTF-8";
219-#ifndef ZORBA_NO_UNICODE
220- else if (ser->encoding == PARAMETER_VALUE_UTF_16)
221- tr << "UTF-16";
222-#endif
223+ << ser->media_type;
224+ switch (ser->encoding) {
225+ case PARAMETER_VALUE_UTF_8:
226+ case PARAMETER_VALUE_UTF_16:
227+ tr << "\" charset=\"";
228+ switch (ser->encoding) {
229+ case PARAMETER_VALUE_UTF_8 : tr << "UTF-8" ; break;
230+ case PARAMETER_VALUE_UTF_16: tr << "UTF-16"; break;
231+ default : ZORBA_ASSERT(false);
232+ }
233+ break;
234+ }
235 tr << "\"";
236 // closed_parent_tag = 1;
237 }
238@@ -1343,14 +1347,18 @@
239 }
240
241 tr << "<meta http-equiv=\"content-type\" content=\""
242- << ser->media_type << "; charset=";
243-
244- if (ser->encoding == PARAMETER_VALUE_UTF_8)
245- tr << "UTF-8";
246-#ifndef ZORBA_NO_UNICODE
247- else if (ser->encoding == PARAMETER_VALUE_UTF_16)
248- tr << "UTF-16";
249-#endif
250+ << ser->media_type;
251+ switch (ser->encoding) {
252+ case PARAMETER_VALUE_UTF_8:
253+ case PARAMETER_VALUE_UTF_16:
254+ tr << "\" charset=\"";
255+ switch (ser->encoding) {
256+ case PARAMETER_VALUE_UTF_8 : tr << "UTF-8" ; break;
257+ case PARAMETER_VALUE_UTF_16: tr << "UTF-16"; break;
258+ default : ZORBA_ASSERT(false);
259+ }
260+ break;
261+ }
262 tr << "\"/";
263 //closed_parent_tag = 1;
264 }
265@@ -2052,10 +2060,8 @@
266 {
267 if (!strcmp(aValue, "UTF-8"))
268 encoding = PARAMETER_VALUE_UTF_8;
269-#ifndef ZORBA_NO_UNICODE
270 else if (!strcmp(aValue, "UTF-16"))
271 encoding = PARAMETER_VALUE_UTF_16;
272-#endif
273 else
274 throw XQUERY_EXCEPTION(
275 err::SEPM0016, ERROR_PARAMS( aValue, aName, ZED( GoodValuesAreUTF8 ) )
276@@ -2164,16 +2170,13 @@
277 {
278 tr = new transcoder(os, false);
279 }
280-#ifndef ZORBA_NO_UNICODE
281 else if (encoding == PARAMETER_VALUE_UTF_16)
282 {
283 tr = new transcoder(os, true);
284 }
285-#endif
286 else
287 {
288- ZORBA_ASSERT(0);
289- return false;
290+ ZORBA_ASSERT(false);
291 }
292
293 if (method == PARAMETER_VALUE_XML)
294
295=== modified file 'src/api/serialization/serializer.h'
296--- src/api/serialization/serializer.h 2011-11-11 07:44:01 +0000
297+++ src/api/serialization/serializer.h 2012-01-18 18:33:36 +0000
298@@ -70,10 +70,8 @@
299 PARAMETER_VALUE_TEXT,
300 PARAMETER_VALUE_BINARY,
301
302- PARAMETER_VALUE_UTF_8
303-#ifndef ZORBA_NO_UNICODE
304- ,PARAMETER_VALUE_UTF_16
305-#endif
306+ PARAMETER_VALUE_UTF_8,
307+ PARAMETER_VALUE_UTF_16
308 } PARAMETER_VALUE_TYPE;
309
310 protected:
311
312=== modified file 'src/diagnostics/diagnostic_en.xml'
313--- src/diagnostics/diagnostic_en.xml 2011-12-21 14:40:33 +0000
314+++ src/diagnostics/diagnostic_en.xml 2012-01-18 18:33:36 +0000
315@@ -3080,85 +3080,171 @@
316 <value>item type is not a subtype of "$3"</value>
317 </entry>
318
319- <entry key="U_REGEX_BAD_ESCAPE_SEQUENCE" if="!defined(ZORBA_NO_UNICODE)">
320+ <entry key="U_REGEX_BAD_ESCAPE_SEQUENCE" if="!defined(ZORBA_NO_ICU)">
321 <value>unrecognized backslash escape sequence</value>
322 </entry>
323
324- <entry key="U_REGEX_BAD_INTERVAL" if="!defined(ZORBA_NO_UNICODE)">
325+ <entry key="U_REGEX_BAD_INTERVAL" if="!defined(ZORBA_NO_ICU)">
326 <value>error in {min,max} interval</value>
327 </entry>
328
329- <entry key="U_REGEX_INTERNAL_ERROR" if="!defined(ZORBA_NO_UNICODE)">
330+ <entry key="U_REGEX_INTERNAL_ERROR" if="!defined(ZORBA_NO_ICU)">
331 <value>an internal ICU error (bug) was detected</value>
332 </entry>
333
334- <entry key="U_REGEX_INVALID_BACK_REF" if="!defined(ZORBA_NO_UNICODE)">
335+ <entry key="U_REGEX_INVALID_BACK_REF" if="!defined(ZORBA_NO_ICU)">
336 <value>backreference to a non-existent capture group</value>
337 </entry>
338
339- <entry key="U_REGEX_INVALID_FLAG" if="!defined(ZORBA_NO_UNICODE)">
340+ <entry key="U_REGEX_INVALID_FLAG" if="!defined(ZORBA_NO_ICU)">
341 <value>invalid value for match mode flags</value>
342 </entry>
343
344- <entry key="U_REGEX_INVALID_RANGE" if="!defined(ZORBA_NO_UNICODE)">
345+ <entry key="U_REGEX_INVALID_RANGE" if="!defined(ZORBA_NO_ICU)">
346 <value>in character range [x-y], x is greater than y</value>
347 </entry>
348
349- <entry key="U_REGEX_INVALID_STATE" if="!defined(ZORBA_NO_UNICODE)">
350+ <entry key="U_REGEX_INVALID_STATE" if="!defined(ZORBA_NO_ICU)">
351 <value>RegexMatcher in invalid state for requested operation</value>
352 </entry>
353
354- <entry key="U_REGEX_LOOK_BEHIND_LIMIT" if="!defined(ZORBA_NO_UNICODE)">
355+ <entry key="U_REGEX_LOOK_BEHIND_LIMIT" if="!defined(ZORBA_NO_ICU)">
356 <value>look-behind pattern matches must have a bounded maximum length</value>
357 </entry>
358
359- <entry key="U_REGEX_MAX_LT_MIN" if="!defined(ZORBA_NO_UNICODE)">
360+ <entry key="U_REGEX_MAX_LT_MIN" if="!defined(ZORBA_NO_ICU)">
361 <value>in {min,max}, max is less than min</value>
362 </entry>
363
364- <entry key="U_REGEX_MISMATCHED_PAREN" if="!defined(ZORBA_NO_UNICODE)">
365+ <entry key="U_REGEX_MISMATCHED_PAREN" if="!defined(ZORBA_NO_ICU)">
366 <value>incorrectly nested parentheses</value>
367 </entry>
368
369- <entry key="U_REGEX_MISSING_CLOSE_BRACKET" if="!defined(ZORBA_NO_UNICODE)">
370+ <entry key="U_REGEX_MISSING_CLOSE_BRACKET" if="!defined(ZORBA_NO_ICU)">
371 <value>missing ']'</value>
372 </entry>
373
374- <entry key="U_REGEX_NUMBER_TOO_BIG" if="!defined(ZORBA_NO_UNICODE)">
375+ <entry key="U_REGEX_NUMBER_TOO_BIG" if="!defined(ZORBA_NO_ICU)">
376 <value>decimal number is too large</value>
377 </entry>
378
379- <entry key="U_REGEX_OCTAL_TOO_BIG" if="!defined(ZORBA_NO_UNICODE)">
380+ <entry key="U_REGEX_OCTAL_TOO_BIG" if="!defined(ZORBA_NO_ICU)">
381 <value>octal character constants must be &lt;= 0377</value>
382 </entry>
383
384- <entry key="U_REGEX_PROPERTY_SYNTAX" if="!defined(ZORBA_NO_UNICODE)">
385+ <entry key="U_REGEX_PROPERTY_SYNTAX" if="!defined(ZORBA_NO_ICU)">
386 <value>incorrect Unicode property</value>
387 </entry>
388
389- <entry key="U_REGEX_RULE_SYNTAX" if="!defined(ZORBA_NO_UNICODE)">
390+ <entry key="U_REGEX_RULE_SYNTAX" if="!defined(ZORBA_NO_ICU)">
391 <value>syntax error</value>
392 </entry>
393
394- <entry key="U_REGEX_SET_CONTAINS_STRING" if="!defined(ZORBA_NO_UNICODE)">
395+ <entry key="U_REGEX_SET_CONTAINS_STRING" if="!defined(ZORBA_NO_ICU)">
396 <value>can not have UnicodeSets containing strings</value>
397 </entry>
398
399- <entry key="U_REGEX_STACK_OVERFLOW" if="!defined(ZORBA_NO_UNICODE)">
400+ <entry key="U_REGEX_STACK_OVERFLOW" if="!defined(ZORBA_NO_ICU)">
401 <value>backtrack stack overflow</value>
402 </entry>
403
404- <entry key="U_REGEX_STOPPED_BY_CALLER" if="!defined(ZORBA_NO_UNICODE)">
405+ <entry key="U_REGEX_STOPPED_BY_CALLER" if="!defined(ZORBA_NO_ICU)">
406 <value>matching operation aborted by user callback fn</value>
407 </entry>
408
409- <entry key="U_REGEX_TIME_OUT" if="!defined(ZORBA_NO_UNICODE)">
410+ <entry key="U_REGEX_TIME_OUT" if="!defined(ZORBA_NO_ICU)">
411 <value>maximum allowed match time exceeded</value>
412 </entry>
413
414- <entry key="U_REGEX_UNIMPLEMENTED" if="!defined(ZORBA_NO_UNICODE)">
415- <value>use of regular expression feature that is not yet implemented</value>
416- </entry>
417+ <entry key="U_REGEX_UNIMPLEMENTED" if="!defined(ZORBA_NO_ICU)">
418+ <value>use of regular expression feature that is not yet implemented</value>
419+ </entry>
420+
421+ <!-- Regex Ascii error messages-->
422+ <entry key="REGEX_UNIMPLEMENTED" if="defined(ZORBA_NO_ICU)">
423+ <value>use of regular expression feature that is not yet implemented</value>
424+ </entry>
425+
426+ <entry key="REGEX_MISMATCHED_PAREN" if="defined(ZORBA_NO_ICU)">
427+ <value>incorrectly nested parentheses</value>
428+ </entry>
429+
430+ <entry key="REGEX_BROKEN_P_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
431+ <value>broken \\p construct</value>
432+ </entry>
433+
434+ <entry key="REGEX_UNKNOWN_PL_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
435+ <value>unknown \\p{L?} category; supported categories: L, Lu, Ll, Lt, Lm, Lo</value>
436+ </entry>
437+
438+ <entry key="REGEX_UNKNOWN_PM_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
439+ <value>unknown \\p{M?} category; supported categories: M, Mn, Mc, Me</value>
440+ </entry>
441+
442+ <entry key="REGEX_UNKNOWN_PN_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
443+ <value>unknown \\p{N?} category; supported categories: N, Nd, Nl, No</value>
444+ </entry>
445+
446+ <entry key="REGEX_UNKNOWN_PP_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
447+ <value>unknown \\p{P?} category; supported categories: P, Pc, Pd, Ps, Pe, Pi, Pf, Po</value>
448+ </entry>
449+
450+ <entry key="REGEX_UNKNOWN_PZ_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
451+ <value>unknown \\p{Z?} category; supported categories: Z, Zs, Zl, Zp</value>
452+ </entry>
453+
454+ <entry key="REGEX_UNKNOWN_PS_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
455+ <value>unknown \\p{S?} category; supported categories: S, Sm, Sc, Sk, So</value>
456+ </entry>
457+
458+ <entry key="REGEX_UNKNOWN_PC_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
459+ <value>unknown \\p{C?} category; supported categories: C, Cc, Cf, Co, Cn(for not assigned)</value>
460+ </entry>
461+
462+ <entry key="REGEX_BROKEN_PIs_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
463+ <value>broken \\p{Is} construct; valid characters are [a-zA-Z0-9-]</value>
464+ </entry>
465+
466+ <entry key="REGEX_UNKNOWN_PIs_CONSTRUCT" if="defined(ZORBA_NO_ICU)">
467+ <value>unknown \\p{Is} category block; see supported block escapes here: http://www.w3.org/TR/xmlschema-2/#charcter-classes</value>
468+ </entry>
469+
470+ <entry key="REGEX_INVALID_UNICODE_CODEPOINT_u" if="defined(ZORBA_NO_ICU)">
471+ <value>invalid unicode hex, should be in form \\uXXXX or \\UXXXXXXXX</value>
472+ </entry>
473+
474+ <entry key="REGEX_UNKNOWN_ESC_CHAR" if="defined(ZORBA_NO_ICU)">
475+ <value>unknown \\? escape char; supported escapes are: \\[nrt\\|.?*+(){}[]-^$] for char escapes, \\[pP] for categories and \\[sSiIcCdDwW] for multichar groups</value>
476+ </entry>
477+
478+ <entry key="REGEX_INVALID_BACK_REF" if="defined(ZORBA_NO_ICU)">
479+ <value>\\$3 backreference to a non-existent capture group ($4 groups so far)</value>
480+ </entry>
481+
482+ <entry key="REGEX_INVALID_ATOM_CHAR" if="defined(ZORBA_NO_ICU)">
483+ <value>$3 - invalid character for an atom; forbidden characters are: [{}?*+|^]</value>
484+ </entry>
485+
486+ <entry key="REGEX_INVALID_SUBCLASS" if="defined(ZORBA_NO_ICU)">
487+ <value>malformed class subtraction</value>
488+ </entry>
489+
490+ <entry key="REGEX_INVALID_USE_OF_SUBCLASS" if="defined(ZORBA_NO_ICU)">
491+ <value>improper use of class subtraction: it must be the last construct in a class group [xxx-[yyy]]</value>
492+ </entry>
493+
494+ <entry key="REGEX_MULTICHAR_IN_CHAR_RANGE" if="defined(ZORBA_NO_ICU)">
495+ <value>multichars or char categories cannot be part of a char range</value>
496+ </entry>
497+
498+ <entry key="REGEX_MISSING_CLOSE_BRACKET" if="defined(ZORBA_NO_ICU)">
499+ <value>missing close bracket in char group</value>
500+ </entry>
501+
502+ <entry key="REGEX_MAX_LT_MIN" if="defined(ZORBA_NO_ICU)">
503+ <value>in {min,max}, max is less than min</value>
504+ </entry>
505+
506
507 <entry key="UnaryArithOp">
508 <value>unary arithmetic operator</value>
509
510=== modified file 'src/diagnostics/pregenerated/dict_en.cpp'
511--- src/diagnostics/pregenerated/dict_en.cpp 2011-12-21 14:40:33 +0000
512+++ src/diagnostics/pregenerated/dict_en.cpp 2012-01-18 18:33:36 +0000
513@@ -565,6 +565,69 @@
514 { "~ParserNoCreateTree", "XML tree creation failed" },
515 { "~PromotionImpossible", "promotion not possible" },
516 { "~QuotedColon_23", "\"$2\": $3" },
517+#if defined(ZORBA_NO_ICU)
518+ { "~REGEX_BROKEN_PIs_CONSTRUCT", "broken \\p{Is} construct; valid characters are [a-zA-Z0-9-]" },
519+#endif
520+#if defined(ZORBA_NO_ICU)
521+ { "~REGEX_BROKEN_P_CONSTRUCT", "broken \\p construct" },
522+#endif
523+#if defined(ZORBA_NO_ICU)
524+ { "~REGEX_INVALID_ATOM_CHAR", "$3 - invalid character for an atom; forbidden characters are: [{}?*+|^]" },
525+#endif
526+#if defined(ZORBA_NO_ICU)
527+ { "~REGEX_INVALID_BACK_REF", "\\$3 backreference to a non-existent capture group ($4 groups so far)" },
528+#endif
529+#if defined(ZORBA_NO_ICU)
530+ { "~REGEX_INVALID_SUBCLASS", "malformed class subtraction" },
531+#endif
532+#if defined(ZORBA_NO_ICU)
533+ { "~REGEX_INVALID_UNICODE_CODEPOINT_u", "invalid unicode hex, should be in form \\uXXXX or \\UXXXXXXXX" },
534+#endif
535+#if defined(ZORBA_NO_ICU)
536+ { "~REGEX_INVALID_USE_OF_SUBCLASS", "improper use of class subtraction: it must be the last construct in a class group [xxx-[yyy]]" },
537+#endif
538+#if defined(ZORBA_NO_ICU)
539+ { "~REGEX_MAX_LT_MIN", "in {min,max}, max is less than min" },
540+#endif
541+#if defined(ZORBA_NO_ICU)
542+ { "~REGEX_MISMATCHED_PAREN", "incorrectly nested parentheses" },
543+#endif
544+#if defined(ZORBA_NO_ICU)
545+ { "~REGEX_MISSING_CLOSE_BRACKET", "missing close bracket in char group" },
546+#endif
547+#if defined(ZORBA_NO_ICU)
548+ { "~REGEX_MULTICHAR_IN_CHAR_RANGE", "multichars or char categories cannot be part of a char range" },
549+#endif
550+#if defined(ZORBA_NO_ICU)
551+ { "~REGEX_UNIMPLEMENTED", "use of regular expression feature that is not yet implemented" },
552+#endif
553+#if defined(ZORBA_NO_ICU)
554+ { "~REGEX_UNKNOWN_ESC_CHAR", "unknown \\? escape char; supported escapes are: \\[nrt\\|.?*+(){}[]-^$] for char escapes, \\[pP] for categories and \\[sSiIcCdDwW] for multichar groups" },
555+#endif
556+#if defined(ZORBA_NO_ICU)
557+ { "~REGEX_UNKNOWN_PC_CONSTRUCT", "unknown \\p{C?} category; supported categories: C, Cc, Cf, Co, Cn(for not assigned)" },
558+#endif
559+#if defined(ZORBA_NO_ICU)
560+ { "~REGEX_UNKNOWN_PIs_CONSTRUCT", "unknown \\p{Is} category block; see supported block escapes here: http://www.w3.org/TR/xmlschema-2/#charcter-classes" },
561+#endif
562+#if defined(ZORBA_NO_ICU)
563+ { "~REGEX_UNKNOWN_PL_CONSTRUCT", "unknown \\p{L?} category; supported categories: L, Lu, Ll, Lt, Lm, Lo" },
564+#endif
565+#if defined(ZORBA_NO_ICU)
566+ { "~REGEX_UNKNOWN_PM_CONSTRUCT", "unknown \\p{M?} category; supported categories: M, Mn, Mc, Me" },
567+#endif
568+#if defined(ZORBA_NO_ICU)
569+ { "~REGEX_UNKNOWN_PN_CONSTRUCT", "unknown \\p{N?} category; supported categories: N, Nd, Nl, No" },
570+#endif
571+#if defined(ZORBA_NO_ICU)
572+ { "~REGEX_UNKNOWN_PP_CONSTRUCT", "unknown \\p{P?} category; supported categories: P, Pc, Pd, Ps, Pe, Pi, Pf, Po" },
573+#endif
574+#if defined(ZORBA_NO_ICU)
575+ { "~REGEX_UNKNOWN_PS_CONSTRUCT", "unknown \\p{S?} category; supported categories: S, Sm, Sc, Sk, So" },
576+#endif
577+#if defined(ZORBA_NO_ICU)
578+ { "~REGEX_UNKNOWN_PZ_CONSTRUCT", "unknown \\p{Z?} category; supported categories: Z, Zs, Zl, Zp" },
579+#endif
580 { "~SEPM0009_Not10", "the version parameter has a value other than \"1.0\" and the doctype-system parameter is specified" },
581 { "~SEPM0009_NotOmit", "the standalone attribute has a value other than \"omit\"" },
582 { "~SchemaAttributeName", "schema-attribute name" },
583@@ -588,64 +651,64 @@
584 { "~TwoDecimalFormatsSameName_2", "\"$2\": two decimal formats with this name" },
585 { "~TwoDefaultDecimalFormats", "two default decimal formats" },
586 { "~TypeIsNotSubtype", "item type is not a subtype of \"$3\"" },
587-#if !defined(ZORBA_NO_UNICODE)
588+#if !defined(ZORBA_NO_ICU)
589 { "~U_REGEX_BAD_ESCAPE_SEQUENCE", "unrecognized backslash escape sequence" },
590 #endif
591-#if !defined(ZORBA_NO_UNICODE)
592+#if !defined(ZORBA_NO_ICU)
593 { "~U_REGEX_BAD_INTERVAL", "error in {min,max} interval" },
594 #endif
595-#if !defined(ZORBA_NO_UNICODE)
596+#if !defined(ZORBA_NO_ICU)
597 { "~U_REGEX_INTERNAL_ERROR", "an internal ICU error (bug) was detected" },
598 #endif
599-#if !defined(ZORBA_NO_UNICODE)
600+#if !defined(ZORBA_NO_ICU)
601 { "~U_REGEX_INVALID_BACK_REF", "backreference to a non-existent capture group" },
602 #endif
603-#if !defined(ZORBA_NO_UNICODE)
604+#if !defined(ZORBA_NO_ICU)
605 { "~U_REGEX_INVALID_FLAG", "invalid value for match mode flags" },
606 #endif
607-#if !defined(ZORBA_NO_UNICODE)
608+#if !defined(ZORBA_NO_ICU)
609 { "~U_REGEX_INVALID_RANGE", "in character range [x-y], x is greater than y" },
610 #endif
611-#if !defined(ZORBA_NO_UNICODE)
612+#if !defined(ZORBA_NO_ICU)
613 { "~U_REGEX_INVALID_STATE", "RegexMatcher in invalid state for requested operation" },
614 #endif
615-#if !defined(ZORBA_NO_UNICODE)
616+#if !defined(ZORBA_NO_ICU)
617 { "~U_REGEX_LOOK_BEHIND_LIMIT", "look-behind pattern matches must have a bounded maximum length" },
618 #endif
619-#if !defined(ZORBA_NO_UNICODE)
620+#if !defined(ZORBA_NO_ICU)
621 { "~U_REGEX_MAX_LT_MIN", "in {min,max}, max is less than min" },
622 #endif
623-#if !defined(ZORBA_NO_UNICODE)
624+#if !defined(ZORBA_NO_ICU)
625 { "~U_REGEX_MISMATCHED_PAREN", "incorrectly nested parentheses" },
626 #endif
627-#if !defined(ZORBA_NO_UNICODE)
628+#if !defined(ZORBA_NO_ICU)
629 { "~U_REGEX_MISSING_CLOSE_BRACKET", "missing ']'" },
630 #endif
631-#if !defined(ZORBA_NO_UNICODE)
632+#if !defined(ZORBA_NO_ICU)
633 { "~U_REGEX_NUMBER_TOO_BIG", "decimal number is too large" },
634 #endif
635-#if !defined(ZORBA_NO_UNICODE)
636+#if !defined(ZORBA_NO_ICU)
637 { "~U_REGEX_OCTAL_TOO_BIG", "octal character constants must be <= 0377" },
638 #endif
639-#if !defined(ZORBA_NO_UNICODE)
640+#if !defined(ZORBA_NO_ICU)
641 { "~U_REGEX_PROPERTY_SYNTAX", "incorrect Unicode property" },
642 #endif
643-#if !defined(ZORBA_NO_UNICODE)
644+#if !defined(ZORBA_NO_ICU)
645 { "~U_REGEX_RULE_SYNTAX", "syntax error" },
646 #endif
647-#if !defined(ZORBA_NO_UNICODE)
648+#if !defined(ZORBA_NO_ICU)
649 { "~U_REGEX_SET_CONTAINS_STRING", "can not have UnicodeSets containing strings" },
650 #endif
651-#if !defined(ZORBA_NO_UNICODE)
652+#if !defined(ZORBA_NO_ICU)
653 { "~U_REGEX_STACK_OVERFLOW", "backtrack stack overflow" },
654 #endif
655-#if !defined(ZORBA_NO_UNICODE)
656+#if !defined(ZORBA_NO_ICU)
657 { "~U_REGEX_STOPPED_BY_CALLER", "matching operation aborted by user callback fn" },
658 #endif
659-#if !defined(ZORBA_NO_UNICODE)
660+#if !defined(ZORBA_NO_ICU)
661 { "~U_REGEX_TIME_OUT", "maximum allowed match time exceeded" },
662 #endif
663-#if !defined(ZORBA_NO_UNICODE)
664+#if !defined(ZORBA_NO_ICU)
665 { "~U_REGEX_UNIMPLEMENTED", "use of regular expression feature that is not yet implemented" },
666 #endif
667 { "~UnaryArithOp", "unary arithmetic operator" },
668
669=== modified file 'src/runtime/full_text/CMakeLists.txt'
670--- src/runtime/full_text/CMakeLists.txt 2011-08-31 13:17:59 +0000
671+++ src/runtime/full_text/CMakeLists.txt 2012-01-18 18:33:36 +0000
672@@ -42,11 +42,11 @@
673 default_tokenizer.cpp
674 )
675
676-IF (ZORBA_NO_UNICODE)
677+IF (ZORBA_NO_ICU)
678 LIST(APPEND FULLTEXT_SRCS latin_tokenizer.cpp)
679-ELSE (ZORBA_NO_UNICODE)
680+ELSE (ZORBA_NO_ICU)
681 LIST(APPEND FULLTEXT_SRCS icu_tokenizer.cpp)
682-ENDIF (ZORBA_NO_UNICODE)
683+ENDIF (ZORBA_NO_ICU)
684
685 ADD_SRC_SUBFOLDER(FULLTEXT_SRCS stemmer LIBSTEMMER_SRCS)
686
687
688=== modified file 'src/runtime/full_text/default_tokenizer.cpp'
689--- src/runtime/full_text/default_tokenizer.cpp 2011-08-31 02:53:07 +0000
690+++ src/runtime/full_text/default_tokenizer.cpp 2012-01-18 18:33:36 +0000
691@@ -19,22 +19,22 @@
692 #include <zorba/config.h>
693
694 #include "default_tokenizer.h"
695-#ifdef ZORBA_NO_UNICODE
696+#ifdef ZORBA_NO_ICU
697 # include "latin_tokenizer.h"
698 #else
699 # include "icu_tokenizer.h"
700-#endif /* ZORBA_NO_UNICODE */
701+#endif /* ZORBA_NO_ICU */
702
703 namespace zorba {
704
705 ///////////////////////////////////////////////////////////////////////////////
706
707 TokenizerProvider const& default_tokenizer_provider() {
708-#ifdef ZORBA_NO_UNICODE
709+#ifdef ZORBA_NO_ICU
710 static LatinTokenizerProvider const instance;
711 #else
712 static ICU_TokenizerProvider const instance;
713-#endif /* ZORBA_NO_UNICODE */
714+#endif /* ZORBA_NO_ICU */
715 return instance;
716 };
717
718
719=== modified file 'src/runtime/full_text/latin_tokenizer.cpp'
720--- src/runtime/full_text/latin_tokenizer.cpp 2011-08-31 03:39:32 +0000
721+++ src/runtime/full_text/latin_tokenizer.cpp 2012-01-18 18:33:36 +0000
722@@ -18,8 +18,9 @@
723 #include <functional>
724
725 #include <zorba/diagnostic_list.h>
726-#include <zorba/xquery_exception.h>
727-#include <zorba/zorba.h>
728+
729+#include "diagnostics/dict.h"
730+#include "diagnostics/xquery_exception.h"
731
732 #include "latin_tokenizer.h"
733
734
735=== modified file 'src/runtime/full_text/latin_tokenizer.h'
736--- src/runtime/full_text/latin_tokenizer.h 2011-08-31 03:39:32 +0000
737+++ src/runtime/full_text/latin_tokenizer.h 2012-01-18 18:33:36 +0000
738@@ -14,12 +14,12 @@
739 * limitations under the License.
740 */
741
742-#ifndef ZORBA_WESTERN_TOKENIZER_H
743-#define ZORBA_WESTERN_TOKENIZER_H
744+#ifndef ZORBA_LATIN_TOKENIZER_H
745+#define ZORBA_LATIN_TOKENIZER_H
746
747 #include <zorba/config.h>
748
749-#ifdef ZORBA_NO_FULL_TEXT
750+#ifdef ZORBA_NO_ICU
751
752 #include <zorba/tokenizer.h>
753 #include "zorbatypes/zstring.h"
754@@ -38,8 +38,8 @@
755
756 // inherited
757 void destroy() const;
758- void tokenize( char const*, size_type, iso639_1::type, bool, Callback&,
759- void* );
760+ void tokenize( char const*, size_type, locale::iso639_1::type, bool,
761+ Callback&, void* );
762
763 private:
764 typedef zstring string_type;
765@@ -64,13 +64,14 @@
766 class LatinTokenizerProvider : public TokenizerProvider {
767 public:
768 // inherited
769- Tokenizer::ptr getTokenizer( iso639_1::type, Tokenizer::Numbers& ) const;
770+ Tokenizer::ptr getTokenizer( locale::iso639_1::type,
771+ Tokenizer::Numbers& ) const;
772 };
773
774 ///////////////////////////////////////////////////////////////////////////////
775
776 } // namespace zorba
777
778-#endif /* ZORBA_NO_FULL_TEXT */
779-#endif /* ZORBA_WESTERN_TOKENIZER_H */
780+#endif /* ZORBA_NO_ICU */
781+#endif /* ZORBA_LATIN_TOKENIZER_H */
782 /* vim:set et sw=2 ts=2: */
783
784=== modified file 'src/runtime/numerics/format_integer_impl.cpp'
785--- src/runtime/numerics/format_integer_impl.cpp 2011-07-07 12:47:14 +0000
786+++ src/runtime/numerics/format_integer_impl.cpp 2012-01-18 18:33:36 +0000
787@@ -881,7 +881,7 @@
788 utf8_result += (*valueit);
789 }
790 else
791- utf8_result += (0x2080 + *valueit - '0');
792+ utf8_result += (unicode::code_point)(0x2080 + *valueit - '0');
793 }
794 }
795 else if((c0 == 0x2460) || //CIRCLED DIGIT ONE (1-20)
796
797=== modified file 'src/runtime/numerics/numerics_impl.cpp'
798--- src/runtime/numerics/numerics_impl.cpp 2011-07-10 14:55:46 +0000
799+++ src/runtime/numerics/numerics_impl.cpp 2012-01-18 18:33:36 +0000
800@@ -490,7 +490,7 @@
801 minus( "-" )
802 {
803 utf8_string<zstring> u_per_mille( per_mille );
804- u_per_mille = 0x2030;
805+ u_per_mille = (unicode::code_point)0x2030;
806 }
807
808 void readFormat(const DecimalFormat_t& df_t)
809
810=== modified file 'src/runtime/strings/strings_impl.cpp'
811--- src/runtime/strings/strings_impl.cpp 2012-01-11 17:30:25 +0000
812+++ src/runtime/strings/strings_impl.cpp 2012-01-18 18:33:36 +0000
813@@ -806,7 +806,9 @@
814 zstring normForm;
815 zstring resStr;
816 unicode::normalization::type normType;
817+#ifndef ZORBA_NO_ICU
818 bool success;
819+#endif /* ZORBA_NO_ICU */
820
821 PlanIteratorState* state;
822 DEFAULT_STACK_INIT(PlanIteratorState, state, planState);
823@@ -856,10 +858,10 @@
824 }
825
826 item0->getStringValue2(resStr);
827-#ifndef ZORBA_NO_UNICODE
828+#ifndef ZORBA_NO_ICU
829 success = utf8::normalize(resStr, normType, &resStr);
830 ZORBA_ASSERT(success);
831-#endif//#ifndef ZORBA_NO_UNICODE
832+#endif//#ifndef ZORBA_NO_ICU
833 STACK_PUSH(GENV_ITEMFACTORY->createString(result, resStr), state );
834 }
835 else
836@@ -988,7 +990,7 @@
837 trans_map[ *map_i ] = *trans_i;
838
839 for ( ; map_i != map_end; ++map_i )
840- trans_map[ *map_i ] = ~0;
841+ trans_map[ *map_i ] = static_cast<unicode::code_point>( ~0 );
842 }
843
844 utf8_string<zstring> u_result_string( result_string );
845@@ -1003,7 +1005,7 @@
846 cp_map_type::const_iterator const found_i = trans_map.find( cp );
847 if ( found_i != trans_map.end() ) {
848 cp = found_i->second;
849- if ( cp == ~0 )
850+ if ( cp == static_cast<unicode::code_point>( ~0 ) )
851 continue;
852 }
853 u_result_string += cp;
854@@ -1791,16 +1793,33 @@
855 int &utf8start,
856 unsigned int &bytestart,
857 int utf8end,
858+ unsigned int byteend,
859 zstring &out)
860 {
861+#ifndef ZORBA_NO_ICU
862 utf8::size_type clen;
863- while(utf8start < utf8end)
864- {
865- clen = utf8::char_length(*sin);
866- out.append(sin, clen);
867- utf8start++;
868- bytestart += clen;
869- sin += clen;
870+ if(utf8end)
871+ {
872+ while(utf8start < utf8end)
873+ {
874+ clen = utf8::char_length(*sin);
875+ if(clen == 0)
876+ clen = 1;
877+ out.append(sin, clen);
878+ utf8start++;
879+ bytestart += clen;
880+ sin += clen;
881+ }
882+ }
883+ else
884+#endif
885+ {
886+ if(!utf8end)
887+ utf8end = byteend;
888+ out.append(sin, utf8end-bytestart);
889+ sin += utf8end-bytestart;
890+ utf8start = utf8end;
891+ bytestart = utf8end;
892 }
893 }
894
895@@ -1808,6 +1827,7 @@
896 int &match_end1,
897 unsigned int &match_end1_bytes,
898 int match_start2,
899+ unsigned int match_start2_bytes,
900 const char *&strin)
901 {
902 store::Item_t non_match_elem;
903@@ -1829,7 +1849,7 @@
904 // utf8_it++;
905 // match_end1++;
906 //}
907- copyUtf8Chars(strin, match_end1, match_end1_bytes, match_start2, non_match_str);
908+ copyUtf8Chars(strin, match_end1, match_end1_bytes, match_start2, match_start2_bytes, non_match_str);
909 store::Item_t non_match_text_item;
910 GENV_ITEMFACTORY->createTextNode(non_match_text_item, non_match_elem, non_match_str);
911 }
912@@ -1860,19 +1880,31 @@
913 i--;
914 break;
915 }
916+#ifndef ZORBA_NO_ICU
917 match_startg = rx.get_match_start(i+1);
918 if((match_startg < 0) && (gparent < 0))
919 continue;
920+#else
921+ int temp_endg;
922+ match_startg = -1;
923+ temp_endg = -1;
924+ if(!rx.get_match_start_end_bytes(i+1, &match_startg, &temp_endg) && (gparent < 0))
925+ continue;
926+#endif
927 if(match_endgood < match_startg)
928 {
929 //add non-group match text
930 zstring non_group_str;
931
932- copyUtf8Chars(sin, match_endgood, match_end1_bytes, match_startg, non_group_str);
933+ copyUtf8Chars(sin, match_endgood, match_end1_bytes, match_startg, 0, non_group_str);
934 store::Item_t non_group_text_item;
935 GENV_ITEMFACTORY->createTextNode(non_group_text_item, parent.getp(), non_group_str);
936 }
937+#ifndef ZORBA_NO_ICU
938 match_endg = rx.get_match_end(i+1);
939+#else
940+ match_endg = temp_endg;
941+#endif
942 //add group match text
943 GENV_ITEMFACTORY->createQName(group_element_name,
944 static_context::W3C_FN_NS, "fn", "group");
945@@ -1903,7 +1935,7 @@
946 }
947 zstring group_str;
948
949- copyUtf8Chars(sin, match_startg, match_end1_bytes, match_endg, group_str);
950+ copyUtf8Chars(sin, match_startg, match_end1_bytes, match_endg, 0, group_str);
951 store::Item_t group_text_item;
952 GENV_ITEMFACTORY->createTextNode(group_text_item, group_elem.getp(), group_str);
953 }
954@@ -1912,7 +1944,7 @@
955 {
956 zstring non_group_str;
957
958- copyUtf8Chars(sin, match_endgood, match_end1_bytes, match_end2, non_group_str);
959+ copyUtf8Chars(sin, match_endgood, match_end1_bytes, match_end2, 0, non_group_str);
960 store::Item_t non_group_text_item;
961 GENV_ITEMFACTORY->createTextNode(non_group_text_item, parent, non_group_str);
962 }
963@@ -2140,8 +2172,14 @@
964 reachedEnd = false;
965 while(rx.find_next_match(&reachedEnd))
966 {
967- int match_start2 = rx.get_match_start();
968- int match_end2 = rx.get_match_end();
969+ int match_start2;
970+ int match_end2;
971+#ifndef ZORBA_NO_ICU
972+ match_start2 = rx.get_match_start();
973+ match_end2 = rx.get_match_end();
974+#else
975+ rx.get_match_start_end_bytes(0, &match_start2, &match_end2);
976+#endif
977 ZORBA_ASSERT(match_start2 >= 0);
978
979 if(is_input_stream && reachedEnd && !instream->eof())
980@@ -2153,7 +2191,7 @@
981 //construct the fn:non-match
982 if(match_start2 > match_end1)
983 {
984- addNonMatchElement(result, match_end1, match_end1_bytes, match_start2, instr);
985+ addNonMatchElement(result, match_end1, match_end1_bytes, match_start2, 0, instr);
986 }
987
988 //construct the fn:match
989@@ -2161,7 +2199,7 @@
990 match_end1 = match_end2;
991 }
992
993- if(is_input_stream && reachedEnd && !instream->eof())
994+ if(is_input_stream && !instream->eof())
995 {
996 //load some more data, maybe the match will be different
997 if(match_end1_bytes)
998@@ -2209,7 +2247,7 @@
999 else
1000 {
1001 if(match_end1_bytes < streambuf_read)
1002- addNonMatchElement(result, match_end1, match_end1_bytes, streambuf_read, instr);
1003+ addNonMatchElement(result, match_end1, match_end1_bytes, 0, streambuf_read, instr);
1004 if(is_input_stream && instream->eof())
1005 reachedEnd = true;
1006 }
1007
1008=== modified file 'src/system/globalenv.cpp'
1009--- src/system/globalenv.cpp 2012-01-11 17:30:25 +0000
1010+++ src/system/globalenv.cpp 2012-01-18 18:33:36 +0000
1011@@ -17,11 +17,11 @@
1012
1013 #include "common/common.h"
1014
1015-#ifndef ZORBA_NO_UNICODE
1016+#ifndef ZORBA_NO_ICU
1017 # include <unicode/uclean.h>
1018 # include <unicode/utypes.h>
1019 # include <unicode/udata.h>
1020-#endif /* ZORBA_NO_UNICODE */
1021+#endif /* ZORBA_NO_ICU */
1022
1023 #ifdef ZORBA_WITH_BIG_INTEGER
1024 # include "zorbatypes/m_apm.h"
1025@@ -208,7 +208,7 @@
1026 // from one thread only
1027 // see http://www.icu-project.org/userguide/design.html#Init_and_Termination
1028 // and http://www.icu-project.org/apiref/icu4c/uclean_8h.html
1029-#ifndef ZORBA_NO_UNICODE
1030+#ifndef ZORBA_NO_ICU
1031 # if defined U_STATIC_IMPLEMENTATION && (defined WIN32 || defined WINCE)
1032 {
1033 TCHAR self_path[1024];
1034@@ -238,13 +238,13 @@
1035 udata_setCommonData(icu_appdata, &data_err);
1036 ZORBA_ASSERT(data_err == U_ZERO_ERROR);
1037
1038- // u_setDataDirectory(self_path);
1039+ // u_setDataDirectory(self_path);
1040 }
1041 # endif
1042 UErrorCode lICUInitStatus = U_ZERO_ERROR;
1043 u_init(&lICUInitStatus);
1044 ZORBA_ASSERT(lICUInitStatus == U_ZERO_ERROR);
1045-#endif//ifndef ZORBA_NO_UNICODE
1046+#endif /* ZORBA_NO_ICU */
1047 }
1048
1049
1050@@ -256,12 +256,12 @@
1051 // releases statically initialized memory and prevents
1052 // valgrind from reporting those problems at the end
1053 // see http://www.icu-project.org/apiref/icu4c/uclean_8h.html#93f27d0ddc7c196a1da864763f2d8920
1054-#ifndef ZORBA_NO_UNICODE
1055+#ifndef ZORBA_NO_ICU
1056 u_cleanup();
1057 # if defined U_STATIC_IMPLEMENTATION && (defined WIN32 || defined WINCE)
1058 delete[] icu_appdata;
1059 # endif
1060-#endif//ifndef ZORBA_NO_UNICODE
1061+#endif /* ZORBA_NO_ICU */
1062 }
1063
1064
1065
1066=== modified file 'src/util/CMakeLists.txt'
1067--- src/util/CMakeLists.txt 2011-07-18 14:25:21 +0000
1068+++ src/util/CMakeLists.txt 2012-01-18 18:33:36 +0000
1069@@ -38,9 +38,9 @@
1070 LIST(APPEND UTIL_SRCS mmap_file.cpp)
1071 ENDIF(ZORBA_WITH_FILE_ACCESS)
1072
1073-IF(ZORBA_NO_UNICODE)
1074- LIST(APPEND UTIL_SRCS regex_ascii.cpp)
1075-ENDIF(ZORBA_NO_UNICODE)
1076+IF(ZORBA_NO_ICU)
1077+ LIST(APPEND UTIL_SRCS regex_xquery.cpp)
1078+ENDIF(ZORBA_NO_ICU)
1079
1080 HEADER_GROUP_SUBFOLDER(UTIL_SRCS fx)
1081 HEADER_GROUP_SUBFOLDER(UTIL_SRCS win32)
1082
1083=== modified file 'src/util/regex.cpp'
1084--- src/util/regex.cpp 2011-09-24 00:16:36 +0000
1085+++ src/util/regex.cpp 2012-01-18 18:33:36 +0000
1086@@ -33,8 +33,7 @@
1087 #define INVALID_RE_EXCEPTION(...) \
1088 XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS( __VA_ARGS__ ) )
1089
1090-
1091-#ifndef ZORBA_NO_UNICODE
1092+#ifndef ZORBA_NO_ICU
1093 # include <unicode/uversion.h>
1094 U_NAMESPACE_USE
1095
1096@@ -442,11 +441,11 @@
1097 }
1098
1099 } // namespace unicode
1100-
1101-}//namespace zorba
1102-
1103-
1104-#else /* ZORBA_NO_UNICODE */
1105+} // namespace zorba
1106+
1107+///////////////////////////////////////////////////////////////////////////////
1108+
1109+#else /* ZORBA_NO_ICU */
1110
1111 #include "zorbatypes/zstring.h"
1112
1113@@ -470,7 +469,7 @@
1114 case 'i': flags |= REGEX_ASCII_CASE_INSENSITIVE; break;
1115 case 's': flags |= REGEX_ASCII_DOTALL; break;
1116 case 'm': flags |= REGEX_ASCII_MULTILINE; break;
1117- case 'x': flags |= REGEX_ASCII_COMMENTS; break;
1118+ case 'x': flags |= REGEX_ASCII_NO_WHITESPACE; break;
1119 case 'q': flags |= REGEX_ASCII_LITERAL; break;
1120 default:
1121 throw XQUERY_EXCEPTION( err::FORX0001, ERROR_PARAMS( *p ) );
1122@@ -483,6 +482,7 @@
1123 void regex::compile( char const *pattern, char const *flags)
1124 {
1125 parsed_flags = parse_regex_flags(flags);
1126+ regex_xquery::CRegexXQuery_parser regex_parser;
1127 regex_matcher = regex_parser.parse(pattern, parsed_flags);
1128 if(!regex_matcher)
1129 throw INVALID_RE_EXCEPTION(pattern);
1130@@ -517,6 +517,8 @@
1131 bool regex::next_token( char const *s, size_type *pos, zstring *token,
1132 bool *matched)
1133 {
1134+ if(!s[*pos])
1135+ return false;
1136 bool retval;
1137 int match_pos;
1138 int matched_len;
1139@@ -528,14 +530,8 @@
1140 token->assign(s+*pos, match_pos);
1141 *pos += match_pos + matched_len;
1142 if(matched)
1143- if(match_pos)
1144- *matched = true;
1145- else
1146- *matched = false;
1147- if(match_pos)
1148- return true;
1149- else
1150- return false;
1151+ *matched = true;
1152+ return true;
1153 }
1154 else
1155 {
1156@@ -544,7 +540,7 @@
1157 *pos += strlen(s+*pos);
1158 if(matched)
1159 *matched = false;
1160- return s[*pos] != 0;
1161+ return true;
1162 }
1163 }
1164
1165@@ -554,13 +550,9 @@
1166 int matched_pos;
1167 int matched_len;
1168
1169- bool prev_align = regex_matcher->set_align_begin(true);
1170- retval = regex_matcher->match_from(s, parsed_flags, &matched_pos, &matched_len);
1171- regex_matcher->set_align_begin(prev_align);
1172+ retval = regex_matcher->match_anywhere(s, parsed_flags|REGEX_ASCII_WHOLE_MATCH, &matched_pos, &matched_len);
1173 if(!retval)
1174 return false;
1175- if(matched_len != strlen(s))
1176- return false;
1177 return true;
1178 }
1179
1180@@ -587,14 +579,19 @@
1181 //look for dollars
1182 if(*temprepl == '\\')
1183 {
1184- temprepl++;
1185- if(!*temprepl || (*temprepl != '\\') || (*temprepl != '$'))//Invalid replacement string.
1186- throw XQUERY_EXCEPTION( err::FORX0004, ERROR_PARAMS( replacement ) );
1187+ if(!(parsed_flags & REGEX_ASCII_LITERAL))
1188+ {
1189+ temprepl++;
1190+ if(!*temprepl)
1191+ temprepl--;
1192+ else if((*temprepl != '\\') && (*temprepl != '$'))//Invalid replacement string.
1193+ throw XQUERY_EXCEPTION( err::FORX0004, ERROR_PARAMS( replacement ) );
1194+ }
1195 result->append(1, *temprepl);
1196 temprepl++;
1197 continue;
1198 }
1199- if(*temprepl == '$')
1200+ if((*temprepl == '$') && !(parsed_flags & REGEX_ASCII_LITERAL))
1201 {
1202 temprepl++;
1203 index = 0;
1204@@ -648,7 +645,7 @@
1205 if(retval)
1206 {
1207 m_match_pos += m_pos;
1208- m_pos = m_match_pos = m_matched_len;
1209+ m_pos = m_match_pos + m_matched_len;
1210 }
1211 else
1212 {
1213@@ -666,35 +663,30 @@
1214 return (int)regex_matcher->get_indexed_regex_count();
1215 }
1216
1217-int regex::get_match_start( int groupId )
1218-{
1219- if(groupId == 0)
1220- return m_match_pos;
1221- if(groupId > (int)regex_matcher->get_indexed_regex_count())
1222- return -1;
1223- const char *submatched_source;
1224- int submatched_len;
1225- if(!regex_matcher->get_indexed_match(groupId, &submatched_source, &submatched_len))
1226- return -1;
1227- return submatched_source - s_in_.c_str();
1228-}
1229-
1230-int regex::get_match_end( int groupId )
1231-{
1232- if(groupId == 0)
1233- return m_match_pos + m_matched_len;
1234- if(groupId > (int)regex_matcher->get_indexed_regex_count())
1235- return -1;
1236- const char *submatched_source;
1237- int submatched_len;
1238- if(!regex_matcher->get_indexed_match(groupId, &submatched_source, &submatched_len))
1239- return -1;
1240- return submatched_source - s_in_.c_str() + submatched_len;
1241+bool regex::get_match_start_end_bytes( int groupId, int *start, int *end )
1242+{
1243+ *start = -1;
1244+ *end = -1;
1245+ if(groupId == 0)
1246+ {
1247+ *start = m_match_pos;
1248+ *end = m_match_pos + m_matched_len;
1249+ return true;
1250+ }
1251+ if(groupId > (int)regex_matcher->get_indexed_regex_count())
1252+ return false;
1253+ const char *submatched_source;
1254+ int submatched_len;
1255+ if(!regex_matcher->get_indexed_match(groupId, &submatched_source, &submatched_len))
1256+ return false;
1257+ *start = submatched_source - s_in_.c_str();
1258+ *end = *start + submatched_len;
1259+ return true;
1260 }
1261
1262 } // namespace unicode
1263 } // namespace zorba
1264-#endif /* ZORBA_NO_UNICODE */
1265+#endif /* ZORBA_NO_ICU */
1266
1267 ///////////////////////////////////////////////////////////////////////////////
1268
1269
1270=== modified file 'src/util/regex.h'
1271--- src/util/regex.h 2011-07-18 14:25:21 +0000
1272+++ src/util/regex.h 2012-01-18 18:33:36 +0000
1273@@ -17,15 +17,13 @@
1274 #ifndef ZORBA_REGEX_H
1275 #define ZORBA_REGEX_H
1276
1277-#ifndef ZORBA_NO_UNICODE
1278-#include <unicode/regex.h>
1279-#endif
1280-
1281 #include "cxx_util.h"
1282 #include "unicode_util.h"
1283 #include "zorbatypes/zstring.h"
1284
1285-#ifndef ZORBA_NO_UNICODE
1286+#ifndef ZORBA_NO_ICU
1287+
1288+#include <unicode/regex.h>
1289
1290 namespace zorba {
1291
1292@@ -496,15 +494,17 @@
1293 } // namespace unicode
1294 } // namespace zorba
1295
1296-#else ///ZORBA_NO_UNICODE (ascii part:)
1297-
1298-#include "util/regex_ascii.h"
1299+///////////////////////////////////////////////////////////////////////////////
1300+
1301+#else /* ZORBA_NO_ICU */
1302+
1303+#include "util/regex_xquery.h"
1304 #include <string>
1305
1306 namespace zorba{
1307 /**
1308 * Converts an XQuery regular expression to the form used by the regular
1309- * expression library Zorba is using (here regex_ascii).
1310+ * expression library Zorba is using (here regex_xquery).
1311 *
1312 * @param xq_re The XQuery regular expression.
1313 * @param lib_re A pointer to the resuling library regular expression.
1314@@ -525,7 +525,7 @@
1315 /**
1316 * Constructs a %regex.
1317 */
1318- regex() : regex_matcher( NULL ) { }
1319+ regex() : regex_matcher( nullptr ) { }
1320
1321 /**
1322 * Destroys a %regex.
1323@@ -835,31 +835,21 @@
1324
1325 /**
1326 * Get the start position of the matched group.
1327- * If groupId is zero, then the start position of the whole match is returned.
1328- * If groupId is non-zero, then the start position of that group is returned.
1329- * If that group has not been matched, -1 is returned.
1330+ * If groupId is zero, then the start and end position of the whole match is returned.
1331+ * If groupId is non-zero, then the start and end position of that group is returned.
1332+ * If that group has not been matched, false is returned.
1333 *
1334 * @param groupId the id of the group, either zero for the entire regex,
1335 * or [1 .. group_count] for that specific group
1336- * @return the start position, zero based, or -1 if that group didn't match
1337+ * @param start to return start position in bytes
1338+ * @param end to return end position in bytes
1339+ * @return true if that group exists and has been matched
1340 */
1341- int get_match_start( int groupId = 0 );
1342+ bool get_match_start_end_bytes( int groupId, int *start, int *end );
1343
1344- /**
1345- * Get the end position of the matched group.
1346- * If groupId is zero, then the end position of the whole match is returned.
1347- * If groupId is non-zero, then the end position of that group is returned.
1348- * If that group has not been matched, -1 is returned.
1349- *
1350- * @param groupId the id of the group, either zero for the entire regex,
1351- * or [1 .. group_count] for that specific group
1352- * @return the end position, zero based, or -1 if that group didn't match
1353- */
1354- int get_match_end( int groupId = 0 );
1355
1356 private:
1357- regex_ascii::CRegexAscii_parser regex_parser;
1358- regex_ascii::CRegexAscii_regex *regex_matcher;
1359+ regex_xquery::CRegexXQuery_regex *regex_matcher;
1360 uint32_t parsed_flags;
1361
1362 zstring s_in_;
1363@@ -873,15 +863,13 @@
1364 regex( regex const& );
1365 regex& operator=( regex const& );
1366 };
1367+
1368+///////////////////////////////////////////////////////////////////////////////
1369+
1370 } // namespace unicode
1371 } // namespace zorba
1372
1373-#endif /* ZORBA_NO_UNICODE */
1374-
1375-
1376-///////////////////////////////////////////////////////////////////////////////
1377-
1378-
1379+#endif /* ZORBA_NO_ICU */
1380 #endif /* ZORBA_REGEX_H */
1381 /*
1382 * Local variables:
1383
1384=== renamed file 'src/util/regex_ascii.cpp' => 'src/util/regex_xquery.cpp'
1385--- src/util/regex_ascii.cpp 2011-08-05 02:21:55 +0000
1386+++ src/util/regex_xquery.cpp 2012-01-18 18:33:36 +0000
1387@@ -1,4 +1,4 @@
1388-a/*
1389+/*
1390 * Copyright 2006-2008 The FLWOR Foundation.
1391 *
1392 * Licensed under the Apache License, Version 2.0 (the "License");
1393@@ -18,12 +18,15 @@
1394
1395 #include "diagnostics/xquery_diagnostics.h"
1396
1397-#include "regex_ascii.h"
1398+#include "regex_xquery.h"
1399 #include <string.h>
1400 #include "zorbatypes/chartype.h"
1401+#include "util/unicode_categories.h"
1402+#include "util/ascii_util.h"
1403+#include "util/utf8_string.h"
1404
1405 namespace zorba {
1406- namespace regex_ascii{
1407+ namespace regex_xquery{
1408 //ascii regular expression matching
1409
1410 /*http://www.w3.org/TR/xmlschema-2/#regexs
1411@@ -62,96 +65,138 @@
1412 + http://www.w3.org/TR/xquery-operators/#regex-syntax (not implemented)
1413 */
1414
1415+
1416+static bool compare_ascii_i(const char *str1, const char *str2)
1417+{
1418+ while(*str1 && *str2)
1419+ {
1420+ if(ascii::to_lower(*str1) != ascii::to_lower(*str2))
1421+ return false;
1422+ str1++;
1423+ str2++;
1424+ }
1425+ if(*str1 || *str2)
1426+ return false;
1427+ return true;
1428+}
1429+
1430+static bool compare_unicode_ni(const char *str1, const char *str2, int len)
1431+{
1432+ while(len > 0)
1433+ {
1434+ const char *temp_str1 = str1;
1435+ const char *temp_str2 = str2;
1436+ unicode::code_point cp1 = unicode::to_upper(utf8::next_char(temp_str1));
1437+ unicode::code_point cp2 = unicode::to_upper(utf8::next_char(temp_str2));
1438+ if(cp1 != cp2)
1439+ return false;
1440+ len -= temp_str1-str1;
1441+ str1 = temp_str1;
1442+ str2 = temp_str2;
1443+ }
1444+ return true;
1445+}
1446+static utf8::size_type myutf8len(const char *source)
1447+{
1448+ utf8::size_type len = utf8::char_length(*source);
1449+ if(!len)
1450+ return 1;
1451+ else
1452+ return len;
1453+}
1454 ////////////////////////////////////
1455 ////Regular expression parsing and building of the tree
1456 ////////////////////////////////////
1457
1458-CRegexAscii_regex* CRegexAscii_parser::parse(const char *pattern, unsigned int flags)
1459+CRegexXQuery_regex* CRegexXQuery_parser::parse(const char *pattern, unsigned int flags)
1460 {
1461 this->flags = flags;
1462- bool align_begin = false;
1463
1464- if(!(flags & REGEX_ASCII_LITERAL) && (pattern[0] == '^'))
1465- align_begin = true;
1466-
1467 int regex_len;
1468- CRegexAscii_regex* regex = parse_regexp(pattern + (align_begin?1:0), &regex_len);
1469+ CRegexXQuery_regex* regex = parse_regexp(pattern, &regex_len);
1470
1471- if(regex)
1472- regex->set_align_begin(align_begin);
1473-
1474 return regex;
1475 }
1476
1477 //until '\0' or ')'
1478-CRegexAscii_regex* CRegexAscii_parser::parse_regexp(const char *pattern,
1479+CRegexXQuery_regex* CRegexXQuery_parser::parse_regexp(const char *pattern,
1480 int *regex_len)
1481 {
1482 *regex_len = 0;
1483 int branch_len;
1484 regex_depth++;
1485- CRegexAscii_regex *regex = new CRegexAscii_regex(current_regex);
1486+ std::auto_ptr<CRegexXQuery_regex> regex(new CRegexXQuery_regex(current_regex));
1487 if(!current_regex)
1488- current_regex = regex;
1489+ current_regex = regex.get();
1490 if(regex_depth >= 2)
1491 {
1492 //mark this as group if it does not start with ?:
1493 if(pattern[0] != '?' || pattern[1] != ':')
1494- current_regex->subregex.push_back(regex);
1495+ current_regex->subregex.push_back(regex.get());
1496 else
1497 *regex_len = 2;
1498 }
1499- CRegexAscii_branch *branch;
1500+ CRegexXQuery_branch *branch;
1501+ bool must_read_another_branch = true;
1502 while(pattern[*regex_len] && (pattern[*regex_len] != ')'))
1503 {
1504 branch = parse_branch(pattern+*regex_len, &branch_len);
1505 if(!branch)
1506 {
1507 regex_depth--;
1508- delete regex;
1509 return NULL;
1510 }
1511 regex->add_branch(branch);
1512 *regex_len += branch_len;
1513+ if(pattern[*regex_len] == '|')
1514+ (*regex_len)++;
1515+ else
1516+ must_read_another_branch = false;
1517 }
1518- if((current_regex == regex) && (pattern[*regex_len] == ')'))
1519+ if((current_regex == regex.get()) && (pattern[*regex_len] == ')'))
1520 {
1521- throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(U_REGEX_MISMATCHED_PAREN)) );
1522+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_MISMATCHED_PAREN)) );
1523 }
1524 if(pattern[*regex_len])
1525 (*regex_len)++;
1526+ if(must_read_another_branch)
1527+ regex->add_branch(new CRegexXQuery_branch(current_regex));//add empty branch
1528 regex->flags = 0;//finished initialization
1529 regex_depth--;
1530- return regex;
1531+ return regex.release();
1532 }
1533
1534-CRegexAscii_branch* CRegexAscii_parser::parse_branch(const char *pattern, int *branch_len)
1535+CRegexXQuery_branch* CRegexXQuery_parser::parse_branch(const char *pattern, int *branch_len)
1536 {
1537 int piece_len;
1538
1539- CRegexAscii_branch *branch = new CRegexAscii_branch(current_regex);
1540- CRegexAscii_piece *piece;
1541+ std::auto_ptr<CRegexXQuery_branch> branch(new CRegexXQuery_branch(current_regex));
1542+ CRegexXQuery_piece *piece;
1543 *branch_len = 0;
1544 while(pattern[*branch_len] && (pattern[*branch_len] != '|') && (pattern[*branch_len] != ')'))
1545 {
1546 piece = parse_piece(pattern+*branch_len, &piece_len);
1547 if(!piece)
1548 {
1549- delete branch;
1550 return NULL;
1551 }
1552+ if(branch->piece_list.size() && dynamic_cast<CRegexXQuery_pinstart*>(piece->atom))
1553+ {
1554+ //found ^ that is not at the beginning of branch
1555+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_ATOM_CHAR), '^') );
1556+ }
1557 branch->add_piece(piece);
1558 *branch_len += piece_len;
1559 }
1560- if(pattern[*branch_len] == '|')
1561- (*branch_len)++;
1562- return branch;
1563+ //if(pattern[*branch_len] == '|')
1564+ // (*branch_len)++;
1565+ return branch.release();
1566 }
1567
1568 //piece = atom + quantifier
1569-CRegexAscii_piece* CRegexAscii_parser::parse_piece(const char *pattern, int *piece_len)
1570+CRegexXQuery_piece* CRegexXQuery_parser::parse_piece(const char *pattern, int *piece_len)
1571 {
1572- CRegexAscii_piece *piece = new CRegexAscii_piece;
1573+ std::auto_ptr<CRegexXQuery_piece> piece(new CRegexXQuery_piece);
1574 IRegexAtom *atom;
1575 *piece_len = 0;
1576
1577@@ -160,19 +205,18 @@
1578 atom = read_atom(pattern, &atom_len);
1579 if(!atom)
1580 {
1581- delete piece;
1582 return NULL;
1583 }
1584 piece->set_atom(atom);
1585 if(!(flags & REGEX_ASCII_LITERAL))
1586- read_quantifier(piece, pattern+atom_len, &quantif_len);
1587+ read_quantifier(piece.get(), pattern+atom_len, &quantif_len);
1588
1589 *piece_len += atom_len + quantif_len;
1590
1591- return piece;
1592+ return piece.release();
1593 }
1594
1595-char CRegexAscii_parser::myishex(char c)
1596+char CRegexXQuery_parser::myishex(char c)
1597 {
1598 if((c >= '0') && (c <= '9'))
1599 return c-'0'+1;
1600@@ -183,26 +227,125 @@
1601 return 0;//not a hex
1602 }
1603
1604-bool CRegexAscii_parser::myisdigit(char c)
1605-{
1606- return (c >= '0') || (c <= '9');
1607-}
1608-
1609-char CRegexAscii_parser::readChar(const char *pattern, int *char_len, bool *is_multichar)
1610+bool CRegexXQuery_parser::myisdigit(char c)
1611+{
1612+ return (c >= '0') && (c <= '9');
1613+}
1614+
1615+bool CRegexXQuery_parser::myisletterAZ(char c)
1616+{
1617+ return ((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z'));
1618+}
1619+
1620+static const unicode::code_point specials_extcp[] = {0xFFF0, 0xFFFD, 0};
1621+
1622+static CRegexXQuery_parser::block_escape_t block_escape[] =
1623+{
1624+{{0x0000, 0x007F}, NULL, "BasicLatin"},
1625+{{0x0080, 0x00FF}, NULL, "Latin-1Supplement"},
1626+{{0x0100, 0x017F}, NULL, "LatinExtended-A"},
1627+{{0x0180, 0x024F}, NULL, "LatinExtended-B"},
1628+{{0x0250, 0x02AF}, NULL, "IPAExtensions"},
1629+{{0x02B0, 0x02FF}, NULL, "SpacingModifierLetters"},
1630+{{0x0300, 0x036F}, NULL, "CombiningDiacriticalMarks"},
1631+{{0x0370, 0x03FF}, NULL, "Greek"},
1632+{{0x0400, 0x04FF}, NULL, "Cyrillic"},
1633+{{0x0530, 0x058F}, NULL, "Armenian"},
1634+{{0x0590, 0x05FF}, NULL, "Hebrew"},
1635+{{0x0600, 0x06FF}, NULL, "Arabic"},
1636+{{0x0700, 0x074F}, NULL, "Syriac"},
1637+{{0x0780, 0x07BF}, NULL, "Thaana"},
1638+{{0x0900, 0x097F}, NULL, "Devanagari"},
1639+{{0x0980, 0x09FF}, NULL, "Bengali"},
1640+{{0x0A00, 0x0A7F}, NULL, "Gurmukhi"},
1641+{{0x0A80, 0x0AFF}, NULL, "Gujarati"},
1642+{{0x0B00, 0x0B7F}, NULL, "Oriya"},
1643+{{0x0B80, 0x0BFF}, NULL, "Tamil"},
1644+{{0x0C00, 0x0C7F}, NULL, "Telugu"},
1645+{{0x0C80, 0x0CFF}, NULL, "Kannada"},
1646+{{0x0D00, 0x0D7F}, NULL, "Malayalam"},
1647+{{0x0D80, 0x0DFF}, NULL, "Sinhala"},
1648+{{0x0E00, 0x0E7F}, NULL, "Thai"},
1649+{{0x0E80, 0x0EFF}, NULL, "Lao"},
1650+{{0x0F00, 0x0FFF}, NULL, "Tibetan"},
1651+{{0x1000, 0x109F}, NULL, "Myanmar"},
1652+{{0x10A0, 0x10FF}, NULL, "Georgian"},
1653+{{0x1100, 0x11FF}, NULL, "HangulJamo"},
1654+{{0x1200, 0x137F}, NULL, "Ethiopic"},
1655+{{0x13A0, 0x13FF}, NULL, "Cherokee"},
1656+{{0x1400, 0x167F}, NULL, "UnifiedCanadianAboriginalSyllabics"},
1657+{{0x1680, 0x169F}, NULL, "Ogham"},
1658+{{0x16A0, 0x16FF}, NULL, "Runic"},
1659+{{0x1780, 0x17FF}, NULL, "Khmer"},
1660+{{0x1800, 0x18AF}, NULL, "Mongolian"},
1661+{{0x1E00, 0x1EFF}, NULL, "LatinExtendedAdditional"},
1662+{{0x1F00, 0x1FFF}, NULL, "GreekExtended"},
1663+{{0x2000, 0x206F}, NULL, "GeneralPunctuation"},
1664+{{0x2070, 0x209F}, NULL, "SuperscriptsandSubscripts"},
1665+{{0x20A0, 0x20CF}, NULL, "CurrencySymbols"},
1666+{{0x20D0, 0x20FF}, NULL, "CombiningMarksforSymbols"},
1667+{{0x2100, 0x214F}, NULL, "LetterlikeSymbols"},
1668+{{0x2150, 0x218F}, NULL, "NumberForms"},
1669+{{0x2190, 0x21FF}, NULL, "Arrows"},
1670+{{0x2200, 0x22FF}, NULL, "MathematicalOperators"},
1671+{{0x2300, 0x23FF}, NULL, "MiscellaneousTechnical"},
1672+{{0x2400, 0x243F}, NULL, "ControlPictures"},
1673+{{0x2440, 0x245F}, NULL, "OpticalCharacterRecognition"},
1674+{{0x2460, 0x24FF}, NULL, "EnclosedAlphanumerics"},
1675+{{0x2500, 0x257F}, NULL, "BoxDrawing"},
1676+{{0x2580, 0x259F}, NULL, "BlockElements"},
1677+{{0x25A0, 0x25FF}, NULL, "GeometricShapes"},
1678+{{0x2600, 0x26FF}, NULL, "MiscellaneousSymbols"},
1679+{{0x2700, 0x27BF}, NULL, "Dingbats"},
1680+{{0x2800, 0x28FF}, NULL, "BraillePatterns"},
1681+{{0x2E80, 0x2EFF}, NULL, "CJKRadicalsSupplement"},
1682+{{0x2F00, 0x2FDF}, NULL, "KangxiRadicals"},
1683+{{0x2FF0, 0x2FFF}, NULL, "IdeographicDescriptionCharacters"},
1684+{{0x3000, 0x303F}, NULL, "CJKSymbolsandPunctuation"},
1685+{{0x3040, 0x309F}, NULL, "Hiragana"},
1686+{{0x30A0, 0x30FF}, NULL, "Katakana"},
1687+{{0x3100, 0x312F}, NULL, "Bopomofo"},
1688+{{0x3130, 0x318F}, NULL, "HangulCompatibilityJamo"},
1689+{{0x3190, 0x319F}, NULL, "Kanbun"},
1690+{{0x31A0, 0x31BF}, NULL, "BopomofoExtended"},
1691+{{0x3200, 0x32FF}, NULL, "EnclosedCJKLettersandMonths"},
1692+{{0x3300, 0x33FF}, NULL, "CJKCompatibility"},
1693+{{0x3400, 0x4DB5}, NULL, "CJKUnifiedIdeographsExtensionA"},
1694+{{0x4E00, 0x9FFF}, NULL, "CJKUnifiedIdeographs"},
1695+{{0xA000, 0xA48F}, NULL, "YiSyllables"},
1696+{{0xA490, 0xA4CF}, NULL, "YiRadicals"},
1697+{{0xAC00, 0xD7A3}, NULL, "HangulSyllables"},
1698+{{0xE000, 0xF8FF}, NULL, "PrivateUse"},
1699+{{0xF900, 0xFAFF}, NULL, "CJKCompatibilityIdeographs"},
1700+{{0xFB00, 0xFB4F}, NULL, "AlphabeticPresentationForms"},
1701+{{0xFB50, 0xFDFF}, NULL, "ArabicPresentationForms-A"},
1702+{{0xFE20, 0xFE2F}, NULL, "CombiningHalfMarks"},
1703+{{0xFE30, 0xFE4F}, NULL, "CJKCompatibilityForms"},
1704+{{0xFE50, 0xFE6F}, NULL, "SmallFormVariants"},
1705+{{0xFE70, 0xFEFE}, NULL, "ArabicPresentationForms-B"},
1706+{{0xFEFF, 0xFEFF}, specials_extcp, "Specials"},
1707+{{0xFF00, 0xFFEF}, NULL, "HalfwidthandFullwidthForms"}
1708+};
1709+
1710+CRegexXQuery_charmatch* CRegexXQuery_parser::readChar(const char *pattern,
1711+ int *char_len,
1712+ enum CHARGROUP_t *multichar_type)
1713 {
1714 char c = 0;
1715 *char_len = 0;
1716- *is_multichar = false;
1717+ *multichar_type = CHARGROUP_NO_MULTICHAR;
1718 switch(pattern[*char_len])
1719 {
1720 case '\\':
1721- { (*char_len)++;
1722+ {
1723+ (*char_len)++;
1724 switch(pattern[*char_len])
1725 {
1726- case 'n': c = '\n';break;
1727- case 'r': c = '\r';break;
1728- case 't': c = '\t';break;
1729+ case 'n': c = '\n';(*char_len)++;return new CRegexXQuery_char_ascii(current_regex, c);
1730+ case 'r': c = '\r';(*char_len)++;return new CRegexXQuery_char_ascii(current_regex, c);
1731+ case 't': c = '\t';(*char_len)++;return new CRegexXQuery_char_ascii(current_regex, c);
1732 case '\\':
1733+ case '/'://+
1734 case '|':
1735 case '.':
1736 case '?':
1737@@ -216,19 +359,205 @@
1738 case '['://#x5B
1739 case ']'://#x5D
1740 case '^'://#x5E
1741+ case '$'://+
1742 c = pattern[*char_len];
1743- break;
1744+ (*char_len)++;
1745+ *multichar_type = CHARGROUP_FLAGS_ONECHAR_ASCII;
1746+ return new CRegexXQuery_char_ascii(current_regex, c);
1747 case 'p'://catEsc
1748 case 'P'://complEsc
1749+ {
1750 //ignore the prop for now
1751- c = pattern[*char_len];
1752- *is_multichar = true;
1753- if(pattern[*char_len+1] == '{')
1754- {
1755- while(pattern[*char_len] != '}')
1756+ *multichar_type = CHARGROUP_FLAGS_MULTICHAR_p;//(CHARGROUP_t)((pattern[*char_len] == 'P') ? 128 : 0);
1757+ bool is_reverse = (pattern[*char_len] == 'P');
1758+ c = 0;
1759+ if(pattern[(*char_len)+1] != '{')
1760+ {
1761+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_BROKEN_P_CONSTRUCT)) );
1762+ }
1763+ (*char_len) += 2;
1764+ switch(pattern[*char_len])
1765+ {//IsCategory
1766+ case 'L':
1767+ {
1768+ switch(pattern[(*char_len)+1])
1769+ {
1770+ case '}':
1771+ c = unicode::UNICODE_Ll + 50;break;
1772+ case 'u':
1773+ c = unicode::UNICODE_Lu; (*char_len)++;break;
1774+ case 'l':
1775+ c = unicode::UNICODE_Ll; (*char_len)++;break;
1776+ case 't':
1777+ c = unicode::UNICODE_Lt; (*char_len)++;break;
1778+ case 'm':
1779+ c = unicode::UNICODE_Lm; (*char_len)++;break;
1780+ case 'o':
1781+ c = unicode::UNICODE_Lo; (*char_len)++;break;
1782+ default:
1783+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PL_CONSTRUCT)) );
1784+ }
1785+ }break;
1786+ case 'M':
1787+ {
1788+ switch(pattern[(*char_len)+1])
1789+ {
1790+ case '}':
1791+ c = unicode::UNICODE_Mc + 50;break;
1792+ case 'n':
1793+ c = unicode::UNICODE_Mn; (*char_len)++;break;
1794+ case 'c':
1795+ c = unicode::UNICODE_Mc; (*char_len)++;break;
1796+ case 'e':
1797+ c = unicode::UNICODE_Me; (*char_len)++;break;
1798+ default:
1799+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PM_CONSTRUCT)) );
1800+ }
1801+ }break;
1802+ case 'N':
1803+ {
1804+ switch(pattern[(*char_len)+1])
1805+ {
1806+ case '}':
1807+ c = unicode::UNICODE_Nd + 50;break;
1808+ case 'd':
1809+ c = unicode::UNICODE_Nd; (*char_len)++;break;
1810+ case 'l':
1811+ c = unicode::UNICODE_Nl; (*char_len)++;break;
1812+ case 'o':
1813+ c = unicode::UNICODE_No; (*char_len)++;break;
1814+ default:
1815+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PN_CONSTRUCT)) );
1816+ }
1817+ }break;
1818+ case 'P':
1819+ {
1820+ switch(pattern[(*char_len)+1])
1821+ {
1822+ case '}':
1823+ c = unicode::UNICODE_Pc + 50;break;
1824+ case 'c':
1825+ c = unicode::UNICODE_Pc; (*char_len)++;break;
1826+ case 'd':
1827+ c = unicode::UNICODE_Pd; (*char_len)++;break;
1828+ case 's':
1829+ c = unicode::UNICODE_Ps; (*char_len)++;break;
1830+ case 'e':
1831+ c = unicode::UNICODE_Pe; (*char_len)++;break;
1832+ case 'i':
1833+ c = unicode::UNICODE_Pi; (*char_len)++;break;
1834+ case 'f':
1835+ c = unicode::UNICODE_Pf; (*char_len)++;break;
1836+ case 'o':
1837+ c = unicode::UNICODE_Po; (*char_len)++;break;
1838+ default:
1839+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PP_CONSTRUCT)) );
1840+ }
1841+ }break;
1842+ case 'Z':
1843+ {
1844+ switch(pattern[(*char_len)+1])
1845+ {
1846+ case '}':
1847+ c = unicode::UNICODE_Zl + 50;break;
1848+ case 's':
1849+ c = unicode::UNICODE_Zs; (*char_len)++;break;
1850+ case 'l':
1851+ c = unicode::UNICODE_Zl; (*char_len)++;break;
1852+ case 'p':
1853+ c = unicode::UNICODE_Zp; (*char_len)++;break;
1854+ default:
1855+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PZ_CONSTRUCT)) );
1856+ }
1857+ }break;
1858+ case 'S':
1859+ {
1860+ switch(pattern[(*char_len)+1])
1861+ {
1862+ case '}':
1863+ c = unicode::UNICODE_Sc + 50;break;
1864+ case 'm':
1865+ c = unicode::UNICODE_Sm; (*char_len)++;break;
1866+ case 'c':
1867+ c = unicode::UNICODE_Sc; (*char_len)++;break;
1868+ case 'k':
1869+ c = unicode::UNICODE_Sk; (*char_len)++;break;
1870+ case 'o':
1871+ c = unicode::UNICODE_So; (*char_len)++;break;
1872+ default:
1873+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PS_CONSTRUCT)) );
1874+ }
1875+ }break;
1876+ case 'C':
1877+ {
1878+ switch(pattern[(*char_len)+1])
1879+ {
1880+ case '}':
1881+ c = unicode::UNICODE_Cc + 50;break;
1882+ case 'c':
1883+ c = unicode::UNICODE_Cc; (*char_len)++;break;
1884+ case 'f':
1885+ c = unicode::UNICODE_Cf; (*char_len)++;break;
1886+ case 'o':
1887+ c = unicode::UNICODE_Co; (*char_len)++;break;
1888+ case 'n':
1889+ c = unicode::UNICODE_Cn; (*char_len)++;break;
1890+ default:
1891+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PC_CONSTRUCT)) );
1892+ }
1893+ }break;
1894+ }//end switch
1895+ if(c)
1896+ {
1897+ if(pattern[(*char_len) + 1] != '}')
1898+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_BROKEN_P_CONSTRUCT)) );
1899+ (*char_len)++;
1900+ (*char_len)++;
1901+ return new CRegexXQuery_multicharP(current_regex, c, is_reverse);
1902+ }
1903+ if(pattern[*char_len] == 'I')
1904+ {
1905+ if(pattern[(*char_len)+1] == 's')//IsBlock
1906+ {
1907+ *multichar_type = CHARGROUP_FLAGS_MULTICHAR_Is;
1908+ (*char_len) += 2;
1909+ zstring block_name;
1910+ char tempc = pattern[(*char_len)];
1911+ while(tempc && (tempc != '}'))
1912+ {
1913+ if(!myisletterAZ(tempc) && !myisdigit(tempc) && (tempc != '-'))
1914+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_BROKEN_PIs_CONSTRUCT)) );
1915+ block_name.append(1, tempc);
1916+ (*char_len)++;
1917+ tempc = pattern[(*char_len)];
1918+ }
1919+ if(!tempc)
1920+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_BROKEN_PIs_CONSTRUCT)) );
1921+ //search for the block name
1922+ int i;
1923+ int nr_blocks = sizeof(block_escape)/sizeof(CRegexXQuery_parser::block_escape_t);
1924+ for(i=0;i<nr_blocks;i++)
1925+ {
1926+ if(compare_ascii_i(block_name.c_str(), block_escape[i].group_name))
1927+ {
1928+ c = i;
1929+ break;
1930+ }
1931+ }
1932+ if(i==nr_blocks)
1933+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PIs_CONSTRUCT)) );
1934 (*char_len)++;
1935- }
1936- break;
1937+ return new CRegexXQuery_multicharIs(current_regex, i, is_reverse);
1938+ }
1939+ else
1940+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_BROKEN_PIs_CONSTRUCT)) );
1941+ }
1942+ else
1943+ {
1944+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_BROKEN_P_CONSTRUCT)) );
1945+ }
1946+ break;//unreachable
1947+ }//end case 'p'
1948 //multiCharEsc
1949 case 's':
1950 case 'S':
1951@@ -240,40 +569,104 @@
1952 case 'D':
1953 case 'w':
1954 case 'W':
1955- *is_multichar = true;
1956+ *multichar_type = CHARGROUP_FLAGS_MULTICHAR_OTHER;
1957 c = pattern[*char_len];
1958- break;
1959- }
1960- break;
1961- }
1962- case '#':///might be #xXX
1963- {
1964- if((pattern[*char_len+1] == 'x') &&
1965- myishex(pattern[*char_len+2]) && myishex(pattern[*char_len+3]))
1966- {
1967- c = (myishex(pattern[*char_len+2])-1)<<4 | (myishex(pattern[*char_len+3])-1);
1968- *char_len += 3;
1969- break;
1970- }
1971- }
1972+ (*char_len)++;
1973+ return new CRegexXQuery_multicharOther(current_regex, c);
1974+ case 'u'://unicode codepoint \uXXXX
1975+ {
1976+ unicode::code_point utf8c = 0;
1977+ (*char_len)++;
1978+ for(int i=0;i<4;i++)
1979+ {
1980+ char hex = myishex(pattern[*char_len]);
1981+ if(!hex)
1982+ {
1983+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_UNICODE_CODEPOINT_u)) );
1984+ }
1985+ utf8c <<= 4;
1986+ utf8c |= (hex-1) & 0x0f;
1987+ (*char_len)++;
1988+ }
1989+ return create_charmatch(utf8c, NULL, 0, multichar_type);
1990+ }
1991+ case 'U'://unicode codepoint \UXXXXXXXX
1992+ {
1993+ unicode::code_point utf8c = 0;
1994+ (*char_len)++;
1995+ for(int i=0;i<8;i++)
1996+ {
1997+ char hex = myishex(pattern[*char_len]);
1998+ if(!hex)
1999+ {
2000+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_UNICODE_CODEPOINT_u)) );
2001+ }
2002+ utf8c <<= 4;
2003+ utf8c |= (hex-1) & 0x0f;
2004+ (*char_len)++;
2005+ }
2006+ return create_charmatch(utf8c, NULL, 0, multichar_type);
2007+ }
2008+ default:
2009+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_ESC_CHAR)) );
2010+ }
2011+ assert(false);
2012+ break;//unreachable
2013+ }//end case '\'
2014 default:
2015- c = pattern[*char_len];
2016- break;
2017- }
2018-
2019- (*char_len)++;
2020- return c;
2021-}
2022-
2023-
2024-
2025-IRegexAtom* CRegexAscii_parser::read_atom(const char *pattern, int *atom_len)
2026+ {
2027+ const char *temp_pattern = pattern;
2028+ unicode::code_point utf8c = utf8::next_char(temp_pattern);
2029+ (*char_len) = temp_pattern - pattern;
2030+ return create_charmatch(utf8c, pattern, *char_len, multichar_type);
2031+ }
2032+ }
2033+ return NULL;
2034+}
2035+
2036+CRegexXQuery_charmatch *CRegexXQuery_parser::create_charmatch(unicode::code_point utf8c,
2037+ const char *pattern, int utf8len,
2038+ enum CHARGROUP_t *multichar_type)
2039+{
2040+ if(utf8c <= 0x7F)
2041+ {
2042+ *multichar_type = CHARGROUP_FLAGS_ONECHAR_ASCII;
2043+ if(flags & REGEX_ASCII_CASE_INSENSITIVE)
2044+ return new CRegexXQuery_char_ascii_i(current_regex, (char)utf8c);
2045+ else
2046+ return new CRegexXQuery_char_ascii(current_regex, (char)utf8c);
2047+ }
2048+ else
2049+ {
2050+ *multichar_type = CHARGROUP_FLAGS_ONECHAR_UNICODE;
2051+ if(flags & REGEX_ASCII_CASE_INSENSITIVE)
2052+ return new CRegexXQuery_char_unicode_i(current_regex, utf8c);
2053+ else
2054+ {
2055+ if(pattern)
2056+ return new CRegexXQuery_char_unicode(current_regex, pattern, utf8len);
2057+ else
2058+ return new CRegexXQuery_char_unicode_cp(current_regex, utf8c);
2059+ }
2060+ }
2061+}
2062+
2063+IRegexAtom* CRegexXQuery_parser::read_atom(const char *pattern, int *atom_len)
2064 {
2065 *atom_len = 0;
2066- char c;
2067- bool is_end_line = false;
2068- c = pattern[*atom_len];
2069- if((!(flags & REGEX_ASCII_LITERAL)) && (c == '\\'))
2070+ if(flags & REGEX_ASCII_LITERAL)
2071+ {
2072+ unicode::code_point utf8c;
2073+ //bool is_end_line = false;
2074+ const char *temp_pattern = pattern;
2075+ utf8c = utf8::next_char(temp_pattern);
2076+ *atom_len = temp_pattern - pattern;
2077+ enum CHARGROUP_t multichar_type;
2078+ return create_charmatch(utf8c, pattern, *atom_len, &multichar_type);
2079+ }
2080+
2081+ char c = *pattern;
2082+ if(c == '\\')
2083 {
2084 //check for back reference
2085 if(myisdigit(pattern[(*atom_len)+1]))
2086@@ -281,13 +674,13 @@
2087 (*atom_len)++;
2088 if(pattern[*atom_len] == '0')
2089 {
2090- throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(U_REGEX_INVALID_BACK_REF)) );
2091+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_BACK_REF), 0, current_regex->subregex.size()) );
2092 }
2093 unsigned int backref = pattern[*atom_len] - '0';
2094 if((backref > current_regex->subregex.size()) ||
2095 (current_regex->subregex.at(backref-1)->flags != 0))
2096 {
2097- throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(U_REGEX_INVALID_BACK_REF)) );
2098+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_BACK_REF), backref, current_regex->subregex.size()) );
2099 }
2100 while(current_regex->subregex.size() >= backref*10)
2101 {
2102@@ -303,70 +696,86 @@
2103 break;
2104 }
2105 }
2106- return new CRegexAscii_backref(current_regex, backref);
2107+ (*atom_len)++;
2108+ return new CRegexXQuery_backref(current_regex, backref);
2109 }
2110 }
2111+ if(c == '^')
2112+ {
2113+ (*atom_len)++;
2114+ return new CRegexXQuery_pinstart(current_regex);
2115+ }
2116+ if((c == '}') || (c == '{') || (c == '?') || (c == '*') || (c == '+') || (c == '|'))
2117+ {
2118+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_ATOM_CHAR), c) );
2119+ }
2120 switch(c)
2121 {
2122 case '[':
2123 {
2124- if(!(flags & REGEX_ASCII_LITERAL))
2125- {
2126- (*atom_len)++;
2127- CRegexAscii_chargroup *chargroup = NULL;
2128- int chargroup_len;
2129- chargroup = readchargroup(pattern+*atom_len, &chargroup_len);
2130- *atom_len += chargroup_len;
2131- return chargroup;
2132- }
2133+ (*atom_len)++;
2134+ CRegexXQuery_chargroup *chargroup = NULL;
2135+ int chargroup_len;
2136+ chargroup = readchargroup(pattern+*atom_len, &chargroup_len);
2137+ *atom_len += chargroup_len;
2138+ return chargroup;
2139 }
2140 case '.'://WildCharEsc
2141 {
2142- if(!(flags & REGEX_ASCII_LITERAL))
2143- {
2144- CRegexAscii_wildchar *wildchar = new CRegexAscii_wildchar(current_regex);
2145- (*atom_len)++;
2146- return wildchar;
2147- }
2148+ (*atom_len)++;
2149+ return new CRegexXQuery_wildchar(current_regex);
2150 }
2151 case '('://begin an embedded reg exp
2152 {
2153- if(!(flags & REGEX_ASCII_LITERAL))
2154- {
2155- (*atom_len)++;
2156- CRegexAscii_regex *emb_regex = NULL;
2157- int regex_len;
2158- emb_regex = parse_regexp(pattern + *atom_len, &regex_len);
2159- *atom_len += regex_len;
2160- return emb_regex;
2161- }
2162+ (*atom_len)++;
2163+ CRegexXQuery_regex *emb_regex = NULL;
2164+ int regex_len;
2165+ emb_regex = parse_regexp(pattern + *atom_len, &regex_len);
2166+ *atom_len += regex_len;
2167+ return emb_regex;
2168 }
2169 case '$'://end line
2170- if(!(flags & REGEX_ASCII_LITERAL))
2171- {
2172- is_end_line = true;
2173- }
2174+ //is_end_line = true;
2175+ (*atom_len)++;
2176+ return new CRegexXQuery_endline(current_regex);
2177 default:
2178 {
2179- char c;
2180+ //char c;
2181+ CRegexXQuery_charmatch *charmatch = NULL;
2182 int c_len;
2183- bool is_multichar = false;
2184- if(!(flags & REGEX_ASCII_LITERAL))
2185- c = readChar(pattern+*atom_len, &c_len, &is_multichar);
2186- else
2187+ CHARGROUP_t multichar_type = CHARGROUP_NO_MULTICHAR;
2188+ *atom_len = 0;
2189+ while(pattern[*atom_len])
2190 {
2191- c = pattern[*atom_len];
2192- c_len = 1;
2193+ charmatch = readChar(pattern+*atom_len, &c_len, &multichar_type);
2194+ *atom_len += c_len;
2195+ if((flags & REGEX_ASCII_NO_WHITESPACE) && (multichar_type == CHARGROUP_FLAGS_ONECHAR_ASCII))
2196+ {
2197+ char c = (char)charmatch->get_c();
2198+ if((c == ' ') || (c == '\t') || (c == '\r') || (c == '\n'))
2199+ {
2200+ //ignore this whitespace
2201+ delete charmatch;
2202+ continue;
2203+ }
2204+ else
2205+ break;
2206+ }
2207+ else
2208+ break;
2209 }
2210- CRegexAscii_chargroup *chargroup = new CRegexAscii_chargroup(current_regex);
2211- if(is_multichar)
2212- chargroup->addMultiChar(c);
2213+ /*
2214+ std::auto_ptr<CRegexXQuery_chargroup> chargroup(new CRegexXQuery_chargroup(current_regex));
2215+ if(multichar_type)
2216+ chargroup->addMultiChar(c, multichar_type);
2217 else if(is_end_line)
2218 chargroup->addEndLine();
2219 else
2220- chargroup->addCharRange(c, c);
2221+ chargroup->addOneChar(c);
2222 *atom_len += c_len;
2223- return chargroup;
2224+ return chargroup.release();
2225+ */
2226+ return charmatch;
2227 }
2228 }
2229 }
2230@@ -374,81 +783,119 @@
2231 //read until ']'
2232 //posCharGroup ::= ( charRange | charClassEsc )+
2233 //charRange ::= seRange | XmlCharIncDash
2234-CRegexAscii_chargroup* CRegexAscii_parser::readchargroup(const char *pattern, int *chargroup_len)
2235+CRegexXQuery_chargroup* CRegexXQuery_parser::readchargroup(const char *pattern, int *chargroup_len)
2236 {
2237- CRegexAscii_chargroup *chargroup = NULL;
2238+ std::auto_ptr<CRegexXQuery_chargroup> chargroup;
2239 *chargroup_len = 0;
2240 if(pattern[*chargroup_len] == '^')//negative group
2241 {
2242 (*chargroup_len)++;
2243- chargroup = new CRegexAscii_negchargroup(current_regex);
2244+ chargroup.reset(new CRegexXQuery_negchargroup(current_regex));
2245 }
2246 else
2247- chargroup = new CRegexAscii_chargroup(current_regex);
2248+ chargroup.reset(new CRegexXQuery_chargroup(current_regex));
2249 while(pattern[*chargroup_len] && (pattern[*chargroup_len]!=']'))
2250 {
2251- char c1, c2;
2252- bool is_multichar;
2253+ //char c1, c2;
2254+ CHARGROUP_t multichar_type = CHARGROUP_NO_MULTICHAR;
2255 int c1_len;
2256- c1 = pattern[*chargroup_len];
2257- c2 = pattern[*chargroup_len+1];
2258- if((c1 == '-') && (c2 == '['))//charClassSub
2259+ if((pattern[*chargroup_len] == '-') && (pattern[(*chargroup_len)+1] == '['))//charClassSub
2260 {
2261 int classsub_len;
2262- CRegexAscii_chargroup *classsub = readchargroup(pattern + *chargroup_len+1 + 1, &classsub_len);
2263+ CRegexXQuery_chargroup *classsub = readchargroup(pattern + (*chargroup_len)+1 + 1, &classsub_len);
2264 if(!classsub)
2265 {
2266- delete chargroup;
2267- return NULL;
2268+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_SUBCLASS)) );
2269 }
2270 chargroup->addClassSub(classsub);
2271 *chargroup_len += 2 + classsub_len + 1;
2272 if(pattern[*chargroup_len-1] != ']')
2273 {
2274- delete chargroup;
2275- return NULL;
2276+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_USE_OF_SUBCLASS)) );
2277 }
2278- return chargroup;
2279+ return chargroup.release();
2280 }
2281
2282- c1 = readChar(pattern+*chargroup_len, &c1_len, &is_multichar);
2283- if(is_multichar)//first char is multichar
2284+ std::unique_ptr<CRegexXQuery_charmatch> charmatch(readChar(pattern+*chargroup_len, &c1_len, &multichar_type));
2285+ if((multichar_type == CHARGROUP_FLAGS_MULTICHAR_p) ||
2286+ (multichar_type == CHARGROUP_FLAGS_MULTICHAR_Is) ||
2287+ (multichar_type == CHARGROUP_FLAGS_MULTICHAR_OTHER))//first char is multichar
2288 {
2289- chargroup->addMultiChar(c1);
2290+ if((pattern[*chargroup_len+c1_len] == '-') &&///should not be a range
2291+ (pattern[*chargroup_len+c1_len+1] != ']'))
2292+ {
2293+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_MULTICHAR_IN_CHAR_RANGE)) );
2294+ }
2295+ //chargroup->addMultiChar(c1, multichar_type);
2296+ chargroup->addCharMatch(charmatch.release());
2297 *chargroup_len += c1_len;
2298 continue;
2299 }
2300- if(pattern[*chargroup_len+c1_len] == '-')///might be a range
2301+ (*chargroup_len) += c1_len;
2302+ if(pattern[*chargroup_len] == '-')///might be a range
2303 {
2304- if(pattern[*chargroup_len+c1_len+1] == ']')//no range, just the last char is '-'
2305+ if(pattern[(*chargroup_len)+1] == ']')//no range, just the last char is '-'
2306 {
2307- chargroup->addCharRange(c1, c1);
2308- chargroup->addCharRange('-', '-');
2309- *chargroup_len += c1_len + 1;
2310+ //chargroup->addOneChar(c1);
2311+ //chargroup->addOneChar('-');
2312+ chargroup->addCharMatch(charmatch.release());
2313+ chargroup->addCharMatch(new CRegexXQuery_char_ascii(current_regex, '-'));
2314+ (*chargroup_len)++;
2315 continue;
2316 }
2317- else
2318+ else if(pattern[(*chargroup_len)+1] != '[')
2319 {
2320 //it is a range
2321- char c3;
2322- int c3_len;
2323- c3 = readChar(pattern+*chargroup_len+c1_len+1, &c3_len, &is_multichar);
2324- if(is_multichar)
2325- return NULL;//error
2326- chargroup->addCharRange(c1, c3);
2327- *chargroup_len += c1_len + 1 + c3_len;
2328+ (*chargroup_len)++;
2329+ std::unique_ptr<CRegexXQuery_charmatch> charmatch2;
2330+ CHARGROUP_t multichar_type2 = CHARGROUP_NO_MULTICHAR;
2331+ int c2_len;
2332+ charmatch2.reset(readChar(pattern+(*chargroup_len), &c2_len, &multichar_type2));
2333+ if((multichar_type2 != CHARGROUP_FLAGS_ONECHAR_ASCII) &&
2334+ (multichar_type2 != CHARGROUP_FLAGS_ONECHAR_ASCII))//second char in range is multichar
2335+ {
2336+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_MULTICHAR_IN_CHAR_RANGE)) );
2337+ }
2338+ //chargroup->addCharRange(c1, c3);
2339+ if((multichar_type == CHARGROUP_FLAGS_ONECHAR_ASCII) && (multichar_type2 == CHARGROUP_FLAGS_ONECHAR_ASCII))
2340+ {
2341+ if(flags & REGEX_ASCII_CASE_INSENSITIVE)
2342+ chargroup->addCharMatch(new CRegexXQuery_char_range_ascii_i(current_regex,
2343+ (char)charmatch->get_c(),
2344+ (char)charmatch2->get_c()));
2345+ else
2346+ chargroup->addCharMatch(new CRegexXQuery_char_range_ascii(current_regex,
2347+ (char)charmatch->get_c(),
2348+ (char)charmatch2->get_c()));
2349+ }
2350+ else
2351+ {
2352+ if(flags & REGEX_ASCII_CASE_INSENSITIVE)
2353+ chargroup->addCharMatch(new CRegexXQuery_char_range_unicode_i(current_regex,
2354+ charmatch->get_c(),
2355+ charmatch2->get_c()));
2356+ else
2357+ chargroup->addCharMatch(new CRegexXQuery_char_range_unicode(current_regex,
2358+ charmatch->get_c(),
2359+ charmatch2->get_c()));
2360+ }
2361+ *chargroup_len += c2_len;
2362 continue;
2363 }
2364 }
2365- chargroup->addCharRange(c1, c1);
2366- *chargroup_len += c1_len;
2367+ //chargroup->addOneChar(c1);
2368+ chargroup->addCharMatch(charmatch.release());
2369 }
2370 if(pattern[*chargroup_len])
2371 (*chargroup_len)++;
2372- return chargroup;
2373+ else
2374+ {
2375+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_MISSING_CLOSE_BRACKET)) );
2376+ }
2377+ return chargroup.release();
2378 }
2379
2380-void CRegexAscii_parser::read_quantifier(CRegexAscii_piece *piece,
2381+void CRegexXQuery_parser::read_quantifier(CRegexXQuery_piece *piece,
2382 const char *pattern, int *quantif_len)
2383 {
2384 *quantif_len = 0;
2385@@ -496,6 +943,10 @@
2386 max = max*10 + pattern[*quantif_len] - '0';
2387 (*quantif_len)++;
2388 }
2389+ if(max < min)
2390+ {
2391+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_MAX_LT_MIN)) );
2392+ }
2393 piece->set_quantifier_min_max(min, max, true);
2394 }
2395 while(pattern[*quantif_len] && (pattern[*quantif_len] != '}'))
2396@@ -524,23 +975,25 @@
2397 ///Constructors and destructors and internal functions
2398 ////////////////////////////
2399
2400-CRegexAscii_regex::CRegexAscii_regex(CRegexAscii_regex *topregex) : IRegexAtom(topregex?topregex:this)
2401+CRegexXQuery_regex::CRegexXQuery_regex(CRegexXQuery_regex *topregex) : IRegexAtom(topregex?topregex:this)
2402 {
2403 matched_source = NULL;
2404 matched_len = 0;
2405+// backup_matched_source = NULL;
2406+// backup_matched_len = 0;
2407 flags = 128;//set to 0 after initialization
2408 }
2409
2410-CRegexAscii_regex::~CRegexAscii_regex()
2411+CRegexXQuery_regex::~CRegexXQuery_regex()
2412 {
2413- std::list<CRegexAscii_branch*>::iterator branch_it;
2414+ std::list<CRegexXQuery_branch*>::iterator branch_it;
2415
2416 for(branch_it = branch_list.begin(); branch_it != branch_list.end(); branch_it++)
2417 {
2418 delete (*branch_it);
2419 }
2420 /*
2421- std::vector<CRegexAscii_regex*>::iterator subregex_it;
2422+ std::vector<CRegexXQuery_regex*>::iterator subregex_it;
2423 for(subregex_it = subregex.begin(); subregex_it != subregex.end(); subregex_it++)
2424 {
2425 delete (*subregex_it);
2426@@ -548,25 +1001,18 @@
2427 */
2428 }
2429
2430-bool CRegexAscii_regex::set_align_begin(bool align_begin)
2431-{
2432- bool prev_align = this->align_begin;
2433- this->align_begin = align_begin;
2434- return prev_align;
2435-}
2436-
2437-void CRegexAscii_regex::add_branch(CRegexAscii_branch *branch)
2438+void CRegexXQuery_regex::add_branch(CRegexXQuery_branch *branch)
2439 {
2440 branch_list.push_back(branch);
2441 }
2442
2443-bool CRegexAscii_regex::get_indexed_match(int index,
2444+bool CRegexXQuery_regex::get_indexed_match(int index,
2445 const char **matched_source,
2446 int *matched_len)
2447 {
2448 if(!index || index > (int)subregex.size())
2449 return false;
2450- CRegexAscii_regex *subr = subregex[index-1];
2451+ CRegexXQuery_regex *subr = subregex[index-1];
2452 *matched_source = subr->matched_source;
2453 if(!*matched_source)
2454 return false;
2455@@ -574,145 +1020,209 @@
2456 return true;
2457 }
2458
2459-unsigned int CRegexAscii_regex::get_indexed_regex_count()
2460+unsigned int CRegexXQuery_regex::get_indexed_regex_count()
2461 {
2462 return subregex.size();
2463 }
2464
2465-CRegexAscii_branch::CRegexAscii_branch(CRegexAscii_regex* regex) :
2466- IRegexMatcher(regex)
2467+CRegexXQuery_branch::CRegexXQuery_branch(CRegexXQuery_regex* regex)
2468+ //:
2469+ //IRegexMatcher(regex)
2470 {
2471 }
2472
2473-CRegexAscii_branch::~CRegexAscii_branch()
2474+CRegexXQuery_branch::~CRegexXQuery_branch()
2475 {
2476- std::list<CRegexAscii_piece*>::iterator piece_it;
2477+ std::list<RegexAscii_pieceinfo>::iterator piece_it;
2478
2479 for(piece_it = piece_list.begin(); piece_it != piece_list.end(); piece_it++)
2480 {
2481- delete (*piece_it);
2482+ delete (*piece_it).piece;
2483 }
2484 }
2485
2486-void CRegexAscii_branch::add_piece(CRegexAscii_piece *piece)
2487+void CRegexXQuery_branch::add_piece(CRegexXQuery_piece *piece)
2488 {
2489 piece_list.push_back(piece);
2490 }
2491
2492-CRegexAscii_piece::CRegexAscii_piece()
2493+CRegexXQuery_piece::CRegexXQuery_piece()
2494 {
2495+ atom = NULL;
2496+ regex_atom = NULL;
2497 }
2498
2499-CRegexAscii_piece::~CRegexAscii_piece()
2500+CRegexXQuery_piece::~CRegexXQuery_piece()
2501 {
2502 delete atom;
2503 }
2504
2505-void CRegexAscii_piece::set_atom(IRegexAtom *atom)
2506+void CRegexXQuery_piece::set_atom(IRegexAtom *atom)
2507 {
2508 this->atom = atom;
2509+ this->regex_atom = dynamic_cast<CRegexXQuery_regex*>(atom);
2510 }
2511
2512-void CRegexAscii_piece::set_quantifier_min_max(int min, int max, bool strict_max)
2513+void CRegexXQuery_piece::set_quantifier_min_max(int min, int max, bool strict_max)
2514 {
2515 this->min = min;
2516 this->max = max;
2517 this->strict_max = strict_max;
2518 }
2519-void CRegexAscii_piece::set_is_reluctant(bool is_reluctant)
2520+void CRegexXQuery_piece::set_is_reluctant(bool is_reluctant)
2521 {
2522 this->is_reluctant = is_reluctant;
2523 }
2524-void CRegexAscii_piece::get_quantifier(int *min, int *max, bool *strict_max)
2525+void CRegexXQuery_piece::get_quantifier(int *min, int *max, bool *strict_max)
2526 {
2527 *min = this->min;
2528 *max = this->max;
2529 *strict_max = this->strict_max;
2530 }
2531-bool CRegexAscii_piece::get_is_reluctant()
2532+bool CRegexXQuery_piece::get_is_reluctant()
2533 {
2534+ if(atom->regex_intern->flags & REGEX_ASCII_MINIMAL_MATCH)
2535+ return true;
2536 return is_reluctant;
2537 }
2538
2539
2540-CRegexAscii_chargroup::CRegexAscii_chargroup(CRegexAscii_regex* regex) :
2541+CRegexXQuery_charmatch::CRegexXQuery_charmatch(CRegexXQuery_regex* regex) :
2542+ IRegexAtom(regex)
2543+{
2544+}
2545+CRegexXQuery_multicharP::CRegexXQuery_multicharP(CRegexXQuery_regex* regex, char type, bool is_reverse) :
2546+ CRegexXQuery_charmatch(regex)
2547+{
2548+ this->multichar_type = type; this->is_reverse = is_reverse;
2549+}
2550+CRegexXQuery_multicharIs::CRegexXQuery_multicharIs(CRegexXQuery_regex* regex, int block_index, bool is_reverse) :
2551+ CRegexXQuery_charmatch(regex)
2552+{
2553+ this->block_index = block_index; this->is_reverse = is_reverse;
2554+}
2555+CRegexXQuery_multicharOther::CRegexXQuery_multicharOther(CRegexXQuery_regex* regex, char type) :
2556+ CRegexXQuery_charmatch(regex)
2557+{
2558+ this->multichar_type = type;
2559+}
2560+CRegexXQuery_char_ascii::CRegexXQuery_char_ascii(CRegexXQuery_regex* regex, char c) :
2561+ CRegexXQuery_charmatch(regex)
2562+{
2563+ this->c = c;
2564+}
2565+CRegexXQuery_char_ascii_i::CRegexXQuery_char_ascii_i(CRegexXQuery_regex* regex, char c) :
2566+ CRegexXQuery_char_ascii(regex, toupper(c))
2567+{
2568+}
2569+CRegexXQuery_char_range_ascii::CRegexXQuery_char_range_ascii(CRegexXQuery_regex* regex, char c1, char c2) :
2570+ CRegexXQuery_charmatch(regex)
2571+{
2572+ this->c1 = c1; this->c2 = c2;
2573+}
2574+CRegexXQuery_char_range_ascii_i::CRegexXQuery_char_range_ascii_i(CRegexXQuery_regex* regex, char c1, char c2) :
2575+ CRegexXQuery_char_range_ascii(regex, toupper(c1), toupper(c2))
2576+{
2577+}
2578+CRegexXQuery_char_unicode::CRegexXQuery_char_unicode(CRegexXQuery_regex* regex, const char *source, int len) :
2579+ CRegexXQuery_charmatch(regex)
2580+{
2581+ this->len = len;
2582+ memcpy(c, source, len);
2583+}
2584+CRegexXQuery_char_unicode_cp::CRegexXQuery_char_unicode_cp(CRegexXQuery_regex* regex, unicode::code_point c) :
2585+ CRegexXQuery_charmatch(regex)
2586+{
2587+ this->c = c;
2588+}
2589+CRegexXQuery_char_unicode_i::CRegexXQuery_char_unicode_i(CRegexXQuery_regex* regex, unicode::code_point c) :
2590+ CRegexXQuery_char_unicode_cp(regex, unicode::to_upper(c))
2591+{
2592+}
2593+CRegexXQuery_char_range_unicode::CRegexXQuery_char_range_unicode(CRegexXQuery_regex* regex, unicode::code_point c1, unicode::code_point c2) :
2594+ CRegexXQuery_charmatch(regex)
2595+{
2596+ this->c1 = c1; this->c2 = c2;
2597+}
2598+CRegexXQuery_char_range_unicode_i::CRegexXQuery_char_range_unicode_i(CRegexXQuery_regex* regex, unicode::code_point c1, unicode::code_point c2) :
2599+ CRegexXQuery_char_range_unicode(regex, unicode::to_upper(c1), unicode::to_upper(c2))
2600+{
2601+}
2602+CRegexXQuery_endline::CRegexXQuery_endline(CRegexXQuery_regex* regex) :
2603+ CRegexXQuery_charmatch(regex)
2604+{
2605+}
2606+
2607+unicode::code_point CRegexXQuery_char_unicode::get_c()
2608+{
2609+ const char *temp_c = (const char*)c;
2610+ return utf8::next_char(temp_c);
2611+}
2612+
2613+
2614+CRegexXQuery_chargroup::CRegexXQuery_chargroup(CRegexXQuery_regex* regex) :
2615 IRegexAtom(regex)
2616 {
2617 classsub = NULL;
2618 }
2619
2620-CRegexAscii_chargroup::~CRegexAscii_chargroup()
2621+CRegexXQuery_chargroup::~CRegexXQuery_chargroup()
2622 {
2623 delete classsub;
2624-}
2625-
2626-void CRegexAscii_chargroup::addMultiChar(char c)
2627-{
2628- chargroup_t cgt;
2629- cgt.flags = CHARGROUP_FLAGS_MULTICHAR;
2630- cgt.c1 = c;
2631- cgt.c2 = 0;
2632- chargroup_list.push_back(cgt);
2633-}
2634-
2635-void CRegexAscii_chargroup::addEndLine()
2636-{
2637- chargroup_t cgt;
2638- cgt.flags = CHARGROUP_FLAGS_ENDLINE;
2639- cgt.c1 = '$';
2640- cgt.c2 = 0;
2641- chargroup_list.push_back(cgt);
2642-}
2643-
2644-void CRegexAscii_chargroup::addCharRange(char c1, char c2)
2645-{
2646- chargroup_t cgt;
2647- cgt.flags = 0;
2648- cgt.c1 = c1;
2649- cgt.c2 = c2;
2650- chargroup_list.push_back(cgt);
2651-}
2652-
2653-void CRegexAscii_chargroup::addClassSub(CRegexAscii_chargroup* classsub)
2654+ std::list<CRegexXQuery_charmatch* >::iterator charmatch_it;
2655+ for(charmatch_it=chargroup_list.begin(); charmatch_it != chargroup_list.end(); charmatch_it++)
2656+ delete (*charmatch_it);
2657+}
2658+
2659+void CRegexXQuery_chargroup::addCharMatch(CRegexXQuery_charmatch *charmatch)
2660+{
2661+ chargroup_list.push_back(charmatch);
2662+}
2663+void CRegexXQuery_chargroup::addClassSub(CRegexXQuery_chargroup* classsub)
2664 {
2665 this->classsub = classsub;
2666 }
2667
2668-CRegexAscii_negchargroup::CRegexAscii_negchargroup(CRegexAscii_regex* regex) :
2669- CRegexAscii_chargroup(regex)
2670-{
2671-}
2672-
2673-CRegexAscii_negchargroup::~CRegexAscii_negchargroup()
2674-{
2675-}
2676-
2677-CRegexAscii_wildchar::CRegexAscii_wildchar(CRegexAscii_regex* regex) :
2678+CRegexXQuery_negchargroup::CRegexXQuery_negchargroup(CRegexXQuery_regex* regex) :
2679+ CRegexXQuery_chargroup(regex)
2680+{
2681+}
2682+
2683+CRegexXQuery_negchargroup::~CRegexXQuery_negchargroup()
2684+{
2685+}
2686+
2687+CRegexXQuery_wildchar::CRegexXQuery_wildchar(CRegexXQuery_regex* regex) :
2688 IRegexAtom(regex)
2689 {
2690 }
2691
2692-CRegexAscii_wildchar::~CRegexAscii_wildchar()
2693+CRegexXQuery_wildchar::~CRegexXQuery_wildchar()
2694 {
2695 }
2696
2697-CRegexAscii_backref::CRegexAscii_backref(CRegexAscii_regex* regex, unsigned int backref_) :
2698+CRegexXQuery_backref::CRegexXQuery_backref(CRegexXQuery_regex* regex, unsigned int backref_) :
2699 IRegexAtom(regex),
2700 backref(backref_)
2701 {
2702 }
2703
2704-CRegexAscii_backref::~CRegexAscii_backref()
2705-{
2706-}
2707-
2708-CRegexAscii_parser::CRegexAscii_parser()
2709+CRegexXQuery_backref::~CRegexXQuery_backref()
2710+{
2711+}
2712+
2713+CRegexXQuery_pinstart::CRegexXQuery_pinstart(CRegexXQuery_regex* regex):
2714+ IRegexAtom(regex)
2715+{
2716+}
2717+
2718+CRegexXQuery_parser::CRegexXQuery_parser()
2719 {
2720 current_regex = NULL;
2721 regex_depth = 0;
2722 }
2723
2724-CRegexAscii_parser::~CRegexAscii_parser()
2725+CRegexXQuery_parser::~CRegexXQuery_parser()
2726 {
2727 }
2728
2729@@ -720,9 +1230,68 @@
2730 //////////////////////////////////////////
2731 ////Matching the pattern on a string
2732 /////////////////////////////////////////
2733+static std::list<RegexAscii_pieceinfo> empty_pieces;//empty list of pieces
2734+/*
2735+std::list<RegexAscii_pieceinfo>::iterator
2736+IRegexAtom::choose_next_piece(const char *source, int *matched_len,
2737+ std::list<RegexAscii_pieceinfo>::iterator this_piece,
2738+ std::list<RegexAscii_pieceinfo>::iterator end_piece)
2739+{
2740+ //if this_piece is repetition, repeat until max, then go to next piece
2741+ int min, max;
2742+ bool strict_max;
2743+ while(this_piece != end_piece)
2744+ {
2745+ (*this_piece).piece->get_quantifier(&min, &max, &strict_max);
2746+ if(max <= ((*this_piece).nr_matches))//finished this piece
2747+ {
2748+ this_piece++;
2749+ }
2750+ else
2751+ break;
2752+ }
2753+ return this_piece;
2754+}
2755+*/
2756+
2757+bool IRegexAtom::match(const char *source, int *start_from_branch, int *matched_len,
2758+ std::list<RegexAscii_pieceinfo>::iterator this_piece,
2759+ std::list<RegexAscii_pieceinfo>::iterator end_piece)
2760+{
2761+ *start_from_branch = 0;
2762+ bool retmatch;
2763+ retmatch = match_internal(source, start_from_branch, matched_len);
2764+ if(!retmatch)
2765+ return false;
2766+
2767+ if(this_piece == end_piece)
2768+ return true;
2769+
2770+ (*this_piece).nr_matches++;
2771+ int min,max;
2772+ bool strict_max;
2773+ (*this_piece).piece->get_quantifier(&min, &max, &strict_max);
2774+ std::list<RegexAscii_pieceinfo>::iterator init_piece = this_piece;
2775+ if(((min == 1) && (max == 1)) || //the simple common case
2776+ ((*matched_len == 0) && ((*this_piece).nr_matches>=min)))//to avoid infinite loop
2777+ {
2778+ this_piece++;
2779+ if(this_piece == end_piece)
2780+ return true;
2781+ }
2782+ int matched_len2;
2783+ retmatch = (*this_piece).piece->match_piece(this_piece, end_piece, source + *matched_len, &matched_len2);
2784+ if(!retmatch)
2785+ {
2786+ (*init_piece).nr_matches--;
2787+ return false;
2788+ }
2789+ *matched_len += matched_len2;
2790+ return true;
2791+}
2792
2793 //try every position in source to match the pattern
2794-bool CRegexAscii_regex::match_anywhere(const char *source, unsigned int flags,
2795+bool CRegexXQuery_regex::match_anywhere(const char *source, unsigned int flags,
2796 int *match_pos, int *matched_len)
2797 {
2798 *match_pos = 0;
2799@@ -730,43 +1299,66 @@
2800 return match_from(source, flags, match_pos, matched_len);
2801 }
2802
2803-bool CRegexAscii_regex::match_from(const char *source, unsigned int flags,
2804+bool CRegexXQuery_regex::match_from(const char *source, unsigned int flags,
2805 int *match_pos, int *matched_len)
2806 {
2807 this->flags = flags;
2808+ this->source_start = source;
2809 reachedEnd = false;
2810
2811- std::vector<CRegexAscii_regex*>::iterator regex_it;
2812+ std::vector<CRegexXQuery_regex*>::iterator regex_it;
2813 for(regex_it = subregex.begin(); regex_it != subregex.end(); regex_it++)
2814 {
2815 (*regex_it)->matched_source = NULL;
2816 }
2817-// if(!source[0])
2818-// {
2819-// if(branch_list.empty())
2820-// return true;
2821-// else
2822-// return false;
2823-// }
2824-
2825- bool skip_first_match = false;
2826- if(*match_pos && align_begin)
2827- skip_first_match = true;
2828+
2829+ std::vector<std::pair<const char*, int> > saved_subregex;
2830+
2831+ if(*match_pos && (flags & REGEX_ASCII_WHOLE_MATCH))
2832+ return false;
2833+
2834 do
2835 {
2836- if(!skip_first_match)
2837- {
2838- if(match(source + *match_pos, matched_len))
2839- return true;
2840- }
2841- skip_first_match = false;
2842- if(align_begin)
2843+ int start_from_branch = 0;
2844+ int longest_match = -1;
2845+ while(1)
2846+ {
2847+ if(!match(source + *match_pos, &start_from_branch, matched_len, empty_pieces.begin(), empty_pieces.end()))
2848+ break;
2849+ if(longest_match < *matched_len)
2850+ {
2851+ longest_match = *matched_len;
2852+ if(start_from_branch && (flags & REGEX_ASCII_GET_LONGEST_BRANCH))
2853+ save_subregex_list(saved_subregex);
2854+ }
2855+ if(!start_from_branch || !(flags & REGEX_ASCII_GET_LONGEST_BRANCH))
2856+ break;
2857+ //else try the other branches to see which is longer
2858+ }
2859+ if(longest_match != -1)
2860+ {
2861+ *matched_len = longest_match;
2862+ if(saved_subregex.size())
2863+ load_subregex_list(saved_subregex);
2864+ if(flags & REGEX_ASCII_WHOLE_MATCH)
2865+ {
2866+ if(!source[*match_pos+*matched_len])
2867+ return true;
2868+ if((flags & REGEX_ASCII_MULTILINE) &&
2869+ ((source[*match_pos+*matched_len] == '\n') || (source[*match_pos+*matched_len] == '\r')))
2870+ return true;
2871+ return false;
2872+ }
2873+ return true;
2874+ }
2875+
2876+ if(flags & REGEX_ASCII_WHOLE_MATCH)
2877 {
2878 if(flags & REGEX_ASCII_MULTILINE)
2879 {
2880- //goto the next line
2881+ //go to next line
2882 while(source[*match_pos] && (source[*match_pos] != '\n') && (source[*match_pos] != '\r'))
2883- (*match_pos)++;
2884+ (*match_pos) += myutf8len(source);
2885 if(source[*match_pos] == '\n')
2886 {
2887 (*match_pos)++;
2888@@ -780,190 +1372,1039 @@
2889 (*match_pos)++;
2890 }
2891 if(!source[*match_pos])
2892- return false;
2893+ break;
2894 continue;
2895 }
2896- return false;
2897+ break;
2898 }
2899 if(!source[*match_pos])
2900 break;
2901- (*match_pos)++;
2902+ (*match_pos) += myutf8len(source);
2903 }
2904 while(source[*match_pos]);
2905+// if(!source[*match_pos])
2906+// {
2907+// reachedEnd = true;
2908+// }
2909 return false;
2910 }
2911
2912+void CRegexXQuery_regex::reset_match()
2913+{
2914+// this->backup_matched_source = this->matched_source;
2915+// this->backup_matched_len = this->matched_len;
2916+ this->matched_source = NULL;
2917+ this->matched_len = 0;
2918+ std::list<CRegexXQuery_branch*>::iterator branch_it;
2919+ for(branch_it = branch_list.begin(); branch_it != branch_list.end(); branch_it++)
2920+ {
2921+ (*branch_it)->reset();
2922+ }
2923+}
2924+/*
2925+void CRegexXQuery_regex::restore_match()
2926+{
2927+ this->matched_source = this->backup_matched_source;
2928+ this->matched_len = this->backup_matched_len;
2929+ std::list<CRegexXQuery_branch*>::iterator branch_it;
2930+ for(branch_it = branch_list.begin(); branch_it != branch_list.end(); branch_it++)
2931+ {
2932+ (*branch_it)->restore();
2933+ }
2934+}
2935+*/
2936 //match any of the branches
2937-bool CRegexAscii_regex::match(const char *source, int *matched_len)
2938+bool CRegexXQuery_regex::match(const char *source, int *start_from_branch, int *matched_len,
2939+ std::list<RegexAscii_pieceinfo>::iterator next_piece,
2940+ std::list<RegexAscii_pieceinfo>::iterator end_piece)
2941 {
2942 reachedEnd = false;
2943- std::list<CRegexAscii_branch*>::iterator branch_it;
2944-
2945- for(branch_it = branch_list.begin(); branch_it != branch_list.end(); branch_it++)
2946- {
2947- if((*branch_it)->match(source, matched_len))
2948- {
2949- matched_source = source;
2950- this->matched_len = *matched_len;
2951+ if(!(flags & REGEX_ASCII_GROUPING_LEN_WHOLE_PIECE) ||
2952+ (this->matched_source == NULL) || ((this->matched_source + this->matched_len) != source))
2953+ this->matched_source = source;
2954+ *matched_len = 0;
2955+ std::list<CRegexXQuery_branch*>::iterator branch_it;
2956+
2957+ if(*start_from_branch == 0)
2958+ {
2959+ for(branch_it = branch_list.begin(); branch_it != branch_list.end(); branch_it++)
2960+ {
2961+ (*branch_it)->reset();
2962+ }
2963+ }
2964+
2965+ branch_it = branch_list.begin();
2966+ if(*start_from_branch)
2967+ {
2968+ for(int i=0;i<*start_from_branch;i++)
2969+ branch_it++;
2970+ }
2971+ (*start_from_branch)++;
2972+ for(; branch_it != branch_list.end(); branch_it++,(*start_from_branch)++)
2973+ {
2974+ if((*branch_it)->match(source, matched_len, this, next_piece, end_piece))
2975+ {
2976+ //matched_source = source;
2977+ //this->matched_len = *matched_len;
2978 return true;
2979 }
2980 }
2981- matched_source = NULL;
2982- matched_len = 0;
2983+ *start_from_branch = 0;
2984+ if(this->matched_source == source)
2985+ this->matched_source = NULL;
2986+ *matched_len = 0;
2987 return false;
2988 }
2989
2990+void CRegexXQuery_regex::save_subregex_list(std::vector<std::pair<const char*, int> > &saved_subregex)
2991+{
2992+ saved_subregex.resize(0);
2993+ saved_subregex.reserve(subregex.size());
2994+ std::vector<CRegexXQuery_regex*>::iterator it;
2995+ for(it=subregex.begin(); it != subregex.end(); it++)
2996+ {
2997+ saved_subregex.push_back(std::pair<const char*, int>((*it)->matched_source, (*it)->matched_len));
2998+ }
2999+}
3000+
3001+void CRegexXQuery_regex::load_subregex_list(std::vector<std::pair<const char*, int> > &saved_subregex)
3002+{
3003+ std::vector<std::pair<const char*, int> >::iterator it;
3004+ std::vector<CRegexXQuery_regex*>::iterator subit;
3005+ for(it=saved_subregex.begin(), subit = subregex.begin(); it != saved_subregex.end(); it++, subit++)
3006+ {
3007+ (*subit)->matched_source = (*it).first;
3008+ (*subit)->matched_len = (*it).second;
3009+ }
3010+}
3011+
3012+void CRegexXQuery_branch::reset()
3013+{
3014+ std::list<RegexAscii_pieceinfo>::iterator piece_it;
3015+ for(piece_it = piece_list.begin(); piece_it != piece_list.end(); piece_it++)
3016+ {
3017+ (*piece_it).piece->atom->reset_match();
3018+ }
3019+}
3020+/*
3021+void CRegexXQuery_branch::restore()
3022+{
3023+ std::list<RegexAscii_pieceinfo>::iterator piece_it;
3024+ for(piece_it = piece_list.begin(); piece_it != piece_list.end(); piece_it++)
3025+ {
3026+ (*piece_it).piece->atom->restore_match();
3027+ }
3028+}
3029+*/
3030 //match all the pieces
3031-bool CRegexAscii_branch::match(const char *source, int *matched_len)
3032+bool CRegexXQuery_branch::match(const char *source, int *matched_len,
3033+ CRegexXQuery_regex* group_regex,
3034+ std::list<RegexAscii_pieceinfo>::iterator next_piece,
3035+ std::list<RegexAscii_pieceinfo>::iterator end_piece)
3036 {
3037- std::list<CRegexAscii_piece*>::iterator piece_it;
3038+ std::list<RegexAscii_pieceinfo>::iterator piece_it;
3039
3040 piece_it = piece_list.begin();
3041+ //if(piece_it == piece_list.end())
3042+ //if(!source[0])
3043+ // return true;
3044+ //else
3045+ // return false;
3046 if(piece_it == piece_list.end())
3047- if(source[0])
3048- return false;
3049+ {
3050+ piece_it = next_piece;
3051+ if(next_piece == end_piece)
3052+ {
3053+ group_regex->matched_len = 0;
3054+ return true;
3055+ }
3056+ }
3057+
3058+ std::list<RegexAscii_pieceinfo> temp_pieces(piece_list);
3059+ temp_pieces.push_back(group_regex);//this will be used to store the group match
3060+ temp_pieces.insert(temp_pieces.end(), next_piece, end_piece);
3061+
3062+ return (*piece_it).piece->match_piece(temp_pieces.begin(), temp_pieces.end(), source, matched_len);
3063+}
3064+
3065+bool CRegexXQuery_piece::match_piece(std::list<RegexAscii_pieceinfo>::iterator piece_it,
3066+ std::list<RegexAscii_pieceinfo>::iterator end_it,
3067+ const char *source, int *matched_len)
3068+{
3069+ if((*piece_it).nr_matches < 0)
3070+ {
3071+ //special case, store the group match
3072+ (*piece_it).group_regex->matched_len = source - (*piece_it).group_regex->matched_source;
3073+ piece_it++;
3074+ if(piece_it == end_it)
3075+ return true;
3076 else
3077- return true;
3078- if(!(*piece_it)->get_is_reluctant())
3079- return match_piece_iter_normal(piece_it, source, matched_len);
3080+ return (*piece_it).piece->match_piece(piece_it, end_it, source, matched_len);
3081+ }
3082+
3083+ if(!get_is_reluctant())
3084+ return match_piece_iter_normal(piece_it, end_it, source, matched_len);
3085 else
3086- return match_piece_iter_reluctant(piece_it, source, matched_len);
3087-}
3088-
3089-//match as less as possible
3090-bool CRegexAscii_branch::match_piece_iter_reluctant(
3091- std::list<CRegexAscii_piece*>::iterator piece_it,
3092+ return match_piece_iter_reluctant(piece_it, end_it, source, matched_len);
3093+}
3094+
3095+int CRegexXQuery_piece::choose_another_branch(std::vector<std::pair<int,int> > &match_lens)
3096+{
3097+ int i = match_lens.size()-1;
3098+ i--;
3099+ while((i >= 0) && (match_lens.at(i).second == 0))
3100+ i--;
3101+ if(i < 0)
3102+ return -1;//no more branches
3103+ match_lens.resize(i+1);
3104+ i++;
3105+ return i;
3106+}
3107+
3108+bool CRegexXQuery_piece::is_regex_atom()
3109+{
3110+ return regex_atom != NULL;
3111+}
3112+
3113+//match as less as possible (shortest string)
3114+bool CRegexXQuery_piece::match_piece_iter_reluctant(
3115+ std::list<RegexAscii_pieceinfo>::iterator piece_it,
3116+ std::list<RegexAscii_pieceinfo>::iterator end_it,
3117 const char *source, int *matched_len)
3118 {
3119 *matched_len = 0;
3120- if(piece_it == piece_list.end())
3121+ if(piece_it == end_it)
3122 return true;
3123
3124 int min, max;
3125 bool strict_max;
3126 //std::vector<int> match_lens;
3127- (*piece_it)->get_quantifier(&min, &max, &strict_max);
3128- if(strict_max && (max >= 0))
3129+ (*piece_it).piece->get_quantifier(&min, &max, &strict_max);
3130+
3131+ std::vector<std::pair<const char*, int> > saved_subregex;
3132+
3133+ if(is_regex_atom())
3134 {
3135- int timeslen;
3136- //check if the piece doesn't exceed the max match
3137- if((*piece_it)->match_piece_times(source, &timeslen, max+1, NULL))
3138- return false;///too many matches
3139+ //recursive
3140+ bool retmatch;
3141+ atom->regex_intern->save_subregex_list(saved_subregex);
3142+ if((*piece_it).nr_matches >= min)
3143+ {
3144+ //go to next piece
3145+ std::list<RegexAscii_pieceinfo>::iterator next_it = piece_it;
3146+ next_it++;
3147+ if(next_it == end_it)
3148+ return true;
3149+ retmatch = (*next_it).piece->match_piece(next_it, end_it, source, matched_len);
3150+ if(retmatch)
3151+ return true;
3152+ }
3153+ if(((max == -1) || ((*piece_it).nr_matches < max)) &&//try further with this piece
3154+ (((*piece_it).nr_matches < min) || ((*piece_it).nr_matches == 0) || ((*piece_it).piece->regex_atom->matched_len)))//if matched_len is zero, avoid infinite loop
3155+ {
3156+ int start_from_branch = 0;
3157+ int shortest_len = -1;
3158+ bool branch_saved = false;
3159+ //try all branches to get the shortest len
3160+ (*piece_it).nr_matches++;
3161+ while(atom->match(source, &start_from_branch, matched_len, piece_it, end_it))
3162+ {
3163+ if((shortest_len == -1) || (shortest_len > *matched_len))
3164+ {
3165+ shortest_len = *matched_len;
3166+ if(start_from_branch && (atom->regex_intern->flags & REGEX_ASCII_GET_LONGEST_BRANCH))
3167+ {
3168+ atom->regex_intern->save_subregex_list(saved_subregex);
3169+ branch_saved = true;
3170+ }
3171+ }
3172+ if(!start_from_branch || !(atom->regex_intern->flags & REGEX_ASCII_GET_LONGEST_BRANCH))
3173+ break;
3174+ }
3175+ if(shortest_len != -1)
3176+ {
3177+ *matched_len = shortest_len;
3178+ if(branch_saved)
3179+ atom->regex_intern->load_subregex_list(saved_subregex);
3180+ return true;
3181+ }
3182+ else
3183+ {
3184+ (*piece_it).nr_matches--;
3185+ atom->regex_intern->load_subregex_list(saved_subregex);
3186+ return false;
3187+ }
3188+ }
3189+ else
3190+ {
3191+ atom->regex_intern->load_subregex_list(saved_subregex);
3192+ return false;
3193+ }
3194 }
3195
3196- int i=min;
3197- std::list<CRegexAscii_piece*>::iterator next_it = piece_it;
3198+ int i=0;
3199+ int shortest_len = -1;
3200+ int otherpieces_shortest = -1;
3201+ int i_shortest = -1;
3202+ std::list<RegexAscii_pieceinfo>::iterator next_it = piece_it;
3203+ std::vector<std::pair<int,int> > match_lens;
3204 next_it++;
3205 int pieceslen = 0;
3206 while(1)
3207 {
3208- if((max > 0) && (i>max))
3209- break;
3210- int piecelen = 0;
3211- if((*piece_it)->match_piece_times(source+pieceslen, &piecelen, !pieceslen ? i : 1, NULL))
3212- {
3213- pieceslen += piecelen;
3214+ int piecelen = 0;
3215+ bool retmatch;
3216+ retmatch = match_piece_times(source, &piecelen, i < min ? min : i, &match_lens);
3217+ i = match_lens.size()-1;//number of matches
3218+ if(i<0)
3219+ i = 0;
3220+ if((i>=min))
3221+ {
3222+ pieceslen = piecelen;
3223+ if((shortest_len >= 0) && (shortest_len <= pieceslen))//this branch is longer
3224+ {//try another branch
3225+ i = choose_another_branch(match_lens);
3226+ if(i >= 0)
3227+ continue;//try another branch
3228+ else
3229+ break;
3230+ }
3231 int otherpieces = 0;
3232- if((next_it == piece_list.end()) ||
3233- ((*next_it)->get_is_reluctant() && match_piece_iter_reluctant(next_it, source+pieceslen, &otherpieces)) ||
3234- (!(*next_it)->get_is_reluctant() && match_piece_iter_normal(next_it, source+pieceslen, &otherpieces)))
3235- {
3236- *matched_len = pieceslen + otherpieces;
3237- return true;
3238- }
3239+ if((next_it == end_it) ||
3240+ (*next_it).piece->match_piece(next_it, end_it, source+pieceslen, &otherpieces)
3241+ )
3242+ {
3243+ if((i == pieceslen) || (match_lens.at(0).second == 0) ||//minimum achieved already, cannot go lower than that
3244+ !(atom->regex_intern->flags & REGEX_ASCII_GET_LONGEST_BRANCH))
3245+ {
3246+ *matched_len = pieceslen + otherpieces;
3247+ return true;
3248+ }
3249+ if((shortest_len < 0) || (shortest_len > pieceslen))
3250+ {
3251+ shortest_len = pieceslen;
3252+ otherpieces_shortest = otherpieces;
3253+ i_shortest = i;
3254+ if(match_lens.at(0).second != 0)
3255+ atom->regex_intern->save_subregex_list(saved_subregex);
3256+ }
3257+ i = choose_another_branch(match_lens);
3258+ if(i >= 0)
3259+ continue;//try another branch
3260+ else
3261+ break;
3262+ }
3263+ else
3264+ {
3265+ //try further
3266+ if(retmatch)
3267+ {
3268+ i++;
3269+ if((max < 0) || (i<=max))
3270+ continue;
3271+ i--;
3272+ }
3273+ }
3274+ }
3275+
3276+ if(i==0)
3277+ {
3278+ break;
3279 }
3280 else
3281- break;
3282- i++;
3283+ {
3284+ i = choose_another_branch(match_lens);
3285+ if(i >= 0)
3286+ continue;//try another branch
3287+ else
3288+ break;
3289+ }
3290 }
3291
3292+ if(shortest_len >= 0)
3293+ {
3294+ if(strict_max && (max>=0) && (i_shortest > max))
3295+ return false;
3296+ *matched_len = shortest_len + otherpieces_shortest;
3297+ if(saved_subregex.size())
3298+ atom->regex_intern->load_subregex_list(saved_subregex);
3299+ return true;
3300+ }
3301 return false;
3302 }
3303
3304 //match as much as possible
3305-bool CRegexAscii_branch::match_piece_iter_normal(
3306- std::list<CRegexAscii_piece*>::iterator piece_it,
3307+bool CRegexXQuery_piece::match_piece_iter_normal(
3308+ std::list<RegexAscii_pieceinfo>::iterator piece_it,
3309+ std::list<RegexAscii_pieceinfo>::iterator end_it,
3310 const char *source, int *matched_len)
3311 {
3312 *matched_len = 0;
3313
3314 int min, max;
3315 bool strict_max;
3316- std::vector<int> match_lens;
3317- (*piece_it)->get_quantifier(&min, &max, &strict_max);
3318- int timeslen;
3319- if(strict_max && (max >= 0))
3320+ std::vector<std::pair<int,int> > match_lens;
3321+ (*piece_it).piece->get_quantifier(&min, &max, &strict_max);
3322+ int timeslen = 0;
3323+ std::vector<std::pair<const char*, int> > saved_subregex;
3324+
3325+ if(is_regex_atom())
3326 {
3327- //check if the piece doesn't exceed the max match
3328- //if((*piece_it)->match_piece_times(source, &timeslen, max+1, &match_lens))
3329- // return false;///too many matches
3330- (*piece_it)->match_piece_times(source, &timeslen, max, &match_lens);
3331+ //recursive
3332+ bool retmatch;
3333+ atom->regex_intern->save_subregex_list(saved_subregex);
3334+ if(((max == -1) || ((*piece_it).nr_matches < max)) && //try further with this piece
3335+ (((*piece_it).nr_matches < min) || ((*piece_it).nr_matches == 0) || ((*piece_it).piece->regex_atom->matched_len)))//if matched_len is zero, avoid infinite loop
3336+ {
3337+ int start_from_branch = 0;
3338+ int longest_len = -1;
3339+ bool branch_saved = false;
3340+ //try all branches to get the longest len
3341+ (*piece_it).nr_matches++;
3342+ while(atom->match(source, &start_from_branch, matched_len, piece_it, end_it))
3343+ {
3344+ if((longest_len < *matched_len))
3345+ {
3346+ longest_len = *matched_len;
3347+ if(start_from_branch && (atom->regex_intern->flags & REGEX_ASCII_GET_LONGEST_BRANCH))
3348+ {
3349+ atom->regex_intern->save_subregex_list(saved_subregex);
3350+ branch_saved = true;
3351+ }
3352+ }
3353+ if(!start_from_branch || !(atom->regex_intern->flags & REGEX_ASCII_GET_LONGEST_BRANCH))
3354+ break;
3355+ }
3356+ if(longest_len != -1)
3357+ {
3358+ *matched_len = longest_len;
3359+ if(branch_saved)
3360+ atom->regex_intern->load_subregex_list(saved_subregex);
3361+ return true;
3362+ }
3363+ else
3364+ {
3365+ atom->regex_intern->load_subregex_list(saved_subregex);
3366+ (*piece_it).nr_matches--;
3367+ }
3368+ }
3369+ if((*piece_it).nr_matches >= min)
3370+ {
3371+ //go to next piece
3372+ std::list<RegexAscii_pieceinfo>::iterator next_it = piece_it;
3373+ next_it++;
3374+ if(next_it == end_it)
3375+ return true;
3376+ retmatch = (*next_it).piece->match_piece(next_it, end_it, source, matched_len);
3377+ if(!retmatch)
3378+ atom->regex_intern->load_subregex_list(saved_subregex);
3379+ return retmatch;
3380+ }
3381+ else
3382+ {
3383+ // regex_atom->restore_match();
3384+ atom->regex_intern->load_subregex_list(saved_subregex);
3385+ return false;
3386+ }
3387 }
3388- else if(!strict_max && (max >= 0))
3389- (*piece_it)->match_piece_times(source, &timeslen, max, &match_lens);
3390- else
3391- (*piece_it)->match_piece_times(source, &timeslen, -1, &match_lens);
3392
3393- int i;
3394- std::list<CRegexAscii_piece*>::iterator next_it = piece_it;
3395+ int longest_len = -1;
3396+ int otherpieces_longest = -1;
3397+ int i_longest = -1;
3398+ int i = max;
3399+ std::list<RegexAscii_pieceinfo>::iterator next_it = piece_it;
3400 next_it++;
3401- if(next_it == piece_list.end())
3402+
3403+ bool retmatch;
3404+ while(1)
3405 {
3406- if((int)match_lens.size() > min)
3407- {
3408- *matched_len = timeslen;
3409- return true;
3410+ retmatch = match_piece_times(source, &timeslen, i, &match_lens);
3411+ i=match_lens.size()-1;//number of matches
3412+ if((i>=min))
3413+ {
3414+ if(timeslen < longest_len)
3415+ {//this branch is no use
3416+ i = choose_another_branch(match_lens);
3417+ if(i >= 0)
3418+ {
3419+ i = max;
3420+ continue;//try another branch
3421+ }
3422+ else
3423+ break;
3424+ }
3425+ //int piecelen = 0;
3426+ int otherpieces = 0;
3427+ if((next_it == end_it) ||
3428+ (*next_it).piece->match_piece(next_it, end_it, source+timeslen, &otherpieces)
3429+ )
3430+ {
3431+ if(timeslen > longest_len)
3432+ {
3433+ longest_len = timeslen;
3434+ otherpieces_longest = otherpieces;
3435+ i_longest = i;
3436+ if(!(atom->regex_intern->flags & REGEX_ASCII_GET_LONGEST_BRANCH))
3437+ {
3438+ *matched_len = longest_len + otherpieces_longest;
3439+ return true;
3440+ }
3441+ else
3442+ {
3443+ if(match_lens.at(0).second)
3444+ atom->regex_intern->save_subregex_list(saved_subregex);
3445+ }
3446+ }
3447+ }
3448+ else
3449+ {
3450+ if(!match_lens.at(0).second)
3451+ {
3452+ match_lens.resize(match_lens.size()-1);
3453+ i--;
3454+ if(i >= 0)
3455+ continue;//try smaller
3456+ else
3457+ break;
3458+ }
3459+ else
3460+ {
3461+ i = choose_another_branch(match_lens);
3462+ if(i >= 0)
3463+ continue;//try another branch
3464+ else
3465+ break;
3466+ }
3467+ }
3468+ }
3469+ //now try another branch
3470+ i = choose_another_branch(match_lens);
3471+ if(i >= 0)
3472+ {
3473+ i = max;
3474+ continue;//try another branch
3475 }
3476 else
3477- return false;
3478- }
3479- for(i=match_lens.size()-1; i>=min; i--)
3480+ break;
3481+ }//end while
3482+
3483+ if(longest_len >= 0)
3484 {
3485- int piecelen = 0;
3486- int otherpieces = 0;
3487- if(((*next_it)->get_is_reluctant() && match_piece_iter_reluctant(next_it, source+match_lens[i]+piecelen, &otherpieces)) ||
3488- (!(*next_it)->get_is_reluctant() && match_piece_iter_normal(next_it, source+match_lens[i]+piecelen, &otherpieces)))
3489- {
3490- *matched_len = match_lens[i] + piecelen + otherpieces;
3491- return true;
3492- }
3493+ *matched_len = longest_len + otherpieces_longest;
3494+ if(saved_subregex.size())
3495+ atom->regex_intern->load_subregex_list(saved_subregex);
3496+ return true;
3497 }
3498
3499 return false;
3500 }
3501
3502-bool CRegexAscii_piece::match_piece_times(const char *source,
3503+bool CRegexXQuery_piece::match_piece_times(const char *source,
3504 int *piecelen,
3505 int times,
3506- std::vector<int> *match_lens)
3507+ std::vector<std::pair<int,int> > *match_lens)
3508 {
3509- *piecelen = 0;
3510- for(int i=0;(times < 0) || (i<times);i++)
3511- {
3512+ int i=0;
3513+ if(match_lens && match_lens->size())
3514+ {
3515+ i = match_lens->size()-1;
3516+ }
3517+ if(match_lens && match_lens->size())
3518+ *piecelen = match_lens->at(match_lens->size()-1).first;
3519+ else
3520+ *piecelen = 0;
3521+ if((times >= 0) && (i>=times))
3522+ return true;
3523+ for(;(times < 0) || (i<times);i++)
3524+ {
3525+ int atomlen;
3526+ int start_from_branch = 0;
3527+ if(match_lens && (i<(int)match_lens->size()))
3528+ start_from_branch = match_lens->at(i).second;
3529+ bool first_branch = (start_from_branch == 0);
3530+ if(!atom->match(source+*piecelen, &start_from_branch, &atomlen, empty_pieces.begin(), empty_pieces.end()))
3531+ {
3532+ if(match_lens)
3533+ {
3534+ if(i >= (int)match_lens->size())
3535+ match_lens->push_back(std::pair<int,int>(*piecelen, 0));
3536+ else
3537+ (*match_lens)[i] = std::pair<int,int>(*piecelen, 0);
3538+ }
3539+ return false;
3540+ }
3541 if(match_lens)
3542- match_lens->push_back(*piecelen);
3543- int atomlen;
3544- if(!atom->match(source+*piecelen, &atomlen))
3545- return false;
3546+ {
3547+ if(i >= (int)match_lens->size())
3548+ match_lens->push_back(std::pair<int,int>(*piecelen, start_from_branch));
3549+ else
3550+ (*match_lens)[i] = std::pair<int,int>(*piecelen, start_from_branch);
3551+ }
3552 *piecelen += atomlen;
3553 if(!atomlen && !source[*piecelen])
3554 {
3555- atom->regex_intern->reachedEnd = true;
3556+ // atom->regex_intern->set_reachedEnd(source);
3557+ break;
3558+ }
3559+ if(first_branch && (atomlen == 0))//avoid infinite loop
3560+ {
3561 break;
3562 }
3563 }
3564 if(match_lens)
3565- match_lens->push_back(*piecelen);
3566+ {
3567+ // if(i >= match_lens->size())
3568+ match_lens->push_back(std::pair<int,int>(*piecelen, 0));
3569+ // else
3570+ // (*match_lens)[i] = std::pair<int,int>(*piecelen, 0);
3571+ }
3572
3573 return true;
3574 }
3575
3576+bool CRegexXQuery_multicharP::match_internal(const char *source, int *start_from_branch, int *matched_len)
3577+{
3578+ if(!source[0])
3579+ {
3580+ regex_intern->set_reachedEnd(source);
3581+ return false;
3582+ }
3583+ bool found = false;
3584+ const char *temp_source = source;
3585+ unicode::code_point utf8c = utf8::next_char(temp_source);
3586+ switch(multichar_type)
3587+ {
3588+ case unicode::UNICODE_Ll + 50:
3589+ if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Ll) ||
3590+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Lm) ||
3591+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Lo) ||
3592+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Lt) ||
3593+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Lu))
3594+ {
3595+ if(!is_reverse)
3596+ found = true;
3597+ }
3598+ else
3599+ {
3600+ if(is_reverse)
3601+ found = true;
3602+ }
3603+ break;
3604+ case unicode::UNICODE_Mc + 50:
3605+ if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Mn) ||
3606+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Mc) ||
3607+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Me))
3608+ {
3609+ if(!is_reverse)
3610+ found = true;
3611+ }
3612+ else
3613+ {
3614+ if(is_reverse)
3615+ found = true;
3616+ }
3617+ break;
3618+ case unicode::UNICODE_Nd + 50:
3619+ if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Nd) ||
3620+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Nl) ||
3621+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_No))
3622+ {
3623+ if(!is_reverse)
3624+ found = true;
3625+ }
3626+ else
3627+ {
3628+ if(is_reverse)
3629+ found = true;
3630+ }
3631+ break;
3632+ case unicode::UNICODE_Pc + 50:
3633+ if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pc) ||
3634+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pd) ||
3635+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Ps) ||
3636+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pe) ||
3637+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pi) ||
3638+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pf) ||
3639+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Po))
3640+ {
3641+ if(!is_reverse)
3642+ found = true;
3643+ }
3644+ else
3645+ {
3646+ if(is_reverse)
3647+ found = true;
3648+ }
3649+ break;
3650+ case unicode::UNICODE_Zl + 50:
3651+ if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Zs) ||
3652+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Zl) ||
3653+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Zp))
3654+ {
3655+ if(!is_reverse)
3656+ found = true;
3657+ }
3658+ else
3659+ {
3660+ if(is_reverse)
3661+ found = true;
3662+ }
3663+ break;
3664+ case unicode::UNICODE_Sc + 50:
3665+ if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Sm) ||
3666+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Sc) ||
3667+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Sk) ||
3668+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_So))
3669+ {
3670+ if(!is_reverse)
3671+ found = true;
3672+ }
3673+ else
3674+ {
3675+ if(is_reverse)
3676+ found = true;
3677+ }
3678+ break;
3679+ case unicode::UNICODE_Cc + 50:
3680+ if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Cc) ||
3681+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Cf) ||
3682+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Co))//ignore unicode::UNICODE_Cn
3683+ {
3684+ if(!is_reverse)
3685+ found = true;
3686+ }
3687+ else
3688+ {
3689+ if(is_reverse)
3690+ found = true;
3691+ }
3692+ break;
3693+ default:
3694+ if(unicode::check_codepoint_category(utf8c, (unicode::category)multichar_type))
3695+ {
3696+ if(!is_reverse)
3697+ found = true;
3698+ }
3699+ else
3700+ {
3701+ if(is_reverse)
3702+ found = true;
3703+ }
3704+ break;
3705+ }
3706+
3707+ if(found)
3708+ {
3709+ *matched_len = temp_source - source;
3710+ }
3711+ return found;
3712+}
3713+
3714+bool CRegexXQuery_multicharIs::match_internal(const char *source, int *start_from_branch, int *matched_len)
3715+{
3716+ if(!source[0])
3717+ {
3718+ regex_intern->set_reachedEnd(source);
3719+ return false;
3720+ }
3721+ bool found = false;
3722+ const char *temp_source = source;
3723+ unicode::code_point utf8c = utf8::next_char(temp_source);
3724+ const unicode::code_point *cp = block_escape[block_index].cp;
3725+ if((utf8c >= cp[0]) && (utf8c <= cp[1]))
3726+ {
3727+ if(!is_reverse)
3728+ found = true;
3729+ }
3730+ else if(block_escape[block_index].ext_cp)
3731+ {
3732+ cp = block_escape[block_index].ext_cp;
3733+ while(*cp)
3734+ {
3735+ if((utf8c >= cp[0]) && (utf8c <= cp[1]))
3736+ break;
3737+ cp += 2;
3738+ }
3739+ if(*cp)
3740+ {
3741+ if(!is_reverse)
3742+ found = true;
3743+ }
3744+ else
3745+ {
3746+ if(is_reverse)
3747+ found = true;
3748+ }
3749+ }
3750+ else
3751+ {
3752+ if(is_reverse)
3753+ found = true;
3754+ }
3755+ if(found)
3756+ {
3757+ *matched_len = temp_source - source;
3758+ }
3759+ return found;
3760+}
3761+
3762+bool CRegexXQuery_multicharOther::match_internal(const char *source, int *start_from_branch, int *matched_len)
3763+{
3764+ if(!source[0])
3765+ {
3766+ regex_intern->set_reachedEnd(source);
3767+ return false;
3768+ }
3769+ bool found = false;
3770+ bool value_true = true;
3771+ const char *temp_source = source;
3772+ unicode::code_point utf8c = utf8::next_char(temp_source);
3773+ switch(multichar_type)
3774+ {
3775+ case 'S':value_true = false;//[^\s]
3776+ case 's'://[#x20\t\n\r]
3777+ switch(utf8c)
3778+ {
3779+ case '\t':
3780+ case '\r':
3781+ case '\n':
3782+ case ' ':
3783+ found = true;
3784+ default:
3785+ break;
3786+ }
3787+ break;
3788+ case 'I':value_true = false;//[^\i]
3789+ case 'i'://the set of initial name characters, those matched by Letter | '_' | ':'
3790+ if((utf8c == '_') ||
3791+ (utf8c == ':') ||
3792+ XQCharType::isLetter(utf8c))
3793+ {
3794+ found = true;
3795+ }
3796+ break;
3797+ case 'C':value_true = false;//[^\c]
3798+ case 'c'://the set of name characters, those matched by NameChar
3799+ if(XQCharType::isNameChar(utf8c))
3800+ {
3801+ found = true;
3802+ }
3803+ break;
3804+ case 'D':value_true = false;//[^\d]
3805+ case 'd':
3806+ if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Nd))
3807+ found = true;
3808+ break;
3809+ case 'W':value_true = false;//[^\w]
3810+ case 'w':
3811+ found = !(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pc) ||
3812+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pd) ||
3813+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Ps) ||
3814+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pe) ||
3815+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pi) ||
3816+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pf) ||
3817+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Po) ||
3818+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Zs) ||
3819+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Zl) ||
3820+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Zp) ||
3821+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Cc) ||
3822+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Cf) ||
3823+ unicode::check_codepoint_category(utf8c, unicode::UNICODE_Co));//ignore unicode::UNICODE_Cn
3824+ break;
3825+ default:
3826+ throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(source, ZED(REGEX_UNIMPLEMENTED)) );
3827+ }
3828+ if((found && value_true) || (!found && !value_true))
3829+ {
3830+ *matched_len = temp_source - source;
3831+ return true;
3832+ }
3833+ else
3834+ {
3835+ return false;
3836+ }
3837+}
3838+
3839+bool CRegexXQuery_char_ascii::match_internal(const char *source, int *start_from_branch, int *matched_len)
3840+{
3841+ if(!source[0])
3842+ {
3843+ regex_intern->set_reachedEnd(source);
3844+ return false;
3845+ }
3846+ if(source[0] == c)
3847+ {
3848+ *matched_len = 1;
3849+ return true;
3850+ }
3851+ else
3852+ return false;
3853+}
3854+
3855+bool CRegexXQuery_char_ascii_i::match_internal(const char *source, int *start_from_branch, int *matched_len)
3856+{
3857+ if(!source[0])
3858+ {
3859+ regex_intern->set_reachedEnd(source);
3860+ return false;
3861+ }
3862+ char sup = toupper(source[0]);
3863+ if(sup == c)
3864+ {
3865+ *matched_len = 1;
3866+ return true;
3867+ }
3868+ else
3869+ return false;
3870+}
3871+
3872+bool CRegexXQuery_char_range_ascii::match_internal(const char *source, int *start_from_branch, int *matched_len)
3873+{
3874+ if(!source[0])
3875+ {
3876+ regex_intern->set_reachedEnd(source);
3877+ return false;
3878+ }
3879+ if((source[0] >= c1) && (source[0] <= c2))
3880+ {
3881+ *matched_len = 1;
3882+ return true;
3883+ }
3884+ else
3885+ return false;
3886+}
3887+
3888+bool CRegexXQuery_char_range_ascii_i::match_internal(const char *source, int *start_from_branch, int *matched_len)
3889+{
3890+ if(!source[0])
3891+ {
3892+ regex_intern->set_reachedEnd(source);
3893+ return false;
3894+ }
3895+ char sup = toupper(source[0]);
3896+ if((sup >= c1) && (sup <= c2))
3897+ {
3898+ *matched_len = 1;
3899+ return true;
3900+ }
3901+ else
3902+ return false;
3903+}
3904+
3905+bool CRegexXQuery_char_unicode::match_internal(const char *source, int *start_from_branch, int *matched_len)
3906+{
3907+ if(!source[0])
3908+ {
3909+ regex_intern->set_reachedEnd(source);
3910+ return false;
3911+ }
3912+ if(!memcmp(source, c, len))
3913+ {
3914+ *matched_len = len;
3915+ return true;
3916+ }
3917+ else
3918+ return false;
3919+}
3920+
3921+bool CRegexXQuery_char_unicode_cp::match_internal(const char *source, int *start_from_branch, int *matched_len)
3922+{
3923+ if(!source[0])
3924+ {
3925+ regex_intern->set_reachedEnd(source);
3926+ return false;
3927+ }
3928+ const char *temp_source = source;
3929+ unicode::code_point utf8c = utf8::next_char(temp_source);
3930+ if(utf8c == c)
3931+ {
3932+ *matched_len = temp_source - source;
3933+ return true;
3934+ }
3935+ else
3936+ return false;
3937+}
3938+
3939+bool CRegexXQuery_char_unicode_i::match_internal(const char *source, int *start_from_branch, int *matched_len)
3940+{
3941+ if(!source[0])
3942+ {
3943+ regex_intern->set_reachedEnd(source);
3944+ return false;
3945+ }
3946+ const char *temp_source = source;
3947+ unicode::code_point sup = unicode::to_upper(utf8::next_char(temp_source));
3948+ if(sup == c)
3949+ {
3950+ *matched_len = temp_source - source;
3951+ return true;
3952+ }
3953+ else
3954+ return false;
3955+}
3956+
3957+bool CRegexXQuery_char_range_unicode::match_internal(const char *source, int *start_from_branch, int *matched_len)
3958+{
3959+ if(!source[0])
3960+ {
3961+ regex_intern->set_reachedEnd(source);
3962+ return false;
3963+ }
3964+ const char *temp_source = source;
3965+ unicode::code_point utf8c = utf8::next_char(temp_source);
3966+ if((utf8c >= c1) && (utf8c <= c2))
3967+ {
3968+ *matched_len = temp_source - source;
3969+ return true;
3970+ }
3971+ else
3972+ return false;
3973+}
3974+
3975+bool CRegexXQuery_char_range_unicode_i::match_internal(const char *source, int *start_from_branch, int *matched_len)
3976+{
3977+ if(!source[0])
3978+ {
3979+ regex_intern->set_reachedEnd(source);
3980+ return false;
3981+ }
3982+ const char *temp_source = source;
3983+ unicode::code_point sup = unicode::to_upper(utf8::next_char(temp_source));
3984+ if((sup >= c1) && (sup <= c2))
3985+ {
3986+ *matched_len = temp_source - source;
3987+ return true;
3988+ }
3989+ else
3990+ return false;
3991+}
3992+
3993+bool CRegexXQuery_endline::match_internal(const char *source, int *start_from_branch, int *matched_len)
3994+{
3995+ *matched_len = 0;
3996+ if(!source[0])
3997+ {
3998+ // regex_intern->reachedEnd = true;
3999+ return true;
4000+ }
4001+ if((source[0] == 0x0A) || ((source[0] == 0x0D) && (source[1] == 0x0A)))
4002+ {
4003+ if(regex_intern->get_flags() & REGEX_ASCII_MULTILINE)
4004+ {
4005+ // regex_intern->reachedEnd = true;
4006+ return true;
4007+ }
4008+ }
4009+ return false;
4010+}
4011+
4012+
4013 //match any of chargroups
4014-bool CRegexAscii_chargroup::match(const char *source, int *matched_len)
4015+bool CRegexXQuery_chargroup::match_internal(const char *source, int *start_from_branch, int *matched_len)
4016 {
4017 *matched_len = 0;
4018- std::list<chargroup_t>::iterator cgt_it;
4019-
4020+ std::list<CRegexXQuery_charmatch* >::iterator cgt_it;
4021+/*
4022 if(!source[0])
4023 {
4024 regex_intern->reachedEnd = true;
4025@@ -975,113 +2416,21 @@
4026 return false;
4027 }
4028
4029- if(source[0] == 0x0A)
4030+ if((source[0] == 0x0A) || ((source[0] == 0x0D) && (source[1] == 0x0A)))
4031 {
4032 if((regex_intern->flags & REGEX_ASCII_MULTILINE) &&
4033 (chargroup_list.size() == 1) && (chargroup_list.begin()->flags == CHARGROUP_FLAGS_ENDLINE))
4034 {
4035- *matched_len = 1;
4036+ // *matched_len = 1;
4037 return true;
4038 }
4039 }
4040-
4041+*/
4042+ //bool found = false;
4043 for(cgt_it = chargroup_list.begin(); cgt_it != chargroup_list.end(); cgt_it++)
4044 {
4045- if(cgt_it->flags == CHARGROUP_FLAGS_MULTICHAR)
4046- {
4047- switch(cgt_it->c1)
4048- {
4049- case 'p'://catEsc
4050- case 'P'://complEsc
4051- //ignore the prop for now
4052- throw XQUERY_EXCEPTION( err::FORX0002 );
4053- case 's'://[#x20\t\n\r]
4054- switch(source[0])
4055- {
4056- case '\t':
4057- case '\r':
4058- case '\n':
4059- case ' ':
4060- *matched_len = 1;
4061- return true;
4062- default:
4063- return false;
4064- }
4065- case 'S'://[^\s]
4066- switch(source[0])
4067- {
4068- case 0:
4069- regex_intern->reachedEnd = true;
4070- case '\t':
4071- case '\r':
4072- case '\n':
4073- case ' ':
4074- return false;
4075- default:
4076- *matched_len = 1;
4077- return true;
4078- }
4079- case 'i'://the set of initial name characters, those matched by Letter | '_' | ':'
4080- if((source[0] == '_') ||
4081- (source[0] == ':') ||
4082- XQCharType::isLetter(source[0]))
4083- {
4084- *matched_len = 1;
4085- return true;
4086- }
4087- return false;
4088- case 'I':
4089- if((source[0] == '_') ||
4090- (source[0] == ':') ||
4091- XQCharType::isLetter(source[0]))
4092- {
4093- return false;
4094- }
4095- *matched_len = 1;
4096- return true;
4097- case 'c'://the set of name characters, those matched by NameChar
4098- if(XQCharType::isNameChar(source[0]))
4099- {
4100- *matched_len = 1;
4101- return true;
4102- }
4103- return false;
4104- case 'C':
4105- if(XQCharType::isNameChar(source[0]))
4106- {
4107- return false;
4108- }
4109- *matched_len = 1;
4110- return true;
4111- case 'd':
4112- case 'D':
4113- case 'w':
4114- case 'W':
4115- default:
4116- throw XQUERY_EXCEPTION( err::FORX0002 );
4117- }
4118- return false;
4119- }
4120- else if(cgt_it->flags == CHARGROUP_FLAGS_ENDLINE)
4121- {
4122- return false;
4123- }
4124- else
4125- {
4126- if(regex_intern->flags & REGEX_ASCII_CASE_INSENSITIVE)
4127- {
4128- char sup = toupper(source[0]);
4129- if((sup >= toupper(cgt_it->c1)) &&
4130- (sup <= toupper(cgt_it->c2)))
4131- break;
4132- }
4133- else
4134- {
4135- if((source[0] >= cgt_it->c1) &&
4136- (source[0] <= cgt_it->c2))
4137- break;
4138- }
4139- }
4140+ if((*cgt_it)->match_internal(source, start_from_branch, matched_len))
4141+ break;
4142 }
4143 if(cgt_it == chargroup_list.end())
4144 return false;
4145@@ -1089,53 +2438,48 @@
4146 if(classsub)
4147 {
4148 int classsub_len;
4149- if(classsub->match(source, &classsub_len))
4150+ if(classsub->match_internal(source, NULL, &classsub_len))
4151 return false;
4152 }
4153
4154- *matched_len = 1;
4155+ //*matched_len = 1;
4156 return true;
4157 }
4158
4159-bool CRegexAscii_negchargroup::match(const char *source, int *matched_len)
4160+bool CRegexXQuery_negchargroup::match_internal(const char *source, int *start_from_branch, int *matched_len)
4161 {
4162 if(!source[0])
4163 {
4164- regex_intern->reachedEnd = true;
4165+ regex_intern->set_reachedEnd(source);
4166 return false;
4167 }
4168- if(!CRegexAscii_chargroup::match(source, matched_len))
4169+ if(!CRegexXQuery_chargroup::match_internal(source, start_from_branch, matched_len))
4170 {
4171- *matched_len = 1;
4172+ *matched_len = myutf8len(source);
4173 return true;
4174 }
4175 return false;
4176 }
4177
4178-bool CRegexAscii_wildchar::match(const char *source, int *matched_len)
4179+bool CRegexXQuery_wildchar::match_internal(const char *source, int *start_from_branch, int *matched_len)
4180 {
4181 *matched_len = 0;
4182- if(source[0])
4183- {
4184- if((regex_intern->flags & REGEX_ASCII_DOTALL) ||
4185- (source[0] != '\n') && (source[0] != '\r'))
4186- {
4187- *matched_len = 1;
4188- return true;
4189- }
4190- else
4191- return false;
4192+ if(!source[0])
4193+ {
4194+ regex_intern->set_reachedEnd(source);
4195+ return false;
4196+ }
4197+ if((regex_intern->flags & REGEX_ASCII_DOTALL) ||
4198+ ((source[0] != '\n') && (source[0] != '\r')))
4199+ {
4200+ *matched_len = myutf8len(source);
4201+ return true;
4202 }
4203 else
4204- {
4205- if(!source[0])
4206- regex_intern->reachedEnd = true;
4207- *matched_len = 0;
4208 return false;
4209- }
4210 }
4211
4212-bool CRegexAscii_backref::match(const char *source, int *matched_len)
4213+bool CRegexXQuery_backref::match_internal(const char *source, int *start_from_branch, int *matched_len)
4214 {
4215 const char *submatch = regex_intern->subregex.at(backref-1)->matched_source;
4216 if(!submatch)
4217@@ -1143,15 +2487,42 @@
4218 *matched_len = 0;
4219 return true;
4220 }
4221+ if(!source[0])
4222+ {
4223+ regex_intern->set_reachedEnd(source);
4224+ return false;
4225+ }
4226 *matched_len = regex_intern->subregex.at(backref-1)->matched_len;
4227- if(!strncmp(source, submatch, *matched_len))
4228- {
4229- return true;
4230- }
4231- *matched_len = 0;
4232- return false;
4233-}
4234-
4235- }//end namespace regex_ascii
4236+ if(regex_intern->flags & REGEX_ASCII_CASE_INSENSITIVE)
4237+ {
4238+ if(compare_unicode_ni(source, submatch, *matched_len))
4239+ {
4240+ return true;
4241+ }
4242+ }
4243+ else
4244+ {
4245+ if(!memcmp(source, submatch, *matched_len))
4246+ {
4247+ return true;
4248+ }
4249+ }
4250+ *matched_len = 0;
4251+ return false;
4252+}
4253+
4254+bool CRegexXQuery_pinstart::match_internal(const char *source, int *start_from_branch, int *matched_len)
4255+{
4256+ *matched_len = 0;
4257+ if(source == regex_intern->source_start)
4258+ return true;
4259+ if((regex_intern->flags & REGEX_ASCII_MULTILINE) &&
4260+ ((source[-1] == '\n') || (source[-1] == '\r')))
4261+ return true;
4262+
4263+ return false;
4264+}
4265+
4266+ }//end namespace regex_xquery
4267 }//end namespace zorba
4268 /* vim:set et sw=2 ts=2: */
4269
4270=== renamed file 'src/util/regex_ascii.h' => 'src/util/regex_xquery.h'
4271--- src/util/regex_ascii.h 2011-07-18 14:25:21 +0000
4272+++ src/util/regex_xquery.h 2012-01-18 18:33:36 +0000
4273@@ -21,103 +21,142 @@
4274 #include <vector>
4275
4276 #include <zorba/config.h>
4277+#include <zorba/internal/unique_ptr.h>
4278+#include "util/unicode_util.h"
4279
4280 namespace zorba {
4281- namespace regex_ascii{
4282+ namespace regex_xquery{
4283
4284 //matching flags
4285-#define REGEX_ASCII_CASE_INSENSITIVE 1
4286-#define REGEX_ASCII_DOTALL 2
4287-#define REGEX_ASCII_MULTILINE 4
4288-#define REGEX_ASCII_COMMENTS 8
4289-#define REGEX_ASCII_LITERAL 16
4290-
4291-class CRegexAscii_regex;
4292-
4293-class IRegexMatcher
4294+#define REGEX_ASCII_CASE_INSENSITIVE 1 //i
4295+#define REGEX_ASCII_DOTALL 2 //s
4296+#define REGEX_ASCII_MULTILINE 4 //m
4297+#define REGEX_ASCII_NO_WHITESPACE 8 //x
4298+#define REGEX_ASCII_LITERAL 16 //q
4299+
4300+#define REGEX_ASCII_GET_LONGEST_BRANCH 32 //try all branches and get the longest match (or shortest for reluctant pieces)
4301+#define REGEX_ASCII_MINIMAL_MATCH 64 //consider all pieces as reluctant
4302+#define REGEX_ASCII_WHOLE_MATCH 128 //match only all string, like having "^regex$"
4303+#define REGEX_ASCII_GROUPING_LEN_WHOLE_PIECE 256 //compute the len of a grouping as for the whole piece ( for example (a)+ when matching "aa" and referred as $1 will get string len 2 instead of last 1)
4304+
4305+class CRegexXQuery_regex;
4306+class CRegexXQuery_piece;
4307+
4308+struct RegexAscii_pieceinfo
4309 {
4310-public:
4311- CRegexAscii_regex *regex_intern;
4312-public:
4313- IRegexMatcher(CRegexAscii_regex* regex) : regex_intern(regex) {}
4314- virtual ~IRegexMatcher() {}
4315+ union
4316+ {
4317+ CRegexXQuery_piece* piece;
4318+ CRegexXQuery_regex* group_regex;
4319+ };
4320+ int nr_matches;
4321
4322- virtual bool match(const char *source, int *matched_len) = 0;
4323+ RegexAscii_pieceinfo(CRegexXQuery_piece* piece) {nr_matches=0;this->piece=piece;}
4324+ RegexAscii_pieceinfo(CRegexXQuery_regex* group_regex) {nr_matches=-1;this->group_regex=group_regex;}
4325 };
4326
4327-class IRegexAtom : public IRegexMatcher
4328+
4329+class IRegexAtom
4330 {
4331+protected:
4332+ friend class CRegexXQuery_piece;
4333+ CRegexXQuery_regex *regex_intern;
4334 public:
4335- IRegexAtom(CRegexAscii_regex* regex) : IRegexMatcher(regex) {}
4336+ IRegexAtom(CRegexXQuery_regex* regex) : regex_intern(regex) {}
4337 virtual ~IRegexAtom() {}
4338+
4339+ virtual bool match(const char *source, int *start_from_branch, int *matched_len,
4340+ std::list<RegexAscii_pieceinfo>::iterator next_piece,
4341+ std::list<RegexAscii_pieceinfo>::iterator end_piece);
4342+ virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len) = 0;
4343+ virtual void reset_match() {}
4344+// virtual void restore_match() {}
4345 };
4346
4347-class CRegexAscii_branch;
4348-class CRegexAscii_piece;
4349-class CRegexAscii_chargroup;
4350-class CRegexAscii_parser;
4351+class CRegexXQuery_branch;
4352+class CRegexXQuery_piece;
4353+class CRegexXQuery_chargroup;
4354+class CRegexXQuery_parser;
4355
4356-class CRegexAscii_regex : public IRegexAtom
4357+class CRegexXQuery_regex : public IRegexAtom
4358 {
4359- friend class CRegexAscii_parser;
4360- friend class CRegexAscii_branch;
4361- friend class CRegexAscii_piece;
4362- friend class CRegexAscii_chargroup;
4363- friend class CRegexAscii_negchargroup;
4364- friend class CRegexAscii_wildchar;
4365- friend class CRegexAscii_backref;
4366+ friend class CRegexXQuery_parser;
4367+ friend class CRegexXQuery_branch;
4368+ friend class CRegexXQuery_piece;
4369+ friend class CRegexXQuery_chargroup;
4370+ friend class CRegexXQuery_negchargroup;
4371+ friend class CRegexXQuery_wildchar;
4372+ friend class CRegexXQuery_backref;
4373+ friend class CRegexXQuery_pinstart;
4374 public:
4375- CRegexAscii_regex(CRegexAscii_regex *);
4376- virtual ~CRegexAscii_regex();
4377+ CRegexXQuery_regex(CRegexXQuery_regex *);
4378+ virtual ~CRegexXQuery_regex();
4379
4380 bool match_anywhere(const char *source, unsigned int flags, int *match_pos, int *matched_len);
4381 bool match_from(const char *source, unsigned int flags, int *match_pos, int *matched_len);
4382- virtual bool match(const char *source, int *matched_len);
4383
4384 //for replace $1, $2 ...
4385 bool get_indexed_match(int index, const char **matched_source, int *matched_len);
4386 unsigned int get_indexed_regex_count();
4387
4388 bool get_reachedEnd() {return reachedEnd;}
4389- bool set_align_begin(bool align_begin);
4390+ void set_reachedEnd(const char *source) {if(source > source_start) reachedEnd = true;}
4391+ unsigned int get_flags() {return flags;}
4392+public:
4393+ virtual bool match(const char *source, int *start_from_branch, int *matched_len,
4394+ std::list<RegexAscii_pieceinfo>::iterator next_piece,
4395+ std::list<RegexAscii_pieceinfo>::iterator end_piece);
4396+ virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len) {return false;}//not impl
4397+ virtual void reset_match();
4398+// virtual void restore_match();
4399 private:
4400- void add_branch(CRegexAscii_branch *branch);
4401+ void add_branch(CRegexXQuery_branch *branch);
4402+
4403+ void save_subregex_list(std::vector<std::pair<const char*, int> > &saved_subregex);
4404+ void load_subregex_list(std::vector<std::pair<const char*, int> > &saved_subregex);
4405 private:
4406 unsigned int flags;
4407- std::list<CRegexAscii_branch*> branch_list;
4408- bool align_begin;
4409+ std::list<CRegexXQuery_branch*> branch_list;
4410+
4411+ const char *source_start;
4412
4413 const char *matched_source;
4414 int matched_len;
4415- std::vector<CRegexAscii_regex*> subregex;//for grouping
4416+// const unicode::code_point *backup_matched_source;
4417+// int backup_matched_len;
4418+ std::vector<CRegexXQuery_regex*> subregex;//for grouping
4419
4420 bool reachedEnd;
4421 };
4422
4423-class CRegexAscii_branch : public IRegexMatcher
4424+class CRegexXQuery_branch
4425 {
4426- friend class CRegexAscii_parser;
4427+ friend class CRegexXQuery_parser;
4428 public:
4429- CRegexAscii_branch(CRegexAscii_regex* regex);
4430- ~CRegexAscii_branch();
4431+ CRegexXQuery_branch(CRegexXQuery_regex* regex);
4432+ ~CRegexXQuery_branch();
4433
4434- virtual bool match(const char *source, int *matched_len);
4435-private:
4436- std::list<CRegexAscii_piece*> piece_list;
4437-private:
4438- void add_piece(CRegexAscii_piece *piece);
4439+ bool match(const char *source, int *matched_len,
4440+ CRegexXQuery_regex* group_regex,
4441+ std::list<RegexAscii_pieceinfo>::iterator next_piece,
4442+ std::list<RegexAscii_pieceinfo>::iterator end_piece);
4443+ void reset();
4444+// void restore();
4445+private:
4446+ std::list<RegexAscii_pieceinfo> piece_list;
4447+private:
4448+ void add_piece(CRegexXQuery_piece *piece);
4449
4450- bool match_piece_iter_reluctant(std::list<CRegexAscii_piece*>::iterator piece_it,
4451- const char *source, int *matched_len);
4452- bool match_piece_iter_normal(std::list<CRegexAscii_piece*>::iterator piece_it,
4453- const char *source, int *matched_len);
4454 };
4455
4456-class CRegexAscii_piece //: public IRegexMatcher
4457+class CRegexXQuery_piece //: public IRegexMatcher
4458 {
4459- friend class CRegexAscii_parser;
4460-public:
4461+ friend class CRegexXQuery_parser;
4462+ friend class CRegexXQuery_branch;
4463+
4464 IRegexAtom *atom;
4465+ CRegexXQuery_regex *regex_atom;
4466+
4467 //quantifier
4468 bool strict_max;
4469 int min;
4470@@ -125,8 +164,8 @@
4471 bool is_reluctant;
4472
4473 public:
4474- CRegexAscii_piece();
4475- ~CRegexAscii_piece();
4476+ CRegexXQuery_piece();
4477+ ~CRegexXQuery_piece();
4478 public:
4479 void set_atom(IRegexAtom *atom);
4480 void set_quantifier_min_max(int min, int max, bool strict_max);
4481@@ -134,95 +173,294 @@
4482 void get_quantifier(int *min, int *max, bool *strict_max);
4483 bool get_is_reluctant();
4484 // bool match(const char *source, int *matched_len);
4485+ bool match_piece(std::list<RegexAscii_pieceinfo>::iterator next_piece,
4486+ std::list<RegexAscii_pieceinfo>::iterator end_piece,
4487+ const char *source, int *matched_len);
4488+protected:
4489 bool match_piece_times(const char *source,
4490 int *piecelen,
4491 int times,
4492- std::vector<int> *match_lens);
4493-};
4494-
4495-#define CHARGROUP_FLAGS_MULTICHAR 1
4496-#define CHARGROUP_FLAGS_ENDLINE 2
4497-
4498-class CRegexAscii_chargroup : public IRegexAtom
4499-{
4500- friend class CRegexAscii_parser;
4501-public:
4502- CRegexAscii_chargroup(CRegexAscii_regex* regex);
4503- virtual ~CRegexAscii_chargroup();
4504+ std::vector<std::pair<int,int> > *match_lens);
4505+ int choose_another_branch(std::vector<std::pair<int,int> > &match_lens);
4506+ bool match_piece_iter_reluctant(std::list<RegexAscii_pieceinfo>::iterator next_piece,
4507+ std::list<RegexAscii_pieceinfo>::iterator end_piece,
4508+ const char *source, int *matched_len);
4509+ bool match_piece_iter_normal(std::list<RegexAscii_pieceinfo>::iterator next_piece,
4510+ std::list<RegexAscii_pieceinfo>::iterator end_piece,
4511+ const char *source, int *matched_len);
4512+ bool is_regex_atom();
4513+};
4514+
4515+
4516+enum CHARGROUP_t
4517+{
4518+CHARGROUP_NO_MULTICHAR = 0,
4519+//CHARGROUP_FLAGS_CHAR_RANGE,
4520+CHARGROUP_FLAGS_MULTICHAR_p,
4521+CHARGROUP_FLAGS_MULTICHAR_Is,
4522+CHARGROUP_FLAGS_MULTICHAR_OTHER,
4523+CHARGROUP_FLAGS_ONECHAR_ASCII,
4524+CHARGROUP_FLAGS_ONECHAR_UNICODE
4525+//CHARGROUP_FLAGS_ENDLINE
4526+};
4527+
4528+
4529+class CRegexXQuery_charmatch : public IRegexAtom
4530+{
4531+ friend class CRegexXQuery_parser;
4532+protected:
4533+ //enum CHARGROUP_t type;
4534+public:
4535+ CRegexXQuery_charmatch(CRegexXQuery_regex* regex);//, enum CHARGROUP_t type);
4536+ virtual ~CRegexXQuery_charmatch() {}
4537+ virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len) = 0;
4538+ virtual unicode::code_point get_c() {return 0;}
4539+};
4540+
4541+class CRegexXQuery_multicharP : public CRegexXQuery_charmatch
4542+{
4543+ char multichar_type;
4544+ bool is_reverse;
4545+public:
4546+ CRegexXQuery_multicharP(CRegexXQuery_regex* regex, char type, bool is_reverse);
4547+ virtual ~CRegexXQuery_multicharP() {}
4548+
4549+ virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len);
4550+};
4551+
4552+class CRegexXQuery_multicharIs : public CRegexXQuery_charmatch
4553+{
4554+ int block_index;
4555+ bool is_reverse;
4556+public:
4557+ CRegexXQuery_multicharIs(CRegexXQuery_regex* regex, int block_index, bool is_reverse);
4558+ virtual ~CRegexXQuery_multicharIs() {}
4559+
4560+ virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len);
4561+};
4562+
4563+class CRegexXQuery_multicharOther : public CRegexXQuery_charmatch
4564+{
4565+ char multichar_type;
4566+public:
4567+ CRegexXQuery_multicharOther(CRegexXQuery_regex* regex, char type);
4568+ virtual ~CRegexXQuery_multicharOther() {}
4569+
4570+ virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len);
4571+};
4572+
4573+class CRegexXQuery_char_ascii : public CRegexXQuery_charmatch
4574+{
4575+ friend class CRegexXQuery_parser;
4576+protected:
4577+ char c;
4578+public:
4579+ CRegexXQuery_char_ascii(CRegexXQuery_regex* regex, char c);
4580+ virtual ~CRegexXQuery_char_ascii() {}
4581+
4582+ virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len);
4583+ virtual unicode::code_point get_c() {return c;}
4584+};
4585+
4586+class CRegexXQuery_char_ascii_i : public CRegexXQuery_char_ascii
4587+{
4588+public:
4589+ CRegexXQuery_char_ascii_i(CRegexXQuery_regex* regex, char c);
4590+ virtual ~CRegexXQuery_char_ascii_i() {}
4591+
4592+ virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len);
4593+ virtual unicode::code_point get_c() {return c;}
4594+};
4595+
4596+class CRegexXQuery_char_range_ascii : public CRegexXQuery_charmatch
4597+{
4598+protected:
4599+ char c1;
4600+ char c2;
4601+public:
4602+ CRegexXQuery_char_range_ascii(CRegexXQuery_regex* regex, char c1, char c2);
4603+ virtual ~CRegexXQuery_char_range_ascii() {}
4604+
4605+ virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len);
4606+};
4607+
4608+class CRegexXQuery_char_range_ascii_i : public CRegexXQuery_char_range_ascii
4609+{
4610+public:
4611+ CRegexXQuery_char_range_ascii_i(CRegexXQuery_regex* regex, char c1, char c2);
4612+ virtual ~CRegexXQuery_char_range_ascii_i() {}
4613+
4614+ virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len);
4615+};
4616+
4617+class CRegexXQuery_char_unicode : public CRegexXQuery_charmatch
4618+{
4619+ unsigned char c[6];
4620+ int len;
4621+public:
4622+ CRegexXQuery_char_unicode(CRegexXQuery_regex* regex, const char *c, int len);
4623+ virtual ~CRegexXQuery_char_unicode() {}
4624+
4625+ virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len);
4626+ virtual unicode::code_point get_c();
4627+};
4628+
4629+class CRegexXQuery_char_unicode_cp : public CRegexXQuery_charmatch
4630+{
4631+protected:
4632+ unicode::code_point c;
4633+public:
4634+ CRegexXQuery_char_unicode_cp(CRegexXQuery_regex* regex, unicode::code_point c);
4635+ virtual ~CRegexXQuery_char_unicode_cp() {}
4636+
4637+ virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len);
4638+ virtual unicode::code_point get_c() {return c;}
4639+};
4640+
4641+class CRegexXQuery_char_unicode_i : public CRegexXQuery_char_unicode_cp
4642+{
4643+public:
4644+ CRegexXQuery_char_unicode_i(CRegexXQuery_regex* regex, unicode::code_point c);
4645+ virtual ~CRegexXQuery_char_unicode_i() {}
4646+
4647+ virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len);
4648+ virtual unicode::code_point get_c() {return c;}
4649+};
4650+
4651+class CRegexXQuery_char_range_unicode : public CRegexXQuery_charmatch
4652+{
4653+protected:
4654+ unicode::code_point c1;
4655+ unicode::code_point c2;
4656+public:
4657+ CRegexXQuery_char_range_unicode(CRegexXQuery_regex* regex, unicode::code_point c1, unicode::code_point c2);
4658+ virtual ~CRegexXQuery_char_range_unicode() {}
4659+
4660+ virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len);
4661+};
4662+
4663+class CRegexXQuery_char_range_unicode_i : public CRegexXQuery_char_range_unicode
4664+{
4665+public:
4666+ CRegexXQuery_char_range_unicode_i(CRegexXQuery_regex* regex, unicode::code_point c1, unicode::code_point c2);
4667+ virtual ~CRegexXQuery_char_range_unicode_i() {}
4668+
4669+ virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len);
4670+};
4671+
4672+class CRegexXQuery_endline : public CRegexXQuery_charmatch
4673+{
4674+public:
4675+ CRegexXQuery_endline(CRegexXQuery_regex* regex);
4676+ virtual ~CRegexXQuery_endline() {}
4677+
4678+ virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len);
4679+};
4680+
4681+
4682+class CRegexXQuery_chargroup : public IRegexAtom
4683+{
4684+ friend class CRegexXQuery_parser;
4685+public:
4686+ CRegexXQuery_chargroup(CRegexXQuery_regex* regex);
4687+ virtual ~CRegexXQuery_chargroup();
4688 private:
4689- typedef struct
4690+/* typedef struct
4691 {
4692- unsigned char flags;
4693+ CHARGROUP_t flags;
4694 char c1;
4695 char c2;
4696 }chargroup_t;
4697- std::list<chargroup_t> chargroup_list;
4698- CRegexAscii_chargroup *classsub;
4699-public:
4700- void addMultiChar(char c);
4701- void addEndLine();
4702- void addCharRange(char c1, char c2);
4703- void addClassSub(CRegexAscii_chargroup* classsub);
4704-
4705- virtual bool match(const char *source, int *matched_len);
4706-};
4707-
4708-class CRegexAscii_negchargroup : public CRegexAscii_chargroup
4709-{
4710-public:
4711- CRegexAscii_negchargroup(CRegexAscii_regex* regex);
4712- virtual ~CRegexAscii_negchargroup();
4713-
4714- virtual bool match(const char *source, int *matched_len);
4715-};
4716-
4717-class CRegexAscii_wildchar : public IRegexAtom
4718-{
4719-public:
4720- CRegexAscii_wildchar(CRegexAscii_regex* regex);
4721- virtual ~CRegexAscii_wildchar();
4722-
4723- virtual bool match(const char *source, int *matched_len);
4724-};
4725-
4726-class CRegexAscii_backref : public IRegexAtom
4727-{
4728-public:
4729- CRegexAscii_backref(CRegexAscii_regex* regex, unsigned int backref);
4730- virtual ~CRegexAscii_backref();
4731-
4732- virtual bool match(const char *source, int *matched_len);
4733+*/
4734+ std::list<CRegexXQuery_charmatch* > chargroup_list;
4735+ CRegexXQuery_chargroup *classsub;
4736+public:
4737+ //void addMultiChar(char c, CHARGROUP_t multichar_type);
4738+ //void addEndLine();
4739+ //void addCharRange(char c1, char c2);
4740+ //void addOneChar(char c);
4741+ void addCharMatch(CRegexXQuery_charmatch *charmatch);
4742+ void addClassSub(CRegexXQuery_chargroup* classsub);
4743+
4744+ virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len);
4745+};
4746+
4747+class CRegexXQuery_negchargroup : public CRegexXQuery_chargroup
4748+{
4749+public:
4750+ CRegexXQuery_negchargroup(CRegexXQuery_regex* regex);
4751+ virtual ~CRegexXQuery_negchargroup();
4752+
4753+ virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len);
4754+};
4755+
4756+class CRegexXQuery_wildchar : public IRegexAtom
4757+{
4758+public:
4759+ CRegexXQuery_wildchar(CRegexXQuery_regex* regex);
4760+ virtual ~CRegexXQuery_wildchar();
4761+
4762+ virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len);
4763+};
4764+
4765+class CRegexXQuery_backref : public IRegexAtom
4766+{
4767+public:
4768+ CRegexXQuery_backref(CRegexXQuery_regex* regex, unsigned int backref);
4769+ virtual ~CRegexXQuery_backref();
4770+
4771+ virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len);
4772 private:
4773 unsigned int backref;
4774 };
4775
4776-class CRegexAscii_parser
4777-{
4778-public:
4779- CRegexAscii_parser();
4780- ~CRegexAscii_parser();
4781-
4782-public:
4783- CRegexAscii_regex* parse(const char *pattern, unsigned int flags);
4784+class CRegexXQuery_pinstart : public IRegexAtom
4785+{
4786+public:
4787+ CRegexXQuery_pinstart(CRegexXQuery_regex* regex);
4788+
4789+ virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len);
4790+};
4791+
4792+class CRegexXQuery_parser
4793+{
4794+public:
4795+ typedef struct
4796+ {
4797+ const unicode::code_point cp[2];//in pairs start, end
4798+ const unicode::code_point *ext_cp;
4799+ const char *group_name;
4800+ }block_escape_t;
4801+
4802+ CRegexXQuery_parser();
4803+ ~CRegexXQuery_parser();
4804+
4805+public:
4806+ CRegexXQuery_regex* parse(const char *pattern, unsigned int flags);
4807
4808 protected:
4809- CRegexAscii_regex* parse_regexp(const char *pattern, int *regex_len);
4810- CRegexAscii_branch* parse_branch(const char *pattern, int *branch_len);
4811- CRegexAscii_piece* parse_piece(const char *pattern, int *piece_len);
4812+ CRegexXQuery_regex* parse_regexp(const char *pattern, int *regex_len);
4813+ CRegexXQuery_branch* parse_branch(const char *pattern, int *branch_len);
4814+ CRegexXQuery_piece* parse_piece(const char *pattern, int *piece_len);
4815 char myishex(char c);
4816 bool myisdigit(char c);
4817- char readChar(const char *pattern, int *char_len, bool *is_multichar);
4818+ bool myisletterAZ(char c);
4819+ CRegexXQuery_charmatch* readChar(const char *pattern, int *char_len, CHARGROUP_t *multichar_type);
4820+ CRegexXQuery_charmatch *create_charmatch(unicode::code_point utf8c,
4821+ const char *pattern, int utf8len,
4822+ enum CHARGROUP_t *multichar_type);
4823 IRegexAtom* read_atom(const char *pattern, int *atom_len);
4824- CRegexAscii_chargroup* readchargroup(const char *pattern, int *chargroup_len);
4825- void read_quantifier(CRegexAscii_piece *piece, const char *pattern, int *quantif_len);
4826+ CRegexXQuery_chargroup* readchargroup(const char *pattern, int *chargroup_len);
4827+ void read_quantifier(CRegexXQuery_piece *piece, const char *pattern, int *quantif_len);
4828
4829 private:
4830- CRegexAscii_regex *current_regex;
4831+ CRegexXQuery_regex *current_regex;
4832 int regex_depth;
4833 unsigned int flags;
4834 };
4835
4836-}}//end namespace zorba::regex_ascii
4837+}
4838+}//end namespace zorba::regex_xquery
4839
4840 #endif
4841 /* vim:set et sw=2 ts=2: */
4842
4843=== modified file 'src/util/unicode_categories.cpp'
4844--- src/util/unicode_categories.cpp 2011-06-14 17:26:33 +0000
4845+++ src/util/unicode_categories.cpp 2012-01-18 18:33:36 +0000
4846@@ -65812,7 +65812,7 @@
4847 { 0x100000, 0x100000, UNICODE_Co},
4848 };
4849
4850-bool check_codepoint_category(code_point cp, UnicodeCategoriesEnum categ)
4851+bool check_codepoint_category(code_point cp, category categ)
4852 {
4853 if(cp < 0x10000)
4854 return codepoints_categories[cp] == categ;
4855@@ -65824,10 +65824,10 @@
4856 if(cp >= codepoints_categories2[i].cp1)
4857 return codepoints_categories2[i].category == categ;
4858 else
4859- return false;
4860+ return categ ? false : true;
4861 }
4862 }
4863- return false;
4864+ return categ ? false : true;
4865 }
4866
4867 /*
4868
4869=== modified file 'src/util/unicode_categories.h'
4870--- src/util/unicode_categories.h 2011-06-14 17:26:33 +0000
4871+++ src/util/unicode_categories.h 2012-01-18 18:33:36 +0000
4872@@ -22,46 +22,53 @@
4873 namespace zorba {
4874 namespace unicode {
4875
4876-//Unicode codepoint categories, as from http://www.fileformat.info/info/unicode/category/index.htm
4877+///////////////////////////////////////////////////////////////////////////////
4878
4879-enum UnicodeCategoriesEnum {
4880-UNICODE_Cc, //Other, Control
4881-UNICODE_Cf, //Other, Format
4882-UNICODE_Co, //Other, Private Use
4883-UNICODE_Cs, //Other, Surrogate
4884-UNICODE_Ll, //Letter, Lowercase
4885-UNICODE_Lm, //Letter, Modifier
4886-UNICODE_Lo, //Letter, Other
4887-UNICODE_Lt, //Letter, Titlecase
4888-UNICODE_Lu, //Letter, Uppercase
4889-UNICODE_Mc, //Mark, Spacing Combining
4890-UNICODE_Me, //Mark, Enclosing
4891-UNICODE_Mn, //Mark, Nonspacing
4892-UNICODE_Nd, //Number, Decimal Digit
4893-UNICODE_Nl, //Number, Letter
4894-UNICODE_No, //Number, Other
4895-UNICODE_Pc, //Punctuation, Connector
4896-UNICODE_Pd, //Punctuation, Dash
4897-UNICODE_Pe, //Punctuation, Close
4898-UNICODE_Pf, //Punctuation, Final quote (may behave like Ps or Pe depending on usage)
4899-UNICODE_Pi, //Punctuation, Initial quote (may behave like Ps or Pe depending on usage)
4900-UNICODE_Po, //Punctuation, Other
4901-UNICODE_Ps, //Punctuation, Open
4902-UNICODE_Sc, //Symbol, Currency
4903-UNICODE_Sk, //Symbol, Modifier
4904-UNICODE_Sm, //Symbol, Math
4905-UNICODE_So, //Symbol, Other
4906-UNICODE_Zl, //Separator, Line
4907-UNICODE_Zp, //Separator, Paragraph
4908-UNICODE_Zs //Separator, Space
4909+/**
4910+ * Unicode codepoint categories.
4911+ * See: http://www.fileformat.info/info/unicode/category/
4912+ */
4913+enum category {
4914+ UNICODE_Cn, // Not Assigned
4915+ UNICODE_Cc, // Other, Control
4916+ UNICODE_Cf, // Other, Format
4917+ UNICODE_Co, // Other, Private Use
4918+ UNICODE_Cs, // Other, Surrogate
4919+ UNICODE_Ll, // Letter, Lowercase
4920+ UNICODE_Lm, // Letter, Modifier
4921+ UNICODE_Lo, // Letter, Other
4922+ UNICODE_Lt, // Letter, Titlecase
4923+ UNICODE_Lu, // Letter, Uppercase
4924+ UNICODE_Mc, // Mark, Spacing Combining
4925+ UNICODE_Me, // Mark, Enclosing
4926+ UNICODE_Mn, // Mark, Nonspacing
4927+ UNICODE_Nd, // Number, Decimal Digit
4928+ UNICODE_Nl, // Number, Letter
4929+ UNICODE_No, // Number, Other
4930+ UNICODE_Pc, // Punctuation, Connector
4931+ UNICODE_Pd, // Punctuation, Dash
4932+ UNICODE_Pe, // Punctuation, Close
4933+ UNICODE_Pf, // Punctuation, Final quote (like Ps or Pe depending on usage)
4934+ UNICODE_Pi, // Punctuation, Initial quote (like Ps or Pe depending on usage)
4935+ UNICODE_Po, // Punctuation, Other
4936+ UNICODE_Ps, // Punctuation, Open
4937+ UNICODE_Sc, // Symbol, Currency
4938+ UNICODE_Sk, // Symbol, Modifier
4939+ UNICODE_Sm, // Symbol, Math
4940+ UNICODE_So, // Symbol, Other
4941+ UNICODE_Zl, // Separator, Line
4942+ UNICODE_Zp, // Separator, Paragraph
4943+ UNICODE_Zs // Separator, Space
4944 };
4945
4946 bool is_UnicodeNd(code_point cp, code_point *ret_zero);
4947
4948-bool check_codepoint_category(code_point cp, UnicodeCategoriesEnum categ);
4949-
4950-}
4951-}
4952-
4953-#endif
4954+bool check_codepoint_category(code_point cp, category categ);
4955+
4956+///////////////////////////////////////////////////////////////////////////////
4957+
4958+} // namespace unicode
4959+} // namespaec zorba
4960+
4961+#endif /* ZORBA_UNICODE_CATEGORIES */
4962 /* vim:set et sw=2 ts=2: */
4963
4964=== modified file 'src/util/unicode_util.cpp'
4965--- src/util/unicode_util.cpp 2011-07-17 00:10:56 +0000
4966+++ src/util/unicode_util.cpp 2012-01-18 18:33:36 +0000
4967@@ -22,15 +22,19 @@
4968 #include <functional> /* for binary_function */
4969 #include <utility> /* for pair */
4970
4971-#include <unicode/normlzr.h>
4972-#include <unicode/ustring.h>
4973+#ifndef ZORBA_NO_ICU
4974+# include <unicode/normlzr.h>
4975+# include <unicode/ustring.h>
4976+#endif /* ZORBA_NO_ICU */
4977
4978 #include "cxx_util.h"
4979 #include "unicode_util.h"
4980 #include "utf8_util.h"
4981
4982 using namespace std;
4983+#ifndef ZORBA_NO_ICU
4984 U_NAMESPACE_USE
4985+#endif /* ZORBA_NO_ICU */
4986
4987 namespace zorba {
4988 namespace unicode {
4989@@ -2208,6 +2212,8 @@
4990 return to_case<upper>( c );
4991 }
4992
4993+#ifndef ZORBA_NO_ICU
4994+
4995 bool normalize( string const &in, normalization::type n, string *out ) {
4996 UErrorCode status = U_ZERO_ERROR;
4997 UNormalizationMode icu_mode;
4998@@ -2230,8 +2236,11 @@
4999 return U_SUCCESS( status ) == TRUE;
5000 }
The diff has been truncated for viewing.

Subscribers

People subscribed via source and target branches