Merge lp:~zorba-coders/zorba/no_unicode into lp:zorba
- no_unicode
- Merge into trunk
Status: | Merged |
---|---|
Approved by: | Rodolfo Ochoa |
Approved revision: | 10540 |
Merged at revision: | 10761 |
Proposed branch: | lp:~zorba-coders/zorba/no_unicode |
Merge into: | lp:zorba |
Diff against target: |
9007 lines (+3904/-1422) 269 files modified
CMakeConfiguration.txt (+5/-5) CMakeLists.txt (+6/-2) ChangeLog (+7/-0) KNOWN_ISSUES.txt (+1/-1) doc/cxx/examples/context.cpp (+4/-0) include/zorba/config.h.cmake (+3/-1) include/zorba/static_context.h (+4/-0) include/zorba/util/time.h (+1/-1) src/CMakeLists.txt (+4/-0) src/api/serialization/serializer.cpp (+36/-33) src/api/serialization/serializer.h (+2/-4) src/diagnostics/diagnostic_en.xml (+116/-27) src/diagnostics/pregenerated/dict_en.cpp (+98/-20) src/precompiled/stdafx.h (+74/-356) src/runtime/full_text/CMakeLists.txt (+3/-3) src/runtime/full_text/default_tokenizer.cpp (+4/-4) src/runtime/full_text/latin_tokenizer.cpp (+3/-2) src/runtime/full_text/latin_tokenizer.h (+9/-8) src/runtime/numerics/format_integer_impl.cpp (+1/-1) src/runtime/numerics/numerics_impl.cpp (+1/-1) src/runtime/strings/strings_impl.cpp (+58/-20) src/store/api/store.h (+1/-1) src/store/naive/simple_store.h (+7/-3) src/store/naive/store.h (+12/-11) src/system/globalenv.cpp (+7/-7) src/unit_tests/CMakeLists.txt (+2/-2) src/unit_tests/string.cpp (+8/-0) src/unit_tests/unit_test_list.h (+2/-2) src/unit_tests/unit_tests.cpp (+2/-2) src/util/CMakeLists.txt (+4/-4) src/util/icu_streambuf.h (+1/-0) src/util/passthru_streambuf.cpp (+2/-2) src/util/passthru_streambuf.h (+10/-2) src/util/regex.cpp (+96/-82) src/util/regex.h (+22/-34) src/util/regex_xquery.cpp (+1860/-489) src/util/regex_xquery.h (+359/-123) src/util/transcode_streambuf.h (+5/-5) src/util/unicode_categories.cpp (+3/-3) src/util/unicode_categories.h (+44/-37) src/util/unicode_util.cpp (+20/-2) src/util/unicode_util.h (+47/-15) src/util/utf8_util.cpp (+6/-6) src/util/utf8_util.h (+29/-13) src/util/utf8_util.tcc (+10/-2) src/zorbatypes/collation_manager.cpp (+17/-17) src/zorbatypes/collation_manager.h (+3/-3) src/zorbatypes/libicu.h (+0/-32) src/zorbatypes/transcoder.cpp (+8/-4) src/zorbatypes/transcoder.h (+9/-9) src/zorbautils/hashmap_itemh.h (+1/-1) src/zorbautils/string_util.cpp (+19/-18) src/zorbautils/string_util.h (+15/-1) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a1.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a10.xml.res (+242/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a11.xml.res (+6/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a2.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a3.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a5.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a6.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a7.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a8.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a9.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m1.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m10.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m11.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m12.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m13.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m14.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m15.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m16.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m17.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m18.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m19.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m2.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m20.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m21.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m22.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m23.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m24.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m25.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m26.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m27.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m28.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m29.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m3.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m30.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m31.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m32.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m33.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m34.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m35.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m36.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m37.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m38.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m39.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m4.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m40.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m41.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m42.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m43.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m44.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m45.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m46.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m47.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m48.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m49.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m5.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m50.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m51.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m52.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m53.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m6.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m7.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m8.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m9.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_prime1.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r1.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r10.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r11.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r12.xml.res (+5/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r2.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r3.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r4.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r5.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r6.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r9.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_t1.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_t4.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_t5.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/testdriver/bom_bug.xml.res (+1/-0) test/rbkt/Queries/CMakeLists.txt (+16/-1) test/rbkt/Queries/zorba/string/Regex/regex_a1.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_a10.xq (+11/-0) test/rbkt/Queries/zorba/string/Regex/regex_a11.xq (+9/-0) test/rbkt/Queries/zorba/string/Regex/regex_a2.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_a3.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_a5.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_a6.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_a7.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_a8.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_a9.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err1.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err1.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err10.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err10.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err11.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err11.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err12.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err12.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err13.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err13.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err14.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err14.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err15.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err15.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err16.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err16.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err17.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err17.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err18.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err18.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err19.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err19.xq (+3/-0) test/rbkt/Queries/zorba/string/Regex/regex_err2.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err2.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err20.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err20.xq (+3/-0) test/rbkt/Queries/zorba/string/Regex/regex_err21.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err21.xq (+3/-0) test/rbkt/Queries/zorba/string/Regex/regex_err22.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err22.xq (+3/-0) test/rbkt/Queries/zorba/string/Regex/regex_err23.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err23.xq (+3/-0) test/rbkt/Queries/zorba/string/Regex/regex_err24.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err24.xq (+3/-0) test/rbkt/Queries/zorba/string/Regex/regex_err25.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err25.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err3.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err3.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err4.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err4.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err5.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err5.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err7.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err7.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err8.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err8.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err9.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err9.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m1.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m10.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m11.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m12.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m13.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m14.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m15.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m16.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m17.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m18.xq (+3/-0) test/rbkt/Queries/zorba/string/Regex/regex_m19.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m2.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m20.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m21.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m22.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m23.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m24.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m25.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m26.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m27.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m28.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m29.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m3.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m30.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m31.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m32.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m33.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m34.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m35.xq (+4/-0) test/rbkt/Queries/zorba/string/Regex/regex_m36.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m37.xq (+4/-0) test/rbkt/Queries/zorba/string/Regex/regex_m38.xq (+4/-0) test/rbkt/Queries/zorba/string/Regex/regex_m39.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m4.xq (+6/-0) test/rbkt/Queries/zorba/string/Regex/regex_m40.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m41.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m42.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m43.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m44.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m45.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m46.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m47.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m48.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m49.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m5.xq (+6/-0) test/rbkt/Queries/zorba/string/Regex/regex_m50.xq (+2/-0) test/rbkt/Queries/zorba/string/Regex/regex_m51.xq (+2/-0) test/rbkt/Queries/zorba/string/Regex/regex_m52.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m53.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m6.xq (+6/-0) test/rbkt/Queries/zorba/string/Regex/regex_m7.xq (+6/-0) test/rbkt/Queries/zorba/string/Regex/regex_m8.xq (+7/-0) test/rbkt/Queries/zorba/string/Regex/regex_m9.xq (+7/-0) test/rbkt/Queries/zorba/string/Regex/regex_prime1.xq (+17/-0) test/rbkt/Queries/zorba/string/Regex/regex_r1.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_r10.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_r11.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_r12.xq (+7/-0) test/rbkt/Queries/zorba/string/Regex/regex_r2.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_r3.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_r4.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_r5.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_r6.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_r7_err.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_r7_err.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_r8_err.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_r8_err.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_r9.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_t1.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_t2.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_t3_err.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_t3_err.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_t4.xq (+2/-0) test/rbkt/Queries/zorba/string/Regex/regex_t5.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/zorba.html (+242/-0) test/rbkt/Queries/zorba/string/Regex/zorba2.html (+5/-0) test/rbkt/Queries/zorba/testdriver/bom_bug.xq (+1/-0) test/unit/static_context.cpp (+2/-0) test/update/CMakeLists.txt (+9/-0) |
To merge this branch: | bzr merge lp:~zorba-coders/zorba/no_unicode |
Related bugs: |
Reviewer | Review Type | Date Requested | Status |
---|---|---|---|
Rodolfo Ochoa | Approve | ||
Markos Zaharioudakis | Approve | ||
Review via email: mp+101588@code.launchpad.net |
This proposal supersedes a proposal from 2012-04-07.
Commit message
"No Unicode" is now "No ICU."
Added a a q-flag fix for an undiscovered bug.
Description of the change
"No Unicode" is now "No ICU."
Added a a q-flag fix for an undiscovered bug.
Matthias Brantner (matthias-brantner) wrote : Posted in a previous version of this proposal | # |
Matthias Brantner (matthias-brantner) : Posted in a previous version of this proposal | # |
Zorba Build Bot (zorba-buildbot) wrote : Posted in a previous version of this proposal | # |
There are additional revisions which have not been approved in review. Please seek review and approval of these new revisions.
Matthias Brantner (matthias-brantner) wrote : Posted in a previous version of this proposal | # |
The test suite doesn't run clean on my system (Linux) without ICU. This prevents us from adding the built to the remote queue. For example, the following tests fail without ICU (some of them also seem to fail with ICU):
1294 - test/rbkt/
1548 - test/rbkt/
1560 - test/rbkt/
1574 - test/rbkt/
1581 - test/rbkt/
1587 - test/rbkt/
1600 - test/rbkt/
1605 - test/rbkt/
1612 - test/rbkt/
1635 - test/rbkt/
1637 - test/rbkt/
1643 - test/rbkt/
1789 - test/rbkt/
2345 - test/unit/
2534 - test/update/
2544 - doc/cxx/
Please make sure the test suite runs clean.
Paul J. Lucas (paul-lucas) wrote : Posted in a previous version of this proposal | # |
Try it now.
Daniel Turcanu (danielturcanu) wrote : Posted in a previous version of this proposal | # |
Before commiting this branch, the branch lp:~danielturcanu/zorba/my_conv_module should be merged.
Chris Hillery (ceejatec) wrote : Posted in a previous version of this proposal | # |
FWIW, I've skimmed the change for CMake-related changes, and they all look fine (mostly quite trivial).
Zorba Build Bot (zorba-buildbot) wrote : Posted in a previous version of this proposal | # |
Attempt to merge into lp:zorba failed due to conflicts:
text conflict in ChangeLog
Zorba Build Bot (zorba-buildbot) wrote : Posted in a previous version of this proposal | # |
Validation queue starting for merge proposal.
Log at: http://
Zorba Build Bot (zorba-buildbot) wrote : Posted in a previous version of this proposal | # |
The attempt to merge lp:~zorba-coders/zorba/no_unicode into lp:zorba failed. Below is the output from the failed tests.
CMake Error at /home/ceej/
Validation queue job no_unicode-
final status was:
6 tests did not succeed - changes not commited.
Error in read script: /home/ceej/
Zorba Build Bot (zorba-buildbot) wrote : Posted in a previous version of this proposal | # |
Validation queue starting for merge proposal.
Log at: http://
Zorba Build Bot (zorba-buildbot) wrote : Posted in a previous version of this proposal | # |
The attempt to merge lp:~zorba-coders/zorba/no_unicode into lp:zorba failed. Below is the output from the failed tests.
CMake Error at /home/ceej/
Validation queue job no_unicode-
final status was:
6 tests did not succeed - changes not commited.
Error in read script: /home/ceej/
Zorba Build Bot (zorba-buildbot) wrote : Posted in a previous version of this proposal | # |
Validation queue starting for merge proposal.
Log at: http://
Zorba Build Bot (zorba-buildbot) wrote : Posted in a previous version of this proposal | # |
The attempt to merge lp:~zorba-coders/zorba/no_unicode into lp:zorba failed. Below is the output from the failed tests.
CMake Error at /home/ceej/
Validation queue job no_unicode-
final status was:
6 tests did not succeed - changes not commited.
Error in read script: /home/ceej/
Zorba Build Bot (zorba-buildbot) wrote : Posted in a previous version of this proposal | # |
There are additional revisions which have not been approved in review. Please seek review and approval of these new revisions.
Markos Zaharioudakis (markos-za) : | # |
Zorba Build Bot (zorba-buildbot) wrote : | # |
Validation queue starting for merge proposal.
Log at: http://
Zorba Build Bot (zorba-buildbot) wrote : | # |
The attempt to merge lp:~zorba-coders/zorba/no_unicode into lp:zorba failed. Below is the output from the failed tests.
CMake Error at /home/ceej/
Validation queue job no_unicode-
final status was:
No tests were run - build or configure step must have failed.
Not commiting changes.
Error in read script: /home/ceej/
Zorba Build Bot (zorba-buildbot) wrote : | # |
Validation queue starting for merge proposal.
Log at: http://
Zorba Build Bot (zorba-buildbot) wrote : | # |
The attempt to merge lp:~zorba-coders/zorba/no_unicode into lp:zorba failed. Below is the output from the failed tests.
CMake Error at /home/ceej/
Validation queue job no_unicode-
final status was:
No tests were run - build or configure step must have failed.
Not commiting changes.
Error in read script: /home/ceej/
Zorba Build Bot (zorba-buildbot) wrote : | # |
Attempt to merge into lp:zorba failed due to conflicts:
text conflict in src/zorbautils/
Zorba Build Bot (zorba-buildbot) wrote : | # |
Validation queue starting for merge proposal.
Log at: http://
Zorba Build Bot (zorba-buildbot) wrote : | # |
The attempt to merge lp:~zorba-coders/zorba/no_unicode into lp:zorba failed. Below is the output from the failed tests.
CMake Error at /home/ceej/
Validation queue job no_unicode-
final status was:
6 tests did not succeed - changes not commited.
Error in read script: /home/ceej/
Zorba Build Bot (zorba-buildbot) wrote : | # |
Validation queue starting for merge proposal.
Log at: http://
Zorba Build Bot (zorba-buildbot) wrote : | # |
Validation queue job no_unicode-
All tests succeeded!
Zorba Build Bot (zorba-buildbot) wrote : | # |
Voting does not meet specified criteria. Required: Approve > 1, Disapprove < 1, Needs Fixing < 1, Pending < 1. Got: 2 Approve, 1 Pending.
Rodolfo Ochoa (rodolfo-ochoa) : | # |
Zorba Build Bot (zorba-buildbot) wrote : | # |
Validation queue starting for merge proposal.
Log at: http://
Zorba Build Bot (zorba-buildbot) wrote : | # |
Validation queue job no_unicode-
All tests succeeded!
Preview Diff
1 | === modified file 'CMakeConfiguration.txt' |
2 | --- CMakeConfiguration.txt 2012-03-28 05:19:57 +0000 |
3 | +++ CMakeConfiguration.txt 2012-04-13 19:45:38 +0000 |
4 | @@ -135,14 +135,14 @@ |
5 | SET (ZORBA_DEBUG_STRING ${ZORBA_DEBUG_STRING} CACHE BOOL "debug strings") |
6 | MESSAGE (STATUS "ZORBA_DEBUG_STRING: " ${ZORBA_DEBUG_STRING}) |
7 | |
8 | -SET(ZORBA_NO_UNICODE OFF CACHE BOOL "disable ICU") |
9 | -MESSAGE(STATUS "ZORBA_NO_UNICODE: " ${ZORBA_NO_UNICODE}) |
10 | +SET(ZORBA_NO_ICU OFF CACHE BOOL "disable ICU") |
11 | +MESSAGE(STATUS "ZORBA_NO_ICU: " ${ZORBA_NO_ICU}) |
12 | |
13 | -IF (ZORBA_NO_UNICODE) |
14 | +IF (ZORBA_NO_ICU) |
15 | SET (no_full_text ON) |
16 | -ELSE (ZORBA_NO_UNICODE) |
17 | +ELSE (ZORBA_NO_ICU) |
18 | SET (no_full_text OFF) |
19 | -ENDIF (ZORBA_NO_UNICODE) |
20 | +ENDIF (ZORBA_NO_ICU) |
21 | SET (ZORBA_NO_FULL_TEXT ${no_full_text} CACHE BOOL "disable XQuery Full-Text support") |
22 | MESSAGE(STATUS "ZORBA_NO_FULL_TEXT: " ${ZORBA_NO_FULL_TEXT}) |
23 | |
24 | |
25 | === modified file 'CMakeLists.txt' |
26 | --- CMakeLists.txt 2012-03-28 05:19:57 +0000 |
27 | +++ CMakeLists.txt 2012-04-13 19:45:38 +0000 |
28 | @@ -123,10 +123,14 @@ |
29 | CHECK_TYPE_SIZE("int64_t" ZORBA_HAVE_INT64_T) |
30 | |
31 | CHECK_CXX_SOURCE_COMPILES ("#include <type_traits>\nint main() { std::enable_if<true,int> x; }" ZORBA_CXX_ENABLE_IF) |
32 | -CHECK_CXX_SOURCE_COMPILES ("int main() { int *p = nullptr; }" ZORBA_CXX_NULLPTR) |
33 | -CHECK_CXX_SOURCE_COMPILES ("int main() { static_assert(1,\"\"); }" ZORBA_CXX_STATIC_ASSERT) |
34 | +SET(CMAKE_EXTRA_INCLUDE_FILES wchar.h) |
35 | +CHECK_TYPE_SIZE("wchar_t" ZORBA_SIZEOF_WCHAR_T) |
36 | +SET(CMAKE_EXTRA_INCLUDE_FILES) |
37 | CHECK_CXX_SOURCE_COMPILES ("#include <memory>\nint main() { std::unique_ptr<int> p; }" ZORBA_CXX_UNIQUE_PTR) |
38 | |
39 | +CHECK_CXX_SOURCE_COMPILES("int main() { int *p = nullptr; }" ZORBA_CXX_NULLPTR) |
40 | +CHECK_CXX_SOURCE_COMPILES("int main() { static_assert(1,\"\"); }" ZORBA_CXX_STATIC_ASSERT) |
41 | + |
42 | ################################################################################ |
43 | # Various cmake macros |
44 | |
45 | |
46 | === modified file 'ChangeLog' |
47 | --- ChangeLog 2012-04-13 11:34:54 +0000 |
48 | +++ ChangeLog 2012-04-13 19:45:38 +0000 |
49 | @@ -4,6 +4,7 @@ |
50 | |
51 | New Features: |
52 | * Extended API for Python, Java, PHP and Ruby. |
53 | + * Added support for NO_ICU (to not use ICU for unicode processing) |
54 | |
55 | Optimization: |
56 | |
57 | @@ -154,7 +155,9 @@ |
58 | * Fixed bug when parsing a document with a base-uri attribute. |
59 | * Fixed bug #863320 (Sentence is incorrectly incremented when token characters end without sentence terminator) |
60 | * Fixed bug #863730 (static delete-node* functions don't raise ZDDY0012) |
61 | + * Implemented the probe-index-range-value for general indexes |
62 | * Removed ZSTR0005 and ZSTR0006 error codes |
63 | + * Fixed bug #867662 ("nullptr" warning) |
64 | * Fixed bug #868258 (Assertion failure with two delete collection) |
65 | * Fixed bug #871623 and #871629 (assertion failures with insertions in dynamic collections) |
66 | * Fixed bug #867262 (allow reuse of iterator over ExtFuncArgItemSequence) |
67 | @@ -163,6 +166,8 @@ |
68 | * New node-reference module. References can be obtained for any node, and |
69 | different nodes cannot have the same identifier. |
70 | * Fixed bug #872697 (segmentation fault with validation of NMTOKENS) |
71 | + * General index cannot be declared as unique if the type of its key is |
72 | + xs:anyAtomicType or xs:untypedAtomic. |
73 | * Added undo for node revalidation |
74 | * Optimization for count(collection()) expressions |
75 | * Fixed bug #872796 (validate-in-place can interfere with other update primitives) |
76 | @@ -181,6 +186,8 @@ |
77 | * Fixed bug #855715 (Invalid escaped characters in regex not caught) |
78 | * Fixed bug #862089 (Split binary/xq install directories for modules) by |
79 | splitting "module path" into separate URI and Library paths |
80 | + * New node-position module. This module allows to obtain a representation of a node position, which |
81 | + can be used to assess structural relationships with other nodes. |
82 | * Fixed bug #872502 (validation of the JSON module xqdoc fails) |
83 | * Fixed bug #897619 (testdriver_mt can not run the XQueryX tests) |
84 | * Fixed bug #867107 (xqdoc dependency to zorba is wrong) |
85 | |
86 | === modified file 'KNOWN_ISSUES.txt' |
87 | --- KNOWN_ISSUES.txt 2012-03-28 05:19:57 +0000 |
88 | +++ KNOWN_ISSUES.txt 2012-04-13 19:45:38 +0000 |
89 | @@ -37,7 +37,7 @@ |
90 | * The serializer currently doesn't implement character maps as specified |
91 | (http://www.w3.org/TR/xslt-xquery-serialization/#character-maps) |
92 | |
93 | -* In the 2.0 release, setting the CMake variables ZORBA_NO_UNICODE to |
94 | +* In the 2.0 release, setting the CMake variables ZORBA_NO_ICU to |
95 | ON is not supported. |
96 | |
97 | * The PHP language binding is not supported on Mac OS X. For details, |
98 | |
99 | === modified file 'doc/cxx/examples/context.cpp' |
100 | --- doc/cxx/examples/context.cpp 2012-03-28 05:19:57 +0000 |
101 | +++ doc/cxx/examples/context.cpp 2012-04-13 19:45:38 +0000 |
102 | @@ -149,7 +149,11 @@ |
103 | outStream2 << lQuery << std::endl; |
104 | std::cout << outStream2.str() << std::endl; |
105 | |
106 | +#ifndef ZORBA_NO_ICU |
107 | if (outStream2.str() != "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\nBook 1.1\n") |
108 | +#else |
109 | + if (outStream2.str() != "<?xml version=\"1.0\"?>\nBook 1.1\n") |
110 | +#endif /* ZORBA_NO_ICU */ |
111 | { |
112 | std::cerr << "Test 4 failed with a wrong result : " << std::endl |
113 | << outStream2.str() << std::endl; |
114 | |
115 | === modified file 'include/zorba/config.h.cmake' |
116 | --- include/zorba/config.h.cmake 2012-03-28 05:19:57 +0000 |
117 | +++ include/zorba/config.h.cmake 2012-04-13 19:45:38 +0000 |
118 | @@ -96,6 +96,8 @@ |
119 | typedef __int64 int64_t; |
120 | #endif /* ZORBA_HAVE_INT64_T */ |
121 | |
122 | +#cmakedefine ZORBA_SIZEOF_WCHAR_T @ZORBA_SIZEOF_WCHAR_T@ |
123 | + |
124 | // Compiler |
125 | #cmakedefine CLANG |
126 | #cmakedefine MSVC |
127 | @@ -148,7 +150,7 @@ |
128 | |
129 | // Zorba features |
130 | #cmakedefine ZORBA_NO_FULL_TEXT |
131 | -#cmakedefine ZORBA_NO_UNICODE |
132 | +#cmakedefine ZORBA_NO_ICU |
133 | #cmakedefine ZORBA_NO_XMLSCHEMA |
134 | #cmakedefine ZORBA_NUMERIC_OPTIMIZATION |
135 | #cmakedefine ZORBA_VERIFY_PEER_SSL_CERTIFICATE |
136 | |
137 | === modified file 'include/zorba/static_context.h' |
138 | --- include/zorba/static_context.h 2012-04-13 09:11:32 +0000 |
139 | +++ include/zorba/static_context.h 2012-04-13 19:45:38 +0000 |
140 | @@ -26,9 +26,13 @@ |
141 | #include <zorba/function.h> |
142 | #include <zorba/annotation.h> |
143 | #include <zorba/smart_ptr.h> |
144 | +#include <zorba/smart_ptr.h> |
145 | #ifndef ZORBA_NO_FULL_TEXT |
146 | #include <zorba/thesaurus.h> |
147 | #endif /* ZORBA_NO_FULL_TEXT */ |
148 | +#include <zorba/zorba.h> |
149 | +#include <zorba/store_manager.h> |
150 | +#include <zorba/zorba_exception.h> |
151 | |
152 | namespace zorba { |
153 | |
154 | |
155 | === modified file 'include/zorba/util/time.h' |
156 | --- include/zorba/util/time.h 2012-03-28 05:19:57 +0000 |
157 | +++ include/zorba/util/time.h 2012-04-13 19:45:38 +0000 |
158 | @@ -178,7 +178,7 @@ |
159 | |
160 | inline long get_walltime_in_millis(const walltime& t) |
161 | { |
162 | - return t.time * 1000 + t.millitm; |
163 | + return (long)(t.time * 1000 + t.millitm); |
164 | } |
165 | |
166 | #else /* not Windows, and no clock_gettime() */ |
167 | |
168 | === modified file 'src/CMakeLists.txt' |
169 | --- src/CMakeLists.txt 2012-03-28 05:19:57 +0000 |
170 | +++ src/CMakeLists.txt 2012-04-13 19:45:38 +0000 |
171 | @@ -59,7 +59,10 @@ |
172 | # |
173 | # Next, add the files to be compiled into the library |
174 | # |
175 | + |
176 | +MESSAGE(STATUS "PRECOMPILED HEADERS: " ${ZORBA_PRECOMPILED_HEADERS}) |
177 | SET(ZORBA_PRECOMPILED_HEADERS OFF CACHE BOOL "Activate Zorba precompiled headers.") |
178 | +MESSAGE(STATUS "PRECOMPILED HEADERS: " ${ZORBA_PRECOMPILED_HEADERS}) |
179 | |
180 | SET(ZORBA_SRCS) |
181 | ADD_SRC_SUBFOLDER(ZORBA_SRCS api API_SRCS) |
182 | @@ -97,6 +100,7 @@ |
183 | ENDIF(ZORBA_WITH_DEBUGGER) |
184 | ADD_SRC_SUBFOLDER(ZORBA_SRCS unit_tests UNIT_TEST_SRCS) |
185 | |
186 | +MESSAGE(STATUS "PRECOMPILED HEADERS: " ${ZORBA_PRECOMPILED_HEADERS}) |
187 | IF(ZORBA_PRECOMPILED_HEADERS) |
188 | ADD_SRC_SUBFOLDER(ZORBA_SRCS precompiled ZORBAMISC_SRCS) |
189 | INCLUDE_DIRECTORIES("${CMAKE_SOURCE_DIR}/src/precompiled") |
190 | |
191 | === modified file 'src/api/serialization/serializer.cpp' |
192 | --- src/api/serialization/serializer.cpp 2012-03-28 05:19:57 +0000 |
193 | +++ src/api/serialization/serializer.cpp 2012-04-13 19:45:38 +0000 |
194 | @@ -180,7 +180,6 @@ |
195 | for (; chars < chars_end; chars++ ) |
196 | { |
197 | |
198 | -#ifndef ZORBA_NO_UNICODE |
199 | // the input string is UTF-8 |
200 | int char_length = utf8::char_length(*chars); |
201 | if (char_length == 0) |
202 | @@ -217,7 +216,6 @@ |
203 | |
204 | continue; |
205 | } |
206 | -#endif//ZORBA_NO_UNICODE |
207 | |
208 | // raise an error iff (1) the serialization format is XML 1.0 and (2) the given character is an invalid XML 1.0 character |
209 | if (ser && ser->method == PARAMETER_VALUE_XML && |
210 | @@ -332,14 +330,12 @@ |
211 | { |
212 | tr << (char)0xEF << (char)0xBB << (char)0xBF; |
213 | } |
214 | -#ifndef ZORBA_NO_UNICODE |
215 | else if (ser->encoding == PARAMETER_VALUE_UTF_16) |
216 | { |
217 | // Little-endian |
218 | tr.verbatim((char)0xFF); |
219 | tr.verbatim((char)0xFE); |
220 | } |
221 | -#endif |
222 | } |
223 | } |
224 | |
225 | @@ -862,13 +858,17 @@ |
226 | emitter::emit_declaration(); |
227 | |
228 | if (ser->omit_xml_declaration == PARAMETER_VALUE_NO) { |
229 | - tr << "<?xml version=\"" << ser->version << "\" encoding=\""; |
230 | - if (ser->encoding == PARAMETER_VALUE_UTF_8) { |
231 | - tr << "UTF-8"; |
232 | -#ifndef ZORBA_NO_UNICODE |
233 | - } else if (ser->encoding == PARAMETER_VALUE_UTF_16) { |
234 | - tr << "UTF-16"; |
235 | -#endif |
236 | + tr << "<?xml version=\"" << ser->version; |
237 | + switch (ser->encoding) { |
238 | + case PARAMETER_VALUE_UTF_8: |
239 | + case PARAMETER_VALUE_UTF_16: |
240 | + tr << "\" encoding=\""; |
241 | + switch (ser->encoding) { |
242 | + case PARAMETER_VALUE_UTF_8 : tr << "UTF-8" ; break; |
243 | + case PARAMETER_VALUE_UTF_16: tr << "UTF-16"; break; |
244 | + default : ZORBA_ASSERT(false); |
245 | + } |
246 | + break; |
247 | } |
248 | tr << "\""; |
249 | |
250 | @@ -1174,14 +1174,18 @@ |
251 | } |
252 | |
253 | tr << "<meta http-equiv=\"content-type\" content=\"" |
254 | - << ser->media_type << "; charset="; |
255 | - |
256 | - if (ser->encoding == PARAMETER_VALUE_UTF_8) |
257 | - tr << "UTF-8"; |
258 | -#ifndef ZORBA_NO_UNICODE |
259 | - else if (ser->encoding == PARAMETER_VALUE_UTF_16) |
260 | - tr << "UTF-16"; |
261 | -#endif |
262 | + << ser->media_type; |
263 | + switch (ser->encoding) { |
264 | + case PARAMETER_VALUE_UTF_8: |
265 | + case PARAMETER_VALUE_UTF_16: |
266 | + tr << "\" charset=\""; |
267 | + switch (ser->encoding) { |
268 | + case PARAMETER_VALUE_UTF_8 : tr << "UTF-8" ; break; |
269 | + case PARAMETER_VALUE_UTF_16: tr << "UTF-16"; break; |
270 | + default : ZORBA_ASSERT(false); |
271 | + } |
272 | + break; |
273 | + } |
274 | tr << "\""; |
275 | // closed_parent_tag = 1; |
276 | } |
277 | @@ -1371,14 +1375,18 @@ |
278 | } |
279 | |
280 | tr << "<meta http-equiv=\"content-type\" content=\"" |
281 | - << ser->media_type << "; charset="; |
282 | - |
283 | - if (ser->encoding == PARAMETER_VALUE_UTF_8) |
284 | - tr << "UTF-8"; |
285 | -#ifndef ZORBA_NO_UNICODE |
286 | - else if (ser->encoding == PARAMETER_VALUE_UTF_16) |
287 | - tr << "UTF-16"; |
288 | -#endif |
289 | + << ser->media_type; |
290 | + switch (ser->encoding) { |
291 | + case PARAMETER_VALUE_UTF_8: |
292 | + case PARAMETER_VALUE_UTF_16: |
293 | + tr << "\" charset=\""; |
294 | + switch (ser->encoding) { |
295 | + case PARAMETER_VALUE_UTF_8 : tr << "UTF-8" ; break; |
296 | + case PARAMETER_VALUE_UTF_16: tr << "UTF-16"; break; |
297 | + default : ZORBA_ASSERT(false); |
298 | + } |
299 | + break; |
300 | + } |
301 | tr << "\"/"; |
302 | //closed_parent_tag = 1; |
303 | } |
304 | @@ -2098,10 +2106,8 @@ |
305 | { |
306 | if (!strcmp(aValue, "UTF-8")) |
307 | encoding = PARAMETER_VALUE_UTF_8; |
308 | -#ifndef ZORBA_NO_UNICODE |
309 | else if (!strcmp(aValue, "UTF-16")) |
310 | encoding = PARAMETER_VALUE_UTF_16; |
311 | -#endif |
312 | else |
313 | throw XQUERY_EXCEPTION( |
314 | err::SEPM0016, ERROR_PARAMS( aValue, aName, ZED( GoodValuesAreUTF8 ) ) |
315 | @@ -2210,16 +2216,13 @@ |
316 | { |
317 | tr = new transcoder(os, false); |
318 | } |
319 | -#ifndef ZORBA_NO_UNICODE |
320 | else if (encoding == PARAMETER_VALUE_UTF_16) |
321 | { |
322 | tr = new transcoder(os, true); |
323 | } |
324 | -#endif |
325 | else |
326 | { |
327 | - ZORBA_ASSERT(0); |
328 | - return false; |
329 | + ZORBA_ASSERT(false); |
330 | } |
331 | |
332 | if (method == PARAMETER_VALUE_XML) |
333 | |
334 | === modified file 'src/api/serialization/serializer.h' |
335 | --- src/api/serialization/serializer.h 2012-03-28 05:19:57 +0000 |
336 | +++ src/api/serialization/serializer.h 2012-04-13 19:45:38 +0000 |
337 | @@ -70,10 +70,8 @@ |
338 | PARAMETER_VALUE_TEXT, |
339 | PARAMETER_VALUE_BINARY, |
340 | |
341 | - PARAMETER_VALUE_UTF_8 |
342 | -#ifndef ZORBA_NO_UNICODE |
343 | - ,PARAMETER_VALUE_UTF_16 |
344 | -#endif |
345 | + PARAMETER_VALUE_UTF_8, |
346 | + PARAMETER_VALUE_UTF_16 |
347 | } PARAMETER_VALUE_TYPE; |
348 | |
349 | protected: |
350 | |
351 | === modified file 'src/diagnostics/diagnostic_en.xml' |
352 | --- src/diagnostics/diagnostic_en.xml 2012-04-10 13:10:22 +0000 |
353 | +++ src/diagnostics/diagnostic_en.xml 2012-04-13 19:45:38 +0000 |
354 | @@ -2517,11 +2517,11 @@ |
355 | <value>attribute node</value> |
356 | </entry> |
357 | |
358 | - <entry key="BackRef0Illegal"> |
359 | + <entry key="BackRef0Illegal" if="!defined(ZORBA_NO_ICU)"> |
360 | <value>"0": illegal backreference</value> |
361 | </entry> |
362 | |
363 | - <entry key="BackRefIllegalInCharClass"> |
364 | + <entry key="BackRefIllegalInCharClass" if="!defined(ZORBA_NO_ICU)"> |
365 | <value>backreference illegal in character class</value> |
366 | </entry> |
367 | |
368 | @@ -2569,7 +2569,7 @@ |
369 | <value>invalid library module</value> |
370 | </entry> |
371 | |
372 | - <entry key="BadRegexEscape_3"> |
373 | + <entry key="BadRegexEscape_3" if="!defined(ZORBA_NO_ICU)"> |
374 | <value>"$3": illegal escape character</value> |
375 | </entry> |
376 | |
377 | @@ -3029,7 +3029,7 @@ |
378 | <value>nodeid component too big for encoding</value> |
379 | </entry> |
380 | |
381 | - <entry key="NonClosedBackRef_3"> |
382 | + <entry key="NonClosedBackRef_3" if="!defined(ZORBA_NO_ICU)"> |
383 | <value>'$$3': non-closed backreference</value> |
384 | </entry> |
385 | |
386 | @@ -3041,7 +3041,7 @@ |
387 | <value>non-localhost authority</value> |
388 | </entry> |
389 | |
390 | - <entry key="NonexistentBackRef_3"> |
391 | + <entry key="NonexistentBackRef_3" if="!defined(ZORBA_NO_ICU)"> |
392 | <value>'$$3': non-existent backreference</value> |
393 | </entry> |
394 | |
395 | @@ -3193,94 +3193,183 @@ |
396 | <value>item type is not a subtype of "$3"</value> |
397 | </entry> |
398 | |
399 | - <entry key="U_REGEX_BAD_ESCAPE_SEQUENCE" if="!defined(ZORBA_NO_UNICODE)"> |
400 | + <entry key="U_REGEX_BAD_ESCAPE_SEQUENCE" if="!defined(ZORBA_NO_ICU)"> |
401 | <value>unrecognized backslash escape sequence</value> |
402 | </entry> |
403 | |
404 | - <entry key="U_REGEX_BAD_INTERVAL" if="!defined(ZORBA_NO_UNICODE)"> |
405 | + <entry key="U_REGEX_BAD_INTERVAL" if="!defined(ZORBA_NO_ICU)"> |
406 | <value>error in {min,max} interval</value> |
407 | </entry> |
408 | |
409 | - <entry key="U_REGEX_INTERNAL_ERROR" if="!defined(ZORBA_NO_UNICODE)"> |
410 | + <entry key="U_REGEX_INTERNAL_ERROR" if="!defined(ZORBA_NO_ICU)"> |
411 | <value>an internal ICU error (bug) was detected</value> |
412 | </entry> |
413 | |
414 | - <entry key="U_REGEX_INVALID_BACK_REF" if="!defined(ZORBA_NO_UNICODE)"> |
415 | + <entry key="U_REGEX_INVALID_BACK_REF" if="!defined(ZORBA_NO_ICU)"> |
416 | <value>backreference to a non-existent capture group</value> |
417 | </entry> |
418 | |
419 | - <entry key="U_REGEX_INVALID_FLAG" if="!defined(ZORBA_NO_UNICODE)"> |
420 | + <entry key="U_REGEX_INVALID_FLAG" if="!defined(ZORBA_NO_ICU)"> |
421 | <value>invalid value for match mode flags</value> |
422 | </entry> |
423 | |
424 | - <entry key="U_REGEX_INVALID_RANGE" if="!defined(ZORBA_NO_UNICODE)"> |
425 | + <entry key="U_REGEX_INVALID_RANGE" if="!defined(ZORBA_NO_ICU)"> |
426 | <value>in character range [x-y], x is greater than y</value> |
427 | </entry> |
428 | |
429 | - <entry key="U_REGEX_INVALID_STATE" if="!defined(ZORBA_NO_UNICODE)"> |
430 | + <entry key="U_REGEX_INVALID_STATE" if="!defined(ZORBA_NO_ICU)"> |
431 | <value>RegexMatcher in invalid state for requested operation</value> |
432 | </entry> |
433 | |
434 | - <entry key="U_REGEX_LOOK_BEHIND_LIMIT" if="!defined(ZORBA_NO_UNICODE)"> |
435 | + <entry key="U_REGEX_LOOK_BEHIND_LIMIT" if="!defined(ZORBA_NO_ICU)"> |
436 | <value>look-behind pattern matches must have a bounded maximum length</value> |
437 | </entry> |
438 | |
439 | - <entry key="U_REGEX_MAX_LT_MIN" if="!defined(ZORBA_NO_UNICODE)"> |
440 | + <entry key="U_REGEX_MAX_LT_MIN" if="!defined(ZORBA_NO_ICU)"> |
441 | <value>in {min,max}, max is less than min</value> |
442 | </entry> |
443 | |
444 | - <entry key="U_REGEX_MISMATCHED_PAREN" if="!defined(ZORBA_NO_UNICODE)"> |
445 | + <entry key="U_REGEX_MISMATCHED_PAREN" if="!defined(ZORBA_NO_ICU)"> |
446 | <value>incorrectly nested parentheses</value> |
447 | </entry> |
448 | |
449 | - <entry key="U_REGEX_MISSING_CLOSE_BRACKET" if="!defined(ZORBA_NO_UNICODE)"> |
450 | + <entry key="U_REGEX_MISSING_CLOSE_BRACKET" if="!defined(ZORBA_NO_ICU)"> |
451 | <value>missing ']'</value> |
452 | </entry> |
453 | |
454 | - <entry key="U_REGEX_NUMBER_TOO_BIG" if="!defined(ZORBA_NO_UNICODE)"> |
455 | + <entry key="U_REGEX_NUMBER_TOO_BIG" if="!defined(ZORBA_NO_ICU)"> |
456 | <value>decimal number is too large</value> |
457 | </entry> |
458 | |
459 | - <entry key="U_REGEX_OCTAL_TOO_BIG" if="!defined(ZORBA_NO_UNICODE)"> |
460 | + <entry key="U_REGEX_OCTAL_TOO_BIG" if="!defined(ZORBA_NO_ICU)"> |
461 | <value>octal character constants must be <= 0377</value> |
462 | </entry> |
463 | |
464 | - <entry key="U_REGEX_PROPERTY_SYNTAX" if="!defined(ZORBA_NO_UNICODE)"> |
465 | + <entry key="U_REGEX_PROPERTY_SYNTAX" if="!defined(ZORBA_NO_ICU)"> |
466 | <value>incorrect Unicode property</value> |
467 | </entry> |
468 | |
469 | - <entry key="U_REGEX_RULE_SYNTAX" if="!defined(ZORBA_NO_UNICODE)"> |
470 | + <entry key="U_REGEX_RULE_SYNTAX" if="!defined(ZORBA_NO_ICU)"> |
471 | <value>syntax error</value> |
472 | </entry> |
473 | |
474 | - <entry key="U_REGEX_SET_CONTAINS_STRING" if="!defined(ZORBA_NO_UNICODE)"> |
475 | + <entry key="U_REGEX_SET_CONTAINS_STRING" if="!defined(ZORBA_NO_ICU)"> |
476 | <value>can not have UnicodeSets containing strings</value> |
477 | </entry> |
478 | |
479 | - <entry key="U_REGEX_STACK_OVERFLOW" if="!defined(ZORBA_NO_UNICODE)"> |
480 | + <entry key="U_REGEX_STACK_OVERFLOW" if="!defined(ZORBA_NO_ICU)"> |
481 | <value>backtrack stack overflow</value> |
482 | </entry> |
483 | |
484 | - <entry key="U_REGEX_STOPPED_BY_CALLER" if="!defined(ZORBA_NO_UNICODE)"> |
485 | + <entry key="U_REGEX_STOPPED_BY_CALLER" if="!defined(ZORBA_NO_ICU)"> |
486 | <value>matching operation aborted by user callback fn</value> |
487 | </entry> |
488 | |
489 | - <entry key="U_REGEX_TIME_OUT" if="!defined(ZORBA_NO_UNICODE)"> |
490 | + <entry key="U_REGEX_TIME_OUT" if="!defined(ZORBA_NO_ICU)"> |
491 | <value>maximum allowed match time exceeded</value> |
492 | </entry> |
493 | |
494 | - <entry key="U_REGEX_UNIMPLEMENTED" if="!defined(ZORBA_NO_UNICODE)"> |
495 | - <value>use of regular expression feature that is not yet implemented</value> |
496 | + <entry key="U_REGEX_UNIMPLEMENTED" if="!defined(ZORBA_NO_ICU)"> |
497 | + <value>use of regular expression feature that is not yet implemented</value> |
498 | + </entry> |
499 | + |
500 | + <!-- Regex Ascii error messages--> |
501 | + <entry key="REGEX_UNIMPLEMENTED" if="defined(ZORBA_NO_ICU)"> |
502 | + <value>use of regular expression feature that is not yet implemented</value> |
503 | + </entry> |
504 | + |
505 | + <entry key="REGEX_MISMATCHED_PAREN" if="defined(ZORBA_NO_ICU)"> |
506 | + <value>incorrectly nested parentheses</value> |
507 | + </entry> |
508 | + |
509 | + <entry key="REGEX_BROKEN_P_CONSTRUCT" if="defined(ZORBA_NO_ICU)"> |
510 | + <value>broken \\p construct</value> |
511 | + </entry> |
512 | + |
513 | + <entry key="REGEX_UNKNOWN_PL_CONSTRUCT" if="defined(ZORBA_NO_ICU)"> |
514 | + <value>unknown \\p{L?} category; supported categories: L, Lu, Ll, Lt, Lm, Lo</value> |
515 | + </entry> |
516 | + |
517 | + <entry key="REGEX_UNKNOWN_PM_CONSTRUCT" if="defined(ZORBA_NO_ICU)"> |
518 | + <value>unknown \\p{M?} category; supported categories: M, Mn, Mc, Me</value> |
519 | + </entry> |
520 | + |
521 | + <entry key="REGEX_UNKNOWN_PN_CONSTRUCT" if="defined(ZORBA_NO_ICU)"> |
522 | + <value>unknown \\p{N?} category; supported categories: N, Nd, Nl, No</value> |
523 | + </entry> |
524 | + |
525 | + <entry key="REGEX_UNKNOWN_PP_CONSTRUCT" if="defined(ZORBA_NO_ICU)"> |
526 | + <value>unknown \\p{P?} category; supported categories: P, Pc, Pd, Ps, Pe, Pi, Pf, Po</value> |
527 | + </entry> |
528 | + |
529 | + <entry key="REGEX_UNKNOWN_PZ_CONSTRUCT" if="defined(ZORBA_NO_ICU)"> |
530 | + <value>unknown \\p{Z?} category; supported categories: Z, Zs, Zl, Zp</value> |
531 | + </entry> |
532 | + |
533 | + <entry key="REGEX_UNKNOWN_PS_CONSTRUCT" if="defined(ZORBA_NO_ICU)"> |
534 | + <value>unknown \\p{S?} category; supported categories: S, Sm, Sc, Sk, So</value> |
535 | + </entry> |
536 | + |
537 | + <entry key="REGEX_UNKNOWN_PC_CONSTRUCT" if="defined(ZORBA_NO_ICU)"> |
538 | + <value>unknown \\p{C?} category; supported categories: C, Cc, Cf, Co, Cn(for not assigned)</value> |
539 | + </entry> |
540 | + |
541 | + <entry key="REGEX_BROKEN_PIs_CONSTRUCT" if="defined(ZORBA_NO_ICU)"> |
542 | + <value>broken \\p{Is} construct; valid characters are [a-zA-Z0-9-]</value> |
543 | + </entry> |
544 | + |
545 | + <entry key="REGEX_UNKNOWN_PIs_CONSTRUCT" if="defined(ZORBA_NO_ICU)"> |
546 | + <value>unknown \\p{Is} category block; see supported block escapes here: http://www.w3.org/TR/xmlschema-2/#charcter-classes</value> |
547 | + </entry> |
548 | + |
549 | + <entry key="REGEX_INVALID_UNICODE_CODEPOINT_u" if="defined(ZORBA_NO_ICU)"> |
550 | + <value>invalid unicode hex, should be in form \\uXXXX or \\UXXXXXXXX</value> |
551 | + </entry> |
552 | + |
553 | + <entry key="REGEX_UNKNOWN_ESC_CHAR" if="defined(ZORBA_NO_ICU)"> |
554 | + <value>unknown \\? escape char; supported escapes are: \\[nrt\\|.?*+(){}[]-^$] for char escapes, \\[pP] for categories and \\[sSiIcCdDwW] for multichar groups</value> |
555 | + </entry> |
556 | + |
557 | + <entry key="REGEX_INVALID_BACK_REF" if="defined(ZORBA_NO_ICU)"> |
558 | + <value>\\$3 backreference to a non-existent capture group ($4 groups so far)</value> |
559 | + </entry> |
560 | + |
561 | + <entry key="REGEX_INVALID_ATOM_CHAR" if="defined(ZORBA_NO_ICU)"> |
562 | + <value>'$3': invalid character for an atom; forbidden characters are: [{}?*+|^]</value> |
563 | + </entry> |
564 | + |
565 | + <entry key="REGEX_INVALID_SUBCLASS" if="defined(ZORBA_NO_ICU)"> |
566 | + <value>malformed class subtraction</value> |
567 | + </entry> |
568 | + |
569 | + <entry key="REGEX_INVALID_USE_OF_SUBCLASS" if="defined(ZORBA_NO_ICU)"> |
570 | + <value>improper use of class subtraction: it must be the last construct in a class group [xxx-[yyy]]</value> |
571 | + </entry> |
572 | + |
573 | + <entry key="REGEX_MULTICHAR_IN_CHAR_RANGE" if="defined(ZORBA_NO_ICU)"> |
574 | + <value>multichars or char categories cannot be part of a char range</value> |
575 | + </entry> |
576 | + |
577 | + <entry key="REGEX_MISSING_CLOSE_BRACKET" if="defined(ZORBA_NO_ICU)"> |
578 | + <value>missing ']' in character group</value> |
579 | + </entry> |
580 | + |
581 | + <entry key="REGEX_MAX_LT_MIN" if="defined(ZORBA_NO_ICU)"> |
582 | + <value>in {min,max}, max is less than min</value> |
583 | </entry> |
584 | |
585 | <entry key="UnaryArithOp"> |
586 | <value>unary arithmetic operator</value> |
587 | </entry> |
588 | |
589 | - <entry key="UnbalancedChar_3"> |
590 | + <entry key="UnbalancedChar_3" if="!defined(ZORBA_NO_ICU)"> |
591 | <value>missing '$3'</value> |
592 | </entry> |
593 | |
594 | + <entry key="UnescapedChar_3" if="!defined(ZORBA_NO_ICU)"> |
595 | + <value>character '$3' must be escaped here</value> |
596 | + </entry> |
597 | + |
598 | <entry key="UnexpectedElement"> |
599 | <value>unexpected element</value> |
600 | </entry> |
601 | |
602 | === modified file 'src/diagnostics/pregenerated/dict_en.cpp' |
603 | --- src/diagnostics/pregenerated/dict_en.cpp 2012-04-10 13:10:22 +0000 |
604 | +++ src/diagnostics/pregenerated/dict_en.cpp 2012-04-13 19:45:38 +0000 |
605 | @@ -437,8 +437,12 @@ |
606 | { "~AtomizationOfGroupByMakesMoreThanOneItem", "atomization of groupby variable produces more than one item" }, |
607 | { "~AttributeName", "attribute name" }, |
608 | { "~AttributeNode", "attribute node" }, |
609 | +#if !defined(ZORBA_NO_ICU) |
610 | { "~BackRef0Illegal", "\"0\": illegal backreference" }, |
611 | +#endif |
612 | +#if !defined(ZORBA_NO_ICU) |
613 | { "~BackRefIllegalInCharClass", "backreference illegal in character class" }, |
614 | +#endif |
615 | { "~BadAnyURI", "invalid xs:anyURI" }, |
616 | { "~BadArgTypeForFn_2o34o", "${\"2\": }invalid argument type for function $3()${: 4}" }, |
617 | { "~BadCharAfter_34", "'$3': illegal character after '$4'" }, |
618 | @@ -451,7 +455,9 @@ |
619 | { "~BadIterator", "invalid iterator" }, |
620 | { "~BadLibraryModule", "invalid library module" }, |
621 | { "~BadPath", "invalid path" }, |
622 | +#if !defined(ZORBA_NO_ICU) |
623 | { "~BadRegexEscape_3", "\"$3\": illegal escape character" }, |
624 | +#endif |
625 | { "~BadStreamState", "bad I/O stream state" }, |
626 | { "~BadTokenInBraces_3", "\"$3\": illegal token within { }" }, |
627 | { "~BadTraceStream", "trace stream not retrievable using SerializationCallback" }, |
628 | @@ -567,10 +573,14 @@ |
629 | { "~NoUntypedKeyNodeValue_2", "node with untyped key value found during probe on index \"$2\"" }, |
630 | { "~NodeIDNeedsBytes_2", "nodeid requires more than $2 bytes" }, |
631 | { "~NodeIDTooBig", "nodeid component too big for encoding" }, |
632 | +#if !defined(ZORBA_NO_ICU) |
633 | { "~NonClosedBackRef_3", "'$$3': non-closed backreference" }, |
634 | +#endif |
635 | { "~NonFileThesaurusURI", "non-file thesaurus URI" }, |
636 | { "~NonLocalhostAuthority", "non-localhost authority" }, |
637 | +#if !defined(ZORBA_NO_ICU) |
638 | { "~NonexistentBackRef_3", "'$$3': non-existent backreference" }, |
639 | +#endif |
640 | { "~NotAllowedForTypeName", "not allowed for typeName (use xsd:untyped instead)" }, |
641 | { "~NotAmongInScopeSchemaTypes", "not among in-scope schema types" }, |
642 | { "~NotDefInDynamicCtx", "not defined in dynamic context" }, |
643 | @@ -589,6 +599,69 @@ |
644 | { "~ParserNoCreateTree", "XML tree creation failed" }, |
645 | { "~PromotionImpossible", "promotion not possible" }, |
646 | { "~QuotedColon_23", "\"$2\": $3" }, |
647 | +#if defined(ZORBA_NO_ICU) |
648 | + { "~REGEX_BROKEN_PIs_CONSTRUCT", "broken \\p{Is} construct; valid characters are [a-zA-Z0-9-]" }, |
649 | +#endif |
650 | +#if defined(ZORBA_NO_ICU) |
651 | + { "~REGEX_BROKEN_P_CONSTRUCT", "broken \\p construct" }, |
652 | +#endif |
653 | +#if defined(ZORBA_NO_ICU) |
654 | + { "~REGEX_INVALID_ATOM_CHAR", "'$3': invalid character for an atom; forbidden characters are: [{}?*+|^]" }, |
655 | +#endif |
656 | +#if defined(ZORBA_NO_ICU) |
657 | + { "~REGEX_INVALID_BACK_REF", "\\$3 backreference to a non-existent capture group ($4 groups so far)" }, |
658 | +#endif |
659 | +#if defined(ZORBA_NO_ICU) |
660 | + { "~REGEX_INVALID_SUBCLASS", "malformed class subtraction" }, |
661 | +#endif |
662 | +#if defined(ZORBA_NO_ICU) |
663 | + { "~REGEX_INVALID_UNICODE_CODEPOINT_u", "invalid unicode hex, should be in form \\uXXXX or \\UXXXXXXXX" }, |
664 | +#endif |
665 | +#if defined(ZORBA_NO_ICU) |
666 | + { "~REGEX_INVALID_USE_OF_SUBCLASS", "improper use of class subtraction: it must be the last construct in a class group [xxx-[yyy]]" }, |
667 | +#endif |
668 | +#if defined(ZORBA_NO_ICU) |
669 | + { "~REGEX_MAX_LT_MIN", "in {min,max}, max is less than min" }, |
670 | +#endif |
671 | +#if defined(ZORBA_NO_ICU) |
672 | + { "~REGEX_MISMATCHED_PAREN", "incorrectly nested parentheses" }, |
673 | +#endif |
674 | +#if defined(ZORBA_NO_ICU) |
675 | + { "~REGEX_MISSING_CLOSE_BRACKET", "missing ']' in character group" }, |
676 | +#endif |
677 | +#if defined(ZORBA_NO_ICU) |
678 | + { "~REGEX_MULTICHAR_IN_CHAR_RANGE", "multichars or char categories cannot be part of a char range" }, |
679 | +#endif |
680 | +#if defined(ZORBA_NO_ICU) |
681 | + { "~REGEX_UNIMPLEMENTED", "use of regular expression feature that is not yet implemented" }, |
682 | +#endif |
683 | +#if defined(ZORBA_NO_ICU) |
684 | + { "~REGEX_UNKNOWN_ESC_CHAR", "unknown \\? escape char; supported escapes are: \\[nrt\\|.?*+(){}[]-^$] for char escapes, \\[pP] for categories and \\[sSiIcCdDwW] for multichar groups" }, |
685 | +#endif |
686 | +#if defined(ZORBA_NO_ICU) |
687 | + { "~REGEX_UNKNOWN_PC_CONSTRUCT", "unknown \\p{C?} category; supported categories: C, Cc, Cf, Co, Cn(for not assigned)" }, |
688 | +#endif |
689 | +#if defined(ZORBA_NO_ICU) |
690 | + { "~REGEX_UNKNOWN_PIs_CONSTRUCT", "unknown \\p{Is} category block; see supported block escapes here: http://www.w3.org/TR/xmlschema-2/#charcter-classes" }, |
691 | +#endif |
692 | +#if defined(ZORBA_NO_ICU) |
693 | + { "~REGEX_UNKNOWN_PL_CONSTRUCT", "unknown \\p{L?} category; supported categories: L, Lu, Ll, Lt, Lm, Lo" }, |
694 | +#endif |
695 | +#if defined(ZORBA_NO_ICU) |
696 | + { "~REGEX_UNKNOWN_PM_CONSTRUCT", "unknown \\p{M?} category; supported categories: M, Mn, Mc, Me" }, |
697 | +#endif |
698 | +#if defined(ZORBA_NO_ICU) |
699 | + { "~REGEX_UNKNOWN_PN_CONSTRUCT", "unknown \\p{N?} category; supported categories: N, Nd, Nl, No" }, |
700 | +#endif |
701 | +#if defined(ZORBA_NO_ICU) |
702 | + { "~REGEX_UNKNOWN_PP_CONSTRUCT", "unknown \\p{P?} category; supported categories: P, Pc, Pd, Ps, Pe, Pi, Pf, Po" }, |
703 | +#endif |
704 | +#if defined(ZORBA_NO_ICU) |
705 | + { "~REGEX_UNKNOWN_PS_CONSTRUCT", "unknown \\p{S?} category; supported categories: S, Sm, Sc, Sk, So" }, |
706 | +#endif |
707 | +#if defined(ZORBA_NO_ICU) |
708 | + { "~REGEX_UNKNOWN_PZ_CONSTRUCT", "unknown \\p{Z?} category; supported categories: Z, Zs, Zl, Zp" }, |
709 | +#endif |
710 | { "~SEPM0009_Not10", "the version parameter has a value other than \"1.0\" and the doctype-system parameter is specified" }, |
711 | { "~SEPM0009_NotOmit", "the standalone attribute has a value other than \"omit\"" }, |
712 | { "~SchemaAttributeName", "schema-attribute name" }, |
713 | @@ -610,68 +683,73 @@ |
714 | { "~TwoDecimalFormatsSameName_2", "\"$2\": two decimal formats with this name" }, |
715 | { "~TwoDefaultDecimalFormats", "two default decimal formats" }, |
716 | { "~TypeIsNotSubtype", "item type is not a subtype of \"$3\"" }, |
717 | -#if !defined(ZORBA_NO_UNICODE) |
718 | +#if !defined(ZORBA_NO_ICU) |
719 | { "~U_REGEX_BAD_ESCAPE_SEQUENCE", "unrecognized backslash escape sequence" }, |
720 | #endif |
721 | -#if !defined(ZORBA_NO_UNICODE) |
722 | +#if !defined(ZORBA_NO_ICU) |
723 | { "~U_REGEX_BAD_INTERVAL", "error in {min,max} interval" }, |
724 | #endif |
725 | -#if !defined(ZORBA_NO_UNICODE) |
726 | +#if !defined(ZORBA_NO_ICU) |
727 | { "~U_REGEX_INTERNAL_ERROR", "an internal ICU error (bug) was detected" }, |
728 | #endif |
729 | -#if !defined(ZORBA_NO_UNICODE) |
730 | +#if !defined(ZORBA_NO_ICU) |
731 | { "~U_REGEX_INVALID_BACK_REF", "backreference to a non-existent capture group" }, |
732 | #endif |
733 | -#if !defined(ZORBA_NO_UNICODE) |
734 | +#if !defined(ZORBA_NO_ICU) |
735 | { "~U_REGEX_INVALID_FLAG", "invalid value for match mode flags" }, |
736 | #endif |
737 | -#if !defined(ZORBA_NO_UNICODE) |
738 | +#if !defined(ZORBA_NO_ICU) |
739 | { "~U_REGEX_INVALID_RANGE", "in character range [x-y], x is greater than y" }, |
740 | #endif |
741 | -#if !defined(ZORBA_NO_UNICODE) |
742 | +#if !defined(ZORBA_NO_ICU) |
743 | { "~U_REGEX_INVALID_STATE", "RegexMatcher in invalid state for requested operation" }, |
744 | #endif |
745 | -#if !defined(ZORBA_NO_UNICODE) |
746 | +#if !defined(ZORBA_NO_ICU) |
747 | { "~U_REGEX_LOOK_BEHIND_LIMIT", "look-behind pattern matches must have a bounded maximum length" }, |
748 | #endif |
749 | -#if !defined(ZORBA_NO_UNICODE) |
750 | +#if !defined(ZORBA_NO_ICU) |
751 | { "~U_REGEX_MAX_LT_MIN", "in {min,max}, max is less than min" }, |
752 | #endif |
753 | -#if !defined(ZORBA_NO_UNICODE) |
754 | +#if !defined(ZORBA_NO_ICU) |
755 | { "~U_REGEX_MISMATCHED_PAREN", "incorrectly nested parentheses" }, |
756 | #endif |
757 | -#if !defined(ZORBA_NO_UNICODE) |
758 | +#if !defined(ZORBA_NO_ICU) |
759 | { "~U_REGEX_MISSING_CLOSE_BRACKET", "missing ']'" }, |
760 | #endif |
761 | -#if !defined(ZORBA_NO_UNICODE) |
762 | +#if !defined(ZORBA_NO_ICU) |
763 | { "~U_REGEX_NUMBER_TOO_BIG", "decimal number is too large" }, |
764 | #endif |
765 | -#if !defined(ZORBA_NO_UNICODE) |
766 | +#if !defined(ZORBA_NO_ICU) |
767 | { "~U_REGEX_OCTAL_TOO_BIG", "octal character constants must be <= 0377" }, |
768 | #endif |
769 | -#if !defined(ZORBA_NO_UNICODE) |
770 | +#if !defined(ZORBA_NO_ICU) |
771 | { "~U_REGEX_PROPERTY_SYNTAX", "incorrect Unicode property" }, |
772 | #endif |
773 | -#if !defined(ZORBA_NO_UNICODE) |
774 | +#if !defined(ZORBA_NO_ICU) |
775 | { "~U_REGEX_RULE_SYNTAX", "syntax error" }, |
776 | #endif |
777 | -#if !defined(ZORBA_NO_UNICODE) |
778 | +#if !defined(ZORBA_NO_ICU) |
779 | { "~U_REGEX_SET_CONTAINS_STRING", "can not have UnicodeSets containing strings" }, |
780 | #endif |
781 | -#if !defined(ZORBA_NO_UNICODE) |
782 | +#if !defined(ZORBA_NO_ICU) |
783 | { "~U_REGEX_STACK_OVERFLOW", "backtrack stack overflow" }, |
784 | #endif |
785 | -#if !defined(ZORBA_NO_UNICODE) |
786 | +#if !defined(ZORBA_NO_ICU) |
787 | { "~U_REGEX_STOPPED_BY_CALLER", "matching operation aborted by user callback fn" }, |
788 | #endif |
789 | -#if !defined(ZORBA_NO_UNICODE) |
790 | +#if !defined(ZORBA_NO_ICU) |
791 | { "~U_REGEX_TIME_OUT", "maximum allowed match time exceeded" }, |
792 | #endif |
793 | -#if !defined(ZORBA_NO_UNICODE) |
794 | +#if !defined(ZORBA_NO_ICU) |
795 | { "~U_REGEX_UNIMPLEMENTED", "use of regular expression feature that is not yet implemented" }, |
796 | #endif |
797 | { "~UnaryArithOp", "unary arithmetic operator" }, |
798 | +#if !defined(ZORBA_NO_ICU) |
799 | { "~UnbalancedChar_3", "missing '$3'" }, |
800 | +#endif |
801 | +#if !defined(ZORBA_NO_ICU) |
802 | + { "~UnescapedChar_3", "character '$3' must be escaped here" }, |
803 | +#endif |
804 | { "~UnexpectedElement", "unexpected element" }, |
805 | { "~VarValMustBeSingleItem_2", "\"$2\": variable value must be single item" }, |
806 | { "~Variable", "variable" }, |
807 | |
808 | === modified file 'src/precompiled/stdafx.h' |
809 | --- src/precompiled/stdafx.h 2012-03-28 05:19:57 +0000 |
810 | +++ src/precompiled/stdafx.h 2012-04-13 19:45:38 +0000 |
811 | @@ -15,363 +15,81 @@ |
812 | |
813 | */ |
814 | |
815 | -#if defined STDAFX |
816 | -#include <iostream> |
817 | -#include <stdexcept> |
818 | -#include <cassert> |
819 | -#include <cstring> |
820 | -#include <memory> |
821 | - |
822 | -#include <sstream> |
823 | -#include <xfwrap> |
824 | -#include <xfwrap1> |
825 | -#include <istream> |
826 | -#include <cstdio> |
827 | -#include <xxshared> |
828 | -#include <crtdefs.h> |
829 | -#include <map> |
830 | -#include <set> |
831 | -//#include <poppack.h> |
832 | -//#include <xxtype_traits> |
833 | -//#include <xxcallwrap> |
834 | - |
835 | -// #include <xxcallpmf> |
836 | -// //#include <xxbind0> |
837 | -// //#include <xxbind1> |
838 | -// //#include <xxresult> |
839 | -// #include <zorba/audit.h> |
840 | -// #include "api/auditimpl.h" |
841 | -// #include <zorba/audit.h> |
842 | - |
843 | - //#include "unicode/unistr.h" |
844 | - #include "runtime/sequences/sequences.h" |
845 | - #include "diagnostics/xquery_diagnostics.h" |
846 | - #include "xercesc/util/xercesdefs.hpp" |
847 | - #include "runtime/collections/collections.h" |
848 | - #include "unicode/utypes.h" |
849 | - #include "zorba/config.h" |
850 | - #include "store/api/store.h" |
851 | - #include "zorba/zorba.h" |
852 | - #include "zorba/api_shared_types.h" |
853 | - #include "compiler/parsetree/parsenodes.h" |
854 | - #include "compiler/parser/parse_constants.h" |
855 | - //#include "compiler/api/compilercb.h" |
856 | - #include "zorbautils/checked_vector.h" |
857 | - #include "compiler/parser/xquery_driver.h" |
858 | - #include "util/sorter.h" |
859 | - #include "compiler/xqueryx/xqueryx_to_xquery.h" |
860 | -// #include "compiler/xqueryx/xqueryx_xslt.h" |
861 | -//#include "compiler/parser/xquery_scanner.h" |
862 | -//#include "compiler/parsetree/parsenode_base.h" |
863 | -//#include "compiler/parsetree/parsenode_visitor.h" |
864 | -// #include "runtime/core/flwor_iterator.h" |
865 | -// #include "context/static_context.h" |
866 | -// #include "zorbautils/fatal.h" |
867 | -// #include "runtime/base/unarybase.h" |
868 | -// #include "compiler/expression/expr_consts.h" |
869 | -// #include "api/iterator_singleton.h" |
870 | -// #include "runtime/visitors/printer_visitor_api.h" |
871 | -// //#include "compiler/parsetree/parsenode_print_dot_visitor.h" |
872 | -// //#include "compiler/parsetree/parsenode_print_dot_visitor.h" |
873 | -// //#include "runtime/visitors/planiter_visitor_impl_code.h" |
874 | -// //#include "runtime/visitors/planiter_visitor_impl_include.h" |
875 | -// //#include "runtime/visitors/printer_visitor_impl.h" |
876 | -// //#include "runtime/core/path.h" |
877 | -// #include "compiler/expression/ft_expr.h" |
878 | -// #include "compiler/expression/ftnode.h" |
879 | -// #include "compiler/parser/query_loc.h" |
880 | +#ifdef STDAFX |
881 | + |
882 | + #include <fstream> |
883 | + #include <iostream> |
884 | + #include <stdexcept> |
885 | + #include <cassert> |
886 | + #include <cstring> |
887 | + #include <memory> |
888 | + |
889 | + #include <sstream> |
890 | + #include <xfwrap> |
891 | + #include <xfwrap1> |
892 | + #include <istream> |
893 | + #include <cstdio> |
894 | + #include <xxshared> |
895 | + #include <crtdefs.h> |
896 | + #include <map> |
897 | + #include <set> |
898 | + |
899 | + #include "runtime/sequences/sequences.h" |
900 | + #include "diagnostics/xquery_diagnostics.h" |
901 | + #include "xercesc/util/xercesdefs.hpp" |
902 | + #include "runtime/collections/collections.h" |
903 | + #include "unicode/utypes.h" |
904 | + #include "zorba/config.h" |
905 | + #include "store/api/store.h" |
906 | + #include "zorba/zorba.h" |
907 | + #include "zorba/api_shared_types.h" |
908 | + #include "compiler/parsetree/parsenodes.h" |
909 | + #include "compiler/parser/parse_constants.h" |
910 | + #include "zorbautils/checked_vector.h" |
911 | + #include "compiler/parser/xquery_driver.h" |
912 | + #include "util/sorter.h" |
913 | + #include "compiler/xqueryx/xqueryx_to_xquery.h" |
914 | + #include <zorba/store_manager.h> |
915 | + #include <zorba/xquery.h> |
916 | + #include <zorba/xquery_exception.h> |
917 | #include "util/cxx_util.h" |
918 | -// #include "util/indent.h" |
919 | -// #include "util/stl_util.h" |
920 | -// #include "diagnostics/xquery_diagnostics.h" |
921 | -// #include "zorbatypes/numconversions.h" |
922 | + #include "diagnostics/assert.h" |
923 | + #include "zorbatypes/mapm/m_apm_lc.h" |
924 | + #include "zorbatypes/datetime/parse.h" |
925 | + #include "zorbatypes/chartype.h" |
926 | + #include "zorbatypes/collation_manager.h" |
927 | + #include "zorbatypes/ft_token.h" |
928 | + #include "zorbatypes/m_apm.h" |
929 | + #include "zorbatypes/rclock.h" |
930 | + #include "zorbatypes/schema_types.h" |
931 | + #include "zorbatypes/timezone.h" |
932 | + #include "zorbatypes/transcoder.h" |
933 | + #include "zorbatypes/URI.h" |
934 | + #include "zorbatypes/xerces_xmlcharray.h" |
935 | + #include "zorbatypes/zorbatypes_decl.h" |
936 | + #include "zorbatypes/zstring.h" |
937 | + #include "zorbautils/condition.h" |
938 | + #include "zorbautils/hashfun.h" |
939 | + #include "zorbautils/hashmap.h" |
940 | + #include "zorbautils/hashmap_itemp.h" |
941 | + #include "zorbautils/hashmap_str_obj.h" |
942 | + #include "zorbautils/hashmap_zstring.h" |
943 | + #include "zorbautils/hashset.h" |
944 | + #include "zorbautils/hashset_itemh.h" |
945 | + #include "zorbautils/latch.h" |
946 | + #include "zorbautils/locale.h" |
947 | + #include "zorbautils/lock.h" |
948 | + #include "zorbautils/mutex.h" |
949 | + #include "zorbautils/runnable.h" |
950 | + #include "zorbautils/SAXParser.h" |
951 | + #include "zorbautils/stack.h" |
952 | + #include "zorbautils/string_util.h" |
953 | + #include "unit_tests/unit_test_list.h" |
954 | + #include "zorba/diagnostic_handler.h" |
955 | + #include "zorba/xquery_warning.h" |
956 | + #include "runtime/full_text/ftcontains_visitor.h" |
957 | + #include "store/api/ft_token_iterator.h" |
958 | + #include "store/naive/ft_token_store.h" |
959 | |
960 | -// #include "api/serialization/serializable.h" |
961 | -// #include "api/serialization/serializer.h" |
962 | -// #include "api/collectionimpl.h" |
963 | -// #include "api/dynamiccontextimpl.h" |
964 | -// #include "api/fileimpl.h" |
965 | -// #include "api/functionimpl.h" |
966 | -// #include "api/invoke_item_sequence.h" |
967 | -// #include "api/itemfactoryimpl.h" |
968 | -// #include "api/resultiteratorchainer.h" |
969 | -// #include "api/resultiteratorimpl.h" |
970 | -// #include "api/sax2impl.h" |
971 | -// #include "api/serializerimpl.h" |
972 | -// #include "api/staticcontextimpl.h" |
973 | -// #include "api/storeiteratorimpl.h" |
974 | -// #include "api/unmarshaller.h" |
975 | -// #include "api/uri_resolver_wrappers.h" |
976 | -// #include "api/vectoriterator.h" |
977 | -// #include "api/xmldatamanagerimpl.h" |
978 | -// //#include "api/xqueryimpl.h" |
979 | -// #include "api/zorbaimpl.h" |
980 | -// #include "capi/cdynamic_context.h" |
981 | -// #include "capi/cexpression.h" |
982 | -// #include "capi/cexternal_function.h" |
983 | -// #include "capi/cimplementation.h" |
984 | -// #include "capi/csequence.h" |
985 | -// #include "capi/cstatic_context.h" |
986 | -// #include "capi/error.h" |
987 | -// #include "capi/external_module.h" |
988 | -// #include "capi/single_item_sequence.h" |
989 | -// #include "capi/user_item_sequence.h" |
990 | -// #include "compiler/parser/flexlexer.h" |
991 | -// #include "compiler/parser/ft_types.h" |
992 | -// #include "compiler/parser/symbol_table.h" |
993 | -// #include "compiler/parser/xqdoc_comment.h" |
994 | -// #include "compiler/parsetree/parsenode_print_xml_visitor.h" |
995 | -// #include "compiler/parsetree/parsenode_print_xqdoc_visitor.h" |
996 | -// #include "compiler/parsetree/parsenode_print_xquery_visitor.h" |
997 | -// #include "compiler/parsetree/parsenode_xqdoc_visitor.h" |
998 | -// #include "compiler/translator/prolog_graph.h" |
999 | -// #include "compiler/translator/translator.h" |
1000 | -// #include "compiler/codegen/plan_visitor.h" |
1001 | -// #include "compiler/expression/abstract_expr_visitor.h" |
1002 | -// #include "compiler/expression/expr.h" |
1003 | -// #include "compiler/expression/expr_annotations.h" |
1004 | -// #include "compiler/expression/expr_base.h" |
1005 | -// #include "compiler/expression/expr_classes.h" |
1006 | -// #include "compiler/expression/expr_iter.h" |
1007 | -// #include "compiler/expression/expr_utils.h" |
1008 | -// #include "compiler/expression/expr_visitor.h" |
1009 | -// #include "compiler/expression/flwor_expr.h" |
1010 | -// //#include "compiler/expression/fo_expr.h" |
1011 | -// #include "compiler/expression/ftnode_classes.h" |
1012 | -// #include "compiler/expression/ftnode_visitor.h" |
1013 | -// #include "compiler/expression/function_item_expr.h" |
1014 | -// #include "compiler/expression/path_expr.h" |
1015 | -// #include "compiler/expression/script_exprs.h" |
1016 | -// #include "compiler/expression/update_exprs.h" |
1017 | -// #include "compiler/expression/var_expr.h" |
1018 | -// #include "compiler/rewriter/framework/rewriter.h" |
1019 | -// #include "compiler/rewriter/framework/rewriter_context.h" |
1020 | -// #include "compiler/rewriter/framework/rule_driver.h" |
1021 | -// #include "compiler/rewriter/framework/sequential_rewriter.h" |
1022 | -// #include "compiler/rewriter/rewriters/common_rewriter.h" |
1023 | -// #include "compiler/rewriter/rewriters/default_optimizer.h" |
1024 | -// #include "compiler/rewriter/rewriters/phase1_rewriter.h" |
1025 | -// #include "compiler/rewriter/rules/ruleset.h" |
1026 | -// #include "compiler/rewriter/rules/rule_base.h" |
1027 | -// #include "compiler/rewriter/rules/type_rules.h" |
1028 | -// #include "compiler/rewriter/tools/dataflow_annotations.h" |
1029 | -// #include "compiler/rewriter/tools/expr_tools.h" |
1030 | -// #include "compiler/rewriter/tools/udf_graph.h" |
1031 | -// #include "compiler/xqddf/collection_decl.h" |
1032 | -// #include "compiler/xqddf/value_ic.h" |
1033 | -// #include "compiler/xqddf/value_index.h" |
1034 | -// #include "compiler/semantic_annotations/annotations.h" |
1035 | -// #include "compiler/semantic_annotations/annotation_holder.h" |
1036 | -// #include "compiler/semantic_annotations/annotation_keys.h" |
1037 | -// #include "compiler/api/compiler_api.h" |
1038 | -// #include "compiler/api/compiler_api_impl.h" |
1039 | -// #include "system/globalenv.h" |
1040 | -// #include "system/properties.h" |
1041 | -// #include "system/zorba_properties.h" |
1042 | -// #include "context/decimal_format.h" |
1043 | -// #include "context/default_uri_mappers.h" |
1044 | -// #include "context/default_url_resolvers.h" |
1045 | -// #include "context/dynamic_context.h" |
1046 | -// #include "context/dynamic_loader.h" |
1047 | -// #include "context/internal_uri_resolvers.h" |
1048 | -// //#include "context/namespace_context.h" |
1049 | -// #include "context/root_static_context.h" |
1050 | -// #include "context/sctx_map_iterator.h" |
1051 | -// #include "context/standard_uri_resolvers.h" |
1052 | -// #include "context/static_context_consts.h" |
1053 | -// #include "context/stemmer_wrappers.h" |
1054 | -// #include "context/uri_resolver.h" |
1055 | -// #include "context/uri_resolver_wrapper.h" |
1056 | -#include "diagnostics/assert.h" |
1057 | -// #include "diagnostics/diagnostic.h" |
1058 | -// #include "diagnostics/dict.h" |
1059 | -// #include "diagnostics/dict_impl.h" |
1060 | -// #include "diagnostics/StackWalker.h" |
1061 | -// #include "diagnostics/user_error.h" |
1062 | -// #include "diagnostics/user_exception.h" |
1063 | -// #include "diagnostics/xquery_exception.h" |
1064 | -// #include "diagnostics/xquery_stack_trace.h" |
1065 | -// #include "diagnostics/xquery_warning.h" |
1066 | -// #include "diagnostics/zorba_exception.h" |
1067 | -// //#include "functions/annotation.h" |
1068 | -// #include "functions/external_function.h" |
1069 | -// #include "functions/function.h" |
1070 | -// #include "functions/function_consts.h" |
1071 | -// #include "functions/function_impl.h" |
1072 | -// #include "functions/func_accessors_impl.h" |
1073 | -// #include "functions/func_apply.h" |
1074 | -// #include "functions/func_arithmetic.h" |
1075 | -// #include "functions/func_booleans_impl.h" |
1076 | -// #include "functions/func_durations_dates_times_impl.h" |
1077 | -// #include "functions/func_enclosed.h" |
1078 | -// #include "functions/func_eval.h" |
1079 | -// #include "functions/func_hoist.h" |
1080 | -// #include "functions/func_index_ddl.h" |
1081 | -// #include "functions/func_node_sort_distinct.h" |
1082 | -// #include "functions/func_numerics_impl.h" |
1083 | -// #include "functions/func_reflection.h" |
1084 | -// #include "functions/func_sequences_impl.h" |
1085 | -// #include "functions/func_var_decl.h" |
1086 | -// #include "functions/library.h" |
1087 | -// #include "functions/signature.h" |
1088 | -// #include "functions/udf.h" |
1089 | -// #include "runtime/full_text/thesauri/decode_base128.h" |
1090 | -// #include "runtime/full_text/thesauri/encoded_list.h" |
1091 | -// #include "runtime/full_text/thesauri/iso2788.h" |
1092 | -// #include "runtime/full_text/thesauri/wn_db_segment.h" |
1093 | -// #include "runtime/full_text/thesauri/wn_synset.h" |
1094 | -// #include "runtime/full_text/thesauri/wn_thesaurus.h" |
1095 | -// #include "runtime/full_text/thesauri/wn_types.h" |
1096 | -// #include "runtime/full_text/thesauri/xqftts_relationship.h" |
1097 | -// #include "runtime/full_text/thesauri/xqftts_thesaurus.h" |
1098 | -// #include "runtime/full_text/ft_match.h" |
1099 | -// #include "runtime/full_text/ft_query_item.h" |
1100 | -// #include "runtime/full_text/ft_single_token_iterator.h" |
1101 | -// #include "runtime/full_text/ft_stop_words_set.h" |
1102 | -// #include "runtime/full_text/ft_thesaurus.h" |
1103 | -// #include "runtime/full_text/ft_token_matcher.h" |
1104 | -// #include "runtime/full_text/ft_token_seq_iterator.h" |
1105 | -// #include "runtime/full_text/ft_token_span.h" |
1106 | -// #include "runtime/full_text/ft_wildcard.h" |
1107 | -// #include "runtime/full_text/full_text.h" |
1108 | -// #include "runtime/full_text/apply.h" |
1109 | -// #include "runtime/full_text/ft_util.h" |
1110 | -// #include "runtime/collections/collections_base.h" |
1111 | -// #include "runtime/core/apply_updates.h" |
1112 | -// #include "runtime/core/arithmetic_impl.h" |
1113 | -// #include "runtime/core/constructors.h" |
1114 | -// #include "runtime/core/fncall_iterator.h" |
1115 | -// #include "runtime/core/internal_operators.h" |
1116 | -// #include "runtime/core/item_iterator.h" |
1117 | -// #include "runtime/core/nodeid_iterators.h" |
1118 | -// #include "runtime/core/path_iterators.h" |
1119 | -// #include "runtime/core/sequencetypes.h" |
1120 | -// #include "runtime/core/trycatch.h" |
1121 | -// #include "runtime/core/var_iterators.h" |
1122 | -// #include "runtime/numerics/NumericsImpl.h" |
1123 | -// #include "runtime/booleans/BooleanImpl.h" |
1124 | -// #include "runtime/base/binarybase.h" |
1125 | -// #include "runtime/base/narybase.h" |
1126 | -// #include "runtime/base/noarybase.h" |
1127 | -// #include "runtime/base/plan_iterator.h" |
1128 | -// #include "runtime/sequences/SequencesImpl.h" |
1129 | -// #include "runtime/visitors/iterprinter.h" |
1130 | -// #include "runtime/misc/materialize.h" |
1131 | -// #include "runtime/scripting/scripting.h" |
1132 | -// #include "types/schema/EventSchemaValidator.h" |
1133 | -// #include "types/schema/LoadSchemaErrorHandler.h" |
1134 | -// #include "types/schema/PrintSchema.h" |
1135 | -// #include "types/schema/revalidateUtils.h" |
1136 | -// #include "types/schema/schema.h" |
1137 | -// #include "types/schema/SchemaValidatorFilter.h" |
1138 | -// #include "types/schema/StrX.h" |
1139 | -// #include "types/schema/validate.h" |
1140 | -// #include "types/schema/ValidationEventHandler.h" |
1141 | -// #include "types/schema/xercesIncludes.h" |
1142 | -// #include "types/schema/XercesParseUtils.h" |
1143 | -// #include "types/schema/XercSchemaValidator.h" |
1144 | -// #include "types/casting.h" |
1145 | -// #include "types/collation.h" |
1146 | -// #include "types/node_test.h" |
1147 | -// #include "types/root_typemanager.h" |
1148 | -// #include "types/typeconstants.h" |
1149 | -// #include "types/typeimpl.h" |
1150 | -// #include "types/typemanager.h" |
1151 | -// #include "types/typemanagerimpl.h" |
1152 | -// #include "types/typeops.h" |
1153 | -// #include "util/fx/fxarray.h" |
1154 | -// #include "util/fx/fxcharheap.h" |
1155 | -// #include "util/ascii_util.h" |
1156 | -// #include "util/atomic_int.h" |
1157 | -// #include "util/auto_vector.h" |
1158 | -// #include "util/curl_util.h" |
1159 | -// #include "util/dir.h" |
1160 | -// #include "util/dynamic_bitset.h" |
1161 | -// #include "util/empty.h" |
1162 | -// #include "util/error_util.h" |
1163 | -// #include "util/fs_util.h" |
1164 | -// #include "util/hashmap.h" |
1165 | -// //#include "util/hashmap32.h" |
1166 | -// #include "util/less.h" |
1167 | -// #include "util/mmap_file.h" |
1168 | -// #include "util/nonatomic_int.h" |
1169 | -// #include "util/omanip.h" |
1170 | -// #include "util/oseparator.h" |
1171 | -// #include "util/regex.h" |
1172 | -// #include "util/singleton.h" |
1173 | -// #include "util/string_util.h" |
1174 | -// #include "util/threads.h" |
1175 | -// #include "util/tokenbuf.h" |
1176 | -// #include "util/tracer.h" |
1177 | -// #include "util/triple.h" |
1178 | -// #include "util/unicode_categories.h" |
1179 | -// #include "util/unicode_util.h" |
1180 | -// #include "util/uri_util.h" |
1181 | -// #include "util/utf8_string.h" |
1182 | -// #include "util/utf8_util.h" |
1183 | -// #include "util/utf8_util_base.h" |
1184 | -// #include "util/void_int.h" |
1185 | -// #include "util/xml_util.h" |
1186 | -// #include "zorbamisc/config/platform.h" |
1187 | -// //#include "zorbaserialization/archiver.h" |
1188 | -// #include "zorbaserialization/base64impl.h" |
1189 | -// #include "zorbaserialization/bin_archiver.h" |
1190 | -// //#include "zorbaserialization/class_serializer.h" |
1191 | -// #include "zorbaserialization/mem_archiver.h" |
1192 | -// #include "zorbaserialization/serialization_engine.h" |
1193 | -// #include "zorbaserialization/template_serializer.h" |
1194 | -// #include "zorbaserialization/xml_archiver.h" |
1195 | -// #include "zorbaserialization/zorba_class_serializer.h" |
1196 | - #include "zorbatypes/mapm/m_apm_lc.h" |
1197 | - #include "zorbatypes/datetime/parse.h" |
1198 | - //#include "zorbatypes/binary.h" |
1199 | - #include "zorbatypes/chartype.h" |
1200 | - #include "zorbatypes/collation_manager.h" |
1201 | - //#include "zorbatypes/datetime.h" |
1202 | - //#include "zorbatypes/decimal.h" |
1203 | - //#include "zorbatypes/duration.h" |
1204 | - //#include "zorbatypes/floatimpl.h" |
1205 | - #include "zorbatypes/ft_token.h" |
1206 | - //#include "zorbatypes/integer.h" |
1207 | - #include "zorbatypes/libicu.h" |
1208 | - #include "zorbatypes/m_apm.h" |
1209 | - //#include "zorbatypes/rchandle.h" |
1210 | - #include "zorbatypes/rclock.h" |
1211 | - //#include "zorbatypes/regex_ascii.h" |
1212 | - #include "zorbatypes/schema_types.h" |
1213 | - #include "zorbatypes/timezone.h" |
1214 | - #include "zorbatypes/transcoder.h" |
1215 | - #include "zorbatypes/URI.h" |
1216 | - #include "zorbatypes/xerces_xmlcharray.h" |
1217 | - #include "zorbatypes/zorbatypes_decl.h" |
1218 | - #include "zorbatypes/zstring.h" |
1219 | - //#include "zorbautils/stemmer/sb_stemmer.h" |
1220 | - #include "zorbautils/condition.h" |
1221 | - #include "zorbautils/hashfun.h" |
1222 | - #include "zorbautils/hashmap.h" |
1223 | - #include "zorbautils/hashmap_itemp.h" |
1224 | - #include "zorbautils/hashmap_str_obj.h" |
1225 | - #include "zorbautils/hashmap_zstring.h" |
1226 | - #include "zorbautils/hashset.h" |
1227 | - #include "zorbautils/hashset_itemh.h" |
1228 | - //#include "zorbautils/icu_tokenizer.h" |
1229 | - #include "zorbautils/latch.h" |
1230 | - #include "zorbautils/locale.h" |
1231 | - #include "zorbautils/lock.h" |
1232 | - #include "zorbautils/mutex.h" |
1233 | - #include "zorbautils/runnable.h" |
1234 | - #include "zorbautils/SAXParser.h" |
1235 | - #include "zorbautils/stack.h" |
1236 | -// #include "zorbautils/stemmer.h" |
1237 | - #include "zorbautils/string_util.h" |
1238 | - //#include "zorbautils/synchronous_logger.h" |
1239 | - //#include "zorbautils/tokenizer.h" |
1240 | - #include "unit_tests/unit_test_list.h" |
1241 | - #include "zorba/diagnostic_handler.h" |
1242 | - #include "zorba/xquery_warning.h" |
1243 | - #include "runtime/full_text/ftcontains_visitor.h" |
1244 | - #include "store/naive/naive_ft_token_iterator.h" |
1245 | - #include "store/api/ft_token_iterator.h" |
1246 | - #include "store/naive/ft_token_store.h" |
1247 | #endif |
1248 | /* vim:set et sw=2 ts=2: */ |
1249 | |
1250 | === modified file 'src/runtime/full_text/CMakeLists.txt' |
1251 | --- src/runtime/full_text/CMakeLists.txt 2012-03-28 05:19:57 +0000 |
1252 | +++ src/runtime/full_text/CMakeLists.txt 2012-04-13 19:45:38 +0000 |
1253 | @@ -42,11 +42,11 @@ |
1254 | default_tokenizer.cpp |
1255 | ) |
1256 | |
1257 | -IF (ZORBA_NO_UNICODE) |
1258 | +IF (ZORBA_NO_ICU) |
1259 | LIST(APPEND FULLTEXT_SRCS latin_tokenizer.cpp) |
1260 | -ELSE (ZORBA_NO_UNICODE) |
1261 | +ELSE (ZORBA_NO_ICU) |
1262 | LIST(APPEND FULLTEXT_SRCS icu_tokenizer.cpp) |
1263 | -ENDIF (ZORBA_NO_UNICODE) |
1264 | +ENDIF (ZORBA_NO_ICU) |
1265 | |
1266 | ADD_SRC_SUBFOLDER(FULLTEXT_SRCS stemmer LIBSTEMMER_SRCS) |
1267 | |
1268 | |
1269 | === modified file 'src/runtime/full_text/default_tokenizer.cpp' |
1270 | --- src/runtime/full_text/default_tokenizer.cpp 2012-03-28 05:19:57 +0000 |
1271 | +++ src/runtime/full_text/default_tokenizer.cpp 2012-04-13 19:45:38 +0000 |
1272 | @@ -19,22 +19,22 @@ |
1273 | #include <zorba/config.h> |
1274 | |
1275 | #include "default_tokenizer.h" |
1276 | -#ifdef ZORBA_NO_UNICODE |
1277 | +#ifdef ZORBA_NO_ICU |
1278 | # include "latin_tokenizer.h" |
1279 | #else |
1280 | # include "icu_tokenizer.h" |
1281 | -#endif /* ZORBA_NO_UNICODE */ |
1282 | +#endif /* ZORBA_NO_ICU */ |
1283 | |
1284 | namespace zorba { |
1285 | |
1286 | /////////////////////////////////////////////////////////////////////////////// |
1287 | |
1288 | TokenizerProvider const& default_tokenizer_provider() { |
1289 | -#ifdef ZORBA_NO_UNICODE |
1290 | +#ifdef ZORBA_NO_ICU |
1291 | static LatinTokenizerProvider const instance; |
1292 | #else |
1293 | static ICU_TokenizerProvider const instance; |
1294 | -#endif /* ZORBA_NO_UNICODE */ |
1295 | +#endif /* ZORBA_NO_ICU */ |
1296 | return instance; |
1297 | }; |
1298 | |
1299 | |
1300 | === modified file 'src/runtime/full_text/latin_tokenizer.cpp' |
1301 | --- src/runtime/full_text/latin_tokenizer.cpp 2012-03-28 05:19:57 +0000 |
1302 | +++ src/runtime/full_text/latin_tokenizer.cpp 2012-04-13 19:45:38 +0000 |
1303 | @@ -18,8 +18,9 @@ |
1304 | #include <functional> |
1305 | |
1306 | #include <zorba/diagnostic_list.h> |
1307 | -#include <zorba/xquery_exception.h> |
1308 | -#include <zorba/zorba.h> |
1309 | + |
1310 | +#include "diagnostics/dict.h" |
1311 | +#include "diagnostics/xquery_exception.h" |
1312 | |
1313 | #include "latin_tokenizer.h" |
1314 | |
1315 | |
1316 | === modified file 'src/runtime/full_text/latin_tokenizer.h' |
1317 | --- src/runtime/full_text/latin_tokenizer.h 2012-03-28 05:19:57 +0000 |
1318 | +++ src/runtime/full_text/latin_tokenizer.h 2012-04-13 19:45:38 +0000 |
1319 | @@ -14,12 +14,12 @@ |
1320 | * limitations under the License. |
1321 | */ |
1322 | |
1323 | -#ifndef ZORBA_WESTERN_TOKENIZER_H |
1324 | -#define ZORBA_WESTERN_TOKENIZER_H |
1325 | +#ifndef ZORBA_LATIN_TOKENIZER_H |
1326 | +#define ZORBA_LATIN_TOKENIZER_H |
1327 | |
1328 | #include <zorba/config.h> |
1329 | |
1330 | -#ifdef ZORBA_NO_FULL_TEXT |
1331 | +#ifdef ZORBA_NO_ICU |
1332 | |
1333 | #include <zorba/tokenizer.h> |
1334 | #include "zorbatypes/zstring.h" |
1335 | @@ -38,8 +38,8 @@ |
1336 | |
1337 | // inherited |
1338 | void destroy() const; |
1339 | - void tokenize( char const*, size_type, iso639_1::type, bool, Callback&, |
1340 | - void* ); |
1341 | + void tokenize( char const*, size_type, locale::iso639_1::type, bool, |
1342 | + Callback&, void* ); |
1343 | |
1344 | private: |
1345 | typedef zstring string_type; |
1346 | @@ -64,13 +64,14 @@ |
1347 | class LatinTokenizerProvider : public TokenizerProvider { |
1348 | public: |
1349 | // inherited |
1350 | - Tokenizer::ptr getTokenizer( iso639_1::type, Tokenizer::Numbers& ) const; |
1351 | + Tokenizer::ptr getTokenizer( locale::iso639_1::type, |
1352 | + Tokenizer::Numbers& ) const; |
1353 | }; |
1354 | |
1355 | /////////////////////////////////////////////////////////////////////////////// |
1356 | |
1357 | } // namespace zorba |
1358 | |
1359 | -#endif /* ZORBA_NO_FULL_TEXT */ |
1360 | -#endif /* ZORBA_WESTERN_TOKENIZER_H */ |
1361 | +#endif /* ZORBA_NO_ICU */ |
1362 | +#endif /* ZORBA_LATIN_TOKENIZER_H */ |
1363 | /* vim:set et sw=2 ts=2: */ |
1364 | |
1365 | === modified file 'src/runtime/numerics/format_integer_impl.cpp' |
1366 | --- src/runtime/numerics/format_integer_impl.cpp 2012-03-28 05:19:57 +0000 |
1367 | +++ src/runtime/numerics/format_integer_impl.cpp 2012-04-13 19:45:38 +0000 |
1368 | @@ -881,7 +881,7 @@ |
1369 | utf8_result += (*valueit); |
1370 | } |
1371 | else |
1372 | - utf8_result += (0x2080 + *valueit - '0'); |
1373 | + utf8_result += (unicode::code_point)(0x2080 + *valueit - '0'); |
1374 | } |
1375 | } |
1376 | else if((c0 == 0x2460) || //CIRCLED DIGIT ONE (1-20) |
1377 | |
1378 | === modified file 'src/runtime/numerics/numerics_impl.cpp' |
1379 | --- src/runtime/numerics/numerics_impl.cpp 2012-03-28 05:19:57 +0000 |
1380 | +++ src/runtime/numerics/numerics_impl.cpp 2012-04-13 19:45:38 +0000 |
1381 | @@ -462,7 +462,7 @@ |
1382 | minus( "-" ) |
1383 | { |
1384 | utf8_string<zstring> u_per_mille( per_mille ); |
1385 | - u_per_mille = 0x2030; |
1386 | + u_per_mille = (unicode::code_point)0x2030; |
1387 | } |
1388 | |
1389 | void readFormat(const DecimalFormat_t& df_t) |
1390 | |
1391 | === modified file 'src/runtime/strings/strings_impl.cpp' |
1392 | --- src/runtime/strings/strings_impl.cpp 2012-03-28 05:19:57 +0000 |
1393 | +++ src/runtime/strings/strings_impl.cpp 2012-04-13 19:45:38 +0000 |
1394 | @@ -810,7 +810,9 @@ |
1395 | zstring normForm; |
1396 | zstring resStr; |
1397 | unicode::normalization::type normType; |
1398 | +#ifndef ZORBA_NO_ICU |
1399 | bool success; |
1400 | +#endif /* ZORBA_NO_ICU */ |
1401 | |
1402 | PlanIteratorState* state; |
1403 | DEFAULT_STACK_INIT(PlanIteratorState, state, planState); |
1404 | @@ -860,10 +862,10 @@ |
1405 | } |
1406 | |
1407 | item0->getStringValue2(resStr); |
1408 | -#ifndef ZORBA_NO_UNICODE |
1409 | +#ifndef ZORBA_NO_ICU |
1410 | success = utf8::normalize(resStr, normType, &resStr); |
1411 | ZORBA_ASSERT(success); |
1412 | -#endif//#ifndef ZORBA_NO_UNICODE |
1413 | +#endif//#ifndef ZORBA_NO_ICU |
1414 | STACK_PUSH(GENV_ITEMFACTORY->createString(result, resStr), state ); |
1415 | } |
1416 | else |
1417 | @@ -992,7 +994,7 @@ |
1418 | trans_map[ *map_i ] = *trans_i; |
1419 | |
1420 | for ( ; map_i != map_end; ++map_i ) |
1421 | - trans_map[ *map_i ] = ~0; |
1422 | + trans_map[ *map_i ] = static_cast<unicode::code_point>( ~0 ); |
1423 | } |
1424 | |
1425 | utf8_string<zstring> u_result_string( result_string ); |
1426 | @@ -1007,7 +1009,7 @@ |
1427 | cp_map_type::const_iterator const found_i = trans_map.find( cp ); |
1428 | if ( found_i != trans_map.end() ) { |
1429 | cp = found_i->second; |
1430 | - if ( cp == ~0 ) |
1431 | + if ( cp == static_cast<unicode::code_point>( ~0 ) ) |
1432 | continue; |
1433 | } |
1434 | u_result_string += cp; |
1435 | @@ -1795,16 +1797,33 @@ |
1436 | int &utf8start, |
1437 | unsigned int &bytestart, |
1438 | int utf8end, |
1439 | + unsigned int byteend, |
1440 | zstring &out) |
1441 | { |
1442 | +#ifndef ZORBA_NO_ICU |
1443 | utf8::size_type clen; |
1444 | - while(utf8start < utf8end) |
1445 | - { |
1446 | - clen = utf8::char_length(*sin); |
1447 | - out.append(sin, clen); |
1448 | - utf8start++; |
1449 | - bytestart += clen; |
1450 | - sin += clen; |
1451 | + if(utf8end) |
1452 | + { |
1453 | + while(utf8start < utf8end) |
1454 | + { |
1455 | + clen = utf8::char_length(*sin); |
1456 | + if(clen == 0) |
1457 | + clen = 1; |
1458 | + out.append(sin, clen); |
1459 | + utf8start++; |
1460 | + bytestart += clen; |
1461 | + sin += clen; |
1462 | + } |
1463 | + } |
1464 | + else |
1465 | +#endif |
1466 | + { |
1467 | + if(!utf8end) |
1468 | + utf8end = byteend; |
1469 | + out.append(sin, utf8end-bytestart); |
1470 | + sin += utf8end-bytestart; |
1471 | + utf8start = utf8end; |
1472 | + bytestart = utf8end; |
1473 | } |
1474 | } |
1475 | |
1476 | @@ -1812,6 +1831,7 @@ |
1477 | int &match_end1, |
1478 | unsigned int &match_end1_bytes, |
1479 | int match_start2, |
1480 | + unsigned int match_start2_bytes, |
1481 | const char *&strin) |
1482 | { |
1483 | store::Item_t non_match_elem; |
1484 | @@ -1833,7 +1853,7 @@ |
1485 | // utf8_it++; |
1486 | // match_end1++; |
1487 | //} |
1488 | - copyUtf8Chars(strin, match_end1, match_end1_bytes, match_start2, non_match_str); |
1489 | + copyUtf8Chars(strin, match_end1, match_end1_bytes, match_start2, match_start2_bytes, non_match_str); |
1490 | store::Item_t non_match_text_item; |
1491 | GENV_ITEMFACTORY->createTextNode(non_match_text_item, non_match_elem, non_match_str); |
1492 | } |
1493 | @@ -1864,19 +1884,31 @@ |
1494 | i--; |
1495 | break; |
1496 | } |
1497 | +#ifndef ZORBA_NO_ICU |
1498 | match_startg = rx.get_match_start(i+1); |
1499 | if((match_startg < 0) && (gparent < 0)) |
1500 | continue; |
1501 | +#else |
1502 | + int temp_endg; |
1503 | + match_startg = -1; |
1504 | + temp_endg = -1; |
1505 | + if(!rx.get_match_start_end_bytes(i+1, &match_startg, &temp_endg) && (gparent < 0)) |
1506 | + continue; |
1507 | +#endif |
1508 | if(match_endgood < match_startg) |
1509 | { |
1510 | //add non-group match text |
1511 | zstring non_group_str; |
1512 | |
1513 | - copyUtf8Chars(sin, match_endgood, match_end1_bytes, match_startg, non_group_str); |
1514 | + copyUtf8Chars(sin, match_endgood, match_end1_bytes, match_startg, 0, non_group_str); |
1515 | store::Item_t non_group_text_item; |
1516 | GENV_ITEMFACTORY->createTextNode(non_group_text_item, parent.getp(), non_group_str); |
1517 | } |
1518 | +#ifndef ZORBA_NO_ICU |
1519 | match_endg = rx.get_match_end(i+1); |
1520 | +#else |
1521 | + match_endg = temp_endg; |
1522 | +#endif |
1523 | //add group match text |
1524 | GENV_ITEMFACTORY->createQName(group_element_name, |
1525 | static_context::W3C_FN_NS, "fn", "group"); |
1526 | @@ -1907,7 +1939,7 @@ |
1527 | } |
1528 | zstring group_str; |
1529 | |
1530 | - copyUtf8Chars(sin, match_startg, match_end1_bytes, match_endg, group_str); |
1531 | + copyUtf8Chars(sin, match_startg, match_end1_bytes, match_endg, 0, group_str); |
1532 | store::Item_t group_text_item; |
1533 | GENV_ITEMFACTORY->createTextNode(group_text_item, group_elem.getp(), group_str); |
1534 | } |
1535 | @@ -1916,7 +1948,7 @@ |
1536 | { |
1537 | zstring non_group_str; |
1538 | |
1539 | - copyUtf8Chars(sin, match_endgood, match_end1_bytes, match_end2, non_group_str); |
1540 | + copyUtf8Chars(sin, match_endgood, match_end1_bytes, match_end2, 0, non_group_str); |
1541 | store::Item_t non_group_text_item; |
1542 | GENV_ITEMFACTORY->createTextNode(non_group_text_item, parent, non_group_str); |
1543 | } |
1544 | @@ -2144,8 +2176,14 @@ |
1545 | reachedEnd = false; |
1546 | while(rx.find_next_match(&reachedEnd)) |
1547 | { |
1548 | - int match_start2 = rx.get_match_start(); |
1549 | - int match_end2 = rx.get_match_end(); |
1550 | + int match_start2; |
1551 | + int match_end2; |
1552 | +#ifndef ZORBA_NO_ICU |
1553 | + match_start2 = rx.get_match_start(); |
1554 | + match_end2 = rx.get_match_end(); |
1555 | +#else |
1556 | + rx.get_match_start_end_bytes(0, &match_start2, &match_end2); |
1557 | +#endif |
1558 | ZORBA_ASSERT(match_start2 >= 0); |
1559 | |
1560 | if(is_input_stream && reachedEnd && !instream->eof()) |
1561 | @@ -2157,7 +2195,7 @@ |
1562 | //construct the fn:non-match |
1563 | if(match_start2 > match_end1) |
1564 | { |
1565 | - addNonMatchElement(result, match_end1, match_end1_bytes, match_start2, instr); |
1566 | + addNonMatchElement(result, match_end1, match_end1_bytes, match_start2, 0, instr); |
1567 | } |
1568 | |
1569 | //construct the fn:match |
1570 | @@ -2165,7 +2203,7 @@ |
1571 | match_end1 = match_end2; |
1572 | } |
1573 | |
1574 | - if(is_input_stream && reachedEnd && !instream->eof()) |
1575 | + if(is_input_stream && !instream->eof()) |
1576 | { |
1577 | //load some more data, maybe the match will be different |
1578 | if(match_end1_bytes) |
1579 | @@ -2213,7 +2251,7 @@ |
1580 | else |
1581 | { |
1582 | if(match_end1_bytes < streambuf_read) |
1583 | - addNonMatchElement(result, match_end1, match_end1_bytes, streambuf_read, instr); |
1584 | + addNonMatchElement(result, match_end1, match_end1_bytes, 0, streambuf_read, instr); |
1585 | if(is_input_stream && instream->eof()) |
1586 | reachedEnd = true; |
1587 | } |
1588 | |
1589 | === modified file 'src/store/api/store.h' |
1590 | --- src/store/api/store.h 2012-04-10 20:59:34 +0000 |
1591 | +++ src/store/api/store.h 2012-04-13 19:45:38 +0000 |
1592 | @@ -16,7 +16,7 @@ |
1593 | #ifndef ZORBA_STORE_STORE_H |
1594 | #define ZORBA_STORE_STORE_H |
1595 | |
1596 | -#include <zorba/config.h> |
1597 | +#include "zorba/config.h" |
1598 | #include "zorbatypes/schema_types.h" |
1599 | |
1600 | #include "store/api/shared_types.h" |
1601 | |
1602 | === modified file 'src/store/naive/simple_store.h' |
1603 | --- src/store/naive/simple_store.h 2012-03-28 23:58:23 +0000 |
1604 | +++ src/store/naive/simple_store.h 2012-04-13 19:45:38 +0000 |
1605 | @@ -16,7 +16,11 @@ |
1606 | #ifndef ZORBA_SIMPLE_STORE |
1607 | #define ZORBA_SIMPLE_STORE |
1608 | |
1609 | -#include "store.h" |
1610 | +#include "store/naive/store.h" |
1611 | + |
1612 | +#include "store/naive/node_factory.h" |
1613 | +#include "store/naive/pul_primitive_factory.h" |
1614 | +#include "store/naive/tree_id_generator.h" |
1615 | |
1616 | namespace zorba { |
1617 | namespace simplestore { |
1618 | @@ -72,7 +76,7 @@ |
1619 | |
1620 | NodeFactory* createNodeFactory() const; |
1621 | |
1622 | - void destroyNodeFactory(NodeFactory*) const; |
1623 | + void destroyNodeFactory(zorba::simplestore::NodeFactory*) const; |
1624 | |
1625 | store::ItemFactory* createItemFactory() const; |
1626 | |
1627 | @@ -84,7 +88,7 @@ |
1628 | |
1629 | PULPrimitiveFactory* createPULFactory() const; |
1630 | |
1631 | - void destroyPULFactory(PULPrimitiveFactory*) const; |
1632 | + void destroyPULFactory(zorba::simplestore::PULPrimitiveFactory*) const; |
1633 | |
1634 | CollectionSet* createCollectionSet() const; |
1635 | |
1636 | |
1637 | === modified file 'src/store/naive/store.h' |
1638 | --- src/store/naive/store.h 2012-03-28 22:09:36 +0000 |
1639 | +++ src/store/naive/store.h 2012-04-13 19:45:38 +0000 |
1640 | @@ -16,10 +16,18 @@ |
1641 | #ifndef ZORBA_SIMPLESTORE_STORE_H |
1642 | #define ZORBA_SIMPLESTORE_STORE_H |
1643 | |
1644 | +#include "store/api/store.h" |
1645 | + |
1646 | #include "shared_types.h" |
1647 | #include "store_defs.h" |
1648 | #include "hashmap_nodep.h" |
1649 | #include "tree_id.h" |
1650 | +#include "store/util/hashmap_stringbuf.h" |
1651 | +#include "zorbautils/mutex.h" |
1652 | +#include "zorbautils/lock.h" |
1653 | +#include "zorbautils/hashmap.h" |
1654 | +#include "zorbautils/hashmap_itemp.h" |
1655 | +#include "zorbautils/hashmap_zstring_nonserializable.h" |
1656 | |
1657 | #if (defined (WIN32) || defined (WINCE)) |
1658 | #include "node_items.h" |
1659 | @@ -28,14 +36,7 @@ |
1660 | #include "store/api/ic.h" |
1661 | #endif |
1662 | |
1663 | -#include "store/api/store.h" |
1664 | - |
1665 | -#include "store/util/hashmap_stringbuf.h" |
1666 | - |
1667 | -#include "zorbautils/mutex.h" |
1668 | -#include "zorbautils/lock.h" |
1669 | -#include "zorbautils/hashmap_itemp.h" |
1670 | -#include "zorbautils/hashmap_zstring_nonserializable.h" |
1671 | +using namespace zorba; |
1672 | |
1673 | namespace zorba |
1674 | { |
1675 | @@ -63,9 +64,9 @@ |
1676 | class TreeIdGeneratorFactory; |
1677 | class TreeIdGenerator; |
1678 | |
1679 | -typedef zorba::HashMapZString<XmlNode_t> DocumentSet; |
1680 | -typedef ItemPointerHashMap<store::Index_t> IndexSet; |
1681 | -typedef ItemPointerHashMap<store::IC_t> ICSet; |
1682 | +typedef HashMapZString<XmlNode_t> DocumentSet; |
1683 | +typedef zorba::ItemPointerHashMap<store::Index_t> IndexSet; |
1684 | +typedef zorba::ItemPointerHashMap<store::IC_t> ICSet; |
1685 | |
1686 | |
1687 | |
1688 | |
1689 | === modified file 'src/system/globalenv.cpp' |
1690 | --- src/system/globalenv.cpp 2012-04-12 09:21:02 +0000 |
1691 | +++ src/system/globalenv.cpp 2012-04-13 19:45:38 +0000 |
1692 | @@ -17,11 +17,11 @@ |
1693 | |
1694 | #include "common/common.h" |
1695 | |
1696 | -#ifndef ZORBA_NO_UNICODE |
1697 | +#ifndef ZORBA_NO_ICU |
1698 | # include <unicode/uclean.h> |
1699 | # include <unicode/utypes.h> |
1700 | # include <unicode/udata.h> |
1701 | -#endif /* ZORBA_NO_UNICODE */ |
1702 | +#endif /* ZORBA_NO_ICU */ |
1703 | |
1704 | #ifdef ZORBA_WITH_BIG_INTEGER |
1705 | # include "zorbatypes/m_apm.h" |
1706 | @@ -206,7 +206,7 @@ |
1707 | // from one thread only |
1708 | // see http://www.icu-project.org/userguide/design.html#Init_and_Termination |
1709 | // and http://www.icu-project.org/apiref/icu4c/uclean_8h.html |
1710 | -#ifndef ZORBA_NO_UNICODE |
1711 | +#ifndef ZORBA_NO_ICU |
1712 | # if defined U_STATIC_IMPLEMENTATION && (defined WIN32 || defined WINCE) |
1713 | { |
1714 | TCHAR self_path[1024]; |
1715 | @@ -236,13 +236,13 @@ |
1716 | udata_setCommonData(icu_appdata, &data_err); |
1717 | ZORBA_ASSERT(data_err == U_ZERO_ERROR); |
1718 | |
1719 | - // u_setDataDirectory(self_path); |
1720 | + // u_setDataDirectory(self_path); |
1721 | } |
1722 | # endif |
1723 | UErrorCode lICUInitStatus = U_ZERO_ERROR; |
1724 | u_init(&lICUInitStatus); |
1725 | ZORBA_ASSERT(lICUInitStatus == U_ZERO_ERROR); |
1726 | -#endif//ifndef ZORBA_NO_UNICODE |
1727 | +#endif /* ZORBA_NO_ICU */ |
1728 | } |
1729 | |
1730 | |
1731 | @@ -254,12 +254,12 @@ |
1732 | // releases statically initialized memory and prevents |
1733 | // valgrind from reporting those problems at the end |
1734 | // see http://www.icu-project.org/apiref/icu4c/uclean_8h.html#93f27d0ddc7c196a1da864763f2d8920 |
1735 | -#ifndef ZORBA_NO_UNICODE |
1736 | +#ifndef ZORBA_NO_ICU |
1737 | u_cleanup(); |
1738 | # if defined U_STATIC_IMPLEMENTATION && (defined WIN32 || defined WINCE) |
1739 | delete[] icu_appdata; |
1740 | # endif |
1741 | -#endif//ifndef ZORBA_NO_UNICODE |
1742 | +#endif /* ZORBA_NO_ICU */ |
1743 | } |
1744 | |
1745 | |
1746 | |
1747 | === modified file 'src/unit_tests/CMakeLists.txt' |
1748 | --- src/unit_tests/CMakeLists.txt 2012-03-28 05:19:57 +0000 |
1749 | +++ src/unit_tests/CMakeLists.txt 2012-04-13 19:45:38 +0000 |
1750 | @@ -29,9 +29,9 @@ |
1751 | tokenizer.cpp) |
1752 | ENDIF (NOT ZORBA_NO_FULL_TEXT) |
1753 | |
1754 | -IF (NOT ZORBA_NO_UNICODE) |
1755 | +IF (NOT ZORBA_NO_ICU) |
1756 | LIST (APPEND UNIT_TEST_SRCS |
1757 | test_icu_streambuf.cpp) |
1758 | -ENDIF (NOT ZORBA_NO_UNICODE) |
1759 | +ENDIF (NOT ZORBA_NO_ICU) |
1760 | |
1761 | # vim:set et sw=2 tw=2: |
1762 | |
1763 | === modified file 'src/unit_tests/string.cpp' |
1764 | --- src/unit_tests/string.cpp 2012-03-28 05:19:57 +0000 |
1765 | +++ src/unit_tests/string.cpp 2012-04-13 19:45:38 +0000 |
1766 | @@ -569,6 +569,7 @@ |
1767 | ASSERT_TRUE( t == s ); |
1768 | } |
1769 | |
1770 | +#ifndef ZORBA_NO_ICU |
1771 | template<class StringType> |
1772 | static void test_to_string_from_wchar_t() { |
1773 | wchar_t const w[] = L"hello"; |
1774 | @@ -578,6 +579,7 @@ |
1775 | for ( string::size_type i = 0; i < s.length(); ++i ) |
1776 | ASSERT_TRUE( s[i] == w[i] ); |
1777 | } |
1778 | +#endif /* ZORBA_NO_ICU */ |
1779 | |
1780 | template<class StringType> |
1781 | static void test_to_upper() { |
1782 | @@ -605,6 +607,7 @@ |
1783 | } |
1784 | } |
1785 | |
1786 | +#ifndef ZORBA_NO_ICU |
1787 | static void test_to_wchar_t() { |
1788 | string const s = "hello"; |
1789 | wchar_t *w; |
1790 | @@ -616,6 +619,7 @@ |
1791 | ASSERT_TRUE( w[i] == s[i] ); |
1792 | delete[] w; |
1793 | } |
1794 | +#endif /* ZORBA_NO_ICU */ |
1795 | |
1796 | static void test_trim_start() { |
1797 | char const *s; |
1798 | @@ -873,16 +877,20 @@ |
1799 | test_to_string_from_utf8<zstring>(); |
1800 | test_to_string_from_utf8<zstring_p>(); |
1801 | |
1802 | +#ifndef ZORBA_NO_ICU |
1803 | test_to_string_from_wchar_t<string>(); |
1804 | test_to_string_from_wchar_t<zstring>(); |
1805 | test_to_string_from_wchar_t<zstring_p>(); |
1806 | +#endif /* ZORBA_NO_ICU */ |
1807 | |
1808 | test_to_upper<string>(); |
1809 | test_to_upper<zstring>(); |
1810 | test_to_upper<zstring_p>(); |
1811 | test_to_upper<String>(); |
1812 | |
1813 | +#ifndef ZORBA_NO_ICU |
1814 | test_to_wchar_t(); |
1815 | +#endif /* ZORBA_NO_ICU */ |
1816 | |
1817 | test_trim_start(); |
1818 | test_trim_end(); |
1819 | |
1820 | === modified file 'src/unit_tests/unit_test_list.h' |
1821 | --- src/unit_tests/unit_test_list.h 2012-03-28 05:19:57 +0000 |
1822 | +++ src/unit_tests/unit_test_list.h 2012-04-13 19:45:38 +0000 |
1823 | @@ -36,9 +36,9 @@ |
1824 | /** |
1825 | * ADD NEW UNIT TESTS HERE |
1826 | */ |
1827 | -#ifndef ZORBA_NO_UNICODE |
1828 | +#ifndef ZORBA_NO_ICU |
1829 | int test_icu_streambuf( int, char*[] ); |
1830 | -#endif /* ZORBA_NO_UNICODE */ |
1831 | +#endif /* ZORBA_NO_ICU */ |
1832 | int json_parser( int, char*[] ); |
1833 | |
1834 | void initializeTestList(); |
1835 | |
1836 | === modified file 'src/unit_tests/unit_tests.cpp' |
1837 | --- src/unit_tests/unit_tests.cpp 2012-03-28 05:19:57 +0000 |
1838 | +++ src/unit_tests/unit_tests.cpp 2012-04-13 19:45:38 +0000 |
1839 | @@ -39,9 +39,9 @@ |
1840 | void initializeTestList() { |
1841 | libunittests["string"] = test_string; |
1842 | libunittests["uri"] = runUriTest; |
1843 | -#ifndef ZORBA_NO_UNICODE |
1844 | +#ifndef ZORBA_NO_ICU |
1845 | libunittests["icu_streambuf"] = test_icu_streambuf; |
1846 | -#endif /* ZORBA_NO_UNICODE */ |
1847 | +#endif /* ZORBA_NO_ICU */ |
1848 | libunittests["json_parser"] = json_parser; |
1849 | libunittests["unique_ptr"] = test_unique_ptr; |
1850 | #ifndef ZORBA_NO_FULL_TEXT |
1851 | |
1852 | === modified file 'src/util/CMakeLists.txt' |
1853 | --- src/util/CMakeLists.txt 2012-03-28 05:19:57 +0000 |
1854 | +++ src/util/CMakeLists.txt 2012-04-13 19:45:38 +0000 |
1855 | @@ -40,14 +40,14 @@ |
1856 | LIST(APPEND UTIL_SRCS mmap_file.cpp) |
1857 | ENDIF(ZORBA_WITH_FILE_ACCESS) |
1858 | |
1859 | -IF(ZORBA_NO_UNICODE) |
1860 | +IF(ZORBA_NO_ICU) |
1861 | LIST(APPEND UTIL_SRCS |
1862 | - regex_ascii.cpp |
1863 | + regex_xquery.cpp |
1864 | passthru_streambuf.cpp) |
1865 | -ELSE(ZORBA_NO_UNICODE) |
1866 | +ELSE(ZORBA_NO_ICU) |
1867 | LIST(APPEND UTIL_SRCS |
1868 | icu_streambuf.cpp) |
1869 | -ENDIF(ZORBA_NO_UNICODE) |
1870 | +ENDIF(ZORBA_NO_ICU) |
1871 | |
1872 | HEADER_GROUP_SUBFOLDER(UTIL_SRCS fx) |
1873 | HEADER_GROUP_SUBFOLDER(UTIL_SRCS win32) |
1874 | |
1875 | === modified file 'src/util/icu_streambuf.h' |
1876 | --- src/util/icu_streambuf.h 2012-02-04 01:26:18 +0000 |
1877 | +++ src/util/icu_streambuf.h 2012-04-13 19:45:38 +0000 |
1878 | @@ -17,6 +17,7 @@ |
1879 | #ifndef ZORBA_ICU_STREAMBUF_H |
1880 | #define ZORBA_ICU_STREAMBUF_H |
1881 | |
1882 | +#include <unicode/ucnv.h> |
1883 | #include <zorba/transcode_stream.h> |
1884 | |
1885 | #include "util/utf8_util.h" |
1886 | |
1887 | === modified file 'src/util/passthru_streambuf.cpp' |
1888 | --- src/util/passthru_streambuf.cpp 2012-02-04 01:26:18 +0000 |
1889 | +++ src/util/passthru_streambuf.cpp 2012-04-13 19:45:38 +0000 |
1890 | @@ -14,8 +14,8 @@ |
1891 | * limitations under the License. |
1892 | */ |
1893 | |
1894 | +#include "stdafx.h" |
1895 | #include "passthru_streambuf.h" |
1896 | - |
1897 | using namespace std; |
1898 | |
1899 | namespace zorba { |
1900 | @@ -47,7 +47,7 @@ |
1901 | } |
1902 | |
1903 | bool passthru_streambuf::is_supported( char const *cc_charset ) { |
1904 | - return !is_necessary( charset ); |
1905 | + return !is_necessary( cc_charset ); |
1906 | } |
1907 | |
1908 | passthru_streambuf::pos_type |
1909 | |
1910 | === modified file 'src/util/passthru_streambuf.h' |
1911 | --- src/util/passthru_streambuf.h 2012-02-02 18:37:24 +0000 |
1912 | +++ src/util/passthru_streambuf.h 2012-04-13 19:45:38 +0000 |
1913 | @@ -17,8 +17,9 @@ |
1914 | #ifndef ZORBA_PASSTHRU_STREAMBUF_H |
1915 | #define ZORBA_PASSTHRU_STREAMBUF_H |
1916 | |
1917 | -#include <zorba/transcode_streambuf.h> |
1918 | - |
1919 | +#include <zorba/transcode_stream.h> |
1920 | +#include "zorbatypes/zstring.h" |
1921 | +#include "util/ascii_util.h" |
1922 | namespace zorba { |
1923 | |
1924 | /////////////////////////////////////////////////////////////////////////////// |
1925 | @@ -48,6 +49,13 @@ |
1926 | * @return \c true only if the character encoding is supported. |
1927 | */ |
1928 | static bool is_supported( char const *charset ); |
1929 | + static bool is_necessary( char const *cc_charset ); |
1930 | + |
1931 | + typedef std::streambuf::char_type char_type; |
1932 | + typedef std::streambuf::int_type int_type; |
1933 | + typedef std::streambuf::off_type off_type; |
1934 | + typedef std::streambuf::pos_type pos_type; |
1935 | + typedef std::streambuf::traits_type traits_type; |
1936 | |
1937 | protected: |
1938 | void imbue( std::locale const& ); |
1939 | |
1940 | === modified file 'src/util/regex.cpp' |
1941 | --- src/util/regex.cpp 2012-03-28 05:19:57 +0000 |
1942 | +++ src/util/regex.cpp 2012-04-13 19:45:38 +0000 |
1943 | @@ -15,8 +15,6 @@ |
1944 | */ |
1945 | #include "stdafx.h" |
1946 | |
1947 | -#include "regex.h" |
1948 | - |
1949 | #include <cstring> |
1950 | #include <vector> |
1951 | |
1952 | @@ -28,13 +26,13 @@ |
1953 | |
1954 | #include "ascii_util.h" |
1955 | #include "cxx_util.h" |
1956 | +#include "regex.h" |
1957 | #include "stl_util.h" |
1958 | |
1959 | #define INVALID_RE_EXCEPTION(...) \ |
1960 | XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS( __VA_ARGS__ ) ) |
1961 | |
1962 | - |
1963 | -#ifndef ZORBA_NO_UNICODE |
1964 | +#ifndef ZORBA_NO_ICU |
1965 | # include <unicode/uversion.h> |
1966 | U_NAMESPACE_USE |
1967 | |
1968 | @@ -103,6 +101,7 @@ |
1969 | |
1970 | bool got_backslash = false; |
1971 | bool in_char_class = false; // within [...] |
1972 | + bool is_first_char = true; // to check ^ placement |
1973 | |
1974 | bool in_backref = false; // '\'[1-9][0-9]* |
1975 | unsigned backref_no = 0; // 1-based |
1976 | @@ -231,6 +230,8 @@ |
1977 | ++open_cap_subs; |
1978 | cap_sub.push_back( true ); |
1979 | cur_cap_sub = cap_sub.size(); |
1980 | + is_first_char = true; |
1981 | + goto append; |
1982 | } |
1983 | break; |
1984 | case ')': |
1985 | @@ -245,8 +246,10 @@ |
1986 | case '[': |
1987 | if ( q_flag ) |
1988 | *icu_re += '\\'; |
1989 | - else |
1990 | + else { |
1991 | in_char_class = true; |
1992 | + goto append; |
1993 | + } |
1994 | break; |
1995 | case ']': |
1996 | if ( q_flag ) |
1997 | @@ -254,6 +257,19 @@ |
1998 | else |
1999 | in_char_class = false; |
2000 | break; |
2001 | + case '^': |
2002 | + if ( q_flag ) |
2003 | + *icu_re += '\\'; |
2004 | + else if ( !is_first_char && !in_char_class ) |
2005 | + throw INVALID_RE_EXCEPTION( xq_re, ZED( UnescapedChar_3 ), *xq_c ); |
2006 | + break; |
2007 | + case '|': |
2008 | + if ( q_flag ) |
2009 | + *icu_re += '\\'; |
2010 | + else { |
2011 | + is_first_char = true; |
2012 | + goto append; |
2013 | + } |
2014 | default: |
2015 | if ( x_flag && ascii::is_space( *xq_c ) ) { |
2016 | if ( !in_char_class ) |
2017 | @@ -265,37 +281,42 @@ |
2018 | // |
2019 | *icu_re += '\\'; |
2020 | } |
2021 | - } |
2022 | - } |
2023 | + } // switch |
2024 | + } // else |
2025 | + is_first_char = false; |
2026 | +append: |
2027 | *icu_re += *xq_c; |
2028 | } // FOR_EACH |
2029 | |
2030 | - if ( i_flag ) { |
2031 | - // |
2032 | - // XQuery 3.0 F&O 5.6.1.1: All other constructs are unaffected by the "i" |
2033 | - // flag. For example, "\p{Lu}" continues to match upper-case letters only. |
2034 | - // |
2035 | - // However, ICU lower-cases everything for the 'i' flag; hence we have to |
2036 | - // turn off the 'i' flag for just the \p{Lu}. |
2037 | - // |
2038 | - // Note that the "6" and "12" below are correct since "\\" represents a |
2039 | - // single '\'. |
2040 | - // |
2041 | - ascii::replace_all( *icu_re, "\\p{Lu}", 6, "(?-i:\\p{Lu})", 12 ); |
2042 | - } |
2043 | + if ( !q_flag ) { |
2044 | + if ( i_flag ) { |
2045 | + // |
2046 | + // XQuery 3.0 F&O 5.6.1.1: All other constructs are unaffected by the "i" |
2047 | + // flag. For example, "\p{Lu}" continues to match upper-case letters |
2048 | + // only. |
2049 | + // |
2050 | + // However, ICU lower-cases everything for the 'i' flag; hence we have to |
2051 | + // turn off the 'i' flag for just the \p{Lu}. |
2052 | + // |
2053 | + // Note that the "6" and "12" below are correct since "\\" represents a |
2054 | + // single '\'. |
2055 | + // |
2056 | + ascii::replace_all( *icu_re, "\\p{Lu}", 6, "(?-i:\\p{Lu})", 12 ); |
2057 | + } |
2058 | |
2059 | - // |
2060 | - // XML Schema Part 2 F.1.1: [Unicode Database] groups code points into a |
2061 | - // number of blocks such as Basic Latin (i.e., ASCII), Latin-1 Supplement, |
2062 | - // Hangul Jamo, CJK Compatibility, etc. The set containing all characters |
2063 | - // that have block name X (with all white space stripped out), can be |
2064 | - // identified with a block escape \p{IsX}. |
2065 | - // |
2066 | - // However, ICU uses \p{InX} rather than \p{IsX}. |
2067 | - // |
2068 | - // Note that the "5" below is correct since "\\" represents a single '\'. |
2069 | - // |
2070 | - ascii::replace_all( *icu_re, "\\p{Is", 5, "\\p{In", 5 ); |
2071 | + // |
2072 | + // XML Schema Part 2 F.1.1: [Unicode Database] groups code points into a |
2073 | + // number of blocks such as Basic Latin (i.e., ASCII), Latin-1 Supplement, |
2074 | + // Hangul Jamo, CJK Compatibility, etc. The set containing all characters |
2075 | + // that have block name X (with all white space stripped out), can be |
2076 | + // identified with a block escape \p{IsX}. |
2077 | + // |
2078 | + // However, ICU uses \p{InX} rather than \p{IsX}. |
2079 | + // |
2080 | + // Note that the "5" below is correct since "\\" represents a single '\'. |
2081 | + // |
2082 | + ascii::replace_all( *icu_re, "\\p{Is", 5, "\\p{In", 5 ); |
2083 | + } // q_flag |
2084 | } |
2085 | |
2086 | /////////////////////////////////////////////////////////////////////////////// |
2087 | @@ -442,11 +463,11 @@ |
2088 | } |
2089 | |
2090 | } // namespace unicode |
2091 | - |
2092 | -}//namespace zorba |
2093 | - |
2094 | - |
2095 | -#else /* ZORBA_NO_UNICODE */ |
2096 | +} // namespace zorba |
2097 | + |
2098 | +/////////////////////////////////////////////////////////////////////////////// |
2099 | + |
2100 | +#else /* ZORBA_NO_ICU */ |
2101 | |
2102 | #include "zorbatypes/zstring.h" |
2103 | |
2104 | @@ -470,7 +491,7 @@ |
2105 | case 'i': flags |= REGEX_ASCII_CASE_INSENSITIVE; break; |
2106 | case 's': flags |= REGEX_ASCII_DOTALL; break; |
2107 | case 'm': flags |= REGEX_ASCII_MULTILINE; break; |
2108 | - case 'x': flags |= REGEX_ASCII_COMMENTS; break; |
2109 | + case 'x': flags |= REGEX_ASCII_NO_WHITESPACE; break; |
2110 | case 'q': flags |= REGEX_ASCII_LITERAL; break; |
2111 | default: |
2112 | throw XQUERY_EXCEPTION( err::FORX0001, ERROR_PARAMS( *p ) ); |
2113 | @@ -483,6 +504,7 @@ |
2114 | void regex::compile( char const *pattern, char const *flags) |
2115 | { |
2116 | parsed_flags = parse_regex_flags(flags); |
2117 | + regex_xquery::CRegexXQuery_parser regex_parser; |
2118 | regex_matcher = regex_parser.parse(pattern, parsed_flags); |
2119 | if(!regex_matcher) |
2120 | throw INVALID_RE_EXCEPTION(pattern); |
2121 | @@ -517,6 +539,8 @@ |
2122 | bool regex::next_token( char const *s, size_type *pos, zstring *token, |
2123 | bool *matched) |
2124 | { |
2125 | + if(!s[*pos]) |
2126 | + return false; |
2127 | bool retval; |
2128 | int match_pos; |
2129 | int matched_len; |
2130 | @@ -528,14 +552,8 @@ |
2131 | token->assign(s+*pos, match_pos); |
2132 | *pos += match_pos + matched_len; |
2133 | if(matched) |
2134 | - if(match_pos) |
2135 | - *matched = true; |
2136 | - else |
2137 | - *matched = false; |
2138 | - if(match_pos) |
2139 | - return true; |
2140 | - else |
2141 | - return false; |
2142 | + *matched = true; |
2143 | + return true; |
2144 | } |
2145 | else |
2146 | { |
2147 | @@ -544,7 +562,7 @@ |
2148 | *pos += strlen(s+*pos); |
2149 | if(matched) |
2150 | *matched = false; |
2151 | - return s[*pos] != 0; |
2152 | + return true; |
2153 | } |
2154 | } |
2155 | |
2156 | @@ -554,13 +572,9 @@ |
2157 | int matched_pos; |
2158 | int matched_len; |
2159 | |
2160 | - bool prev_align = regex_matcher->set_align_begin(true); |
2161 | - retval = regex_matcher->match_from(s, parsed_flags, &matched_pos, &matched_len); |
2162 | - regex_matcher->set_align_begin(prev_align); |
2163 | + retval = regex_matcher->match_anywhere(s, parsed_flags|REGEX_ASCII_WHOLE_MATCH, &matched_pos, &matched_len); |
2164 | if(!retval) |
2165 | return false; |
2166 | - if(matched_len != strlen(s)) |
2167 | - return false; |
2168 | return true; |
2169 | } |
2170 | |
2171 | @@ -587,14 +601,19 @@ |
2172 | //look for dollars |
2173 | if(*temprepl == '\\') |
2174 | { |
2175 | - temprepl++; |
2176 | - if(!*temprepl || (*temprepl != '\\') || (*temprepl != '$'))//Invalid replacement string. |
2177 | - throw XQUERY_EXCEPTION( err::FORX0004, ERROR_PARAMS( replacement ) ); |
2178 | + if(!(parsed_flags & REGEX_ASCII_LITERAL)) |
2179 | + { |
2180 | + temprepl++; |
2181 | + if(!*temprepl) |
2182 | + temprepl--; |
2183 | + else if((*temprepl != '\\') && (*temprepl != '$'))//Invalid replacement string. |
2184 | + throw XQUERY_EXCEPTION( err::FORX0004, ERROR_PARAMS( replacement ) ); |
2185 | + } |
2186 | result->append(1, *temprepl); |
2187 | temprepl++; |
2188 | continue; |
2189 | } |
2190 | - if(*temprepl == '$') |
2191 | + if((*temprepl == '$') && !(parsed_flags & REGEX_ASCII_LITERAL)) |
2192 | { |
2193 | temprepl++; |
2194 | index = 0; |
2195 | @@ -648,7 +667,7 @@ |
2196 | if(retval) |
2197 | { |
2198 | m_match_pos += m_pos; |
2199 | - m_pos = m_match_pos = m_matched_len; |
2200 | + m_pos = m_match_pos + m_matched_len; |
2201 | } |
2202 | else |
2203 | { |
2204 | @@ -666,35 +685,30 @@ |
2205 | return (int)regex_matcher->get_indexed_regex_count(); |
2206 | } |
2207 | |
2208 | -int regex::get_match_start( int groupId ) |
2209 | -{ |
2210 | - if(groupId == 0) |
2211 | - return m_match_pos; |
2212 | - if(groupId > (int)regex_matcher->get_indexed_regex_count()) |
2213 | - return -1; |
2214 | - const char *submatched_source; |
2215 | - int submatched_len; |
2216 | - if(!regex_matcher->get_indexed_match(groupId, &submatched_source, &submatched_len)) |
2217 | - return -1; |
2218 | - return submatched_source - s_in_.c_str(); |
2219 | -} |
2220 | - |
2221 | -int regex::get_match_end( int groupId ) |
2222 | -{ |
2223 | - if(groupId == 0) |
2224 | - return m_match_pos + m_matched_len; |
2225 | - if(groupId > (int)regex_matcher->get_indexed_regex_count()) |
2226 | - return -1; |
2227 | - const char *submatched_source; |
2228 | - int submatched_len; |
2229 | - if(!regex_matcher->get_indexed_match(groupId, &submatched_source, &submatched_len)) |
2230 | - return -1; |
2231 | - return submatched_source - s_in_.c_str() + submatched_len; |
2232 | +bool regex::get_match_start_end_bytes( int groupId, int *start, int *end ) |
2233 | +{ |
2234 | + *start = -1; |
2235 | + *end = -1; |
2236 | + if(groupId == 0) |
2237 | + { |
2238 | + *start = m_match_pos; |
2239 | + *end = m_match_pos + m_matched_len; |
2240 | + return true; |
2241 | + } |
2242 | + if(groupId > (int)regex_matcher->get_indexed_regex_count()) |
2243 | + return false; |
2244 | + const char *submatched_source; |
2245 | + int submatched_len; |
2246 | + if(!regex_matcher->get_indexed_match(groupId, &submatched_source, &submatched_len)) |
2247 | + return false; |
2248 | + *start = submatched_source - s_in_.c_str(); |
2249 | + *end = *start + submatched_len; |
2250 | + return true; |
2251 | } |
2252 | |
2253 | } // namespace unicode |
2254 | } // namespace zorba |
2255 | -#endif /* ZORBA_NO_UNICODE */ |
2256 | +#endif /* ZORBA_NO_ICU */ |
2257 | |
2258 | /////////////////////////////////////////////////////////////////////////////// |
2259 | |
2260 | |
2261 | === modified file 'src/util/regex.h' |
2262 | --- src/util/regex.h 2012-03-28 05:19:57 +0000 |
2263 | +++ src/util/regex.h 2012-04-13 19:45:38 +0000 |
2264 | @@ -17,15 +17,13 @@ |
2265 | #ifndef ZORBA_REGEX_H |
2266 | #define ZORBA_REGEX_H |
2267 | |
2268 | -#ifndef ZORBA_NO_UNICODE |
2269 | -#include <unicode/regex.h> |
2270 | -#endif |
2271 | - |
2272 | #include "cxx_util.h" |
2273 | #include "unicode_util.h" |
2274 | #include "zorbatypes/zstring.h" |
2275 | |
2276 | -#ifndef ZORBA_NO_UNICODE |
2277 | +#ifndef ZORBA_NO_ICU |
2278 | + |
2279 | +#include <unicode/regex.h> |
2280 | |
2281 | namespace zorba { |
2282 | |
2283 | @@ -496,15 +494,17 @@ |
2284 | } // namespace unicode |
2285 | } // namespace zorba |
2286 | |
2287 | -#else ///ZORBA_NO_UNICODE (ascii part:) |
2288 | - |
2289 | -#include "util/regex_ascii.h" |
2290 | +/////////////////////////////////////////////////////////////////////////////// |
2291 | + |
2292 | +#else /* ZORBA_NO_ICU */ |
2293 | + |
2294 | +#include "util/regex_xquery.h" |
2295 | #include <string> |
2296 | |
2297 | namespace zorba{ |
2298 | /** |
2299 | * Converts an XQuery regular expression to the form used by the regular |
2300 | - * expression library Zorba is using (here regex_ascii). |
2301 | + * expression library Zorba is using (here regex_xquery). |
2302 | * |
2303 | * @param xq_re The XQuery regular expression. |
2304 | * @param lib_re A pointer to the resuling library regular expression. |
2305 | @@ -525,7 +525,7 @@ |
2306 | /** |
2307 | * Constructs a %regex. |
2308 | */ |
2309 | - regex() : regex_matcher( NULL ) { } |
2310 | + regex() : regex_matcher( nullptr ) { } |
2311 | |
2312 | /** |
2313 | * Destroys a %regex. |
2314 | @@ -835,31 +835,21 @@ |
2315 | |
2316 | /** |
2317 | * Get the start position of the matched group. |
2318 | - * If groupId is zero, then the start position of the whole match is returned. |
2319 | - * If groupId is non-zero, then the start position of that group is returned. |
2320 | - * If that group has not been matched, -1 is returned. |
2321 | + * If groupId is zero, then the start and end position of the whole match is returned. |
2322 | + * If groupId is non-zero, then the start and end position of that group is returned. |
2323 | + * If that group has not been matched, false is returned. |
2324 | * |
2325 | * @param groupId the id of the group, either zero for the entire regex, |
2326 | * or [1 .. group_count] for that specific group |
2327 | - * @return the start position, zero based, or -1 if that group didn't match |
2328 | + * @param start to return start position in bytes |
2329 | + * @param end to return end position in bytes |
2330 | + * @return true if that group exists and has been matched |
2331 | */ |
2332 | - int get_match_start( int groupId = 0 ); |
2333 | + bool get_match_start_end_bytes( int groupId, int *start, int *end ); |
2334 | |
2335 | - /** |
2336 | - * Get the end position of the matched group. |
2337 | - * If groupId is zero, then the end position of the whole match is returned. |
2338 | - * If groupId is non-zero, then the end position of that group is returned. |
2339 | - * If that group has not been matched, -1 is returned. |
2340 | - * |
2341 | - * @param groupId the id of the group, either zero for the entire regex, |
2342 | - * or [1 .. group_count] for that specific group |
2343 | - * @return the end position, zero based, or -1 if that group didn't match |
2344 | - */ |
2345 | - int get_match_end( int groupId = 0 ); |
2346 | |
2347 | private: |
2348 | - regex_ascii::CRegexAscii_parser regex_parser; |
2349 | - regex_ascii::CRegexAscii_regex *regex_matcher; |
2350 | + regex_xquery::CRegexXQuery_regex *regex_matcher; |
2351 | uint32_t parsed_flags; |
2352 | |
2353 | zstring s_in_; |
2354 | @@ -873,15 +863,13 @@ |
2355 | regex( regex const& ); |
2356 | regex& operator=( regex const& ); |
2357 | }; |
2358 | + |
2359 | +/////////////////////////////////////////////////////////////////////////////// |
2360 | + |
2361 | } // namespace unicode |
2362 | } // namespace zorba |
2363 | |
2364 | -#endif /* ZORBA_NO_UNICODE */ |
2365 | - |
2366 | - |
2367 | -/////////////////////////////////////////////////////////////////////////////// |
2368 | - |
2369 | - |
2370 | +#endif /* ZORBA_NO_ICU */ |
2371 | #endif /* ZORBA_REGEX_H */ |
2372 | /* |
2373 | * Local variables: |
2374 | |
2375 | === renamed file 'src/util/regex_ascii.cpp' => 'src/util/regex_xquery.cpp' |
2376 | --- src/util/regex_ascii.cpp 2012-03-28 05:19:57 +0000 |
2377 | +++ src/util/regex_xquery.cpp 2012-04-13 19:45:38 +0000 |
2378 | @@ -1,4 +1,4 @@ |
2379 | -a/* |
2380 | +/* |
2381 | * Copyright 2006-2008 The FLWOR Foundation. |
2382 | * |
2383 | * Licensed under the Apache License, Version 2.0 (the "License"); |
2384 | @@ -18,12 +18,15 @@ |
2385 | |
2386 | #include "diagnostics/xquery_diagnostics.h" |
2387 | |
2388 | -#include "regex_ascii.h" |
2389 | +#include "regex_xquery.h" |
2390 | #include <string.h> |
2391 | #include "zorbatypes/chartype.h" |
2392 | +#include "util/unicode_categories.h" |
2393 | +#include "util/ascii_util.h" |
2394 | +#include "util/utf8_string.h" |
2395 | |
2396 | namespace zorba { |
2397 | - namespace regex_ascii{ |
2398 | + namespace regex_xquery{ |
2399 | //ascii regular expression matching |
2400 | |
2401 | /*http://www.w3.org/TR/xmlschema-2/#regexs |
2402 | @@ -62,96 +65,138 @@ |
2403 | + http://www.w3.org/TR/xquery-operators/#regex-syntax (not implemented) |
2404 | */ |
2405 | |
2406 | + |
2407 | +static bool compare_ascii_i(const char *str1, const char *str2) |
2408 | +{ |
2409 | + while(*str1 && *str2) |
2410 | + { |
2411 | + if(ascii::to_lower(*str1) != ascii::to_lower(*str2)) |
2412 | + return false; |
2413 | + str1++; |
2414 | + str2++; |
2415 | + } |
2416 | + if(*str1 || *str2) |
2417 | + return false; |
2418 | + return true; |
2419 | +} |
2420 | + |
2421 | +static bool compare_unicode_ni(const char *str1, const char *str2, int len) |
2422 | +{ |
2423 | + while(len > 0) |
2424 | + { |
2425 | + const char *temp_str1 = str1; |
2426 | + const char *temp_str2 = str2; |
2427 | + unicode::code_point cp1 = unicode::to_upper(utf8::next_char(temp_str1)); |
2428 | + unicode::code_point cp2 = unicode::to_upper(utf8::next_char(temp_str2)); |
2429 | + if(cp1 != cp2) |
2430 | + return false; |
2431 | + len -= temp_str1-str1; |
2432 | + str1 = temp_str1; |
2433 | + str2 = temp_str2; |
2434 | + } |
2435 | + return true; |
2436 | +} |
2437 | +static utf8::size_type myutf8len(const char *source) |
2438 | +{ |
2439 | + utf8::size_type len = utf8::char_length(*source); |
2440 | + if(!len) |
2441 | + return 1; |
2442 | + else |
2443 | + return len; |
2444 | +} |
2445 | //////////////////////////////////// |
2446 | ////Regular expression parsing and building of the tree |
2447 | //////////////////////////////////// |
2448 | |
2449 | -CRegexAscii_regex* CRegexAscii_parser::parse(const char *pattern, unsigned int flags) |
2450 | +CRegexXQuery_regex* CRegexXQuery_parser::parse(const char *pattern, unsigned int flags) |
2451 | { |
2452 | this->flags = flags; |
2453 | - bool align_begin = false; |
2454 | |
2455 | - if(!(flags & REGEX_ASCII_LITERAL) && (pattern[0] == '^')) |
2456 | - align_begin = true; |
2457 | - |
2458 | int regex_len; |
2459 | - CRegexAscii_regex* regex = parse_regexp(pattern + (align_begin?1:0), ®ex_len); |
2460 | + CRegexXQuery_regex* regex = parse_regexp(pattern, ®ex_len); |
2461 | |
2462 | - if(regex) |
2463 | - regex->set_align_begin(align_begin); |
2464 | - |
2465 | return regex; |
2466 | } |
2467 | |
2468 | //until '\0' or ')' |
2469 | -CRegexAscii_regex* CRegexAscii_parser::parse_regexp(const char *pattern, |
2470 | +CRegexXQuery_regex* CRegexXQuery_parser::parse_regexp(const char *pattern, |
2471 | int *regex_len) |
2472 | { |
2473 | *regex_len = 0; |
2474 | int branch_len; |
2475 | regex_depth++; |
2476 | - CRegexAscii_regex *regex = new CRegexAscii_regex(current_regex); |
2477 | + std::auto_ptr<CRegexXQuery_regex> regex(new CRegexXQuery_regex(current_regex)); |
2478 | if(!current_regex) |
2479 | - current_regex = regex; |
2480 | + current_regex = regex.get(); |
2481 | if(regex_depth >= 2) |
2482 | { |
2483 | //mark this as group if it does not start with ?: |
2484 | if(pattern[0] != '?' || pattern[1] != ':') |
2485 | - current_regex->subregex.push_back(regex); |
2486 | + current_regex->subregex.push_back(regex.get()); |
2487 | else |
2488 | *regex_len = 2; |
2489 | } |
2490 | - CRegexAscii_branch *branch; |
2491 | + CRegexXQuery_branch *branch; |
2492 | + bool must_read_another_branch = true; |
2493 | while(pattern[*regex_len] && (pattern[*regex_len] != ')')) |
2494 | { |
2495 | branch = parse_branch(pattern+*regex_len, &branch_len); |
2496 | if(!branch) |
2497 | { |
2498 | regex_depth--; |
2499 | - delete regex; |
2500 | return NULL; |
2501 | } |
2502 | regex->add_branch(branch); |
2503 | *regex_len += branch_len; |
2504 | + if(pattern[*regex_len] == '|') |
2505 | + (*regex_len)++; |
2506 | + else |
2507 | + must_read_another_branch = false; |
2508 | } |
2509 | - if((current_regex == regex) && (pattern[*regex_len] == ')')) |
2510 | + if((current_regex == regex.get()) && (pattern[*regex_len] == ')')) |
2511 | { |
2512 | - throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(U_REGEX_MISMATCHED_PAREN)) ); |
2513 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_MISMATCHED_PAREN)) ); |
2514 | } |
2515 | if(pattern[*regex_len]) |
2516 | (*regex_len)++; |
2517 | + if(must_read_another_branch) |
2518 | + regex->add_branch(new CRegexXQuery_branch(current_regex));//add empty branch |
2519 | regex->flags = 0;//finished initialization |
2520 | regex_depth--; |
2521 | - return regex; |
2522 | + return regex.release(); |
2523 | } |
2524 | |
2525 | -CRegexAscii_branch* CRegexAscii_parser::parse_branch(const char *pattern, int *branch_len) |
2526 | +CRegexXQuery_branch* CRegexXQuery_parser::parse_branch(const char *pattern, int *branch_len) |
2527 | { |
2528 | int piece_len; |
2529 | |
2530 | - CRegexAscii_branch *branch = new CRegexAscii_branch(current_regex); |
2531 | - CRegexAscii_piece *piece; |
2532 | + std::auto_ptr<CRegexXQuery_branch> branch(new CRegexXQuery_branch(current_regex)); |
2533 | + CRegexXQuery_piece *piece; |
2534 | *branch_len = 0; |
2535 | while(pattern[*branch_len] && (pattern[*branch_len] != '|') && (pattern[*branch_len] != ')')) |
2536 | { |
2537 | piece = parse_piece(pattern+*branch_len, &piece_len); |
2538 | if(!piece) |
2539 | { |
2540 | - delete branch; |
2541 | return NULL; |
2542 | } |
2543 | + if(branch->piece_list.size() && dynamic_cast<CRegexXQuery_pinstart*>(piece->atom)) |
2544 | + { |
2545 | + //found ^ that is not at the beginning of branch |
2546 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_ATOM_CHAR), '^') ); |
2547 | + } |
2548 | branch->add_piece(piece); |
2549 | *branch_len += piece_len; |
2550 | } |
2551 | - if(pattern[*branch_len] == '|') |
2552 | - (*branch_len)++; |
2553 | - return branch; |
2554 | + //if(pattern[*branch_len] == '|') |
2555 | + // (*branch_len)++; |
2556 | + return branch.release(); |
2557 | } |
2558 | |
2559 | //piece = atom + quantifier |
2560 | -CRegexAscii_piece* CRegexAscii_parser::parse_piece(const char *pattern, int *piece_len) |
2561 | +CRegexXQuery_piece* CRegexXQuery_parser::parse_piece(const char *pattern, int *piece_len) |
2562 | { |
2563 | - CRegexAscii_piece *piece = new CRegexAscii_piece; |
2564 | + std::auto_ptr<CRegexXQuery_piece> piece(new CRegexXQuery_piece); |
2565 | IRegexAtom *atom; |
2566 | *piece_len = 0; |
2567 | |
2568 | @@ -160,19 +205,18 @@ |
2569 | atom = read_atom(pattern, &atom_len); |
2570 | if(!atom) |
2571 | { |
2572 | - delete piece; |
2573 | return NULL; |
2574 | } |
2575 | piece->set_atom(atom); |
2576 | if(!(flags & REGEX_ASCII_LITERAL)) |
2577 | - read_quantifier(piece, pattern+atom_len, &quantif_len); |
2578 | + read_quantifier(piece.get(), pattern+atom_len, &quantif_len); |
2579 | |
2580 | *piece_len += atom_len + quantif_len; |
2581 | |
2582 | - return piece; |
2583 | + return piece.release(); |
2584 | } |
2585 | |
2586 | -char CRegexAscii_parser::myishex(char c) |
2587 | +char CRegexXQuery_parser::myishex(char c) |
2588 | { |
2589 | if((c >= '0') && (c <= '9')) |
2590 | return c-'0'+1; |
2591 | @@ -183,26 +227,125 @@ |
2592 | return 0;//not a hex |
2593 | } |
2594 | |
2595 | -bool CRegexAscii_parser::myisdigit(char c) |
2596 | -{ |
2597 | - return (c >= '0') || (c <= '9'); |
2598 | -} |
2599 | - |
2600 | -char CRegexAscii_parser::readChar(const char *pattern, int *char_len, bool *is_multichar) |
2601 | +bool CRegexXQuery_parser::myisdigit(char c) |
2602 | +{ |
2603 | + return (c >= '0') && (c <= '9'); |
2604 | +} |
2605 | + |
2606 | +bool CRegexXQuery_parser::myisletterAZ(char c) |
2607 | +{ |
2608 | + return ((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z')); |
2609 | +} |
2610 | + |
2611 | +static const unicode::code_point specials_extcp[] = {0xFFF0, 0xFFFD, 0}; |
2612 | + |
2613 | +static CRegexXQuery_parser::block_escape_t block_escape[] = |
2614 | +{ |
2615 | +{{0x0000, 0x007F}, NULL, "BasicLatin"}, |
2616 | +{{0x0080, 0x00FF}, NULL, "Latin-1Supplement"}, |
2617 | +{{0x0100, 0x017F}, NULL, "LatinExtended-A"}, |
2618 | +{{0x0180, 0x024F}, NULL, "LatinExtended-B"}, |
2619 | +{{0x0250, 0x02AF}, NULL, "IPAExtensions"}, |
2620 | +{{0x02B0, 0x02FF}, NULL, "SpacingModifierLetters"}, |
2621 | +{{0x0300, 0x036F}, NULL, "CombiningDiacriticalMarks"}, |
2622 | +{{0x0370, 0x03FF}, NULL, "Greek"}, |
2623 | +{{0x0400, 0x04FF}, NULL, "Cyrillic"}, |
2624 | +{{0x0530, 0x058F}, NULL, "Armenian"}, |
2625 | +{{0x0590, 0x05FF}, NULL, "Hebrew"}, |
2626 | +{{0x0600, 0x06FF}, NULL, "Arabic"}, |
2627 | +{{0x0700, 0x074F}, NULL, "Syriac"}, |
2628 | +{{0x0780, 0x07BF}, NULL, "Thaana"}, |
2629 | +{{0x0900, 0x097F}, NULL, "Devanagari"}, |
2630 | +{{0x0980, 0x09FF}, NULL, "Bengali"}, |
2631 | +{{0x0A00, 0x0A7F}, NULL, "Gurmukhi"}, |
2632 | +{{0x0A80, 0x0AFF}, NULL, "Gujarati"}, |
2633 | +{{0x0B00, 0x0B7F}, NULL, "Oriya"}, |
2634 | +{{0x0B80, 0x0BFF}, NULL, "Tamil"}, |
2635 | +{{0x0C00, 0x0C7F}, NULL, "Telugu"}, |
2636 | +{{0x0C80, 0x0CFF}, NULL, "Kannada"}, |
2637 | +{{0x0D00, 0x0D7F}, NULL, "Malayalam"}, |
2638 | +{{0x0D80, 0x0DFF}, NULL, "Sinhala"}, |
2639 | +{{0x0E00, 0x0E7F}, NULL, "Thai"}, |
2640 | +{{0x0E80, 0x0EFF}, NULL, "Lao"}, |
2641 | +{{0x0F00, 0x0FFF}, NULL, "Tibetan"}, |
2642 | +{{0x1000, 0x109F}, NULL, "Myanmar"}, |
2643 | +{{0x10A0, 0x10FF}, NULL, "Georgian"}, |
2644 | +{{0x1100, 0x11FF}, NULL, "HangulJamo"}, |
2645 | +{{0x1200, 0x137F}, NULL, "Ethiopic"}, |
2646 | +{{0x13A0, 0x13FF}, NULL, "Cherokee"}, |
2647 | +{{0x1400, 0x167F}, NULL, "UnifiedCanadianAboriginalSyllabics"}, |
2648 | +{{0x1680, 0x169F}, NULL, "Ogham"}, |
2649 | +{{0x16A0, 0x16FF}, NULL, "Runic"}, |
2650 | +{{0x1780, 0x17FF}, NULL, "Khmer"}, |
2651 | +{{0x1800, 0x18AF}, NULL, "Mongolian"}, |
2652 | +{{0x1E00, 0x1EFF}, NULL, "LatinExtendedAdditional"}, |
2653 | +{{0x1F00, 0x1FFF}, NULL, "GreekExtended"}, |
2654 | +{{0x2000, 0x206F}, NULL, "GeneralPunctuation"}, |
2655 | +{{0x2070, 0x209F}, NULL, "SuperscriptsandSubscripts"}, |
2656 | +{{0x20A0, 0x20CF}, NULL, "CurrencySymbols"}, |
2657 | +{{0x20D0, 0x20FF}, NULL, "CombiningMarksforSymbols"}, |
2658 | +{{0x2100, 0x214F}, NULL, "LetterlikeSymbols"}, |
2659 | +{{0x2150, 0x218F}, NULL, "NumberForms"}, |
2660 | +{{0x2190, 0x21FF}, NULL, "Arrows"}, |
2661 | +{{0x2200, 0x22FF}, NULL, "MathematicalOperators"}, |
2662 | +{{0x2300, 0x23FF}, NULL, "MiscellaneousTechnical"}, |
2663 | +{{0x2400, 0x243F}, NULL, "ControlPictures"}, |
2664 | +{{0x2440, 0x245F}, NULL, "OpticalCharacterRecognition"}, |
2665 | +{{0x2460, 0x24FF}, NULL, "EnclosedAlphanumerics"}, |
2666 | +{{0x2500, 0x257F}, NULL, "BoxDrawing"}, |
2667 | +{{0x2580, 0x259F}, NULL, "BlockElements"}, |
2668 | +{{0x25A0, 0x25FF}, NULL, "GeometricShapes"}, |
2669 | +{{0x2600, 0x26FF}, NULL, "MiscellaneousSymbols"}, |
2670 | +{{0x2700, 0x27BF}, NULL, "Dingbats"}, |
2671 | +{{0x2800, 0x28FF}, NULL, "BraillePatterns"}, |
2672 | +{{0x2E80, 0x2EFF}, NULL, "CJKRadicalsSupplement"}, |
2673 | +{{0x2F00, 0x2FDF}, NULL, "KangxiRadicals"}, |
2674 | +{{0x2FF0, 0x2FFF}, NULL, "IdeographicDescriptionCharacters"}, |
2675 | +{{0x3000, 0x303F}, NULL, "CJKSymbolsandPunctuation"}, |
2676 | +{{0x3040, 0x309F}, NULL, "Hiragana"}, |
2677 | +{{0x30A0, 0x30FF}, NULL, "Katakana"}, |
2678 | +{{0x3100, 0x312F}, NULL, "Bopomofo"}, |
2679 | +{{0x3130, 0x318F}, NULL, "HangulCompatibilityJamo"}, |
2680 | +{{0x3190, 0x319F}, NULL, "Kanbun"}, |
2681 | +{{0x31A0, 0x31BF}, NULL, "BopomofoExtended"}, |
2682 | +{{0x3200, 0x32FF}, NULL, "EnclosedCJKLettersandMonths"}, |
2683 | +{{0x3300, 0x33FF}, NULL, "CJKCompatibility"}, |
2684 | +{{0x3400, 0x4DB5}, NULL, "CJKUnifiedIdeographsExtensionA"}, |
2685 | +{{0x4E00, 0x9FFF}, NULL, "CJKUnifiedIdeographs"}, |
2686 | +{{0xA000, 0xA48F}, NULL, "YiSyllables"}, |
2687 | +{{0xA490, 0xA4CF}, NULL, "YiRadicals"}, |
2688 | +{{0xAC00, 0xD7A3}, NULL, "HangulSyllables"}, |
2689 | +{{0xE000, 0xF8FF}, NULL, "PrivateUse"}, |
2690 | +{{0xF900, 0xFAFF}, NULL, "CJKCompatibilityIdeographs"}, |
2691 | +{{0xFB00, 0xFB4F}, NULL, "AlphabeticPresentationForms"}, |
2692 | +{{0xFB50, 0xFDFF}, NULL, "ArabicPresentationForms-A"}, |
2693 | +{{0xFE20, 0xFE2F}, NULL, "CombiningHalfMarks"}, |
2694 | +{{0xFE30, 0xFE4F}, NULL, "CJKCompatibilityForms"}, |
2695 | +{{0xFE50, 0xFE6F}, NULL, "SmallFormVariants"}, |
2696 | +{{0xFE70, 0xFEFE}, NULL, "ArabicPresentationForms-B"}, |
2697 | +{{0xFEFF, 0xFEFF}, specials_extcp, "Specials"}, |
2698 | +{{0xFF00, 0xFFEF}, NULL, "HalfwidthandFullwidthForms"} |
2699 | +}; |
2700 | + |
2701 | +CRegexXQuery_charmatch* CRegexXQuery_parser::readChar(const char *pattern, |
2702 | + int *char_len, |
2703 | + enum CHARGROUP_t *multichar_type) |
2704 | { |
2705 | char c = 0; |
2706 | *char_len = 0; |
2707 | - *is_multichar = false; |
2708 | + *multichar_type = CHARGROUP_NO_MULTICHAR; |
2709 | switch(pattern[*char_len]) |
2710 | { |
2711 | case '\\': |
2712 | - { (*char_len)++; |
2713 | + { |
2714 | + (*char_len)++; |
2715 | switch(pattern[*char_len]) |
2716 | { |
2717 | - case 'n': c = '\n';break; |
2718 | - case 'r': c = '\r';break; |
2719 | - case 't': c = '\t';break; |
2720 | + case 'n': c = '\n';(*char_len)++;return new CRegexXQuery_char_ascii(current_regex, c); |
2721 | + case 'r': c = '\r';(*char_len)++;return new CRegexXQuery_char_ascii(current_regex, c); |
2722 | + case 't': c = '\t';(*char_len)++;return new CRegexXQuery_char_ascii(current_regex, c); |
2723 | case '\\': |
2724 | + case '/'://+ |
2725 | case '|': |
2726 | case '.': |
2727 | case '?': |
2728 | @@ -216,19 +359,205 @@ |
2729 | case '['://#x5B |
2730 | case ']'://#x5D |
2731 | case '^'://#x5E |
2732 | + case '$'://+ |
2733 | c = pattern[*char_len]; |
2734 | - break; |
2735 | + (*char_len)++; |
2736 | + *multichar_type = CHARGROUP_FLAGS_ONECHAR_ASCII; |
2737 | + return new CRegexXQuery_char_ascii(current_regex, c); |
2738 | case 'p'://catEsc |
2739 | case 'P'://complEsc |
2740 | + { |
2741 | //ignore the prop for now |
2742 | - c = pattern[*char_len]; |
2743 | - *is_multichar = true; |
2744 | - if(pattern[*char_len+1] == '{') |
2745 | - { |
2746 | - while(pattern[*char_len] != '}') |
2747 | + *multichar_type = CHARGROUP_FLAGS_MULTICHAR_p;//(CHARGROUP_t)((pattern[*char_len] == 'P') ? 128 : 0); |
2748 | + bool is_reverse = (pattern[*char_len] == 'P'); |
2749 | + c = 0; |
2750 | + if(pattern[(*char_len)+1] != '{') |
2751 | + { |
2752 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_BROKEN_P_CONSTRUCT)) ); |
2753 | + } |
2754 | + (*char_len) += 2; |
2755 | + switch(pattern[*char_len]) |
2756 | + {//IsCategory |
2757 | + case 'L': |
2758 | + { |
2759 | + switch(pattern[(*char_len)+1]) |
2760 | + { |
2761 | + case '}': |
2762 | + c = unicode::UNICODE_Ll + 50;break; |
2763 | + case 'u': |
2764 | + c = unicode::UNICODE_Lu; (*char_len)++;break; |
2765 | + case 'l': |
2766 | + c = unicode::UNICODE_Ll; (*char_len)++;break; |
2767 | + case 't': |
2768 | + c = unicode::UNICODE_Lt; (*char_len)++;break; |
2769 | + case 'm': |
2770 | + c = unicode::UNICODE_Lm; (*char_len)++;break; |
2771 | + case 'o': |
2772 | + c = unicode::UNICODE_Lo; (*char_len)++;break; |
2773 | + default: |
2774 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PL_CONSTRUCT)) ); |
2775 | + } |
2776 | + }break; |
2777 | + case 'M': |
2778 | + { |
2779 | + switch(pattern[(*char_len)+1]) |
2780 | + { |
2781 | + case '}': |
2782 | + c = unicode::UNICODE_Mc + 50;break; |
2783 | + case 'n': |
2784 | + c = unicode::UNICODE_Mn; (*char_len)++;break; |
2785 | + case 'c': |
2786 | + c = unicode::UNICODE_Mc; (*char_len)++;break; |
2787 | + case 'e': |
2788 | + c = unicode::UNICODE_Me; (*char_len)++;break; |
2789 | + default: |
2790 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PM_CONSTRUCT)) ); |
2791 | + } |
2792 | + }break; |
2793 | + case 'N': |
2794 | + { |
2795 | + switch(pattern[(*char_len)+1]) |
2796 | + { |
2797 | + case '}': |
2798 | + c = unicode::UNICODE_Nd + 50;break; |
2799 | + case 'd': |
2800 | + c = unicode::UNICODE_Nd; (*char_len)++;break; |
2801 | + case 'l': |
2802 | + c = unicode::UNICODE_Nl; (*char_len)++;break; |
2803 | + case 'o': |
2804 | + c = unicode::UNICODE_No; (*char_len)++;break; |
2805 | + default: |
2806 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PN_CONSTRUCT)) ); |
2807 | + } |
2808 | + }break; |
2809 | + case 'P': |
2810 | + { |
2811 | + switch(pattern[(*char_len)+1]) |
2812 | + { |
2813 | + case '}': |
2814 | + c = unicode::UNICODE_Pc + 50;break; |
2815 | + case 'c': |
2816 | + c = unicode::UNICODE_Pc; (*char_len)++;break; |
2817 | + case 'd': |
2818 | + c = unicode::UNICODE_Pd; (*char_len)++;break; |
2819 | + case 's': |
2820 | + c = unicode::UNICODE_Ps; (*char_len)++;break; |
2821 | + case 'e': |
2822 | + c = unicode::UNICODE_Pe; (*char_len)++;break; |
2823 | + case 'i': |
2824 | + c = unicode::UNICODE_Pi; (*char_len)++;break; |
2825 | + case 'f': |
2826 | + c = unicode::UNICODE_Pf; (*char_len)++;break; |
2827 | + case 'o': |
2828 | + c = unicode::UNICODE_Po; (*char_len)++;break; |
2829 | + default: |
2830 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PP_CONSTRUCT)) ); |
2831 | + } |
2832 | + }break; |
2833 | + case 'Z': |
2834 | + { |
2835 | + switch(pattern[(*char_len)+1]) |
2836 | + { |
2837 | + case '}': |
2838 | + c = unicode::UNICODE_Zl + 50;break; |
2839 | + case 's': |
2840 | + c = unicode::UNICODE_Zs; (*char_len)++;break; |
2841 | + case 'l': |
2842 | + c = unicode::UNICODE_Zl; (*char_len)++;break; |
2843 | + case 'p': |
2844 | + c = unicode::UNICODE_Zp; (*char_len)++;break; |
2845 | + default: |
2846 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PZ_CONSTRUCT)) ); |
2847 | + } |
2848 | + }break; |
2849 | + case 'S': |
2850 | + { |
2851 | + switch(pattern[(*char_len)+1]) |
2852 | + { |
2853 | + case '}': |
2854 | + c = unicode::UNICODE_Sc + 50;break; |
2855 | + case 'm': |
2856 | + c = unicode::UNICODE_Sm; (*char_len)++;break; |
2857 | + case 'c': |
2858 | + c = unicode::UNICODE_Sc; (*char_len)++;break; |
2859 | + case 'k': |
2860 | + c = unicode::UNICODE_Sk; (*char_len)++;break; |
2861 | + case 'o': |
2862 | + c = unicode::UNICODE_So; (*char_len)++;break; |
2863 | + default: |
2864 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PS_CONSTRUCT)) ); |
2865 | + } |
2866 | + }break; |
2867 | + case 'C': |
2868 | + { |
2869 | + switch(pattern[(*char_len)+1]) |
2870 | + { |
2871 | + case '}': |
2872 | + c = unicode::UNICODE_Cc + 50;break; |
2873 | + case 'c': |
2874 | + c = unicode::UNICODE_Cc; (*char_len)++;break; |
2875 | + case 'f': |
2876 | + c = unicode::UNICODE_Cf; (*char_len)++;break; |
2877 | + case 'o': |
2878 | + c = unicode::UNICODE_Co; (*char_len)++;break; |
2879 | + case 'n': |
2880 | + c = unicode::UNICODE_Cn; (*char_len)++;break; |
2881 | + default: |
2882 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PC_CONSTRUCT)) ); |
2883 | + } |
2884 | + }break; |
2885 | + }//end switch |
2886 | + if(c) |
2887 | + { |
2888 | + if(pattern[(*char_len) + 1] != '}') |
2889 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_BROKEN_P_CONSTRUCT)) ); |
2890 | + (*char_len)++; |
2891 | + (*char_len)++; |
2892 | + return new CRegexXQuery_multicharP(current_regex, c, is_reverse); |
2893 | + } |
2894 | + if(pattern[*char_len] == 'I') |
2895 | + { |
2896 | + if(pattern[(*char_len)+1] == 's')//IsBlock |
2897 | + { |
2898 | + *multichar_type = CHARGROUP_FLAGS_MULTICHAR_Is; |
2899 | + (*char_len) += 2; |
2900 | + zstring block_name; |
2901 | + char tempc = pattern[(*char_len)]; |
2902 | + while(tempc && (tempc != '}')) |
2903 | + { |
2904 | + if(!myisletterAZ(tempc) && !myisdigit(tempc) && (tempc != '-')) |
2905 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_BROKEN_PIs_CONSTRUCT)) ); |
2906 | + block_name.append(1, tempc); |
2907 | + (*char_len)++; |
2908 | + tempc = pattern[(*char_len)]; |
2909 | + } |
2910 | + if(!tempc) |
2911 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_BROKEN_PIs_CONSTRUCT)) ); |
2912 | + //search for the block name |
2913 | + int i; |
2914 | + int nr_blocks = sizeof(block_escape)/sizeof(CRegexXQuery_parser::block_escape_t); |
2915 | + for(i=0;i<nr_blocks;i++) |
2916 | + { |
2917 | + if(compare_ascii_i(block_name.c_str(), block_escape[i].group_name)) |
2918 | + { |
2919 | + c = i; |
2920 | + break; |
2921 | + } |
2922 | + } |
2923 | + if(i==nr_blocks) |
2924 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PIs_CONSTRUCT)) ); |
2925 | (*char_len)++; |
2926 | - } |
2927 | - break; |
2928 | + return new CRegexXQuery_multicharIs(current_regex, i, is_reverse); |
2929 | + } |
2930 | + else |
2931 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_BROKEN_PIs_CONSTRUCT)) ); |
2932 | + } |
2933 | + else |
2934 | + { |
2935 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_BROKEN_P_CONSTRUCT)) ); |
2936 | + } |
2937 | + break;//unreachable |
2938 | + }//end case 'p' |
2939 | //multiCharEsc |
2940 | case 's': |
2941 | case 'S': |
2942 | @@ -240,40 +569,104 @@ |
2943 | case 'D': |
2944 | case 'w': |
2945 | case 'W': |
2946 | - *is_multichar = true; |
2947 | + *multichar_type = CHARGROUP_FLAGS_MULTICHAR_OTHER; |
2948 | c = pattern[*char_len]; |
2949 | - break; |
2950 | - } |
2951 | - break; |
2952 | - } |
2953 | - case '#':///might be #xXX |
2954 | - { |
2955 | - if((pattern[*char_len+1] == 'x') && |
2956 | - myishex(pattern[*char_len+2]) && myishex(pattern[*char_len+3])) |
2957 | - { |
2958 | - c = (myishex(pattern[*char_len+2])-1)<<4 | (myishex(pattern[*char_len+3])-1); |
2959 | - *char_len += 3; |
2960 | - break; |
2961 | - } |
2962 | - } |
2963 | + (*char_len)++; |
2964 | + return new CRegexXQuery_multicharOther(current_regex, c); |
2965 | + case 'u'://unicode codepoint \uXXXX |
2966 | + { |
2967 | + unicode::code_point utf8c = 0; |
2968 | + (*char_len)++; |
2969 | + for(int i=0;i<4;i++) |
2970 | + { |
2971 | + char hex = myishex(pattern[*char_len]); |
2972 | + if(!hex) |
2973 | + { |
2974 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_UNICODE_CODEPOINT_u)) ); |
2975 | + } |
2976 | + utf8c <<= 4; |
2977 | + utf8c |= (hex-1) & 0x0f; |
2978 | + (*char_len)++; |
2979 | + } |
2980 | + return create_charmatch(utf8c, NULL, 0, multichar_type); |
2981 | + } |
2982 | + case 'U'://unicode codepoint \UXXXXXXXX |
2983 | + { |
2984 | + unicode::code_point utf8c = 0; |
2985 | + (*char_len)++; |
2986 | + for(int i=0;i<8;i++) |
2987 | + { |
2988 | + char hex = myishex(pattern[*char_len]); |
2989 | + if(!hex) |
2990 | + { |
2991 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_UNICODE_CODEPOINT_u)) ); |
2992 | + } |
2993 | + utf8c <<= 4; |
2994 | + utf8c |= (hex-1) & 0x0f; |
2995 | + (*char_len)++; |
2996 | + } |
2997 | + return create_charmatch(utf8c, NULL, 0, multichar_type); |
2998 | + } |
2999 | + default: |
3000 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_ESC_CHAR)) ); |
3001 | + } |
3002 | + assert(false); |
3003 | + break;//unreachable |
3004 | + }//end case '\' |
3005 | default: |
3006 | - c = pattern[*char_len]; |
3007 | - break; |
3008 | - } |
3009 | - |
3010 | - (*char_len)++; |
3011 | - return c; |
3012 | -} |
3013 | - |
3014 | - |
3015 | - |
3016 | -IRegexAtom* CRegexAscii_parser::read_atom(const char *pattern, int *atom_len) |
3017 | + { |
3018 | + const char *temp_pattern = pattern; |
3019 | + unicode::code_point utf8c = utf8::next_char(temp_pattern); |
3020 | + (*char_len) = temp_pattern - pattern; |
3021 | + return create_charmatch(utf8c, pattern, *char_len, multichar_type); |
3022 | + } |
3023 | + } |
3024 | + return NULL; |
3025 | +} |
3026 | + |
3027 | +CRegexXQuery_charmatch *CRegexXQuery_parser::create_charmatch(unicode::code_point utf8c, |
3028 | + const char *pattern, int utf8len, |
3029 | + enum CHARGROUP_t *multichar_type) |
3030 | +{ |
3031 | + if(utf8c <= 0x7F) |
3032 | + { |
3033 | + *multichar_type = CHARGROUP_FLAGS_ONECHAR_ASCII; |
3034 | + if(flags & REGEX_ASCII_CASE_INSENSITIVE) |
3035 | + return new CRegexXQuery_char_ascii_i(current_regex, (char)utf8c); |
3036 | + else |
3037 | + return new CRegexXQuery_char_ascii(current_regex, (char)utf8c); |
3038 | + } |
3039 | + else |
3040 | + { |
3041 | + *multichar_type = CHARGROUP_FLAGS_ONECHAR_UNICODE; |
3042 | + if(flags & REGEX_ASCII_CASE_INSENSITIVE) |
3043 | + return new CRegexXQuery_char_unicode_i(current_regex, utf8c); |
3044 | + else |
3045 | + { |
3046 | + if(pattern) |
3047 | + return new CRegexXQuery_char_unicode(current_regex, pattern, utf8len); |
3048 | + else |
3049 | + return new CRegexXQuery_char_unicode_cp(current_regex, utf8c); |
3050 | + } |
3051 | + } |
3052 | +} |
3053 | + |
3054 | +IRegexAtom* CRegexXQuery_parser::read_atom(const char *pattern, int *atom_len) |
3055 | { |
3056 | *atom_len = 0; |
3057 | - char c; |
3058 | - bool is_end_line = false; |
3059 | - c = pattern[*atom_len]; |
3060 | - if((!(flags & REGEX_ASCII_LITERAL)) && (c == '\\')) |
3061 | + if(flags & REGEX_ASCII_LITERAL) |
3062 | + { |
3063 | + unicode::code_point utf8c; |
3064 | + //bool is_end_line = false; |
3065 | + const char *temp_pattern = pattern; |
3066 | + utf8c = utf8::next_char(temp_pattern); |
3067 | + *atom_len = temp_pattern - pattern; |
3068 | + enum CHARGROUP_t multichar_type; |
3069 | + return create_charmatch(utf8c, pattern, *atom_len, &multichar_type); |
3070 | + } |
3071 | + |
3072 | + char c = *pattern; |
3073 | + if(c == '\\') |
3074 | { |
3075 | //check for back reference |
3076 | if(myisdigit(pattern[(*atom_len)+1])) |
3077 | @@ -281,13 +674,13 @@ |
3078 | (*atom_len)++; |
3079 | if(pattern[*atom_len] == '0') |
3080 | { |
3081 | - throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(U_REGEX_INVALID_BACK_REF)) ); |
3082 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_BACK_REF), 0, current_regex->subregex.size()) ); |
3083 | } |
3084 | unsigned int backref = pattern[*atom_len] - '0'; |
3085 | if((backref > current_regex->subregex.size()) || |
3086 | (current_regex->subregex.at(backref-1)->flags != 0)) |
3087 | { |
3088 | - throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(U_REGEX_INVALID_BACK_REF)) ); |
3089 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_BACK_REF), backref, current_regex->subregex.size()) ); |
3090 | } |
3091 | while(current_regex->subregex.size() >= backref*10) |
3092 | { |
3093 | @@ -303,70 +696,86 @@ |
3094 | break; |
3095 | } |
3096 | } |
3097 | - return new CRegexAscii_backref(current_regex, backref); |
3098 | + (*atom_len)++; |
3099 | + return new CRegexXQuery_backref(current_regex, backref); |
3100 | } |
3101 | } |
3102 | + if(c == '^') |
3103 | + { |
3104 | + (*atom_len)++; |
3105 | + return new CRegexXQuery_pinstart(current_regex); |
3106 | + } |
3107 | + if((c == '}') || (c == '{') || (c == '?') || (c == '*') || (c == '+') || (c == '|')) |
3108 | + { |
3109 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_ATOM_CHAR), c) ); |
3110 | + } |
3111 | switch(c) |
3112 | { |
3113 | case '[': |
3114 | { |
3115 | - if(!(flags & REGEX_ASCII_LITERAL)) |
3116 | - { |
3117 | - (*atom_len)++; |
3118 | - CRegexAscii_chargroup *chargroup = NULL; |
3119 | - int chargroup_len; |
3120 | - chargroup = readchargroup(pattern+*atom_len, &chargroup_len); |
3121 | - *atom_len += chargroup_len; |
3122 | - return chargroup; |
3123 | - } |
3124 | + (*atom_len)++; |
3125 | + CRegexXQuery_chargroup *chargroup = NULL; |
3126 | + int chargroup_len; |
3127 | + chargroup = readchargroup(pattern+*atom_len, &chargroup_len); |
3128 | + *atom_len += chargroup_len; |
3129 | + return chargroup; |
3130 | } |
3131 | case '.'://WildCharEsc |
3132 | { |
3133 | - if(!(flags & REGEX_ASCII_LITERAL)) |
3134 | - { |
3135 | - CRegexAscii_wildchar *wildchar = new CRegexAscii_wildchar(current_regex); |
3136 | - (*atom_len)++; |
3137 | - return wildchar; |
3138 | - } |
3139 | + (*atom_len)++; |
3140 | + return new CRegexXQuery_wildchar(current_regex); |
3141 | } |
3142 | case '('://begin an embedded reg exp |
3143 | { |
3144 | - if(!(flags & REGEX_ASCII_LITERAL)) |
3145 | - { |
3146 | - (*atom_len)++; |
3147 | - CRegexAscii_regex *emb_regex = NULL; |
3148 | - int regex_len; |
3149 | - emb_regex = parse_regexp(pattern + *atom_len, ®ex_len); |
3150 | - *atom_len += regex_len; |
3151 | - return emb_regex; |
3152 | - } |
3153 | + (*atom_len)++; |
3154 | + CRegexXQuery_regex *emb_regex = NULL; |
3155 | + int regex_len; |
3156 | + emb_regex = parse_regexp(pattern + *atom_len, ®ex_len); |
3157 | + *atom_len += regex_len; |
3158 | + return emb_regex; |
3159 | } |
3160 | case '$'://end line |
3161 | - if(!(flags & REGEX_ASCII_LITERAL)) |
3162 | - { |
3163 | - is_end_line = true; |
3164 | - } |
3165 | + //is_end_line = true; |
3166 | + (*atom_len)++; |
3167 | + return new CRegexXQuery_endline(current_regex); |
3168 | default: |
3169 | { |
3170 | - char c; |
3171 | + //char c; |
3172 | + CRegexXQuery_charmatch *charmatch = NULL; |
3173 | int c_len; |
3174 | - bool is_multichar = false; |
3175 | - if(!(flags & REGEX_ASCII_LITERAL)) |
3176 | - c = readChar(pattern+*atom_len, &c_len, &is_multichar); |
3177 | - else |
3178 | + CHARGROUP_t multichar_type = CHARGROUP_NO_MULTICHAR; |
3179 | + *atom_len = 0; |
3180 | + while(pattern[*atom_len]) |
3181 | { |
3182 | - c = pattern[*atom_len]; |
3183 | - c_len = 1; |
3184 | + charmatch = readChar(pattern+*atom_len, &c_len, &multichar_type); |
3185 | + *atom_len += c_len; |
3186 | + if((flags & REGEX_ASCII_NO_WHITESPACE) && (multichar_type == CHARGROUP_FLAGS_ONECHAR_ASCII)) |
3187 | + { |
3188 | + char c = (char)charmatch->get_c(); |
3189 | + if((c == ' ') || (c == '\t') || (c == '\r') || (c == '\n')) |
3190 | + { |
3191 | + //ignore this whitespace |
3192 | + delete charmatch; |
3193 | + continue; |
3194 | + } |
3195 | + else |
3196 | + break; |
3197 | + } |
3198 | + else |
3199 | + break; |
3200 | } |
3201 | - CRegexAscii_chargroup *chargroup = new CRegexAscii_chargroup(current_regex); |
3202 | - if(is_multichar) |
3203 | - chargroup->addMultiChar(c); |
3204 | + /* |
3205 | + std::auto_ptr<CRegexXQuery_chargroup> chargroup(new CRegexXQuery_chargroup(current_regex)); |
3206 | + if(multichar_type) |
3207 | + chargroup->addMultiChar(c, multichar_type); |
3208 | else if(is_end_line) |
3209 | chargroup->addEndLine(); |
3210 | else |
3211 | - chargroup->addCharRange(c, c); |
3212 | + chargroup->addOneChar(c); |
3213 | *atom_len += c_len; |
3214 | - return chargroup; |
3215 | + return chargroup.release(); |
3216 | + */ |
3217 | + return charmatch; |
3218 | } |
3219 | } |
3220 | } |
3221 | @@ -374,81 +783,119 @@ |
3222 | //read until ']' |
3223 | //posCharGroup ::= ( charRange | charClassEsc )+ |
3224 | //charRange ::= seRange | XmlCharIncDash |
3225 | -CRegexAscii_chargroup* CRegexAscii_parser::readchargroup(const char *pattern, int *chargroup_len) |
3226 | +CRegexXQuery_chargroup* CRegexXQuery_parser::readchargroup(const char *pattern, int *chargroup_len) |
3227 | { |
3228 | - CRegexAscii_chargroup *chargroup = NULL; |
3229 | + std::auto_ptr<CRegexXQuery_chargroup> chargroup; |
3230 | *chargroup_len = 0; |
3231 | if(pattern[*chargroup_len] == '^')//negative group |
3232 | { |
3233 | (*chargroup_len)++; |
3234 | - chargroup = new CRegexAscii_negchargroup(current_regex); |
3235 | + chargroup.reset(new CRegexXQuery_negchargroup(current_regex)); |
3236 | } |
3237 | else |
3238 | - chargroup = new CRegexAscii_chargroup(current_regex); |
3239 | + chargroup.reset(new CRegexXQuery_chargroup(current_regex)); |
3240 | while(pattern[*chargroup_len] && (pattern[*chargroup_len]!=']')) |
3241 | { |
3242 | - char c1, c2; |
3243 | - bool is_multichar; |
3244 | + //char c1, c2; |
3245 | + CHARGROUP_t multichar_type = CHARGROUP_NO_MULTICHAR; |
3246 | int c1_len; |
3247 | - c1 = pattern[*chargroup_len]; |
3248 | - c2 = pattern[*chargroup_len+1]; |
3249 | - if((c1 == '-') && (c2 == '['))//charClassSub |
3250 | + if((pattern[*chargroup_len] == '-') && (pattern[(*chargroup_len)+1] == '['))//charClassSub |
3251 | { |
3252 | int classsub_len; |
3253 | - CRegexAscii_chargroup *classsub = readchargroup(pattern + *chargroup_len+1 + 1, &classsub_len); |
3254 | + CRegexXQuery_chargroup *classsub = readchargroup(pattern + (*chargroup_len)+1 + 1, &classsub_len); |
3255 | if(!classsub) |
3256 | { |
3257 | - delete chargroup; |
3258 | - return NULL; |
3259 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_SUBCLASS)) ); |
3260 | } |
3261 | chargroup->addClassSub(classsub); |
3262 | *chargroup_len += 2 + classsub_len + 1; |
3263 | if(pattern[*chargroup_len-1] != ']') |
3264 | { |
3265 | - delete chargroup; |
3266 | - return NULL; |
3267 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_USE_OF_SUBCLASS)) ); |
3268 | } |
3269 | - return chargroup; |
3270 | + return chargroup.release(); |
3271 | } |
3272 | |
3273 | - c1 = readChar(pattern+*chargroup_len, &c1_len, &is_multichar); |
3274 | - if(is_multichar)//first char is multichar |
3275 | + std::unique_ptr<CRegexXQuery_charmatch> charmatch(readChar(pattern+*chargroup_len, &c1_len, &multichar_type)); |
3276 | + if((multichar_type == CHARGROUP_FLAGS_MULTICHAR_p) || |
3277 | + (multichar_type == CHARGROUP_FLAGS_MULTICHAR_Is) || |
3278 | + (multichar_type == CHARGROUP_FLAGS_MULTICHAR_OTHER))//first char is multichar |
3279 | { |
3280 | - chargroup->addMultiChar(c1); |
3281 | + if((pattern[*chargroup_len+c1_len] == '-') &&///should not be a range |
3282 | + (pattern[*chargroup_len+c1_len+1] != ']')) |
3283 | + { |
3284 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_MULTICHAR_IN_CHAR_RANGE)) ); |
3285 | + } |
3286 | + //chargroup->addMultiChar(c1, multichar_type); |
3287 | + chargroup->addCharMatch(charmatch.release()); |
3288 | *chargroup_len += c1_len; |
3289 | continue; |
3290 | } |
3291 | - if(pattern[*chargroup_len+c1_len] == '-')///might be a range |
3292 | + (*chargroup_len) += c1_len; |
3293 | + if(pattern[*chargroup_len] == '-')///might be a range |
3294 | { |
3295 | - if(pattern[*chargroup_len+c1_len+1] == ']')//no range, just the last char is '-' |
3296 | + if(pattern[(*chargroup_len)+1] == ']')//no range, just the last char is '-' |
3297 | { |
3298 | - chargroup->addCharRange(c1, c1); |
3299 | - chargroup->addCharRange('-', '-'); |
3300 | - *chargroup_len += c1_len + 1; |
3301 | + //chargroup->addOneChar(c1); |
3302 | + //chargroup->addOneChar('-'); |
3303 | + chargroup->addCharMatch(charmatch.release()); |
3304 | + chargroup->addCharMatch(new CRegexXQuery_char_ascii(current_regex, '-')); |
3305 | + (*chargroup_len)++; |
3306 | continue; |
3307 | } |
3308 | - else |
3309 | + else if(pattern[(*chargroup_len)+1] != '[') |
3310 | { |
3311 | //it is a range |
3312 | - char c3; |
3313 | - int c3_len; |
3314 | - c3 = readChar(pattern+*chargroup_len+c1_len+1, &c3_len, &is_multichar); |
3315 | - if(is_multichar) |
3316 | - return NULL;//error |
3317 | - chargroup->addCharRange(c1, c3); |
3318 | - *chargroup_len += c1_len + 1 + c3_len; |
3319 | + (*chargroup_len)++; |
3320 | + std::unique_ptr<CRegexXQuery_charmatch> charmatch2; |
3321 | + CHARGROUP_t multichar_type2 = CHARGROUP_NO_MULTICHAR; |
3322 | + int c2_len; |
3323 | + charmatch2.reset(readChar(pattern+(*chargroup_len), &c2_len, &multichar_type2)); |
3324 | + if((multichar_type2 != CHARGROUP_FLAGS_ONECHAR_ASCII) && |
3325 | + (multichar_type2 != CHARGROUP_FLAGS_ONECHAR_ASCII))//second char in range is multichar |
3326 | + { |
3327 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_MULTICHAR_IN_CHAR_RANGE)) ); |
3328 | + } |
3329 | + //chargroup->addCharRange(c1, c3); |
3330 | + if((multichar_type == CHARGROUP_FLAGS_ONECHAR_ASCII) && (multichar_type2 == CHARGROUP_FLAGS_ONECHAR_ASCII)) |
3331 | + { |
3332 | + if(flags & REGEX_ASCII_CASE_INSENSITIVE) |
3333 | + chargroup->addCharMatch(new CRegexXQuery_char_range_ascii_i(current_regex, |
3334 | + (char)charmatch->get_c(), |
3335 | + (char)charmatch2->get_c())); |
3336 | + else |
3337 | + chargroup->addCharMatch(new CRegexXQuery_char_range_ascii(current_regex, |
3338 | + (char)charmatch->get_c(), |
3339 | + (char)charmatch2->get_c())); |
3340 | + } |
3341 | + else |
3342 | + { |
3343 | + if(flags & REGEX_ASCII_CASE_INSENSITIVE) |
3344 | + chargroup->addCharMatch(new CRegexXQuery_char_range_unicode_i(current_regex, |
3345 | + charmatch->get_c(), |
3346 | + charmatch2->get_c())); |
3347 | + else |
3348 | + chargroup->addCharMatch(new CRegexXQuery_char_range_unicode(current_regex, |
3349 | + charmatch->get_c(), |
3350 | + charmatch2->get_c())); |
3351 | + } |
3352 | + *chargroup_len += c2_len; |
3353 | continue; |
3354 | } |
3355 | } |
3356 | - chargroup->addCharRange(c1, c1); |
3357 | - *chargroup_len += c1_len; |
3358 | + //chargroup->addOneChar(c1); |
3359 | + chargroup->addCharMatch(charmatch.release()); |
3360 | } |
3361 | if(pattern[*chargroup_len]) |
3362 | (*chargroup_len)++; |
3363 | - return chargroup; |
3364 | + else |
3365 | + { |
3366 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_MISSING_CLOSE_BRACKET)) ); |
3367 | + } |
3368 | + return chargroup.release(); |
3369 | } |
3370 | |
3371 | -void CRegexAscii_parser::read_quantifier(CRegexAscii_piece *piece, |
3372 | +void CRegexXQuery_parser::read_quantifier(CRegexXQuery_piece *piece, |
3373 | const char *pattern, int *quantif_len) |
3374 | { |
3375 | *quantif_len = 0; |
3376 | @@ -496,6 +943,10 @@ |
3377 | max = max*10 + pattern[*quantif_len] - '0'; |
3378 | (*quantif_len)++; |
3379 | } |
3380 | + if(max < min) |
3381 | + { |
3382 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_MAX_LT_MIN)) ); |
3383 | + } |
3384 | piece->set_quantifier_min_max(min, max, true); |
3385 | } |
3386 | while(pattern[*quantif_len] && (pattern[*quantif_len] != '}')) |
3387 | @@ -524,23 +975,25 @@ |
3388 | ///Constructors and destructors and internal functions |
3389 | //////////////////////////// |
3390 | |
3391 | -CRegexAscii_regex::CRegexAscii_regex(CRegexAscii_regex *topregex) : IRegexAtom(topregex?topregex:this) |
3392 | +CRegexXQuery_regex::CRegexXQuery_regex(CRegexXQuery_regex *topregex) : IRegexAtom(topregex?topregex:this) |
3393 | { |
3394 | matched_source = NULL; |
3395 | matched_len = 0; |
3396 | +// backup_matched_source = NULL; |
3397 | +// backup_matched_len = 0; |
3398 | flags = 128;//set to 0 after initialization |
3399 | } |
3400 | |
3401 | -CRegexAscii_regex::~CRegexAscii_regex() |
3402 | +CRegexXQuery_regex::~CRegexXQuery_regex() |
3403 | { |
3404 | - std::list<CRegexAscii_branch*>::iterator branch_it; |
3405 | + std::list<CRegexXQuery_branch*>::iterator branch_it; |
3406 | |
3407 | for(branch_it = branch_list.begin(); branch_it != branch_list.end(); branch_it++) |
3408 | { |
3409 | delete (*branch_it); |
3410 | } |
3411 | /* |
3412 | - std::vector<CRegexAscii_regex*>::iterator subregex_it; |
3413 | + std::vector<CRegexXQuery_regex*>::iterator subregex_it; |
3414 | for(subregex_it = subregex.begin(); subregex_it != subregex.end(); subregex_it++) |
3415 | { |
3416 | delete (*subregex_it); |
3417 | @@ -548,25 +1001,18 @@ |
3418 | */ |
3419 | } |
3420 | |
3421 | -bool CRegexAscii_regex::set_align_begin(bool align_begin) |
3422 | -{ |
3423 | - bool prev_align = this->align_begin; |
3424 | - this->align_begin = align_begin; |
3425 | - return prev_align; |
3426 | -} |
3427 | - |
3428 | -void CRegexAscii_regex::add_branch(CRegexAscii_branch *branch) |
3429 | +void CRegexXQuery_regex::add_branch(CRegexXQuery_branch *branch) |
3430 | { |
3431 | branch_list.push_back(branch); |
3432 | } |
3433 | |
3434 | -bool CRegexAscii_regex::get_indexed_match(int index, |
3435 | +bool CRegexXQuery_regex::get_indexed_match(int index, |
3436 | const char **matched_source, |
3437 | int *matched_len) |
3438 | { |
3439 | if(!index || index > (int)subregex.size()) |
3440 | return false; |
3441 | - CRegexAscii_regex *subr = subregex[index-1]; |
3442 | + CRegexXQuery_regex *subr = subregex[index-1]; |
3443 | *matched_source = subr->matched_source; |
3444 | if(!*matched_source) |
3445 | return false; |
3446 | @@ -574,145 +1020,209 @@ |
3447 | return true; |
3448 | } |
3449 | |
3450 | -unsigned int CRegexAscii_regex::get_indexed_regex_count() |
3451 | +unsigned int CRegexXQuery_regex::get_indexed_regex_count() |
3452 | { |
3453 | return subregex.size(); |
3454 | } |
3455 | |
3456 | -CRegexAscii_branch::CRegexAscii_branch(CRegexAscii_regex* regex) : |
3457 | - IRegexMatcher(regex) |
3458 | +CRegexXQuery_branch::CRegexXQuery_branch(CRegexXQuery_regex* regex) |
3459 | + //: |
3460 | + //IRegexMatcher(regex) |
3461 | { |
3462 | } |
3463 | |
3464 | -CRegexAscii_branch::~CRegexAscii_branch() |
3465 | +CRegexXQuery_branch::~CRegexXQuery_branch() |
3466 | { |
3467 | - std::list<CRegexAscii_piece*>::iterator piece_it; |
3468 | + std::list<RegexAscii_pieceinfo>::iterator piece_it; |
3469 | |
3470 | for(piece_it = piece_list.begin(); piece_it != piece_list.end(); piece_it++) |
3471 | { |
3472 | - delete (*piece_it); |
3473 | + delete (*piece_it).piece; |
3474 | } |
3475 | } |
3476 | |
3477 | -void CRegexAscii_branch::add_piece(CRegexAscii_piece *piece) |
3478 | +void CRegexXQuery_branch::add_piece(CRegexXQuery_piece *piece) |
3479 | { |
3480 | piece_list.push_back(piece); |
3481 | } |
3482 | |
3483 | -CRegexAscii_piece::CRegexAscii_piece() |
3484 | +CRegexXQuery_piece::CRegexXQuery_piece() |
3485 | { |
3486 | + atom = NULL; |
3487 | + regex_atom = NULL; |
3488 | } |
3489 | |
3490 | -CRegexAscii_piece::~CRegexAscii_piece() |
3491 | +CRegexXQuery_piece::~CRegexXQuery_piece() |
3492 | { |
3493 | delete atom; |
3494 | } |
3495 | |
3496 | -void CRegexAscii_piece::set_atom(IRegexAtom *atom) |
3497 | +void CRegexXQuery_piece::set_atom(IRegexAtom *atom) |
3498 | { |
3499 | this->atom = atom; |
3500 | + this->regex_atom = dynamic_cast<CRegexXQuery_regex*>(atom); |
3501 | } |
3502 | |
3503 | -void CRegexAscii_piece::set_quantifier_min_max(int min, int max, bool strict_max) |
3504 | +void CRegexXQuery_piece::set_quantifier_min_max(int min, int max, bool strict_max) |
3505 | { |
3506 | this->min = min; |
3507 | this->max = max; |
3508 | this->strict_max = strict_max; |
3509 | } |
3510 | -void CRegexAscii_piece::set_is_reluctant(bool is_reluctant) |
3511 | +void CRegexXQuery_piece::set_is_reluctant(bool is_reluctant) |
3512 | { |
3513 | this->is_reluctant = is_reluctant; |
3514 | } |
3515 | -void CRegexAscii_piece::get_quantifier(int *min, int *max, bool *strict_max) |
3516 | +void CRegexXQuery_piece::get_quantifier(int *min, int *max, bool *strict_max) |
3517 | { |
3518 | *min = this->min; |
3519 | *max = this->max; |
3520 | *strict_max = this->strict_max; |
3521 | } |
3522 | -bool CRegexAscii_piece::get_is_reluctant() |
3523 | +bool CRegexXQuery_piece::get_is_reluctant() |
3524 | { |
3525 | + if(atom->regex_intern->flags & REGEX_ASCII_MINIMAL_MATCH) |
3526 | + return true; |
3527 | return is_reluctant; |
3528 | } |
3529 | |
3530 | |
3531 | -CRegexAscii_chargroup::CRegexAscii_chargroup(CRegexAscii_regex* regex) : |
3532 | +CRegexXQuery_charmatch::CRegexXQuery_charmatch(CRegexXQuery_regex* regex) : |
3533 | + IRegexAtom(regex) |
3534 | +{ |
3535 | +} |
3536 | +CRegexXQuery_multicharP::CRegexXQuery_multicharP(CRegexXQuery_regex* regex, char type, bool is_reverse) : |
3537 | + CRegexXQuery_charmatch(regex) |
3538 | +{ |
3539 | + this->multichar_type = type; this->is_reverse = is_reverse; |
3540 | +} |
3541 | +CRegexXQuery_multicharIs::CRegexXQuery_multicharIs(CRegexXQuery_regex* regex, int block_index, bool is_reverse) : |
3542 | + CRegexXQuery_charmatch(regex) |
3543 | +{ |
3544 | + this->block_index = block_index; this->is_reverse = is_reverse; |
3545 | +} |
3546 | +CRegexXQuery_multicharOther::CRegexXQuery_multicharOther(CRegexXQuery_regex* regex, char type) : |
3547 | + CRegexXQuery_charmatch(regex) |
3548 | +{ |
3549 | + this->multichar_type = type; |
3550 | +} |
3551 | +CRegexXQuery_char_ascii::CRegexXQuery_char_ascii(CRegexXQuery_regex* regex, char c) : |
3552 | + CRegexXQuery_charmatch(regex) |
3553 | +{ |
3554 | + this->c = c; |
3555 | +} |
3556 | +CRegexXQuery_char_ascii_i::CRegexXQuery_char_ascii_i(CRegexXQuery_regex* regex, char c) : |
3557 | + CRegexXQuery_char_ascii(regex, toupper(c)) |
3558 | +{ |
3559 | +} |
3560 | +CRegexXQuery_char_range_ascii::CRegexXQuery_char_range_ascii(CRegexXQuery_regex* regex, char c1, char c2) : |
3561 | + CRegexXQuery_charmatch(regex) |
3562 | +{ |
3563 | + this->c1 = c1; this->c2 = c2; |
3564 | +} |
3565 | +CRegexXQuery_char_range_ascii_i::CRegexXQuery_char_range_ascii_i(CRegexXQuery_regex* regex, char c1, char c2) : |
3566 | + CRegexXQuery_char_range_ascii(regex, toupper(c1), toupper(c2)) |
3567 | +{ |
3568 | +} |
3569 | +CRegexXQuery_char_unicode::CRegexXQuery_char_unicode(CRegexXQuery_regex* regex, const char *source, int len) : |
3570 | + CRegexXQuery_charmatch(regex) |
3571 | +{ |
3572 | + this->len = len; |
3573 | + memcpy(c, source, len); |
3574 | +} |
3575 | +CRegexXQuery_char_unicode_cp::CRegexXQuery_char_unicode_cp(CRegexXQuery_regex* regex, unicode::code_point c) : |
3576 | + CRegexXQuery_charmatch(regex) |
3577 | +{ |
3578 | + this->c = c; |
3579 | +} |
3580 | +CRegexXQuery_char_unicode_i::CRegexXQuery_char_unicode_i(CRegexXQuery_regex* regex, unicode::code_point c) : |
3581 | + CRegexXQuery_char_unicode_cp(regex, unicode::to_upper(c)) |
3582 | +{ |
3583 | +} |
3584 | +CRegexXQuery_char_range_unicode::CRegexXQuery_char_range_unicode(CRegexXQuery_regex* regex, unicode::code_point c1, unicode::code_point c2) : |
3585 | + CRegexXQuery_charmatch(regex) |
3586 | +{ |
3587 | + this->c1 = c1; this->c2 = c2; |
3588 | +} |
3589 | +CRegexXQuery_char_range_unicode_i::CRegexXQuery_char_range_unicode_i(CRegexXQuery_regex* regex, unicode::code_point c1, unicode::code_point c2) : |
3590 | + CRegexXQuery_char_range_unicode(regex, unicode::to_upper(c1), unicode::to_upper(c2)) |
3591 | +{ |
3592 | +} |
3593 | +CRegexXQuery_endline::CRegexXQuery_endline(CRegexXQuery_regex* regex) : |
3594 | + CRegexXQuery_charmatch(regex) |
3595 | +{ |
3596 | +} |
3597 | + |
3598 | +unicode::code_point CRegexXQuery_char_unicode::get_c() |
3599 | +{ |
3600 | + const char *temp_c = (const char*)c; |
3601 | + return utf8::next_char(temp_c); |
3602 | +} |
3603 | + |
3604 | + |
3605 | +CRegexXQuery_chargroup::CRegexXQuery_chargroup(CRegexXQuery_regex* regex) : |
3606 | IRegexAtom(regex) |
3607 | { |
3608 | classsub = NULL; |
3609 | } |
3610 | |
3611 | -CRegexAscii_chargroup::~CRegexAscii_chargroup() |
3612 | +CRegexXQuery_chargroup::~CRegexXQuery_chargroup() |
3613 | { |
3614 | delete classsub; |
3615 | -} |
3616 | - |
3617 | -void CRegexAscii_chargroup::addMultiChar(char c) |
3618 | -{ |
3619 | - chargroup_t cgt; |
3620 | - cgt.flags = CHARGROUP_FLAGS_MULTICHAR; |
3621 | - cgt.c1 = c; |
3622 | - cgt.c2 = 0; |
3623 | - chargroup_list.push_back(cgt); |
3624 | -} |
3625 | - |
3626 | -void CRegexAscii_chargroup::addEndLine() |
3627 | -{ |
3628 | - chargroup_t cgt; |
3629 | - cgt.flags = CHARGROUP_FLAGS_ENDLINE; |
3630 | - cgt.c1 = '$'; |
3631 | - cgt.c2 = 0; |
3632 | - chargroup_list.push_back(cgt); |
3633 | -} |
3634 | - |
3635 | -void CRegexAscii_chargroup::addCharRange(char c1, char c2) |
3636 | -{ |
3637 | - chargroup_t cgt; |
3638 | - cgt.flags = 0; |
3639 | - cgt.c1 = c1; |
3640 | - cgt.c2 = c2; |
3641 | - chargroup_list.push_back(cgt); |
3642 | -} |
3643 | - |
3644 | -void CRegexAscii_chargroup::addClassSub(CRegexAscii_chargroup* classsub) |
3645 | + std::list<CRegexXQuery_charmatch* >::iterator charmatch_it; |
3646 | + for(charmatch_it=chargroup_list.begin(); charmatch_it != chargroup_list.end(); charmatch_it++) |
3647 | + delete (*charmatch_it); |
3648 | +} |
3649 | + |
3650 | +void CRegexXQuery_chargroup::addCharMatch(CRegexXQuery_charmatch *charmatch) |
3651 | +{ |
3652 | + chargroup_list.push_back(charmatch); |
3653 | +} |
3654 | +void CRegexXQuery_chargroup::addClassSub(CRegexXQuery_chargroup* classsub) |
3655 | { |
3656 | this->classsub = classsub; |
3657 | } |
3658 | |
3659 | -CRegexAscii_negchargroup::CRegexAscii_negchargroup(CRegexAscii_regex* regex) : |
3660 | - CRegexAscii_chargroup(regex) |
3661 | -{ |
3662 | -} |
3663 | - |
3664 | -CRegexAscii_negchargroup::~CRegexAscii_negchargroup() |
3665 | -{ |
3666 | -} |
3667 | - |
3668 | -CRegexAscii_wildchar::CRegexAscii_wildchar(CRegexAscii_regex* regex) : |
3669 | +CRegexXQuery_negchargroup::CRegexXQuery_negchargroup(CRegexXQuery_regex* regex) : |
3670 | + CRegexXQuery_chargroup(regex) |
3671 | +{ |
3672 | +} |
3673 | + |
3674 | +CRegexXQuery_negchargroup::~CRegexXQuery_negchargroup() |
3675 | +{ |
3676 | +} |
3677 | + |
3678 | +CRegexXQuery_wildchar::CRegexXQuery_wildchar(CRegexXQuery_regex* regex) : |
3679 | IRegexAtom(regex) |
3680 | { |
3681 | } |
3682 | |
3683 | -CRegexAscii_wildchar::~CRegexAscii_wildchar() |
3684 | +CRegexXQuery_wildchar::~CRegexXQuery_wildchar() |
3685 | { |
3686 | } |
3687 | |
3688 | -CRegexAscii_backref::CRegexAscii_backref(CRegexAscii_regex* regex, unsigned int backref_) : |
3689 | +CRegexXQuery_backref::CRegexXQuery_backref(CRegexXQuery_regex* regex, unsigned int backref_) : |
3690 | IRegexAtom(regex), |
3691 | backref(backref_) |
3692 | { |
3693 | } |
3694 | |
3695 | -CRegexAscii_backref::~CRegexAscii_backref() |
3696 | -{ |
3697 | -} |
3698 | - |
3699 | -CRegexAscii_parser::CRegexAscii_parser() |
3700 | +CRegexXQuery_backref::~CRegexXQuery_backref() |
3701 | +{ |
3702 | +} |
3703 | + |
3704 | +CRegexXQuery_pinstart::CRegexXQuery_pinstart(CRegexXQuery_regex* regex): |
3705 | + IRegexAtom(regex) |
3706 | +{ |
3707 | +} |
3708 | + |
3709 | +CRegexXQuery_parser::CRegexXQuery_parser() |
3710 | { |
3711 | current_regex = NULL; |
3712 | regex_depth = 0; |
3713 | } |
3714 | |
3715 | -CRegexAscii_parser::~CRegexAscii_parser() |
3716 | +CRegexXQuery_parser::~CRegexXQuery_parser() |
3717 | { |
3718 | } |
3719 | |
3720 | @@ -720,9 +1230,68 @@ |
3721 | ////////////////////////////////////////// |
3722 | ////Matching the pattern on a string |
3723 | ///////////////////////////////////////// |
3724 | +static std::list<RegexAscii_pieceinfo> empty_pieces;//empty list of pieces |
3725 | +/* |
3726 | +std::list<RegexAscii_pieceinfo>::iterator |
3727 | +IRegexAtom::choose_next_piece(const char *source, int *matched_len, |
3728 | + std::list<RegexAscii_pieceinfo>::iterator this_piece, |
3729 | + std::list<RegexAscii_pieceinfo>::iterator end_piece) |
3730 | +{ |
3731 | + //if this_piece is repetition, repeat until max, then go to next piece |
3732 | + int min, max; |
3733 | + bool strict_max; |
3734 | + while(this_piece != end_piece) |
3735 | + { |
3736 | + (*this_piece).piece->get_quantifier(&min, &max, &strict_max); |
3737 | + if(max <= ((*this_piece).nr_matches))//finished this piece |
3738 | + { |
3739 | + this_piece++; |
3740 | + } |
3741 | + else |
3742 | + break; |
3743 | + } |
3744 | + return this_piece; |
3745 | +} |
3746 | +*/ |
3747 | + |
3748 | +bool IRegexAtom::match(const char *source, int *start_from_branch, int *matched_len, |
3749 | + std::list<RegexAscii_pieceinfo>::iterator this_piece, |
3750 | + std::list<RegexAscii_pieceinfo>::iterator end_piece) |
3751 | +{ |
3752 | + *start_from_branch = 0; |
3753 | + bool retmatch; |
3754 | + retmatch = match_internal(source, start_from_branch, matched_len); |
3755 | + if(!retmatch) |
3756 | + return false; |
3757 | + |
3758 | + if(this_piece == end_piece) |
3759 | + return true; |
3760 | + |
3761 | + (*this_piece).nr_matches++; |
3762 | + int min,max; |
3763 | + bool strict_max; |
3764 | + (*this_piece).piece->get_quantifier(&min, &max, &strict_max); |
3765 | + std::list<RegexAscii_pieceinfo>::iterator init_piece = this_piece; |
3766 | + if(((min == 1) && (max == 1)) || //the simple common case |
3767 | + ((*matched_len == 0) && ((*this_piece).nr_matches>=min)))//to avoid infinite loop |
3768 | + { |
3769 | + this_piece++; |
3770 | + if(this_piece == end_piece) |
3771 | + return true; |
3772 | + } |
3773 | + int matched_len2; |
3774 | + retmatch = (*this_piece).piece->match_piece(this_piece, end_piece, source + *matched_len, &matched_len2); |
3775 | + if(!retmatch) |
3776 | + { |
3777 | + (*init_piece).nr_matches--; |
3778 | + return false; |
3779 | + } |
3780 | + *matched_len += matched_len2; |
3781 | + return true; |
3782 | +} |
3783 | |
3784 | //try every position in source to match the pattern |
3785 | -bool CRegexAscii_regex::match_anywhere(const char *source, unsigned int flags, |
3786 | +bool CRegexXQuery_regex::match_anywhere(const char *source, unsigned int flags, |
3787 | int *match_pos, int *matched_len) |
3788 | { |
3789 | *match_pos = 0; |
3790 | @@ -730,43 +1299,66 @@ |
3791 | return match_from(source, flags, match_pos, matched_len); |
3792 | } |
3793 | |
3794 | -bool CRegexAscii_regex::match_from(const char *source, unsigned int flags, |
3795 | +bool CRegexXQuery_regex::match_from(const char *source, unsigned int flags, |
3796 | int *match_pos, int *matched_len) |
3797 | { |
3798 | this->flags = flags; |
3799 | + this->source_start = source; |
3800 | reachedEnd = false; |
3801 | |
3802 | - std::vector<CRegexAscii_regex*>::iterator regex_it; |
3803 | + std::vector<CRegexXQuery_regex*>::iterator regex_it; |
3804 | for(regex_it = subregex.begin(); regex_it != subregex.end(); regex_it++) |
3805 | { |
3806 | (*regex_it)->matched_source = NULL; |
3807 | } |
3808 | -// if(!source[0]) |
3809 | -// { |
3810 | -// if(branch_list.empty()) |
3811 | -// return true; |
3812 | -// else |
3813 | -// return false; |
3814 | -// } |
3815 | - |
3816 | - bool skip_first_match = false; |
3817 | - if(*match_pos && align_begin) |
3818 | - skip_first_match = true; |
3819 | + |
3820 | + std::vector<std::pair<const char*, int> > saved_subregex; |
3821 | + |
3822 | + if(*match_pos && (flags & REGEX_ASCII_WHOLE_MATCH)) |
3823 | + return false; |
3824 | + |
3825 | do |
3826 | { |
3827 | - if(!skip_first_match) |
3828 | - { |
3829 | - if(match(source + *match_pos, matched_len)) |
3830 | - return true; |
3831 | - } |
3832 | - skip_first_match = false; |
3833 | - if(align_begin) |
3834 | + int start_from_branch = 0; |
3835 | + int longest_match = -1; |
3836 | + while(1) |
3837 | + { |
3838 | + if(!match(source + *match_pos, &start_from_branch, matched_len, empty_pieces.begin(), empty_pieces.end())) |
3839 | + break; |
3840 | + if(longest_match < *matched_len) |
3841 | + { |
3842 | + longest_match = *matched_len; |
3843 | + if(start_from_branch && (flags & REGEX_ASCII_GET_LONGEST_BRANCH)) |
3844 | + save_subregex_list(saved_subregex); |
3845 | + } |
3846 | + if(!start_from_branch || !(flags & REGEX_ASCII_GET_LONGEST_BRANCH)) |
3847 | + break; |
3848 | + //else try the other branches to see which is longer |
3849 | + } |
3850 | + if(longest_match != -1) |
3851 | + { |
3852 | + *matched_len = longest_match; |
3853 | + if(saved_subregex.size()) |
3854 | + load_subregex_list(saved_subregex); |
3855 | + if(flags & REGEX_ASCII_WHOLE_MATCH) |
3856 | + { |
3857 | + if(!source[*match_pos+*matched_len]) |
3858 | + return true; |
3859 | + if((flags & REGEX_ASCII_MULTILINE) && |
3860 | + ((source[*match_pos+*matched_len] == '\n') || (source[*match_pos+*matched_len] == '\r'))) |
3861 | + return true; |
3862 | + return false; |
3863 | + } |
3864 | + return true; |
3865 | + } |
3866 | + |
3867 | + if(flags & REGEX_ASCII_WHOLE_MATCH) |
3868 | { |
3869 | if(flags & REGEX_ASCII_MULTILINE) |
3870 | { |
3871 | - //goto the next line |
3872 | + //go to next line |
3873 | while(source[*match_pos] && (source[*match_pos] != '\n') && (source[*match_pos] != '\r')) |
3874 | - (*match_pos)++; |
3875 | + (*match_pos) += myutf8len(source); |
3876 | if(source[*match_pos] == '\n') |
3877 | { |
3878 | (*match_pos)++; |
3879 | @@ -780,190 +1372,1039 @@ |
3880 | (*match_pos)++; |
3881 | } |
3882 | if(!source[*match_pos]) |
3883 | - return false; |
3884 | + break; |
3885 | continue; |
3886 | } |
3887 | - return false; |
3888 | + break; |
3889 | } |
3890 | if(!source[*match_pos]) |
3891 | break; |
3892 | - (*match_pos)++; |
3893 | + (*match_pos) += myutf8len(source); |
3894 | } |
3895 | while(source[*match_pos]); |
3896 | +// if(!source[*match_pos]) |
3897 | +// { |
3898 | +// reachedEnd = true; |
3899 | +// } |
3900 | return false; |
3901 | } |
3902 | |
3903 | +void CRegexXQuery_regex::reset_match() |
3904 | +{ |
3905 | +// this->backup_matched_source = this->matched_source; |
3906 | +// this->backup_matched_len = this->matched_len; |
3907 | + this->matched_source = NULL; |
3908 | + this->matched_len = 0; |
3909 | + std::list<CRegexXQuery_branch*>::iterator branch_it; |
3910 | + for(branch_it = branch_list.begin(); branch_it != branch_list.end(); branch_it++) |
3911 | + { |
3912 | + (*branch_it)->reset(); |
3913 | + } |
3914 | +} |
3915 | +/* |
3916 | +void CRegexXQuery_regex::restore_match() |
3917 | +{ |
3918 | + this->matched_source = this->backup_matched_source; |
3919 | + this->matched_len = this->backup_matched_len; |
3920 | + std::list<CRegexXQuery_branch*>::iterator branch_it; |
3921 | + for(branch_it = branch_list.begin(); branch_it != branch_list.end(); branch_it++) |
3922 | + { |
3923 | + (*branch_it)->restore(); |
3924 | + } |
3925 | +} |
3926 | +*/ |
3927 | //match any of the branches |
3928 | -bool CRegexAscii_regex::match(const char *source, int *matched_len) |
3929 | +bool CRegexXQuery_regex::match(const char *source, int *start_from_branch, int *matched_len, |
3930 | + std::list<RegexAscii_pieceinfo>::iterator next_piece, |
3931 | + std::list<RegexAscii_pieceinfo>::iterator end_piece) |
3932 | { |
3933 | reachedEnd = false; |
3934 | - std::list<CRegexAscii_branch*>::iterator branch_it; |
3935 | - |
3936 | - for(branch_it = branch_list.begin(); branch_it != branch_list.end(); branch_it++) |
3937 | - { |
3938 | - if((*branch_it)->match(source, matched_len)) |
3939 | - { |
3940 | - matched_source = source; |
3941 | - this->matched_len = *matched_len; |
3942 | + if(!(flags & REGEX_ASCII_GROUPING_LEN_WHOLE_PIECE) || |
3943 | + (this->matched_source == NULL) || ((this->matched_source + this->matched_len) != source)) |
3944 | + this->matched_source = source; |
3945 | + *matched_len = 0; |
3946 | + std::list<CRegexXQuery_branch*>::iterator branch_it; |
3947 | + |
3948 | + if(*start_from_branch == 0) |
3949 | + { |
3950 | + for(branch_it = branch_list.begin(); branch_it != branch_list.end(); branch_it++) |
3951 | + { |
3952 | + (*branch_it)->reset(); |
3953 | + } |
3954 | + } |
3955 | + |
3956 | + branch_it = branch_list.begin(); |
3957 | + if(*start_from_branch) |
3958 | + { |
3959 | + for(int i=0;i<*start_from_branch;i++) |
3960 | + branch_it++; |
3961 | + } |
3962 | + (*start_from_branch)++; |
3963 | + for(; branch_it != branch_list.end(); branch_it++,(*start_from_branch)++) |
3964 | + { |
3965 | + if((*branch_it)->match(source, matched_len, this, next_piece, end_piece)) |
3966 | + { |
3967 | + //matched_source = source; |
3968 | + //this->matched_len = *matched_len; |
3969 | return true; |
3970 | } |
3971 | } |
3972 | - matched_source = NULL; |
3973 | - matched_len = 0; |
3974 | + *start_from_branch = 0; |
3975 | + if(this->matched_source == source) |
3976 | + this->matched_source = NULL; |
3977 | + *matched_len = 0; |
3978 | return false; |
3979 | } |
3980 | |
3981 | +void CRegexXQuery_regex::save_subregex_list(std::vector<std::pair<const char*, int> > &saved_subregex) |
3982 | +{ |
3983 | + saved_subregex.resize(0); |
3984 | + saved_subregex.reserve(subregex.size()); |
3985 | + std::vector<CRegexXQuery_regex*>::iterator it; |
3986 | + for(it=subregex.begin(); it != subregex.end(); it++) |
3987 | + { |
3988 | + saved_subregex.push_back(std::pair<const char*, int>((*it)->matched_source, (*it)->matched_len)); |
3989 | + } |
3990 | +} |
3991 | + |
3992 | +void CRegexXQuery_regex::load_subregex_list(std::vector<std::pair<const char*, int> > &saved_subregex) |
3993 | +{ |
3994 | + std::vector<std::pair<const char*, int> >::iterator it; |
3995 | + std::vector<CRegexXQuery_regex*>::iterator subit; |
3996 | + for(it=saved_subregex.begin(), subit = subregex.begin(); it != saved_subregex.end(); it++, subit++) |
3997 | + { |
3998 | + (*subit)->matched_source = (*it).first; |
3999 | + (*subit)->matched_len = (*it).second; |
4000 | + } |
4001 | +} |
4002 | + |
4003 | +void CRegexXQuery_branch::reset() |
4004 | +{ |
4005 | + std::list<RegexAscii_pieceinfo>::iterator piece_it; |
4006 | + for(piece_it = piece_list.begin(); piece_it != piece_list.end(); piece_it++) |
4007 | + { |
4008 | + (*piece_it).piece->atom->reset_match(); |
4009 | + } |
4010 | +} |
4011 | +/* |
4012 | +void CRegexXQuery_branch::restore() |
4013 | +{ |
4014 | + std::list<RegexAscii_pieceinfo>::iterator piece_it; |
4015 | + for(piece_it = piece_list.begin(); piece_it != piece_list.end(); piece_it++) |
4016 | + { |
4017 | + (*piece_it).piece->atom->restore_match(); |
4018 | + } |
4019 | +} |
4020 | +*/ |
4021 | //match all the pieces |
4022 | -bool CRegexAscii_branch::match(const char *source, int *matched_len) |
4023 | +bool CRegexXQuery_branch::match(const char *source, int *matched_len, |
4024 | + CRegexXQuery_regex* group_regex, |
4025 | + std::list<RegexAscii_pieceinfo>::iterator next_piece, |
4026 | + std::list<RegexAscii_pieceinfo>::iterator end_piece) |
4027 | { |
4028 | - std::list<CRegexAscii_piece*>::iterator piece_it; |
4029 | + std::list<RegexAscii_pieceinfo>::iterator piece_it; |
4030 | |
4031 | piece_it = piece_list.begin(); |
4032 | + //if(piece_it == piece_list.end()) |
4033 | + //if(!source[0]) |
4034 | + // return true; |
4035 | + //else |
4036 | + // return false; |
4037 | if(piece_it == piece_list.end()) |
4038 | - if(source[0]) |
4039 | - return false; |
4040 | + { |
4041 | + piece_it = next_piece; |
4042 | + if(next_piece == end_piece) |
4043 | + { |
4044 | + group_regex->matched_len = 0; |
4045 | + return true; |
4046 | + } |
4047 | + } |
4048 | + |
4049 | + std::list<RegexAscii_pieceinfo> temp_pieces(piece_list); |
4050 | + temp_pieces.push_back(group_regex);//this will be used to store the group match |
4051 | + temp_pieces.insert(temp_pieces.end(), next_piece, end_piece); |
4052 | + |
4053 | + return (*piece_it).piece->match_piece(temp_pieces.begin(), temp_pieces.end(), source, matched_len); |
4054 | +} |
4055 | + |
4056 | +bool CRegexXQuery_piece::match_piece(std::list<RegexAscii_pieceinfo>::iterator piece_it, |
4057 | + std::list<RegexAscii_pieceinfo>::iterator end_it, |
4058 | + const char *source, int *matched_len) |
4059 | +{ |
4060 | + if((*piece_it).nr_matches < 0) |
4061 | + { |
4062 | + //special case, store the group match |
4063 | + (*piece_it).group_regex->matched_len = source - (*piece_it).group_regex->matched_source; |
4064 | + piece_it++; |
4065 | + if(piece_it == end_it) |
4066 | + return true; |
4067 | else |
4068 | - return true; |
4069 | - if(!(*piece_it)->get_is_reluctant()) |
4070 | - return match_piece_iter_normal(piece_it, source, matched_len); |
4071 | + return (*piece_it).piece->match_piece(piece_it, end_it, source, matched_len); |
4072 | + } |
4073 | + |
4074 | + if(!get_is_reluctant()) |
4075 | + return match_piece_iter_normal(piece_it, end_it, source, matched_len); |
4076 | else |
4077 | - return match_piece_iter_reluctant(piece_it, source, matched_len); |
4078 | -} |
4079 | - |
4080 | -//match as less as possible |
4081 | -bool CRegexAscii_branch::match_piece_iter_reluctant( |
4082 | - std::list<CRegexAscii_piece*>::iterator piece_it, |
4083 | + return match_piece_iter_reluctant(piece_it, end_it, source, matched_len); |
4084 | +} |
4085 | + |
4086 | +int CRegexXQuery_piece::choose_another_branch(std::vector<std::pair<int,int> > &match_lens) |
4087 | +{ |
4088 | + int i = match_lens.size()-1; |
4089 | + i--; |
4090 | + while((i >= 0) && (match_lens.at(i).second == 0)) |
4091 | + i--; |
4092 | + if(i < 0) |
4093 | + return -1;//no more branches |
4094 | + match_lens.resize(i+1); |
4095 | + i++; |
4096 | + return i; |
4097 | +} |
4098 | + |
4099 | +bool CRegexXQuery_piece::is_regex_atom() |
4100 | +{ |
4101 | + return regex_atom != NULL; |
4102 | +} |
4103 | + |
4104 | +//match as less as possible (shortest string) |
4105 | +bool CRegexXQuery_piece::match_piece_iter_reluctant( |
4106 | + std::list<RegexAscii_pieceinfo>::iterator piece_it, |
4107 | + std::list<RegexAscii_pieceinfo>::iterator end_it, |
4108 | const char *source, int *matched_len) |
4109 | { |
4110 | *matched_len = 0; |
4111 | - if(piece_it == piece_list.end()) |
4112 | + if(piece_it == end_it) |
4113 | return true; |
4114 | |
4115 | int min, max; |
4116 | bool strict_max; |
4117 | //std::vector<int> match_lens; |
4118 | - (*piece_it)->get_quantifier(&min, &max, &strict_max); |
4119 | - if(strict_max && (max >= 0)) |
4120 | + (*piece_it).piece->get_quantifier(&min, &max, &strict_max); |
4121 | + |
4122 | + std::vector<std::pair<const char*, int> > saved_subregex; |
4123 | + |
4124 | + if(is_regex_atom()) |
4125 | { |
4126 | - int timeslen; |
4127 | - //check if the piece doesn't exceed the max match |
4128 | - if((*piece_it)->match_piece_times(source, ×len, max+1, NULL)) |
4129 | - return false;///too many matches |
4130 | + //recursive |
4131 | + bool retmatch; |
4132 | + atom->regex_intern->save_subregex_list(saved_subregex); |
4133 | + if((*piece_it).nr_matches >= min) |
4134 | + { |
4135 | + //go to next piece |
4136 | + std::list<RegexAscii_pieceinfo>::iterator next_it = piece_it; |
4137 | + next_it++; |
4138 | + if(next_it == end_it) |
4139 | + return true; |
4140 | + retmatch = (*next_it).piece->match_piece(next_it, end_it, source, matched_len); |
4141 | + if(retmatch) |
4142 | + return true; |
4143 | + } |
4144 | + if(((max == -1) || ((*piece_it).nr_matches < max)) &&//try further with this piece |
4145 | + (((*piece_it).nr_matches < min) || ((*piece_it).nr_matches == 0) || ((*piece_it).piece->regex_atom->matched_len)))//if matched_len is zero, avoid infinite loop |
4146 | + { |
4147 | + int start_from_branch = 0; |
4148 | + int shortest_len = -1; |
4149 | + bool branch_saved = false; |
4150 | + //try all branches to get the shortest len |
4151 | + (*piece_it).nr_matches++; |
4152 | + while(atom->match(source, &start_from_branch, matched_len, piece_it, end_it)) |
4153 | + { |
4154 | + if((shortest_len == -1) || (shortest_len > *matched_len)) |
4155 | + { |
4156 | + shortest_len = *matched_len; |
4157 | + if(start_from_branch && (atom->regex_intern->flags & REGEX_ASCII_GET_LONGEST_BRANCH)) |
4158 | + { |
4159 | + atom->regex_intern->save_subregex_list(saved_subregex); |
4160 | + branch_saved = true; |
4161 | + } |
4162 | + } |
4163 | + if(!start_from_branch || !(atom->regex_intern->flags & REGEX_ASCII_GET_LONGEST_BRANCH)) |
4164 | + break; |
4165 | + } |
4166 | + if(shortest_len != -1) |
4167 | + { |
4168 | + *matched_len = shortest_len; |
4169 | + if(branch_saved) |
4170 | + atom->regex_intern->load_subregex_list(saved_subregex); |
4171 | + return true; |
4172 | + } |
4173 | + else |
4174 | + { |
4175 | + (*piece_it).nr_matches--; |
4176 | + atom->regex_intern->load_subregex_list(saved_subregex); |
4177 | + return false; |
4178 | + } |
4179 | + } |
4180 | + else |
4181 | + { |
4182 | + atom->regex_intern->load_subregex_list(saved_subregex); |
4183 | + return false; |
4184 | + } |
4185 | } |
4186 | |
4187 | - int i=min; |
4188 | - std::list<CRegexAscii_piece*>::iterator next_it = piece_it; |
4189 | + int i=0; |
4190 | + int shortest_len = -1; |
4191 | + int otherpieces_shortest = -1; |
4192 | + int i_shortest = -1; |
4193 | + std::list<RegexAscii_pieceinfo>::iterator next_it = piece_it; |
4194 | + std::vector<std::pair<int,int> > match_lens; |
4195 | next_it++; |
4196 | int pieceslen = 0; |
4197 | while(1) |
4198 | { |
4199 | - if((max > 0) && (i>max)) |
4200 | - break; |
4201 | - int piecelen = 0; |
4202 | - if((*piece_it)->match_piece_times(source+pieceslen, &piecelen, !pieceslen ? i : 1, NULL)) |
4203 | - { |
4204 | - pieceslen += piecelen; |
4205 | + int piecelen = 0; |
4206 | + bool retmatch; |
4207 | + retmatch = match_piece_times(source, &piecelen, i < min ? min : i, &match_lens); |
4208 | + i = match_lens.size()-1;//number of matches |
4209 | + if(i<0) |
4210 | + i = 0; |
4211 | + if((i>=min)) |
4212 | + { |
4213 | + pieceslen = piecelen; |
4214 | + if((shortest_len >= 0) && (shortest_len <= pieceslen))//this branch is longer |
4215 | + {//try another branch |
4216 | + i = choose_another_branch(match_lens); |
4217 | + if(i >= 0) |
4218 | + continue;//try another branch |
4219 | + else |
4220 | + break; |
4221 | + } |
4222 | int otherpieces = 0; |
4223 | - if((next_it == piece_list.end()) || |
4224 | - ((*next_it)->get_is_reluctant() && match_piece_iter_reluctant(next_it, source+pieceslen, &otherpieces)) || |
4225 | - (!(*next_it)->get_is_reluctant() && match_piece_iter_normal(next_it, source+pieceslen, &otherpieces))) |
4226 | - { |
4227 | - *matched_len = pieceslen + otherpieces; |
4228 | - return true; |
4229 | - } |
4230 | + if((next_it == end_it) || |
4231 | + (*next_it).piece->match_piece(next_it, end_it, source+pieceslen, &otherpieces) |
4232 | + ) |
4233 | + { |
4234 | + if((i == pieceslen) || (match_lens.at(0).second == 0) ||//minimum achieved already, cannot go lower than that |
4235 | + !(atom->regex_intern->flags & REGEX_ASCII_GET_LONGEST_BRANCH)) |
4236 | + { |
4237 | + *matched_len = pieceslen + otherpieces; |
4238 | + return true; |
4239 | + } |
4240 | + if((shortest_len < 0) || (shortest_len > pieceslen)) |
4241 | + { |
4242 | + shortest_len = pieceslen; |
4243 | + otherpieces_shortest = otherpieces; |
4244 | + i_shortest = i; |
4245 | + if(match_lens.at(0).second != 0) |
4246 | + atom->regex_intern->save_subregex_list(saved_subregex); |
4247 | + } |
4248 | + i = choose_another_branch(match_lens); |
4249 | + if(i >= 0) |
4250 | + continue;//try another branch |
4251 | + else |
4252 | + break; |
4253 | + } |
4254 | + else |
4255 | + { |
4256 | + //try further |
4257 | + if(retmatch) |
4258 | + { |
4259 | + i++; |
4260 | + if((max < 0) || (i<=max)) |
4261 | + continue; |
4262 | + i--; |
4263 | + } |
4264 | + } |
4265 | + } |
4266 | + |
4267 | + if(i==0) |
4268 | + { |
4269 | + break; |
4270 | } |
4271 | else |
4272 | - break; |
4273 | - i++; |
4274 | + { |
4275 | + i = choose_another_branch(match_lens); |
4276 | + if(i >= 0) |
4277 | + continue;//try another branch |
4278 | + else |
4279 | + break; |
4280 | + } |
4281 | } |
4282 | |
4283 | + if(shortest_len >= 0) |
4284 | + { |
4285 | + if(strict_max && (max>=0) && (i_shortest > max)) |
4286 | + return false; |
4287 | + *matched_len = shortest_len + otherpieces_shortest; |
4288 | + if(saved_subregex.size()) |
4289 | + atom->regex_intern->load_subregex_list(saved_subregex); |
4290 | + return true; |
4291 | + } |
4292 | return false; |
4293 | } |
4294 | |
4295 | //match as much as possible |
4296 | -bool CRegexAscii_branch::match_piece_iter_normal( |
4297 | - std::list<CRegexAscii_piece*>::iterator piece_it, |
4298 | +bool CRegexXQuery_piece::match_piece_iter_normal( |
4299 | + std::list<RegexAscii_pieceinfo>::iterator piece_it, |
4300 | + std::list<RegexAscii_pieceinfo>::iterator end_it, |
4301 | const char *source, int *matched_len) |
4302 | { |
4303 | *matched_len = 0; |
4304 | |
4305 | int min, max; |
4306 | bool strict_max; |
4307 | - std::vector<int> match_lens; |
4308 | - (*piece_it)->get_quantifier(&min, &max, &strict_max); |
4309 | - int timeslen; |
4310 | - if(strict_max && (max >= 0)) |
4311 | + std::vector<std::pair<int,int> > match_lens; |
4312 | + (*piece_it).piece->get_quantifier(&min, &max, &strict_max); |
4313 | + int timeslen = 0; |
4314 | + std::vector<std::pair<const char*, int> > saved_subregex; |
4315 | + |
4316 | + if(is_regex_atom()) |
4317 | { |
4318 | - //check if the piece doesn't exceed the max match |
4319 | - //if((*piece_it)->match_piece_times(source, ×len, max+1, &match_lens)) |
4320 | - // return false;///too many matches |
4321 | - (*piece_it)->match_piece_times(source, ×len, max, &match_lens); |
4322 | + //recursive |
4323 | + bool retmatch; |
4324 | + atom->regex_intern->save_subregex_list(saved_subregex); |
4325 | + if(((max == -1) || ((*piece_it).nr_matches < max)) && //try further with this piece |
4326 | + (((*piece_it).nr_matches < min) || ((*piece_it).nr_matches == 0) || ((*piece_it).piece->regex_atom->matched_len)))//if matched_len is zero, avoid infinite loop |
4327 | + { |
4328 | + int start_from_branch = 0; |
4329 | + int longest_len = -1; |
4330 | + bool branch_saved = false; |
4331 | + //try all branches to get the longest len |
4332 | + (*piece_it).nr_matches++; |
4333 | + while(atom->match(source, &start_from_branch, matched_len, piece_it, end_it)) |
4334 | + { |
4335 | + if((longest_len < *matched_len)) |
4336 | + { |
4337 | + longest_len = *matched_len; |
4338 | + if(start_from_branch && (atom->regex_intern->flags & REGEX_ASCII_GET_LONGEST_BRANCH)) |
4339 | + { |
4340 | + atom->regex_intern->save_subregex_list(saved_subregex); |
4341 | + branch_saved = true; |
4342 | + } |
4343 | + } |
4344 | + if(!start_from_branch || !(atom->regex_intern->flags & REGEX_ASCII_GET_LONGEST_BRANCH)) |
4345 | + break; |
4346 | + } |
4347 | + if(longest_len != -1) |
4348 | + { |
4349 | + *matched_len = longest_len; |
4350 | + if(branch_saved) |
4351 | + atom->regex_intern->load_subregex_list(saved_subregex); |
4352 | + return true; |
4353 | + } |
4354 | + else |
4355 | + { |
4356 | + atom->regex_intern->load_subregex_list(saved_subregex); |
4357 | + (*piece_it).nr_matches--; |
4358 | + } |
4359 | + } |
4360 | + if((*piece_it).nr_matches >= min) |
4361 | + { |
4362 | + //go to next piece |
4363 | + std::list<RegexAscii_pieceinfo>::iterator next_it = piece_it; |
4364 | + next_it++; |
4365 | + if(next_it == end_it) |
4366 | + return true; |
4367 | + retmatch = (*next_it).piece->match_piece(next_it, end_it, source, matched_len); |
4368 | + if(!retmatch) |
4369 | + atom->regex_intern->load_subregex_list(saved_subregex); |
4370 | + return retmatch; |
4371 | + } |
4372 | + else |
4373 | + { |
4374 | + // regex_atom->restore_match(); |
4375 | + atom->regex_intern->load_subregex_list(saved_subregex); |
4376 | + return false; |
4377 | + } |
4378 | } |
4379 | - else if(!strict_max && (max >= 0)) |
4380 | - (*piece_it)->match_piece_times(source, ×len, max, &match_lens); |
4381 | - else |
4382 | - (*piece_it)->match_piece_times(source, ×len, -1, &match_lens); |
4383 | |
4384 | - int i; |
4385 | - std::list<CRegexAscii_piece*>::iterator next_it = piece_it; |
4386 | + int longest_len = -1; |
4387 | + int otherpieces_longest = -1; |
4388 | + int i_longest = -1; |
4389 | + int i = max; |
4390 | + std::list<RegexAscii_pieceinfo>::iterator next_it = piece_it; |
4391 | next_it++; |
4392 | - if(next_it == piece_list.end()) |
4393 | + |
4394 | + bool retmatch; |
4395 | + while(1) |
4396 | { |
4397 | - if((int)match_lens.size() > min) |
4398 | - { |
4399 | - *matched_len = timeslen; |
4400 | - return true; |
4401 | + retmatch = match_piece_times(source, ×len, i, &match_lens); |
4402 | + i=match_lens.size()-1;//number of matches |
4403 | + if((i>=min)) |
4404 | + { |
4405 | + if(timeslen < longest_len) |
4406 | + {//this branch is no use |
4407 | + i = choose_another_branch(match_lens); |
4408 | + if(i >= 0) |
4409 | + { |
4410 | + i = max; |
4411 | + continue;//try another branch |
4412 | + } |
4413 | + else |
4414 | + break; |
4415 | + } |
4416 | + //int piecelen = 0; |
4417 | + int otherpieces = 0; |
4418 | + if((next_it == end_it) || |
4419 | + (*next_it).piece->match_piece(next_it, end_it, source+timeslen, &otherpieces) |
4420 | + ) |
4421 | + { |
4422 | + if(timeslen > longest_len) |
4423 | + { |
4424 | + longest_len = timeslen; |
4425 | + otherpieces_longest = otherpieces; |
4426 | + i_longest = i; |
4427 | + if(!(atom->regex_intern->flags & REGEX_ASCII_GET_LONGEST_BRANCH)) |
4428 | + { |
4429 | + *matched_len = longest_len + otherpieces_longest; |
4430 | + return true; |
4431 | + } |
4432 | + else |
4433 | + { |
4434 | + if(match_lens.at(0).second) |
4435 | + atom->regex_intern->save_subregex_list(saved_subregex); |
4436 | + } |
4437 | + } |
4438 | + } |
4439 | + else |
4440 | + { |
4441 | + if(!match_lens.at(0).second) |
4442 | + { |
4443 | + match_lens.resize(match_lens.size()-1); |
4444 | + i--; |
4445 | + if(i >= 0) |
4446 | + continue;//try smaller |
4447 | + else |
4448 | + break; |
4449 | + } |
4450 | + else |
4451 | + { |
4452 | + i = choose_another_branch(match_lens); |
4453 | + if(i >= 0) |
4454 | + continue;//try another branch |
4455 | + else |
4456 | + break; |
4457 | + } |
4458 | + } |
4459 | + } |
4460 | + //now try another branch |
4461 | + i = choose_another_branch(match_lens); |
4462 | + if(i >= 0) |
4463 | + { |
4464 | + i = max; |
4465 | + continue;//try another branch |
4466 | } |
4467 | else |
4468 | - return false; |
4469 | - } |
4470 | - for(i=match_lens.size()-1; i>=min; i--) |
4471 | + break; |
4472 | + }//end while |
4473 | + |
4474 | + if(longest_len >= 0) |
4475 | { |
4476 | - int piecelen = 0; |
4477 | - int otherpieces = 0; |
4478 | - if(((*next_it)->get_is_reluctant() && match_piece_iter_reluctant(next_it, source+match_lens[i]+piecelen, &otherpieces)) || |
4479 | - (!(*next_it)->get_is_reluctant() && match_piece_iter_normal(next_it, source+match_lens[i]+piecelen, &otherpieces))) |
4480 | - { |
4481 | - *matched_len = match_lens[i] + piecelen + otherpieces; |
4482 | - return true; |
4483 | - } |
4484 | + *matched_len = longest_len + otherpieces_longest; |
4485 | + if(saved_subregex.size()) |
4486 | + atom->regex_intern->load_subregex_list(saved_subregex); |
4487 | + return true; |
4488 | } |
4489 | |
4490 | return false; |
4491 | } |
4492 | |
4493 | -bool CRegexAscii_piece::match_piece_times(const char *source, |
4494 | +bool CRegexXQuery_piece::match_piece_times(const char *source, |
4495 | int *piecelen, |
4496 | int times, |
4497 | - std::vector<int> *match_lens) |
4498 | + std::vector<std::pair<int,int> > *match_lens) |
4499 | { |
4500 | - *piecelen = 0; |
4501 | - for(int i=0;(times < 0) || (i<times);i++) |
4502 | - { |
4503 | + int i=0; |
4504 | + if(match_lens && match_lens->size()) |
4505 | + { |
4506 | + i = match_lens->size()-1; |
4507 | + } |
4508 | + if(match_lens && match_lens->size()) |
4509 | + *piecelen = match_lens->at(match_lens->size()-1).first; |
4510 | + else |
4511 | + *piecelen = 0; |
4512 | + if((times >= 0) && (i>=times)) |
4513 | + return true; |
4514 | + for(;(times < 0) || (i<times);i++) |
4515 | + { |
4516 | + int atomlen; |
4517 | + int start_from_branch = 0; |
4518 | + if(match_lens && (i<(int)match_lens->size())) |
4519 | + start_from_branch = match_lens->at(i).second; |
4520 | + bool first_branch = (start_from_branch == 0); |
4521 | + if(!atom->match(source+*piecelen, &start_from_branch, &atomlen, empty_pieces.begin(), empty_pieces.end())) |
4522 | + { |
4523 | + if(match_lens) |
4524 | + { |
4525 | + if(i >= (int)match_lens->size()) |
4526 | + match_lens->push_back(std::pair<int,int>(*piecelen, 0)); |
4527 | + else |
4528 | + (*match_lens)[i] = std::pair<int,int>(*piecelen, 0); |
4529 | + } |
4530 | + return false; |
4531 | + } |
4532 | if(match_lens) |
4533 | - match_lens->push_back(*piecelen); |
4534 | - int atomlen; |
4535 | - if(!atom->match(source+*piecelen, &atomlen)) |
4536 | - return false; |
4537 | + { |
4538 | + if(i >= (int)match_lens->size()) |
4539 | + match_lens->push_back(std::pair<int,int>(*piecelen, start_from_branch)); |
4540 | + else |
4541 | + (*match_lens)[i] = std::pair<int,int>(*piecelen, start_from_branch); |
4542 | + } |
4543 | *piecelen += atomlen; |
4544 | if(!atomlen && !source[*piecelen]) |
4545 | { |
4546 | - atom->regex_intern->reachedEnd = true; |
4547 | + // atom->regex_intern->set_reachedEnd(source); |
4548 | + break; |
4549 | + } |
4550 | + if(first_branch && (atomlen == 0))//avoid infinite loop |
4551 | + { |
4552 | break; |
4553 | } |
4554 | } |
4555 | if(match_lens) |
4556 | - match_lens->push_back(*piecelen); |
4557 | + { |
4558 | + // if(i >= match_lens->size()) |
4559 | + match_lens->push_back(std::pair<int,int>(*piecelen, 0)); |
4560 | + // else |
4561 | + // (*match_lens)[i] = std::pair<int,int>(*piecelen, 0); |
4562 | + } |
4563 | |
4564 | return true; |
4565 | } |
4566 | |
4567 | +bool CRegexXQuery_multicharP::match_internal(const char *source, int *start_from_branch, int *matched_len) |
4568 | +{ |
4569 | + if(!source[0]) |
4570 | + { |
4571 | + regex_intern->set_reachedEnd(source); |
4572 | + return false; |
4573 | + } |
4574 | + bool found = false; |
4575 | + const char *temp_source = source; |
4576 | + unicode::code_point utf8c = utf8::next_char(temp_source); |
4577 | + switch(multichar_type) |
4578 | + { |
4579 | + case unicode::UNICODE_Ll + 50: |
4580 | + if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Ll) || |
4581 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Lm) || |
4582 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Lo) || |
4583 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Lt) || |
4584 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Lu)) |
4585 | + { |
4586 | + if(!is_reverse) |
4587 | + found = true; |
4588 | + } |
4589 | + else |
4590 | + { |
4591 | + if(is_reverse) |
4592 | + found = true; |
4593 | + } |
4594 | + break; |
4595 | + case unicode::UNICODE_Mc + 50: |
4596 | + if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Mn) || |
4597 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Mc) || |
4598 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Me)) |
4599 | + { |
4600 | + if(!is_reverse) |
4601 | + found = true; |
4602 | + } |
4603 | + else |
4604 | + { |
4605 | + if(is_reverse) |
4606 | + found = true; |
4607 | + } |
4608 | + break; |
4609 | + case unicode::UNICODE_Nd + 50: |
4610 | + if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Nd) || |
4611 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Nl) || |
4612 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_No)) |
4613 | + { |
4614 | + if(!is_reverse) |
4615 | + found = true; |
4616 | + } |
4617 | + else |
4618 | + { |
4619 | + if(is_reverse) |
4620 | + found = true; |
4621 | + } |
4622 | + break; |
4623 | + case unicode::UNICODE_Pc + 50: |
4624 | + if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pc) || |
4625 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pd) || |
4626 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Ps) || |
4627 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pe) || |
4628 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pi) || |
4629 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pf) || |
4630 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Po)) |
4631 | + { |
4632 | + if(!is_reverse) |
4633 | + found = true; |
4634 | + } |
4635 | + else |
4636 | + { |
4637 | + if(is_reverse) |
4638 | + found = true; |
4639 | + } |
4640 | + break; |
4641 | + case unicode::UNICODE_Zl + 50: |
4642 | + if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Zs) || |
4643 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Zl) || |
4644 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Zp)) |
4645 | + { |
4646 | + if(!is_reverse) |
4647 | + found = true; |
4648 | + } |
4649 | + else |
4650 | + { |
4651 | + if(is_reverse) |
4652 | + found = true; |
4653 | + } |
4654 | + break; |
4655 | + case unicode::UNICODE_Sc + 50: |
4656 | + if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Sm) || |
4657 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Sc) || |
4658 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Sk) || |
4659 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_So)) |
4660 | + { |
4661 | + if(!is_reverse) |
4662 | + found = true; |
4663 | + } |
4664 | + else |
4665 | + { |
4666 | + if(is_reverse) |
4667 | + found = true; |
4668 | + } |
4669 | + break; |
4670 | + case unicode::UNICODE_Cc + 50: |
4671 | + if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Cc) || |
4672 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Cf) || |
4673 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Co))//ignore unicode::UNICODE_Cn |
4674 | + { |
4675 | + if(!is_reverse) |
4676 | + found = true; |
4677 | + } |
4678 | + else |
4679 | + { |
4680 | + if(is_reverse) |
4681 | + found = true; |
4682 | + } |
4683 | + break; |
4684 | + default: |
4685 | + if(unicode::check_codepoint_category(utf8c, (unicode::category)multichar_type)) |
4686 | + { |
4687 | + if(!is_reverse) |
4688 | + found = true; |
4689 | + } |
4690 | + else |
4691 | + { |
4692 | + if(is_reverse) |
4693 | + found = true; |
4694 | + } |
4695 | + break; |
4696 | + } |
4697 | + |
4698 | + if(found) |
4699 | + { |
4700 | + *matched_len = temp_source - source; |
4701 | + } |
4702 | + return found; |
4703 | +} |
4704 | + |
4705 | +bool CRegexXQuery_multicharIs::match_internal(const char *source, int *start_from_branch, int *matched_len) |
4706 | +{ |
4707 | + if(!source[0]) |
4708 | + { |
4709 | + regex_intern->set_reachedEnd(source); |
4710 | + return false; |
4711 | + } |
4712 | + bool found = false; |
4713 | + const char *temp_source = source; |
4714 | + unicode::code_point utf8c = utf8::next_char(temp_source); |
4715 | + const unicode::code_point *cp = block_escape[block_index].cp; |
4716 | + if((utf8c >= cp[0]) && (utf8c <= cp[1])) |
4717 | + { |
4718 | + if(!is_reverse) |
4719 | + found = true; |
4720 | + } |
4721 | + else if(block_escape[block_index].ext_cp) |
4722 | + { |
4723 | + cp = block_escape[block_index].ext_cp; |
4724 | + while(*cp) |
4725 | + { |
4726 | + if((utf8c >= cp[0]) && (utf8c <= cp[1])) |
4727 | + break; |
4728 | + cp += 2; |
4729 | + } |
4730 | + if(*cp) |
4731 | + { |
4732 | + if(!is_reverse) |
4733 | + found = true; |
4734 | + } |
4735 | + else |
4736 | + { |
4737 | + if(is_reverse) |
4738 | + found = true; |
4739 | + } |
4740 | + } |
4741 | + else |
4742 | + { |
4743 | + if(is_reverse) |
4744 | + found = true; |
4745 | + } |
4746 | + if(found) |
4747 | + { |
4748 | + *matched_len = temp_source - source; |
4749 | + } |
4750 | + return found; |
4751 | +} |
4752 | + |
4753 | +bool CRegexXQuery_multicharOther::match_internal(const char *source, int *start_from_branch, int *matched_len) |
4754 | +{ |
4755 | + if(!source[0]) |
4756 | + { |
4757 | + regex_intern->set_reachedEnd(source); |
4758 | + return false; |
4759 | + } |
4760 | + bool found = false; |
4761 | + bool value_true = true; |
4762 | + const char *temp_source = source; |
4763 | + unicode::code_point utf8c = utf8::next_char(temp_source); |
4764 | + switch(multichar_type) |
4765 | + { |
4766 | + case 'S':value_true = false;//[^\s] |
4767 | + case 's'://[#x20\t\n\r] |
4768 | + switch(utf8c) |
4769 | + { |
4770 | + case '\t': |
4771 | + case '\r': |
4772 | + case '\n': |
4773 | + case ' ': |
4774 | + found = true; |
4775 | + default: |
4776 | + break; |
4777 | + } |
4778 | + break; |
4779 | + case 'I':value_true = false;//[^\i] |
4780 | + case 'i'://the set of initial name characters, those matched by Letter | '_' | ':' |
4781 | + if((utf8c == '_') || |
4782 | + (utf8c == ':') || |
4783 | + XQCharType::isLetter(utf8c)) |
4784 | + { |
4785 | + found = true; |
4786 | + } |
4787 | + break; |
4788 | + case 'C':value_true = false;//[^\c] |
4789 | + case 'c'://the set of name characters, those matched by NameChar |
4790 | + if(XQCharType::isNameChar(utf8c)) |
4791 | + { |
4792 | + found = true; |
4793 | + } |
4794 | + break; |
4795 | + case 'D':value_true = false;//[^\d] |
4796 | + case 'd': |
4797 | + if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Nd)) |
4798 | + found = true; |
4799 | + break; |
4800 | + case 'W':value_true = false;//[^\w] |
4801 | + case 'w': |
4802 | + found = !(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pc) || |
4803 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pd) || |
4804 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Ps) || |
4805 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pe) || |
4806 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pi) || |
4807 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pf) || |
4808 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Po) || |
4809 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Zs) || |
4810 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Zl) || |
4811 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Zp) || |
4812 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Cc) || |
4813 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Cf) || |
4814 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Co));//ignore unicode::UNICODE_Cn |
4815 | + break; |
4816 | + default: |
4817 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(source, ZED(REGEX_UNIMPLEMENTED)) ); |
4818 | + } |
4819 | + if((found && value_true) || (!found && !value_true)) |
4820 | + { |
4821 | + *matched_len = temp_source - source; |
4822 | + return true; |
4823 | + } |
4824 | + else |
4825 | + { |
4826 | + return false; |
4827 | + } |
4828 | +} |
4829 | + |
4830 | +bool CRegexXQuery_char_ascii::match_internal(const char *source, int *start_from_branch, int *matched_len) |
4831 | +{ |
4832 | + if(!source[0]) |
4833 | + { |
4834 | + regex_intern->set_reachedEnd(source); |
4835 | + return false; |
4836 | + } |
4837 | + if(source[0] == c) |
4838 | + { |
4839 | + *matched_len = 1; |
4840 | + return true; |
4841 | + } |
4842 | + else |
4843 | + return false; |
4844 | +} |
4845 | + |
4846 | +bool CRegexXQuery_char_ascii_i::match_internal(const char *source, int *start_from_branch, int *matched_len) |
4847 | +{ |
4848 | + if(!source[0]) |
4849 | + { |
4850 | + regex_intern->set_reachedEnd(source); |
4851 | + return false; |
4852 | + } |
4853 | + char sup = toupper(source[0]); |
4854 | + if(sup == c) |
4855 | + { |
4856 | + *matched_len = 1; |
4857 | + return true; |
4858 | + } |
4859 | + else |
4860 | + return false; |
4861 | +} |
4862 | + |
4863 | +bool CRegexXQuery_char_range_ascii::match_internal(const char *source, int *start_from_branch, int *matched_len) |
4864 | +{ |
4865 | + if(!source[0]) |
4866 | + { |
4867 | + regex_intern->set_reachedEnd(source); |
4868 | + return false; |
4869 | + } |
4870 | + if((source[0] >= c1) && (source[0] <= c2)) |
4871 | + { |
4872 | + *matched_len = 1; |
4873 | + return true; |
4874 | + } |
4875 | + else |
4876 | + return false; |
4877 | +} |
4878 | + |
4879 | +bool CRegexXQuery_char_range_ascii_i::match_internal(const char *source, int *start_from_branch, int *matched_len) |
4880 | +{ |
4881 | + if(!source[0]) |
4882 | + { |
4883 | + regex_intern->set_reachedEnd(source); |
4884 | + return false; |
4885 | + } |
4886 | + char sup = toupper(source[0]); |
4887 | + if((sup >= c1) && (sup <= c2)) |
4888 | + { |
4889 | + *matched_len = 1; |
4890 | + return true; |
4891 | + } |
4892 | + else |
4893 | + return false; |
4894 | +} |
4895 | + |
4896 | +bool CRegexXQuery_char_unicode::match_internal(const char *source, int *start_from_branch, int *matched_len) |
4897 | +{ |
4898 | + if(!source[0]) |
4899 | + { |
4900 | + regex_intern->set_reachedEnd(source); |
4901 | + return false; |
4902 | + } |
4903 | + if(!memcmp(source, c, len)) |
4904 | + { |
4905 | + *matched_len = len; |
4906 | + return true; |
4907 | + } |
4908 | + else |
4909 | + return false; |
4910 | +} |
4911 | + |
4912 | +bool CRegexXQuery_char_unicode_cp::match_internal(const char *source, int *start_from_branch, int *matched_len) |
4913 | +{ |
4914 | + if(!source[0]) |
4915 | + { |
4916 | + regex_intern->set_reachedEnd(source); |
4917 | + return false; |
4918 | + } |
4919 | + const char *temp_source = source; |
4920 | + unicode::code_point utf8c = utf8::next_char(temp_source); |
4921 | + if(utf8c == c) |
4922 | + { |
4923 | + *matched_len = temp_source - source; |
4924 | + return true; |
4925 | + } |
4926 | + else |
4927 | + return false; |
4928 | +} |
4929 | + |
4930 | +bool CRegexXQuery_char_unicode_i::match_internal(const char *source, int *start_from_branch, int *matched_len) |
4931 | +{ |
4932 | + if(!source[0]) |
4933 | + { |
4934 | + regex_intern->set_reachedEnd(source); |
4935 | + return false; |
4936 | + } |
4937 | + const char *temp_source = source; |
4938 | + unicode::code_point sup = unicode::to_upper(utf8::next_char(temp_source)); |
4939 | + if(sup == c) |
4940 | + { |
4941 | + *matched_len = temp_source - source; |
4942 | + return true; |
4943 | + } |
4944 | + else |
4945 | + return false; |
4946 | +} |
4947 | + |
4948 | +bool CRegexXQuery_char_range_unicode::match_internal(const char *source, int *start_from_branch, int *matched_len) |
4949 | +{ |
4950 | + if(!source[0]) |
4951 | + { |
4952 | + regex_intern->set_reachedEnd(source); |
4953 | + return false; |
4954 | + } |
4955 | + const char *temp_source = source; |
4956 | + unicode::code_point utf8c = utf8::next_char(temp_source); |
4957 | + if((utf8c >= c1) && (utf8c <= c2)) |
4958 | + { |
4959 | + *matched_len = temp_source - source; |
4960 | + return true; |
4961 | + } |
4962 | + else |
4963 | + return false; |
4964 | +} |
4965 | + |
4966 | +bool CRegexXQuery_char_range_unicode_i::match_internal(const char *source, int *start_from_branch, int *matched_len) |
4967 | +{ |
4968 | + if(!source[0]) |
4969 | + { |
4970 | + regex_intern->set_reachedEnd(source); |
4971 | + return false; |
4972 | + } |
4973 | + const char *temp_source = source; |
4974 | + unicode::code_point sup = unicode::to_upper(utf8::next_char(temp_source)); |
4975 | + if((sup >= c1) && (sup <= c2)) |
4976 | + { |
4977 | + *matched_len = temp_source - source; |
4978 | + return true; |
4979 | + } |
4980 | + else |
4981 | + return false; |
4982 | +} |
4983 | + |
4984 | +bool CRegexXQuery_endline::match_internal(const char *source, int *start_from_branch, int *matched_len) |
4985 | +{ |
4986 | + *matched_len = 0; |
4987 | + if(!source[0]) |
4988 | + { |
4989 | + // regex_intern->reachedEnd = true; |
4990 | + return true; |
4991 | + } |
4992 | + if((source[0] == 0x0A) || ((source[0] == 0x0D) && (source[1] == 0x0A))) |
4993 | + { |
4994 | + if(regex_intern->get_flags() & REGEX_ASCII_MULTILINE) |
4995 | + { |
4996 | + // regex_intern->reachedEnd = true; |
4997 | + return true; |
4998 | + } |
4999 | + } |
5000 | + return false; |
Compiling with ZORBA_NO_ICU=ON fails on Linux:
[ 1%] Building CXX object src/CMakeFiles/ zorba_simplesto re.dir/ api/zorba_ string. cpp.o /zorba/ sandbox/ src/util/ regex.h: 501:0,
from /home/mbrantner /zorba/ sandbox/ src/api/ zorba_string. cpp:23: /zorba/ sandbox/ src/util/ regex_xquery. h:209:3: error: a class-key must be used when declaring a friend /zorba/ sandbox/ src/util/ regex_xquery. h:209:3: error: friend declaration does not name a class or function /zorba/ sandbox/ src/util/ regex_xquery. h:253:3: error: a class-key must be used when declaring a friend /zorba/ sandbox/ src/util/ regex_xquery. h:253:3: error: friend declaration does not name a class or function /zorba_ simplestore. dir/api/ zorba_string. cpp.o] Erro
In file included from /home/mbrantner
/home/mbrantner
/home/mbrantner
/home/mbrantner
/home/mbrantner
make[2]: *** [src/CMakeFiles