Merge lp:~zorba-coders/zorba/no_unicode into lp:zorba
- no_unicode
- Merge into trunk
Status: | Superseded |
---|---|
Proposed branch: | lp:~zorba-coders/zorba/no_unicode |
Merge into: | lp:zorba |
Diff against target: |
8036 lines (+3707/-998) 255 files modified
CMakeConfiguration.txt (+5/-5) CMakeLists.txt (+6/-2) ChangeLog (+10/-0) KNOWN_ISSUES.txt (+1/-1) doc/cxx/examples/context.cpp (+4/-0) include/zorba/config.h.cmake (+3/-1) include/zorba/util/time.h (+1/-1) src/api/serialization/serializer.cpp (+36/-33) src/api/serialization/serializer.h (+2/-4) src/diagnostics/diagnostic_en.xml (+108/-22) src/diagnostics/pregenerated/dict_en.cpp (+83/-20) src/runtime/full_text/CMakeLists.txt (+3/-3) src/runtime/full_text/default_tokenizer.cpp (+4/-4) src/runtime/full_text/latin_tokenizer.cpp (+3/-2) src/runtime/full_text/latin_tokenizer.h (+9/-8) src/runtime/numerics/format_integer_impl.cpp (+1/-1) src/runtime/numerics/numerics_impl.cpp (+1/-1) src/runtime/strings/strings_impl.cpp (+58/-20) src/system/globalenv.cpp (+7/-7) src/util/CMakeLists.txt (+3/-3) src/util/regex.cpp (+44/-52) src/util/regex.h (+22/-34) src/util/regex_xquery.cpp (+1860/-489) src/util/regex_xquery.h (+360/-122) src/util/unicode_categories.cpp (+3/-3) src/util/unicode_categories.h (+44/-37) src/util/unicode_util.cpp (+20/-2) src/util/unicode_util.h (+46/-15) src/util/utf8_util.cpp (+6/-6) src/util/utf8_util.h (+29/-13) src/util/utf8_util.tcc (+10/-2) src/zorbatypes/collation_manager.cpp (+17/-17) src/zorbatypes/collation_manager.h (+3/-3) src/zorbatypes/libicu.h (+0/-32) src/zorbatypes/transcoder.cpp (+8/-4) src/zorbatypes/transcoder.h (+9/-9) src/zorbautils/string_util.cpp (+19/-18) src/zorbautils/string_util.h (+15/-1) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a1.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a10.xml.res (+242/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a11.xml.res (+6/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a2.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a3.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a5.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a6.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a7.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a8.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_a9.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m1.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m10.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m11.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m12.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m13.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m14.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m15.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m16.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m17.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m18.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m19.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m2.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m20.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m21.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m22.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m23.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m24.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m25.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m26.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m27.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m28.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m29.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m3.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m30.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m31.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m32.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m33.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m34.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m35.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m36.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m37.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m38.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m39.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m4.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m40.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m41.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m42.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m43.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m44.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m45.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m46.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m47.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m48.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m49.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m5.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m50.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m51.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m52.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m53.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m6.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m7.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m8.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_m9.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_prime1.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r1.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r10.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r11.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r12.xml.res (+5/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r2.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r3.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r4.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r5.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r6.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_r9.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_t1.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_t4.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/string/Regex/regex_t5.xml.res (+1/-0) test/rbkt/ExpQueryResults/zorba/testdriver/bom_bug.xml.res (+1/-0) test/rbkt/Queries/CMakeLists.txt (+17/-0) test/rbkt/Queries/zorba/string/Regex/regex_a1.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_a10.xq (+11/-0) test/rbkt/Queries/zorba/string/Regex/regex_a11.xq (+9/-0) test/rbkt/Queries/zorba/string/Regex/regex_a2.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_a3.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_a5.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_a6.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_a7.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_a8.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_a9.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err1.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err1.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err10.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err10.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err11.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err11.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err12.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err12.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err13.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err13.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err14.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err14.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err15.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err15.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err16.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err16.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err17.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err17.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err18.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err18.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err19.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err19.xq (+3/-0) test/rbkt/Queries/zorba/string/Regex/regex_err2.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err2.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err20.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err20.xq (+3/-0) test/rbkt/Queries/zorba/string/Regex/regex_err21.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err21.xq (+3/-0) test/rbkt/Queries/zorba/string/Regex/regex_err22.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err22.xq (+3/-0) test/rbkt/Queries/zorba/string/Regex/regex_err23.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err23.xq (+3/-0) test/rbkt/Queries/zorba/string/Regex/regex_err24.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err24.xq (+3/-0) test/rbkt/Queries/zorba/string/Regex/regex_err25.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err25.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err3.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err3.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err4.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err4.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err5.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err5.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err7.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err7.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err8.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err8.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err9.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_err9.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m1.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m10.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m11.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m12.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m13.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m14.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m15.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m16.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m17.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m18.xq (+3/-0) test/rbkt/Queries/zorba/string/Regex/regex_m19.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m2.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m20.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m21.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m22.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m23.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m24.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m25.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m26.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m27.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m28.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m29.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m3.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m30.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m31.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m32.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m33.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m34.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m35.xq (+4/-0) test/rbkt/Queries/zorba/string/Regex/regex_m36.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m37.xq (+4/-0) test/rbkt/Queries/zorba/string/Regex/regex_m38.xq (+4/-0) test/rbkt/Queries/zorba/string/Regex/regex_m39.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m4.xq (+6/-0) test/rbkt/Queries/zorba/string/Regex/regex_m40.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m41.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m42.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m43.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m44.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m45.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m46.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m47.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m48.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m49.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m5.xq (+6/-0) test/rbkt/Queries/zorba/string/Regex/regex_m50.xq (+2/-0) test/rbkt/Queries/zorba/string/Regex/regex_m51.xq (+2/-0) test/rbkt/Queries/zorba/string/Regex/regex_m52.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m53.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_m6.xq (+6/-0) test/rbkt/Queries/zorba/string/Regex/regex_m7.xq (+6/-0) test/rbkt/Queries/zorba/string/Regex/regex_m8.xq (+7/-0) test/rbkt/Queries/zorba/string/Regex/regex_m9.xq (+7/-0) test/rbkt/Queries/zorba/string/Regex/regex_prime1.xq (+17/-0) test/rbkt/Queries/zorba/string/Regex/regex_r1.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_r10.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_r11.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_r12.xq (+7/-0) test/rbkt/Queries/zorba/string/Regex/regex_r2.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_r3.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_r4.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_r5.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_r6.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_r7_err.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_r7_err.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_r8_err.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_r8_err.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_r9.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_t1.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_t2.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_t3_err.spec (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_t3_err.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/regex_t4.xq (+2/-0) test/rbkt/Queries/zorba/string/Regex/regex_t5.xq (+1/-0) test/rbkt/Queries/zorba/string/Regex/zorba.html (+242/-0) test/rbkt/Queries/zorba/string/Regex/zorba2.html (+5/-0) test/rbkt/Queries/zorba/testdriver/bom_bug.xq (+1/-0) test/unit/CMakeLists.txt (+4/-1) test/unit/string_test.cpp (+8/-0) test/update/CMakeLists.txt (+9/-0) |
To merge this branch: | bzr merge lp:~zorba-coders/zorba/no_unicode |
Related bugs: |
Reviewer | Review Type | Date Requested | Status |
---|---|---|---|
Matthias Brantner | Needs Fixing | ||
Markos Zaharioudakis | Pending | ||
Review via email: mp+85142@code.launchpad.net |
This proposal supersedes a proposal from 2011-12-09.
This proposal has been superseded by a proposal from 2012-01-18.
Commit message
"No Unicode" is now "No ICU."
Description of the change
"No Unicode" is now "No ICU."
Matthias Brantner (matthias-brantner) wrote : | # |
Matthias Brantner (matthias-brantner) : | # |
Zorba Build Bot (zorba-buildbot) wrote : | # |
There are additional revisions which have not been approved in review. Please seek review and approval of these new revisions.
Matthias Brantner (matthias-brantner) wrote : | # |
The test suite doesn't run clean on my system (Linux) without ICU. This prevents us from adding the built to the remote queue. For example, the following tests fail without ICU (some of them also seem to fail with ICU):
1294 - test/rbkt/
1548 - test/rbkt/
1560 - test/rbkt/
1574 - test/rbkt/
1581 - test/rbkt/
1587 - test/rbkt/
1600 - test/rbkt/
1605 - test/rbkt/
1612 - test/rbkt/
1635 - test/rbkt/
1637 - test/rbkt/
1643 - test/rbkt/
1789 - test/rbkt/
2345 - test/unit/
2534 - test/update/
2544 - doc/cxx/
Please make sure the test suite runs clean.
- 10512. By Paul J. Lucas
-
Merge from trunk.
- 10513. By Daniel Turcanu
-
Fix bug in FnAnalyzeString
Iterator - 10514. By Daniel Turcanu
-
Removed regex_err12 test from expected failures
- 10515. By Paul J. Lucas
-
Merge from trunk.
- 10516. By Paul J. Lucas
-
Merge from trunk.
Paul J. Lucas (paul-lucas) wrote : | # |
Try it now.
- 10517. By Paul J. Lucas
-
Merge from trunk.
- 10518. By Paul J. Lucas
-
Merge from trunk.
- 10519. By Paul J. Lucas
-
Merge from trunk.
- 10520. By Paul J. Lucas
-
Merge from trunk.
- 10521. By Rodolfo Ochoa
-
-Fixes for windows compiling
- 10522. By Rodolfo Ochoa
-
Merge from trunk
- 10523. By Rodolfo Ochoa
-
- Fixes for commiting on windows with ZORBA_NO_ICU
- Fixes for commiting on windows with ZORBA_NO_FULL_TEXT - 10524. By Rodolfo Ochoa
-
-Fix for no precompiled headers usage
- 10525. By Rodolfo Ochoa
-
Merge from trunk
- 10526. By Rodolfo Ochoa
-
fixes for windows
- 10527. By Rodolfo Ochoa
-
Fix for linux
- 10528. By Rodolfo Ochoa
-
more fixes for linux
- 10529. By Rodolfo Ochoa
-
Final fix for windows
- 10530. By Paul J. Lucas
-
1. Added fix for not catching bad regexs like "^^".
2. Added if="!defined(ZORBA_NO_ ICU)" for some entries in the diagnostics
dictionary. - 10531. By Paul J. Lucas
-
Merge from trunk.
- 10532. By Paul J. Lucas
-
Fix for '^' bug.
- 10533. By Paul J. Lucas
-
1. Fixed yet another '^' bug.
2. Marked some regex tests as expected failure with correct bug numbers. - 10534. By Paul J. Lucas
-
No longer doing some stuff when q_flag is set.
- 10535. By Paul J. Lucas
-
Tweaked one error message.
- 10536. By Paul J. Lucas
-
Merge from trunk.
- 10537. By Rodolfo Ochoa
-
Merge from trunk
- 10538. By Rodolfo Ochoa
-
Strange error on include guards
- 10539. By Rodolfo Ochoa
-
merge from trunk
- 10540. By Rodolfo Ochoa
-
fix for regex errors in RQ
Unmerged revisions
Preview Diff
1 | === modified file 'CMakeConfiguration.txt' |
2 | --- CMakeConfiguration.txt 2012-01-11 17:30:25 +0000 |
3 | +++ CMakeConfiguration.txt 2012-01-18 18:33:36 +0000 |
4 | @@ -135,14 +135,14 @@ |
5 | SET (ZORBA_DEBUG_STRING ${ZORBA_DEBUG_STRING} CACHE BOOL "debug strings") |
6 | MESSAGE (STATUS "ZORBA_DEBUG_STRING: " ${ZORBA_DEBUG_STRING}) |
7 | |
8 | -SET(ZORBA_NO_UNICODE OFF CACHE BOOL "disable ICU") |
9 | -MESSAGE(STATUS "ZORBA_NO_UNICODE: " ${ZORBA_NO_UNICODE}) |
10 | +SET(ZORBA_NO_ICU OFF CACHE BOOL "disable ICU") |
11 | +MESSAGE(STATUS "ZORBA_NO_ICU: " ${ZORBA_NO_ICU}) |
12 | |
13 | -IF (ZORBA_NO_UNICODE) |
14 | +IF (ZORBA_NO_ICU) |
15 | SET (no_full_text ON) |
16 | -ELSE (ZORBA_NO_UNICODE) |
17 | +ELSE (ZORBA_NO_ICU) |
18 | SET (no_full_text OFF) |
19 | -ENDIF (ZORBA_NO_UNICODE) |
20 | +ENDIF (ZORBA_NO_ICU) |
21 | SET (ZORBA_NO_FULL_TEXT ${no_full_text} CACHE BOOL "disable XQuery Full-Text support") |
22 | MESSAGE(STATUS "ZORBA_NO_FULL_TEXT: " ${ZORBA_NO_FULL_TEXT}) |
23 | |
24 | |
25 | === modified file 'CMakeLists.txt' |
26 | --- CMakeLists.txt 2012-01-04 09:47:54 +0000 |
27 | +++ CMakeLists.txt 2012-01-18 18:33:36 +0000 |
28 | @@ -123,10 +123,14 @@ |
29 | CHECK_TYPE_SIZE("int64_t" ZORBA_HAVE_INT64_T) |
30 | |
31 | CHECK_CXX_SOURCE_COMPILES ("#include <type_traits>\nint main() { std::enable_if<true,int> x; }" ZORBA_CXX_ENABLE_IF) |
32 | -CHECK_CXX_SOURCE_COMPILES ("int main() { int *p = nullptr; }" ZORBA_CXX_NULLPTR) |
33 | -CHECK_CXX_SOURCE_COMPILES ("int main() { static_assert(1,\"\"); }" ZORBA_CXX_STATIC_ASSERT) |
34 | +SET(CMAKE_EXTRA_INCLUDE_FILES wchar.h) |
35 | +CHECK_TYPE_SIZE("wchar_t" ZORBA_SIZEOF_WCHAR_T) |
36 | +SET(CMAKE_EXTRA_INCLUDE_FILES) |
37 | CHECK_CXX_SOURCE_COMPILES ("#include <memory>\nint main() { std::unique_ptr<int> p; }" ZORBA_CXX_UNIQUE_PTR) |
38 | |
39 | +CHECK_CXX_SOURCE_COMPILES("int main() { int *p = nullptr; }" ZORBA_CXX_NULLPTR) |
40 | +CHECK_CXX_SOURCE_COMPILES("int main() { static_assert(1,\"\"); }" ZORBA_CXX_STATIC_ASSERT) |
41 | + |
42 | ################################################################################ |
43 | # Various cmake macros |
44 | |
45 | |
46 | === modified file 'ChangeLog' |
47 | --- ChangeLog 2012-01-18 12:18:59 +0000 |
48 | +++ ChangeLog 2012-01-18 18:33:36 +0000 |
49 | @@ -1,5 +1,9 @@ |
50 | Zorba - The XQuery Processor |
51 | |
52 | +version 2.x |
53 | + |
54 | + * Added support for NO_ICU (to not use ICU for unicode processing) |
55 | + |
56 | version 2.2 |
57 | |
58 | * No-copy optimization: avoids copying nodes during node-constructor expressions. |
59 | @@ -78,7 +82,9 @@ |
60 | * Fixed bug when parsing a document with a base-uri attribute. |
61 | * Fixed bug #863320 (Sentence is incorrectly incremented when token characters end without sentence terminator) |
62 | * Fixed bug #863730 (static delete-node* functions don't raise ZDDY0012) |
63 | + * Implemented the probe-index-range-value for general indexes |
64 | * Removed ZSTR0005 and ZSTR0006 error codes |
65 | + * Fixed bug #867662 ("nullptr" warning) |
66 | * Fixed bug #868258 (Assertion failure with two delete collection) |
67 | * Fixed bug #871623 and #871629 (assertion failures with insertions in dynamic collections) |
68 | * Fixed bug #867262 (allow reuse of iterator over ExtFuncArgItemSequence) |
69 | @@ -87,6 +93,8 @@ |
70 | * New node-reference module. References can be obtained for any node, and |
71 | different nodes cannot have the same identifier. |
72 | * Fixed bug #872697 (segmentation fault with validation of NMTOKENS) |
73 | + * General index cannot be declared as unique if the type of its key is |
74 | + xs:anyAtomicType or xs:untypedAtomic. |
75 | * Added undo for node revalidation |
76 | * Optimization for count(collection()) expressions |
77 | * Fixed bug #872796 (validate-in-place can interfere with other update primitives) |
78 | @@ -105,6 +113,8 @@ |
79 | * Fixed bug #855715 (Invalid escaped characters in regex not caught) |
80 | * Fixed bug #862089 (Split binary/xq install directories for modules) by |
81 | splitting "module path" into separate URI and Library paths |
82 | + * New node-position module. This module allows to obtain a representation of a node position, which |
83 | + can be used to assess structural relationships with other nodes. |
84 | * Fixed bug #872502 (validation of the JSON module xqdoc fails) |
85 | * Fixed bug #897619 (testdriver_mt can not run the XQueryX tests) |
86 | * Fixed bug #867107 (xqdoc dependency to zorba is wrong) |
87 | |
88 | === modified file 'KNOWN_ISSUES.txt' |
89 | --- KNOWN_ISSUES.txt 2011-10-07 08:28:43 +0000 |
90 | +++ KNOWN_ISSUES.txt 2012-01-18 18:33:36 +0000 |
91 | @@ -37,7 +37,7 @@ |
92 | * The serializer currently doesn't implement character maps as specified |
93 | (http://www.w3.org/TR/xslt-xquery-serialization/#character-maps) |
94 | |
95 | -* In the 2.0 release, setting the CMake variables ZORBA_NO_UNICODE to |
96 | +* In the 2.0 release, setting the CMake variables ZORBA_NO_ICU to |
97 | ON is not supported. |
98 | |
99 | * The PHP language binding is not supported on Mac OS X. For details, |
100 | |
101 | === modified file 'doc/cxx/examples/context.cpp' |
102 | --- doc/cxx/examples/context.cpp 2011-07-22 08:12:31 +0000 |
103 | +++ doc/cxx/examples/context.cpp 2012-01-18 18:33:36 +0000 |
104 | @@ -149,7 +149,11 @@ |
105 | outStream2 << lQuery << std::endl; |
106 | std::cout << outStream2.str() << std::endl; |
107 | |
108 | +#ifndef ZORBA_NO_ICU |
109 | if (outStream2.str() != "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\nBook 1.1\n") |
110 | +#else |
111 | + if (outStream2.str() != "<?xml version=\"1.0\"?>\nBook 1.1\n") |
112 | +#endif /* ZORBA_NO_ICU */ |
113 | { |
114 | std::cerr << "Test 4 failed with a wrong result : " << std::endl |
115 | << outStream2.str() << std::endl; |
116 | |
117 | === modified file 'include/zorba/config.h.cmake' |
118 | --- include/zorba/config.h.cmake 2012-01-11 17:30:25 +0000 |
119 | +++ include/zorba/config.h.cmake 2012-01-18 18:33:36 +0000 |
120 | @@ -93,6 +93,8 @@ |
121 | typedef __int64 int64_t; |
122 | #endif /* ZORBA_HAVE_INT64_T */ |
123 | |
124 | +#cmakedefine ZORBA_SIZEOF_WCHAR_T @ZORBA_SIZEOF_WCHAR_T@ |
125 | + |
126 | // Compiler |
127 | #cmakedefine CLANG |
128 | #cmakedefine MSVC |
129 | @@ -145,7 +147,7 @@ |
130 | |
131 | // Zorba features |
132 | #cmakedefine ZORBA_NO_FULL_TEXT |
133 | -#cmakedefine ZORBA_NO_UNICODE |
134 | +#cmakedefine ZORBA_NO_ICU |
135 | #cmakedefine ZORBA_NO_XMLSCHEMA |
136 | #cmakedefine ZORBA_NUMERIC_OPTIMIZATION |
137 | #cmakedefine ZORBA_VERIFY_PEER_SSL_CERTIFICATE |
138 | |
139 | === modified file 'include/zorba/util/time.h' |
140 | --- include/zorba/util/time.h 2011-06-16 16:40:44 +0000 |
141 | +++ include/zorba/util/time.h 2012-01-18 18:33:36 +0000 |
142 | @@ -178,7 +178,7 @@ |
143 | |
144 | inline long get_walltime_in_millis(const walltime& t) |
145 | { |
146 | - return t.time * 1000 + t.millitm; |
147 | + return (long)(t.time * 1000 + t.millitm); |
148 | } |
149 | |
150 | #else /* not Windows, and no clock_gettime() */ |
151 | |
152 | === modified file 'src/api/serialization/serializer.cpp' |
153 | --- src/api/serialization/serializer.cpp 2012-01-11 17:30:25 +0000 |
154 | +++ src/api/serialization/serializer.cpp 2012-01-18 18:33:36 +0000 |
155 | @@ -180,7 +180,6 @@ |
156 | for (; chars < chars_end; chars++ ) |
157 | { |
158 | |
159 | -#ifndef ZORBA_NO_UNICODE |
160 | // the input string is UTF-8 |
161 | int char_length = utf8::char_length(*chars); |
162 | if (char_length == 0) |
163 | @@ -217,7 +216,6 @@ |
164 | |
165 | continue; |
166 | } |
167 | -#endif//ZORBA_NO_UNICODE |
168 | |
169 | // raise an error iff (1) the serialization format is XML 1.0 and (2) the given character is an invalid XML 1.0 character |
170 | if (ser && ser->method == PARAMETER_VALUE_XML && |
171 | @@ -332,14 +330,12 @@ |
172 | { |
173 | tr << (char)0xEF << (char)0xBB << (char)0xBF; |
174 | } |
175 | -#ifndef ZORBA_NO_UNICODE |
176 | else if (ser->encoding == PARAMETER_VALUE_UTF_16) |
177 | { |
178 | // Little-endian |
179 | tr.verbatim((char)0xFF); |
180 | tr.verbatim((char)0xFE); |
181 | } |
182 | -#endif |
183 | } |
184 | } |
185 | |
186 | @@ -834,13 +830,17 @@ |
187 | emitter::emit_declaration(); |
188 | |
189 | if (ser->omit_xml_declaration == PARAMETER_VALUE_NO) { |
190 | - tr << "<?xml version=\"" << ser->version << "\" encoding=\""; |
191 | - if (ser->encoding == PARAMETER_VALUE_UTF_8) { |
192 | - tr << "UTF-8"; |
193 | -#ifndef ZORBA_NO_UNICODE |
194 | - } else if (ser->encoding == PARAMETER_VALUE_UTF_16) { |
195 | - tr << "UTF-16"; |
196 | -#endif |
197 | + tr << "<?xml version=\"" << ser->version; |
198 | + switch (ser->encoding) { |
199 | + case PARAMETER_VALUE_UTF_8: |
200 | + case PARAMETER_VALUE_UTF_16: |
201 | + tr << "\" encoding=\""; |
202 | + switch (ser->encoding) { |
203 | + case PARAMETER_VALUE_UTF_8 : tr << "UTF-8" ; break; |
204 | + case PARAMETER_VALUE_UTF_16: tr << "UTF-16"; break; |
205 | + default : ZORBA_ASSERT(false); |
206 | + } |
207 | + break; |
208 | } |
209 | tr << "\""; |
210 | |
211 | @@ -1146,14 +1146,18 @@ |
212 | } |
213 | |
214 | tr << "<meta http-equiv=\"content-type\" content=\"" |
215 | - << ser->media_type << "; charset="; |
216 | - |
217 | - if (ser->encoding == PARAMETER_VALUE_UTF_8) |
218 | - tr << "UTF-8"; |
219 | -#ifndef ZORBA_NO_UNICODE |
220 | - else if (ser->encoding == PARAMETER_VALUE_UTF_16) |
221 | - tr << "UTF-16"; |
222 | -#endif |
223 | + << ser->media_type; |
224 | + switch (ser->encoding) { |
225 | + case PARAMETER_VALUE_UTF_8: |
226 | + case PARAMETER_VALUE_UTF_16: |
227 | + tr << "\" charset=\""; |
228 | + switch (ser->encoding) { |
229 | + case PARAMETER_VALUE_UTF_8 : tr << "UTF-8" ; break; |
230 | + case PARAMETER_VALUE_UTF_16: tr << "UTF-16"; break; |
231 | + default : ZORBA_ASSERT(false); |
232 | + } |
233 | + break; |
234 | + } |
235 | tr << "\""; |
236 | // closed_parent_tag = 1; |
237 | } |
238 | @@ -1343,14 +1347,18 @@ |
239 | } |
240 | |
241 | tr << "<meta http-equiv=\"content-type\" content=\"" |
242 | - << ser->media_type << "; charset="; |
243 | - |
244 | - if (ser->encoding == PARAMETER_VALUE_UTF_8) |
245 | - tr << "UTF-8"; |
246 | -#ifndef ZORBA_NO_UNICODE |
247 | - else if (ser->encoding == PARAMETER_VALUE_UTF_16) |
248 | - tr << "UTF-16"; |
249 | -#endif |
250 | + << ser->media_type; |
251 | + switch (ser->encoding) { |
252 | + case PARAMETER_VALUE_UTF_8: |
253 | + case PARAMETER_VALUE_UTF_16: |
254 | + tr << "\" charset=\""; |
255 | + switch (ser->encoding) { |
256 | + case PARAMETER_VALUE_UTF_8 : tr << "UTF-8" ; break; |
257 | + case PARAMETER_VALUE_UTF_16: tr << "UTF-16"; break; |
258 | + default : ZORBA_ASSERT(false); |
259 | + } |
260 | + break; |
261 | + } |
262 | tr << "\"/"; |
263 | //closed_parent_tag = 1; |
264 | } |
265 | @@ -2052,10 +2060,8 @@ |
266 | { |
267 | if (!strcmp(aValue, "UTF-8")) |
268 | encoding = PARAMETER_VALUE_UTF_8; |
269 | -#ifndef ZORBA_NO_UNICODE |
270 | else if (!strcmp(aValue, "UTF-16")) |
271 | encoding = PARAMETER_VALUE_UTF_16; |
272 | -#endif |
273 | else |
274 | throw XQUERY_EXCEPTION( |
275 | err::SEPM0016, ERROR_PARAMS( aValue, aName, ZED( GoodValuesAreUTF8 ) ) |
276 | @@ -2164,16 +2170,13 @@ |
277 | { |
278 | tr = new transcoder(os, false); |
279 | } |
280 | -#ifndef ZORBA_NO_UNICODE |
281 | else if (encoding == PARAMETER_VALUE_UTF_16) |
282 | { |
283 | tr = new transcoder(os, true); |
284 | } |
285 | -#endif |
286 | else |
287 | { |
288 | - ZORBA_ASSERT(0); |
289 | - return false; |
290 | + ZORBA_ASSERT(false); |
291 | } |
292 | |
293 | if (method == PARAMETER_VALUE_XML) |
294 | |
295 | === modified file 'src/api/serialization/serializer.h' |
296 | --- src/api/serialization/serializer.h 2011-11-11 07:44:01 +0000 |
297 | +++ src/api/serialization/serializer.h 2012-01-18 18:33:36 +0000 |
298 | @@ -70,10 +70,8 @@ |
299 | PARAMETER_VALUE_TEXT, |
300 | PARAMETER_VALUE_BINARY, |
301 | |
302 | - PARAMETER_VALUE_UTF_8 |
303 | -#ifndef ZORBA_NO_UNICODE |
304 | - ,PARAMETER_VALUE_UTF_16 |
305 | -#endif |
306 | + PARAMETER_VALUE_UTF_8, |
307 | + PARAMETER_VALUE_UTF_16 |
308 | } PARAMETER_VALUE_TYPE; |
309 | |
310 | protected: |
311 | |
312 | === modified file 'src/diagnostics/diagnostic_en.xml' |
313 | --- src/diagnostics/diagnostic_en.xml 2011-12-21 14:40:33 +0000 |
314 | +++ src/diagnostics/diagnostic_en.xml 2012-01-18 18:33:36 +0000 |
315 | @@ -3080,85 +3080,171 @@ |
316 | <value>item type is not a subtype of "$3"</value> |
317 | </entry> |
318 | |
319 | - <entry key="U_REGEX_BAD_ESCAPE_SEQUENCE" if="!defined(ZORBA_NO_UNICODE)"> |
320 | + <entry key="U_REGEX_BAD_ESCAPE_SEQUENCE" if="!defined(ZORBA_NO_ICU)"> |
321 | <value>unrecognized backslash escape sequence</value> |
322 | </entry> |
323 | |
324 | - <entry key="U_REGEX_BAD_INTERVAL" if="!defined(ZORBA_NO_UNICODE)"> |
325 | + <entry key="U_REGEX_BAD_INTERVAL" if="!defined(ZORBA_NO_ICU)"> |
326 | <value>error in {min,max} interval</value> |
327 | </entry> |
328 | |
329 | - <entry key="U_REGEX_INTERNAL_ERROR" if="!defined(ZORBA_NO_UNICODE)"> |
330 | + <entry key="U_REGEX_INTERNAL_ERROR" if="!defined(ZORBA_NO_ICU)"> |
331 | <value>an internal ICU error (bug) was detected</value> |
332 | </entry> |
333 | |
334 | - <entry key="U_REGEX_INVALID_BACK_REF" if="!defined(ZORBA_NO_UNICODE)"> |
335 | + <entry key="U_REGEX_INVALID_BACK_REF" if="!defined(ZORBA_NO_ICU)"> |
336 | <value>backreference to a non-existent capture group</value> |
337 | </entry> |
338 | |
339 | - <entry key="U_REGEX_INVALID_FLAG" if="!defined(ZORBA_NO_UNICODE)"> |
340 | + <entry key="U_REGEX_INVALID_FLAG" if="!defined(ZORBA_NO_ICU)"> |
341 | <value>invalid value for match mode flags</value> |
342 | </entry> |
343 | |
344 | - <entry key="U_REGEX_INVALID_RANGE" if="!defined(ZORBA_NO_UNICODE)"> |
345 | + <entry key="U_REGEX_INVALID_RANGE" if="!defined(ZORBA_NO_ICU)"> |
346 | <value>in character range [x-y], x is greater than y</value> |
347 | </entry> |
348 | |
349 | - <entry key="U_REGEX_INVALID_STATE" if="!defined(ZORBA_NO_UNICODE)"> |
350 | + <entry key="U_REGEX_INVALID_STATE" if="!defined(ZORBA_NO_ICU)"> |
351 | <value>RegexMatcher in invalid state for requested operation</value> |
352 | </entry> |
353 | |
354 | - <entry key="U_REGEX_LOOK_BEHIND_LIMIT" if="!defined(ZORBA_NO_UNICODE)"> |
355 | + <entry key="U_REGEX_LOOK_BEHIND_LIMIT" if="!defined(ZORBA_NO_ICU)"> |
356 | <value>look-behind pattern matches must have a bounded maximum length</value> |
357 | </entry> |
358 | |
359 | - <entry key="U_REGEX_MAX_LT_MIN" if="!defined(ZORBA_NO_UNICODE)"> |
360 | + <entry key="U_REGEX_MAX_LT_MIN" if="!defined(ZORBA_NO_ICU)"> |
361 | <value>in {min,max}, max is less than min</value> |
362 | </entry> |
363 | |
364 | - <entry key="U_REGEX_MISMATCHED_PAREN" if="!defined(ZORBA_NO_UNICODE)"> |
365 | + <entry key="U_REGEX_MISMATCHED_PAREN" if="!defined(ZORBA_NO_ICU)"> |
366 | <value>incorrectly nested parentheses</value> |
367 | </entry> |
368 | |
369 | - <entry key="U_REGEX_MISSING_CLOSE_BRACKET" if="!defined(ZORBA_NO_UNICODE)"> |
370 | + <entry key="U_REGEX_MISSING_CLOSE_BRACKET" if="!defined(ZORBA_NO_ICU)"> |
371 | <value>missing ']'</value> |
372 | </entry> |
373 | |
374 | - <entry key="U_REGEX_NUMBER_TOO_BIG" if="!defined(ZORBA_NO_UNICODE)"> |
375 | + <entry key="U_REGEX_NUMBER_TOO_BIG" if="!defined(ZORBA_NO_ICU)"> |
376 | <value>decimal number is too large</value> |
377 | </entry> |
378 | |
379 | - <entry key="U_REGEX_OCTAL_TOO_BIG" if="!defined(ZORBA_NO_UNICODE)"> |
380 | + <entry key="U_REGEX_OCTAL_TOO_BIG" if="!defined(ZORBA_NO_ICU)"> |
381 | <value>octal character constants must be <= 0377</value> |
382 | </entry> |
383 | |
384 | - <entry key="U_REGEX_PROPERTY_SYNTAX" if="!defined(ZORBA_NO_UNICODE)"> |
385 | + <entry key="U_REGEX_PROPERTY_SYNTAX" if="!defined(ZORBA_NO_ICU)"> |
386 | <value>incorrect Unicode property</value> |
387 | </entry> |
388 | |
389 | - <entry key="U_REGEX_RULE_SYNTAX" if="!defined(ZORBA_NO_UNICODE)"> |
390 | + <entry key="U_REGEX_RULE_SYNTAX" if="!defined(ZORBA_NO_ICU)"> |
391 | <value>syntax error</value> |
392 | </entry> |
393 | |
394 | - <entry key="U_REGEX_SET_CONTAINS_STRING" if="!defined(ZORBA_NO_UNICODE)"> |
395 | + <entry key="U_REGEX_SET_CONTAINS_STRING" if="!defined(ZORBA_NO_ICU)"> |
396 | <value>can not have UnicodeSets containing strings</value> |
397 | </entry> |
398 | |
399 | - <entry key="U_REGEX_STACK_OVERFLOW" if="!defined(ZORBA_NO_UNICODE)"> |
400 | + <entry key="U_REGEX_STACK_OVERFLOW" if="!defined(ZORBA_NO_ICU)"> |
401 | <value>backtrack stack overflow</value> |
402 | </entry> |
403 | |
404 | - <entry key="U_REGEX_STOPPED_BY_CALLER" if="!defined(ZORBA_NO_UNICODE)"> |
405 | + <entry key="U_REGEX_STOPPED_BY_CALLER" if="!defined(ZORBA_NO_ICU)"> |
406 | <value>matching operation aborted by user callback fn</value> |
407 | </entry> |
408 | |
409 | - <entry key="U_REGEX_TIME_OUT" if="!defined(ZORBA_NO_UNICODE)"> |
410 | + <entry key="U_REGEX_TIME_OUT" if="!defined(ZORBA_NO_ICU)"> |
411 | <value>maximum allowed match time exceeded</value> |
412 | </entry> |
413 | |
414 | - <entry key="U_REGEX_UNIMPLEMENTED" if="!defined(ZORBA_NO_UNICODE)"> |
415 | - <value>use of regular expression feature that is not yet implemented</value> |
416 | - </entry> |
417 | + <entry key="U_REGEX_UNIMPLEMENTED" if="!defined(ZORBA_NO_ICU)"> |
418 | + <value>use of regular expression feature that is not yet implemented</value> |
419 | + </entry> |
420 | + |
421 | + <!-- Regex Ascii error messages--> |
422 | + <entry key="REGEX_UNIMPLEMENTED" if="defined(ZORBA_NO_ICU)"> |
423 | + <value>use of regular expression feature that is not yet implemented</value> |
424 | + </entry> |
425 | + |
426 | + <entry key="REGEX_MISMATCHED_PAREN" if="defined(ZORBA_NO_ICU)"> |
427 | + <value>incorrectly nested parentheses</value> |
428 | + </entry> |
429 | + |
430 | + <entry key="REGEX_BROKEN_P_CONSTRUCT" if="defined(ZORBA_NO_ICU)"> |
431 | + <value>broken \\p construct</value> |
432 | + </entry> |
433 | + |
434 | + <entry key="REGEX_UNKNOWN_PL_CONSTRUCT" if="defined(ZORBA_NO_ICU)"> |
435 | + <value>unknown \\p{L?} category; supported categories: L, Lu, Ll, Lt, Lm, Lo</value> |
436 | + </entry> |
437 | + |
438 | + <entry key="REGEX_UNKNOWN_PM_CONSTRUCT" if="defined(ZORBA_NO_ICU)"> |
439 | + <value>unknown \\p{M?} category; supported categories: M, Mn, Mc, Me</value> |
440 | + </entry> |
441 | + |
442 | + <entry key="REGEX_UNKNOWN_PN_CONSTRUCT" if="defined(ZORBA_NO_ICU)"> |
443 | + <value>unknown \\p{N?} category; supported categories: N, Nd, Nl, No</value> |
444 | + </entry> |
445 | + |
446 | + <entry key="REGEX_UNKNOWN_PP_CONSTRUCT" if="defined(ZORBA_NO_ICU)"> |
447 | + <value>unknown \\p{P?} category; supported categories: P, Pc, Pd, Ps, Pe, Pi, Pf, Po</value> |
448 | + </entry> |
449 | + |
450 | + <entry key="REGEX_UNKNOWN_PZ_CONSTRUCT" if="defined(ZORBA_NO_ICU)"> |
451 | + <value>unknown \\p{Z?} category; supported categories: Z, Zs, Zl, Zp</value> |
452 | + </entry> |
453 | + |
454 | + <entry key="REGEX_UNKNOWN_PS_CONSTRUCT" if="defined(ZORBA_NO_ICU)"> |
455 | + <value>unknown \\p{S?} category; supported categories: S, Sm, Sc, Sk, So</value> |
456 | + </entry> |
457 | + |
458 | + <entry key="REGEX_UNKNOWN_PC_CONSTRUCT" if="defined(ZORBA_NO_ICU)"> |
459 | + <value>unknown \\p{C?} category; supported categories: C, Cc, Cf, Co, Cn(for not assigned)</value> |
460 | + </entry> |
461 | + |
462 | + <entry key="REGEX_BROKEN_PIs_CONSTRUCT" if="defined(ZORBA_NO_ICU)"> |
463 | + <value>broken \\p{Is} construct; valid characters are [a-zA-Z0-9-]</value> |
464 | + </entry> |
465 | + |
466 | + <entry key="REGEX_UNKNOWN_PIs_CONSTRUCT" if="defined(ZORBA_NO_ICU)"> |
467 | + <value>unknown \\p{Is} category block; see supported block escapes here: http://www.w3.org/TR/xmlschema-2/#charcter-classes</value> |
468 | + </entry> |
469 | + |
470 | + <entry key="REGEX_INVALID_UNICODE_CODEPOINT_u" if="defined(ZORBA_NO_ICU)"> |
471 | + <value>invalid unicode hex, should be in form \\uXXXX or \\UXXXXXXXX</value> |
472 | + </entry> |
473 | + |
474 | + <entry key="REGEX_UNKNOWN_ESC_CHAR" if="defined(ZORBA_NO_ICU)"> |
475 | + <value>unknown \\? escape char; supported escapes are: \\[nrt\\|.?*+(){}[]-^$] for char escapes, \\[pP] for categories and \\[sSiIcCdDwW] for multichar groups</value> |
476 | + </entry> |
477 | + |
478 | + <entry key="REGEX_INVALID_BACK_REF" if="defined(ZORBA_NO_ICU)"> |
479 | + <value>\\$3 backreference to a non-existent capture group ($4 groups so far)</value> |
480 | + </entry> |
481 | + |
482 | + <entry key="REGEX_INVALID_ATOM_CHAR" if="defined(ZORBA_NO_ICU)"> |
483 | + <value>$3 - invalid character for an atom; forbidden characters are: [{}?*+|^]</value> |
484 | + </entry> |
485 | + |
486 | + <entry key="REGEX_INVALID_SUBCLASS" if="defined(ZORBA_NO_ICU)"> |
487 | + <value>malformed class subtraction</value> |
488 | + </entry> |
489 | + |
490 | + <entry key="REGEX_INVALID_USE_OF_SUBCLASS" if="defined(ZORBA_NO_ICU)"> |
491 | + <value>improper use of class subtraction: it must be the last construct in a class group [xxx-[yyy]]</value> |
492 | + </entry> |
493 | + |
494 | + <entry key="REGEX_MULTICHAR_IN_CHAR_RANGE" if="defined(ZORBA_NO_ICU)"> |
495 | + <value>multichars or char categories cannot be part of a char range</value> |
496 | + </entry> |
497 | + |
498 | + <entry key="REGEX_MISSING_CLOSE_BRACKET" if="defined(ZORBA_NO_ICU)"> |
499 | + <value>missing close bracket in char group</value> |
500 | + </entry> |
501 | + |
502 | + <entry key="REGEX_MAX_LT_MIN" if="defined(ZORBA_NO_ICU)"> |
503 | + <value>in {min,max}, max is less than min</value> |
504 | + </entry> |
505 | + |
506 | |
507 | <entry key="UnaryArithOp"> |
508 | <value>unary arithmetic operator</value> |
509 | |
510 | === modified file 'src/diagnostics/pregenerated/dict_en.cpp' |
511 | --- src/diagnostics/pregenerated/dict_en.cpp 2011-12-21 14:40:33 +0000 |
512 | +++ src/diagnostics/pregenerated/dict_en.cpp 2012-01-18 18:33:36 +0000 |
513 | @@ -565,6 +565,69 @@ |
514 | { "~ParserNoCreateTree", "XML tree creation failed" }, |
515 | { "~PromotionImpossible", "promotion not possible" }, |
516 | { "~QuotedColon_23", "\"$2\": $3" }, |
517 | +#if defined(ZORBA_NO_ICU) |
518 | + { "~REGEX_BROKEN_PIs_CONSTRUCT", "broken \\p{Is} construct; valid characters are [a-zA-Z0-9-]" }, |
519 | +#endif |
520 | +#if defined(ZORBA_NO_ICU) |
521 | + { "~REGEX_BROKEN_P_CONSTRUCT", "broken \\p construct" }, |
522 | +#endif |
523 | +#if defined(ZORBA_NO_ICU) |
524 | + { "~REGEX_INVALID_ATOM_CHAR", "$3 - invalid character for an atom; forbidden characters are: [{}?*+|^]" }, |
525 | +#endif |
526 | +#if defined(ZORBA_NO_ICU) |
527 | + { "~REGEX_INVALID_BACK_REF", "\\$3 backreference to a non-existent capture group ($4 groups so far)" }, |
528 | +#endif |
529 | +#if defined(ZORBA_NO_ICU) |
530 | + { "~REGEX_INVALID_SUBCLASS", "malformed class subtraction" }, |
531 | +#endif |
532 | +#if defined(ZORBA_NO_ICU) |
533 | + { "~REGEX_INVALID_UNICODE_CODEPOINT_u", "invalid unicode hex, should be in form \\uXXXX or \\UXXXXXXXX" }, |
534 | +#endif |
535 | +#if defined(ZORBA_NO_ICU) |
536 | + { "~REGEX_INVALID_USE_OF_SUBCLASS", "improper use of class subtraction: it must be the last construct in a class group [xxx-[yyy]]" }, |
537 | +#endif |
538 | +#if defined(ZORBA_NO_ICU) |
539 | + { "~REGEX_MAX_LT_MIN", "in {min,max}, max is less than min" }, |
540 | +#endif |
541 | +#if defined(ZORBA_NO_ICU) |
542 | + { "~REGEX_MISMATCHED_PAREN", "incorrectly nested parentheses" }, |
543 | +#endif |
544 | +#if defined(ZORBA_NO_ICU) |
545 | + { "~REGEX_MISSING_CLOSE_BRACKET", "missing close bracket in char group" }, |
546 | +#endif |
547 | +#if defined(ZORBA_NO_ICU) |
548 | + { "~REGEX_MULTICHAR_IN_CHAR_RANGE", "multichars or char categories cannot be part of a char range" }, |
549 | +#endif |
550 | +#if defined(ZORBA_NO_ICU) |
551 | + { "~REGEX_UNIMPLEMENTED", "use of regular expression feature that is not yet implemented" }, |
552 | +#endif |
553 | +#if defined(ZORBA_NO_ICU) |
554 | + { "~REGEX_UNKNOWN_ESC_CHAR", "unknown \\? escape char; supported escapes are: \\[nrt\\|.?*+(){}[]-^$] for char escapes, \\[pP] for categories and \\[sSiIcCdDwW] for multichar groups" }, |
555 | +#endif |
556 | +#if defined(ZORBA_NO_ICU) |
557 | + { "~REGEX_UNKNOWN_PC_CONSTRUCT", "unknown \\p{C?} category; supported categories: C, Cc, Cf, Co, Cn(for not assigned)" }, |
558 | +#endif |
559 | +#if defined(ZORBA_NO_ICU) |
560 | + { "~REGEX_UNKNOWN_PIs_CONSTRUCT", "unknown \\p{Is} category block; see supported block escapes here: http://www.w3.org/TR/xmlschema-2/#charcter-classes" }, |
561 | +#endif |
562 | +#if defined(ZORBA_NO_ICU) |
563 | + { "~REGEX_UNKNOWN_PL_CONSTRUCT", "unknown \\p{L?} category; supported categories: L, Lu, Ll, Lt, Lm, Lo" }, |
564 | +#endif |
565 | +#if defined(ZORBA_NO_ICU) |
566 | + { "~REGEX_UNKNOWN_PM_CONSTRUCT", "unknown \\p{M?} category; supported categories: M, Mn, Mc, Me" }, |
567 | +#endif |
568 | +#if defined(ZORBA_NO_ICU) |
569 | + { "~REGEX_UNKNOWN_PN_CONSTRUCT", "unknown \\p{N?} category; supported categories: N, Nd, Nl, No" }, |
570 | +#endif |
571 | +#if defined(ZORBA_NO_ICU) |
572 | + { "~REGEX_UNKNOWN_PP_CONSTRUCT", "unknown \\p{P?} category; supported categories: P, Pc, Pd, Ps, Pe, Pi, Pf, Po" }, |
573 | +#endif |
574 | +#if defined(ZORBA_NO_ICU) |
575 | + { "~REGEX_UNKNOWN_PS_CONSTRUCT", "unknown \\p{S?} category; supported categories: S, Sm, Sc, Sk, So" }, |
576 | +#endif |
577 | +#if defined(ZORBA_NO_ICU) |
578 | + { "~REGEX_UNKNOWN_PZ_CONSTRUCT", "unknown \\p{Z?} category; supported categories: Z, Zs, Zl, Zp" }, |
579 | +#endif |
580 | { "~SEPM0009_Not10", "the version parameter has a value other than \"1.0\" and the doctype-system parameter is specified" }, |
581 | { "~SEPM0009_NotOmit", "the standalone attribute has a value other than \"omit\"" }, |
582 | { "~SchemaAttributeName", "schema-attribute name" }, |
583 | @@ -588,64 +651,64 @@ |
584 | { "~TwoDecimalFormatsSameName_2", "\"$2\": two decimal formats with this name" }, |
585 | { "~TwoDefaultDecimalFormats", "two default decimal formats" }, |
586 | { "~TypeIsNotSubtype", "item type is not a subtype of \"$3\"" }, |
587 | -#if !defined(ZORBA_NO_UNICODE) |
588 | +#if !defined(ZORBA_NO_ICU) |
589 | { "~U_REGEX_BAD_ESCAPE_SEQUENCE", "unrecognized backslash escape sequence" }, |
590 | #endif |
591 | -#if !defined(ZORBA_NO_UNICODE) |
592 | +#if !defined(ZORBA_NO_ICU) |
593 | { "~U_REGEX_BAD_INTERVAL", "error in {min,max} interval" }, |
594 | #endif |
595 | -#if !defined(ZORBA_NO_UNICODE) |
596 | +#if !defined(ZORBA_NO_ICU) |
597 | { "~U_REGEX_INTERNAL_ERROR", "an internal ICU error (bug) was detected" }, |
598 | #endif |
599 | -#if !defined(ZORBA_NO_UNICODE) |
600 | +#if !defined(ZORBA_NO_ICU) |
601 | { "~U_REGEX_INVALID_BACK_REF", "backreference to a non-existent capture group" }, |
602 | #endif |
603 | -#if !defined(ZORBA_NO_UNICODE) |
604 | +#if !defined(ZORBA_NO_ICU) |
605 | { "~U_REGEX_INVALID_FLAG", "invalid value for match mode flags" }, |
606 | #endif |
607 | -#if !defined(ZORBA_NO_UNICODE) |
608 | +#if !defined(ZORBA_NO_ICU) |
609 | { "~U_REGEX_INVALID_RANGE", "in character range [x-y], x is greater than y" }, |
610 | #endif |
611 | -#if !defined(ZORBA_NO_UNICODE) |
612 | +#if !defined(ZORBA_NO_ICU) |
613 | { "~U_REGEX_INVALID_STATE", "RegexMatcher in invalid state for requested operation" }, |
614 | #endif |
615 | -#if !defined(ZORBA_NO_UNICODE) |
616 | +#if !defined(ZORBA_NO_ICU) |
617 | { "~U_REGEX_LOOK_BEHIND_LIMIT", "look-behind pattern matches must have a bounded maximum length" }, |
618 | #endif |
619 | -#if !defined(ZORBA_NO_UNICODE) |
620 | +#if !defined(ZORBA_NO_ICU) |
621 | { "~U_REGEX_MAX_LT_MIN", "in {min,max}, max is less than min" }, |
622 | #endif |
623 | -#if !defined(ZORBA_NO_UNICODE) |
624 | +#if !defined(ZORBA_NO_ICU) |
625 | { "~U_REGEX_MISMATCHED_PAREN", "incorrectly nested parentheses" }, |
626 | #endif |
627 | -#if !defined(ZORBA_NO_UNICODE) |
628 | +#if !defined(ZORBA_NO_ICU) |
629 | { "~U_REGEX_MISSING_CLOSE_BRACKET", "missing ']'" }, |
630 | #endif |
631 | -#if !defined(ZORBA_NO_UNICODE) |
632 | +#if !defined(ZORBA_NO_ICU) |
633 | { "~U_REGEX_NUMBER_TOO_BIG", "decimal number is too large" }, |
634 | #endif |
635 | -#if !defined(ZORBA_NO_UNICODE) |
636 | +#if !defined(ZORBA_NO_ICU) |
637 | { "~U_REGEX_OCTAL_TOO_BIG", "octal character constants must be <= 0377" }, |
638 | #endif |
639 | -#if !defined(ZORBA_NO_UNICODE) |
640 | +#if !defined(ZORBA_NO_ICU) |
641 | { "~U_REGEX_PROPERTY_SYNTAX", "incorrect Unicode property" }, |
642 | #endif |
643 | -#if !defined(ZORBA_NO_UNICODE) |
644 | +#if !defined(ZORBA_NO_ICU) |
645 | { "~U_REGEX_RULE_SYNTAX", "syntax error" }, |
646 | #endif |
647 | -#if !defined(ZORBA_NO_UNICODE) |
648 | +#if !defined(ZORBA_NO_ICU) |
649 | { "~U_REGEX_SET_CONTAINS_STRING", "can not have UnicodeSets containing strings" }, |
650 | #endif |
651 | -#if !defined(ZORBA_NO_UNICODE) |
652 | +#if !defined(ZORBA_NO_ICU) |
653 | { "~U_REGEX_STACK_OVERFLOW", "backtrack stack overflow" }, |
654 | #endif |
655 | -#if !defined(ZORBA_NO_UNICODE) |
656 | +#if !defined(ZORBA_NO_ICU) |
657 | { "~U_REGEX_STOPPED_BY_CALLER", "matching operation aborted by user callback fn" }, |
658 | #endif |
659 | -#if !defined(ZORBA_NO_UNICODE) |
660 | +#if !defined(ZORBA_NO_ICU) |
661 | { "~U_REGEX_TIME_OUT", "maximum allowed match time exceeded" }, |
662 | #endif |
663 | -#if !defined(ZORBA_NO_UNICODE) |
664 | +#if !defined(ZORBA_NO_ICU) |
665 | { "~U_REGEX_UNIMPLEMENTED", "use of regular expression feature that is not yet implemented" }, |
666 | #endif |
667 | { "~UnaryArithOp", "unary arithmetic operator" }, |
668 | |
669 | === modified file 'src/runtime/full_text/CMakeLists.txt' |
670 | --- src/runtime/full_text/CMakeLists.txt 2011-08-31 13:17:59 +0000 |
671 | +++ src/runtime/full_text/CMakeLists.txt 2012-01-18 18:33:36 +0000 |
672 | @@ -42,11 +42,11 @@ |
673 | default_tokenizer.cpp |
674 | ) |
675 | |
676 | -IF (ZORBA_NO_UNICODE) |
677 | +IF (ZORBA_NO_ICU) |
678 | LIST(APPEND FULLTEXT_SRCS latin_tokenizer.cpp) |
679 | -ELSE (ZORBA_NO_UNICODE) |
680 | +ELSE (ZORBA_NO_ICU) |
681 | LIST(APPEND FULLTEXT_SRCS icu_tokenizer.cpp) |
682 | -ENDIF (ZORBA_NO_UNICODE) |
683 | +ENDIF (ZORBA_NO_ICU) |
684 | |
685 | ADD_SRC_SUBFOLDER(FULLTEXT_SRCS stemmer LIBSTEMMER_SRCS) |
686 | |
687 | |
688 | === modified file 'src/runtime/full_text/default_tokenizer.cpp' |
689 | --- src/runtime/full_text/default_tokenizer.cpp 2011-08-31 02:53:07 +0000 |
690 | +++ src/runtime/full_text/default_tokenizer.cpp 2012-01-18 18:33:36 +0000 |
691 | @@ -19,22 +19,22 @@ |
692 | #include <zorba/config.h> |
693 | |
694 | #include "default_tokenizer.h" |
695 | -#ifdef ZORBA_NO_UNICODE |
696 | +#ifdef ZORBA_NO_ICU |
697 | # include "latin_tokenizer.h" |
698 | #else |
699 | # include "icu_tokenizer.h" |
700 | -#endif /* ZORBA_NO_UNICODE */ |
701 | +#endif /* ZORBA_NO_ICU */ |
702 | |
703 | namespace zorba { |
704 | |
705 | /////////////////////////////////////////////////////////////////////////////// |
706 | |
707 | TokenizerProvider const& default_tokenizer_provider() { |
708 | -#ifdef ZORBA_NO_UNICODE |
709 | +#ifdef ZORBA_NO_ICU |
710 | static LatinTokenizerProvider const instance; |
711 | #else |
712 | static ICU_TokenizerProvider const instance; |
713 | -#endif /* ZORBA_NO_UNICODE */ |
714 | +#endif /* ZORBA_NO_ICU */ |
715 | return instance; |
716 | }; |
717 | |
718 | |
719 | === modified file 'src/runtime/full_text/latin_tokenizer.cpp' |
720 | --- src/runtime/full_text/latin_tokenizer.cpp 2011-08-31 03:39:32 +0000 |
721 | +++ src/runtime/full_text/latin_tokenizer.cpp 2012-01-18 18:33:36 +0000 |
722 | @@ -18,8 +18,9 @@ |
723 | #include <functional> |
724 | |
725 | #include <zorba/diagnostic_list.h> |
726 | -#include <zorba/xquery_exception.h> |
727 | -#include <zorba/zorba.h> |
728 | + |
729 | +#include "diagnostics/dict.h" |
730 | +#include "diagnostics/xquery_exception.h" |
731 | |
732 | #include "latin_tokenizer.h" |
733 | |
734 | |
735 | === modified file 'src/runtime/full_text/latin_tokenizer.h' |
736 | --- src/runtime/full_text/latin_tokenizer.h 2011-08-31 03:39:32 +0000 |
737 | +++ src/runtime/full_text/latin_tokenizer.h 2012-01-18 18:33:36 +0000 |
738 | @@ -14,12 +14,12 @@ |
739 | * limitations under the License. |
740 | */ |
741 | |
742 | -#ifndef ZORBA_WESTERN_TOKENIZER_H |
743 | -#define ZORBA_WESTERN_TOKENIZER_H |
744 | +#ifndef ZORBA_LATIN_TOKENIZER_H |
745 | +#define ZORBA_LATIN_TOKENIZER_H |
746 | |
747 | #include <zorba/config.h> |
748 | |
749 | -#ifdef ZORBA_NO_FULL_TEXT |
750 | +#ifdef ZORBA_NO_ICU |
751 | |
752 | #include <zorba/tokenizer.h> |
753 | #include "zorbatypes/zstring.h" |
754 | @@ -38,8 +38,8 @@ |
755 | |
756 | // inherited |
757 | void destroy() const; |
758 | - void tokenize( char const*, size_type, iso639_1::type, bool, Callback&, |
759 | - void* ); |
760 | + void tokenize( char const*, size_type, locale::iso639_1::type, bool, |
761 | + Callback&, void* ); |
762 | |
763 | private: |
764 | typedef zstring string_type; |
765 | @@ -64,13 +64,14 @@ |
766 | class LatinTokenizerProvider : public TokenizerProvider { |
767 | public: |
768 | // inherited |
769 | - Tokenizer::ptr getTokenizer( iso639_1::type, Tokenizer::Numbers& ) const; |
770 | + Tokenizer::ptr getTokenizer( locale::iso639_1::type, |
771 | + Tokenizer::Numbers& ) const; |
772 | }; |
773 | |
774 | /////////////////////////////////////////////////////////////////////////////// |
775 | |
776 | } // namespace zorba |
777 | |
778 | -#endif /* ZORBA_NO_FULL_TEXT */ |
779 | -#endif /* ZORBA_WESTERN_TOKENIZER_H */ |
780 | +#endif /* ZORBA_NO_ICU */ |
781 | +#endif /* ZORBA_LATIN_TOKENIZER_H */ |
782 | /* vim:set et sw=2 ts=2: */ |
783 | |
784 | === modified file 'src/runtime/numerics/format_integer_impl.cpp' |
785 | --- src/runtime/numerics/format_integer_impl.cpp 2011-07-07 12:47:14 +0000 |
786 | +++ src/runtime/numerics/format_integer_impl.cpp 2012-01-18 18:33:36 +0000 |
787 | @@ -881,7 +881,7 @@ |
788 | utf8_result += (*valueit); |
789 | } |
790 | else |
791 | - utf8_result += (0x2080 + *valueit - '0'); |
792 | + utf8_result += (unicode::code_point)(0x2080 + *valueit - '0'); |
793 | } |
794 | } |
795 | else if((c0 == 0x2460) || //CIRCLED DIGIT ONE (1-20) |
796 | |
797 | === modified file 'src/runtime/numerics/numerics_impl.cpp' |
798 | --- src/runtime/numerics/numerics_impl.cpp 2011-07-10 14:55:46 +0000 |
799 | +++ src/runtime/numerics/numerics_impl.cpp 2012-01-18 18:33:36 +0000 |
800 | @@ -490,7 +490,7 @@ |
801 | minus( "-" ) |
802 | { |
803 | utf8_string<zstring> u_per_mille( per_mille ); |
804 | - u_per_mille = 0x2030; |
805 | + u_per_mille = (unicode::code_point)0x2030; |
806 | } |
807 | |
808 | void readFormat(const DecimalFormat_t& df_t) |
809 | |
810 | === modified file 'src/runtime/strings/strings_impl.cpp' |
811 | --- src/runtime/strings/strings_impl.cpp 2012-01-11 17:30:25 +0000 |
812 | +++ src/runtime/strings/strings_impl.cpp 2012-01-18 18:33:36 +0000 |
813 | @@ -806,7 +806,9 @@ |
814 | zstring normForm; |
815 | zstring resStr; |
816 | unicode::normalization::type normType; |
817 | +#ifndef ZORBA_NO_ICU |
818 | bool success; |
819 | +#endif /* ZORBA_NO_ICU */ |
820 | |
821 | PlanIteratorState* state; |
822 | DEFAULT_STACK_INIT(PlanIteratorState, state, planState); |
823 | @@ -856,10 +858,10 @@ |
824 | } |
825 | |
826 | item0->getStringValue2(resStr); |
827 | -#ifndef ZORBA_NO_UNICODE |
828 | +#ifndef ZORBA_NO_ICU |
829 | success = utf8::normalize(resStr, normType, &resStr); |
830 | ZORBA_ASSERT(success); |
831 | -#endif//#ifndef ZORBA_NO_UNICODE |
832 | +#endif//#ifndef ZORBA_NO_ICU |
833 | STACK_PUSH(GENV_ITEMFACTORY->createString(result, resStr), state ); |
834 | } |
835 | else |
836 | @@ -988,7 +990,7 @@ |
837 | trans_map[ *map_i ] = *trans_i; |
838 | |
839 | for ( ; map_i != map_end; ++map_i ) |
840 | - trans_map[ *map_i ] = ~0; |
841 | + trans_map[ *map_i ] = static_cast<unicode::code_point>( ~0 ); |
842 | } |
843 | |
844 | utf8_string<zstring> u_result_string( result_string ); |
845 | @@ -1003,7 +1005,7 @@ |
846 | cp_map_type::const_iterator const found_i = trans_map.find( cp ); |
847 | if ( found_i != trans_map.end() ) { |
848 | cp = found_i->second; |
849 | - if ( cp == ~0 ) |
850 | + if ( cp == static_cast<unicode::code_point>( ~0 ) ) |
851 | continue; |
852 | } |
853 | u_result_string += cp; |
854 | @@ -1791,16 +1793,33 @@ |
855 | int &utf8start, |
856 | unsigned int &bytestart, |
857 | int utf8end, |
858 | + unsigned int byteend, |
859 | zstring &out) |
860 | { |
861 | +#ifndef ZORBA_NO_ICU |
862 | utf8::size_type clen; |
863 | - while(utf8start < utf8end) |
864 | - { |
865 | - clen = utf8::char_length(*sin); |
866 | - out.append(sin, clen); |
867 | - utf8start++; |
868 | - bytestart += clen; |
869 | - sin += clen; |
870 | + if(utf8end) |
871 | + { |
872 | + while(utf8start < utf8end) |
873 | + { |
874 | + clen = utf8::char_length(*sin); |
875 | + if(clen == 0) |
876 | + clen = 1; |
877 | + out.append(sin, clen); |
878 | + utf8start++; |
879 | + bytestart += clen; |
880 | + sin += clen; |
881 | + } |
882 | + } |
883 | + else |
884 | +#endif |
885 | + { |
886 | + if(!utf8end) |
887 | + utf8end = byteend; |
888 | + out.append(sin, utf8end-bytestart); |
889 | + sin += utf8end-bytestart; |
890 | + utf8start = utf8end; |
891 | + bytestart = utf8end; |
892 | } |
893 | } |
894 | |
895 | @@ -1808,6 +1827,7 @@ |
896 | int &match_end1, |
897 | unsigned int &match_end1_bytes, |
898 | int match_start2, |
899 | + unsigned int match_start2_bytes, |
900 | const char *&strin) |
901 | { |
902 | store::Item_t non_match_elem; |
903 | @@ -1829,7 +1849,7 @@ |
904 | // utf8_it++; |
905 | // match_end1++; |
906 | //} |
907 | - copyUtf8Chars(strin, match_end1, match_end1_bytes, match_start2, non_match_str); |
908 | + copyUtf8Chars(strin, match_end1, match_end1_bytes, match_start2, match_start2_bytes, non_match_str); |
909 | store::Item_t non_match_text_item; |
910 | GENV_ITEMFACTORY->createTextNode(non_match_text_item, non_match_elem, non_match_str); |
911 | } |
912 | @@ -1860,19 +1880,31 @@ |
913 | i--; |
914 | break; |
915 | } |
916 | +#ifndef ZORBA_NO_ICU |
917 | match_startg = rx.get_match_start(i+1); |
918 | if((match_startg < 0) && (gparent < 0)) |
919 | continue; |
920 | +#else |
921 | + int temp_endg; |
922 | + match_startg = -1; |
923 | + temp_endg = -1; |
924 | + if(!rx.get_match_start_end_bytes(i+1, &match_startg, &temp_endg) && (gparent < 0)) |
925 | + continue; |
926 | +#endif |
927 | if(match_endgood < match_startg) |
928 | { |
929 | //add non-group match text |
930 | zstring non_group_str; |
931 | |
932 | - copyUtf8Chars(sin, match_endgood, match_end1_bytes, match_startg, non_group_str); |
933 | + copyUtf8Chars(sin, match_endgood, match_end1_bytes, match_startg, 0, non_group_str); |
934 | store::Item_t non_group_text_item; |
935 | GENV_ITEMFACTORY->createTextNode(non_group_text_item, parent.getp(), non_group_str); |
936 | } |
937 | +#ifndef ZORBA_NO_ICU |
938 | match_endg = rx.get_match_end(i+1); |
939 | +#else |
940 | + match_endg = temp_endg; |
941 | +#endif |
942 | //add group match text |
943 | GENV_ITEMFACTORY->createQName(group_element_name, |
944 | static_context::W3C_FN_NS, "fn", "group"); |
945 | @@ -1903,7 +1935,7 @@ |
946 | } |
947 | zstring group_str; |
948 | |
949 | - copyUtf8Chars(sin, match_startg, match_end1_bytes, match_endg, group_str); |
950 | + copyUtf8Chars(sin, match_startg, match_end1_bytes, match_endg, 0, group_str); |
951 | store::Item_t group_text_item; |
952 | GENV_ITEMFACTORY->createTextNode(group_text_item, group_elem.getp(), group_str); |
953 | } |
954 | @@ -1912,7 +1944,7 @@ |
955 | { |
956 | zstring non_group_str; |
957 | |
958 | - copyUtf8Chars(sin, match_endgood, match_end1_bytes, match_end2, non_group_str); |
959 | + copyUtf8Chars(sin, match_endgood, match_end1_bytes, match_end2, 0, non_group_str); |
960 | store::Item_t non_group_text_item; |
961 | GENV_ITEMFACTORY->createTextNode(non_group_text_item, parent, non_group_str); |
962 | } |
963 | @@ -2140,8 +2172,14 @@ |
964 | reachedEnd = false; |
965 | while(rx.find_next_match(&reachedEnd)) |
966 | { |
967 | - int match_start2 = rx.get_match_start(); |
968 | - int match_end2 = rx.get_match_end(); |
969 | + int match_start2; |
970 | + int match_end2; |
971 | +#ifndef ZORBA_NO_ICU |
972 | + match_start2 = rx.get_match_start(); |
973 | + match_end2 = rx.get_match_end(); |
974 | +#else |
975 | + rx.get_match_start_end_bytes(0, &match_start2, &match_end2); |
976 | +#endif |
977 | ZORBA_ASSERT(match_start2 >= 0); |
978 | |
979 | if(is_input_stream && reachedEnd && !instream->eof()) |
980 | @@ -2153,7 +2191,7 @@ |
981 | //construct the fn:non-match |
982 | if(match_start2 > match_end1) |
983 | { |
984 | - addNonMatchElement(result, match_end1, match_end1_bytes, match_start2, instr); |
985 | + addNonMatchElement(result, match_end1, match_end1_bytes, match_start2, 0, instr); |
986 | } |
987 | |
988 | //construct the fn:match |
989 | @@ -2161,7 +2199,7 @@ |
990 | match_end1 = match_end2; |
991 | } |
992 | |
993 | - if(is_input_stream && reachedEnd && !instream->eof()) |
994 | + if(is_input_stream && !instream->eof()) |
995 | { |
996 | //load some more data, maybe the match will be different |
997 | if(match_end1_bytes) |
998 | @@ -2209,7 +2247,7 @@ |
999 | else |
1000 | { |
1001 | if(match_end1_bytes < streambuf_read) |
1002 | - addNonMatchElement(result, match_end1, match_end1_bytes, streambuf_read, instr); |
1003 | + addNonMatchElement(result, match_end1, match_end1_bytes, 0, streambuf_read, instr); |
1004 | if(is_input_stream && instream->eof()) |
1005 | reachedEnd = true; |
1006 | } |
1007 | |
1008 | === modified file 'src/system/globalenv.cpp' |
1009 | --- src/system/globalenv.cpp 2012-01-11 17:30:25 +0000 |
1010 | +++ src/system/globalenv.cpp 2012-01-18 18:33:36 +0000 |
1011 | @@ -17,11 +17,11 @@ |
1012 | |
1013 | #include "common/common.h" |
1014 | |
1015 | -#ifndef ZORBA_NO_UNICODE |
1016 | +#ifndef ZORBA_NO_ICU |
1017 | # include <unicode/uclean.h> |
1018 | # include <unicode/utypes.h> |
1019 | # include <unicode/udata.h> |
1020 | -#endif /* ZORBA_NO_UNICODE */ |
1021 | +#endif /* ZORBA_NO_ICU */ |
1022 | |
1023 | #ifdef ZORBA_WITH_BIG_INTEGER |
1024 | # include "zorbatypes/m_apm.h" |
1025 | @@ -208,7 +208,7 @@ |
1026 | // from one thread only |
1027 | // see http://www.icu-project.org/userguide/design.html#Init_and_Termination |
1028 | // and http://www.icu-project.org/apiref/icu4c/uclean_8h.html |
1029 | -#ifndef ZORBA_NO_UNICODE |
1030 | +#ifndef ZORBA_NO_ICU |
1031 | # if defined U_STATIC_IMPLEMENTATION && (defined WIN32 || defined WINCE) |
1032 | { |
1033 | TCHAR self_path[1024]; |
1034 | @@ -238,13 +238,13 @@ |
1035 | udata_setCommonData(icu_appdata, &data_err); |
1036 | ZORBA_ASSERT(data_err == U_ZERO_ERROR); |
1037 | |
1038 | - // u_setDataDirectory(self_path); |
1039 | + // u_setDataDirectory(self_path); |
1040 | } |
1041 | # endif |
1042 | UErrorCode lICUInitStatus = U_ZERO_ERROR; |
1043 | u_init(&lICUInitStatus); |
1044 | ZORBA_ASSERT(lICUInitStatus == U_ZERO_ERROR); |
1045 | -#endif//ifndef ZORBA_NO_UNICODE |
1046 | +#endif /* ZORBA_NO_ICU */ |
1047 | } |
1048 | |
1049 | |
1050 | @@ -256,12 +256,12 @@ |
1051 | // releases statically initialized memory and prevents |
1052 | // valgrind from reporting those problems at the end |
1053 | // see http://www.icu-project.org/apiref/icu4c/uclean_8h.html#93f27d0ddc7c196a1da864763f2d8920 |
1054 | -#ifndef ZORBA_NO_UNICODE |
1055 | +#ifndef ZORBA_NO_ICU |
1056 | u_cleanup(); |
1057 | # if defined U_STATIC_IMPLEMENTATION && (defined WIN32 || defined WINCE) |
1058 | delete[] icu_appdata; |
1059 | # endif |
1060 | -#endif//ifndef ZORBA_NO_UNICODE |
1061 | +#endif /* ZORBA_NO_ICU */ |
1062 | } |
1063 | |
1064 | |
1065 | |
1066 | === modified file 'src/util/CMakeLists.txt' |
1067 | --- src/util/CMakeLists.txt 2011-07-18 14:25:21 +0000 |
1068 | +++ src/util/CMakeLists.txt 2012-01-18 18:33:36 +0000 |
1069 | @@ -38,9 +38,9 @@ |
1070 | LIST(APPEND UTIL_SRCS mmap_file.cpp) |
1071 | ENDIF(ZORBA_WITH_FILE_ACCESS) |
1072 | |
1073 | -IF(ZORBA_NO_UNICODE) |
1074 | - LIST(APPEND UTIL_SRCS regex_ascii.cpp) |
1075 | -ENDIF(ZORBA_NO_UNICODE) |
1076 | +IF(ZORBA_NO_ICU) |
1077 | + LIST(APPEND UTIL_SRCS regex_xquery.cpp) |
1078 | +ENDIF(ZORBA_NO_ICU) |
1079 | |
1080 | HEADER_GROUP_SUBFOLDER(UTIL_SRCS fx) |
1081 | HEADER_GROUP_SUBFOLDER(UTIL_SRCS win32) |
1082 | |
1083 | === modified file 'src/util/regex.cpp' |
1084 | --- src/util/regex.cpp 2011-09-24 00:16:36 +0000 |
1085 | +++ src/util/regex.cpp 2012-01-18 18:33:36 +0000 |
1086 | @@ -33,8 +33,7 @@ |
1087 | #define INVALID_RE_EXCEPTION(...) \ |
1088 | XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS( __VA_ARGS__ ) ) |
1089 | |
1090 | - |
1091 | -#ifndef ZORBA_NO_UNICODE |
1092 | +#ifndef ZORBA_NO_ICU |
1093 | # include <unicode/uversion.h> |
1094 | U_NAMESPACE_USE |
1095 | |
1096 | @@ -442,11 +441,11 @@ |
1097 | } |
1098 | |
1099 | } // namespace unicode |
1100 | - |
1101 | -}//namespace zorba |
1102 | - |
1103 | - |
1104 | -#else /* ZORBA_NO_UNICODE */ |
1105 | +} // namespace zorba |
1106 | + |
1107 | +/////////////////////////////////////////////////////////////////////////////// |
1108 | + |
1109 | +#else /* ZORBA_NO_ICU */ |
1110 | |
1111 | #include "zorbatypes/zstring.h" |
1112 | |
1113 | @@ -470,7 +469,7 @@ |
1114 | case 'i': flags |= REGEX_ASCII_CASE_INSENSITIVE; break; |
1115 | case 's': flags |= REGEX_ASCII_DOTALL; break; |
1116 | case 'm': flags |= REGEX_ASCII_MULTILINE; break; |
1117 | - case 'x': flags |= REGEX_ASCII_COMMENTS; break; |
1118 | + case 'x': flags |= REGEX_ASCII_NO_WHITESPACE; break; |
1119 | case 'q': flags |= REGEX_ASCII_LITERAL; break; |
1120 | default: |
1121 | throw XQUERY_EXCEPTION( err::FORX0001, ERROR_PARAMS( *p ) ); |
1122 | @@ -483,6 +482,7 @@ |
1123 | void regex::compile( char const *pattern, char const *flags) |
1124 | { |
1125 | parsed_flags = parse_regex_flags(flags); |
1126 | + regex_xquery::CRegexXQuery_parser regex_parser; |
1127 | regex_matcher = regex_parser.parse(pattern, parsed_flags); |
1128 | if(!regex_matcher) |
1129 | throw INVALID_RE_EXCEPTION(pattern); |
1130 | @@ -517,6 +517,8 @@ |
1131 | bool regex::next_token( char const *s, size_type *pos, zstring *token, |
1132 | bool *matched) |
1133 | { |
1134 | + if(!s[*pos]) |
1135 | + return false; |
1136 | bool retval; |
1137 | int match_pos; |
1138 | int matched_len; |
1139 | @@ -528,14 +530,8 @@ |
1140 | token->assign(s+*pos, match_pos); |
1141 | *pos += match_pos + matched_len; |
1142 | if(matched) |
1143 | - if(match_pos) |
1144 | - *matched = true; |
1145 | - else |
1146 | - *matched = false; |
1147 | - if(match_pos) |
1148 | - return true; |
1149 | - else |
1150 | - return false; |
1151 | + *matched = true; |
1152 | + return true; |
1153 | } |
1154 | else |
1155 | { |
1156 | @@ -544,7 +540,7 @@ |
1157 | *pos += strlen(s+*pos); |
1158 | if(matched) |
1159 | *matched = false; |
1160 | - return s[*pos] != 0; |
1161 | + return true; |
1162 | } |
1163 | } |
1164 | |
1165 | @@ -554,13 +550,9 @@ |
1166 | int matched_pos; |
1167 | int matched_len; |
1168 | |
1169 | - bool prev_align = regex_matcher->set_align_begin(true); |
1170 | - retval = regex_matcher->match_from(s, parsed_flags, &matched_pos, &matched_len); |
1171 | - regex_matcher->set_align_begin(prev_align); |
1172 | + retval = regex_matcher->match_anywhere(s, parsed_flags|REGEX_ASCII_WHOLE_MATCH, &matched_pos, &matched_len); |
1173 | if(!retval) |
1174 | return false; |
1175 | - if(matched_len != strlen(s)) |
1176 | - return false; |
1177 | return true; |
1178 | } |
1179 | |
1180 | @@ -587,14 +579,19 @@ |
1181 | //look for dollars |
1182 | if(*temprepl == '\\') |
1183 | { |
1184 | - temprepl++; |
1185 | - if(!*temprepl || (*temprepl != '\\') || (*temprepl != '$'))//Invalid replacement string. |
1186 | - throw XQUERY_EXCEPTION( err::FORX0004, ERROR_PARAMS( replacement ) ); |
1187 | + if(!(parsed_flags & REGEX_ASCII_LITERAL)) |
1188 | + { |
1189 | + temprepl++; |
1190 | + if(!*temprepl) |
1191 | + temprepl--; |
1192 | + else if((*temprepl != '\\') && (*temprepl != '$'))//Invalid replacement string. |
1193 | + throw XQUERY_EXCEPTION( err::FORX0004, ERROR_PARAMS( replacement ) ); |
1194 | + } |
1195 | result->append(1, *temprepl); |
1196 | temprepl++; |
1197 | continue; |
1198 | } |
1199 | - if(*temprepl == '$') |
1200 | + if((*temprepl == '$') && !(parsed_flags & REGEX_ASCII_LITERAL)) |
1201 | { |
1202 | temprepl++; |
1203 | index = 0; |
1204 | @@ -648,7 +645,7 @@ |
1205 | if(retval) |
1206 | { |
1207 | m_match_pos += m_pos; |
1208 | - m_pos = m_match_pos = m_matched_len; |
1209 | + m_pos = m_match_pos + m_matched_len; |
1210 | } |
1211 | else |
1212 | { |
1213 | @@ -666,35 +663,30 @@ |
1214 | return (int)regex_matcher->get_indexed_regex_count(); |
1215 | } |
1216 | |
1217 | -int regex::get_match_start( int groupId ) |
1218 | -{ |
1219 | - if(groupId == 0) |
1220 | - return m_match_pos; |
1221 | - if(groupId > (int)regex_matcher->get_indexed_regex_count()) |
1222 | - return -1; |
1223 | - const char *submatched_source; |
1224 | - int submatched_len; |
1225 | - if(!regex_matcher->get_indexed_match(groupId, &submatched_source, &submatched_len)) |
1226 | - return -1; |
1227 | - return submatched_source - s_in_.c_str(); |
1228 | -} |
1229 | - |
1230 | -int regex::get_match_end( int groupId ) |
1231 | -{ |
1232 | - if(groupId == 0) |
1233 | - return m_match_pos + m_matched_len; |
1234 | - if(groupId > (int)regex_matcher->get_indexed_regex_count()) |
1235 | - return -1; |
1236 | - const char *submatched_source; |
1237 | - int submatched_len; |
1238 | - if(!regex_matcher->get_indexed_match(groupId, &submatched_source, &submatched_len)) |
1239 | - return -1; |
1240 | - return submatched_source - s_in_.c_str() + submatched_len; |
1241 | +bool regex::get_match_start_end_bytes( int groupId, int *start, int *end ) |
1242 | +{ |
1243 | + *start = -1; |
1244 | + *end = -1; |
1245 | + if(groupId == 0) |
1246 | + { |
1247 | + *start = m_match_pos; |
1248 | + *end = m_match_pos + m_matched_len; |
1249 | + return true; |
1250 | + } |
1251 | + if(groupId > (int)regex_matcher->get_indexed_regex_count()) |
1252 | + return false; |
1253 | + const char *submatched_source; |
1254 | + int submatched_len; |
1255 | + if(!regex_matcher->get_indexed_match(groupId, &submatched_source, &submatched_len)) |
1256 | + return false; |
1257 | + *start = submatched_source - s_in_.c_str(); |
1258 | + *end = *start + submatched_len; |
1259 | + return true; |
1260 | } |
1261 | |
1262 | } // namespace unicode |
1263 | } // namespace zorba |
1264 | -#endif /* ZORBA_NO_UNICODE */ |
1265 | +#endif /* ZORBA_NO_ICU */ |
1266 | |
1267 | /////////////////////////////////////////////////////////////////////////////// |
1268 | |
1269 | |
1270 | === modified file 'src/util/regex.h' |
1271 | --- src/util/regex.h 2011-07-18 14:25:21 +0000 |
1272 | +++ src/util/regex.h 2012-01-18 18:33:36 +0000 |
1273 | @@ -17,15 +17,13 @@ |
1274 | #ifndef ZORBA_REGEX_H |
1275 | #define ZORBA_REGEX_H |
1276 | |
1277 | -#ifndef ZORBA_NO_UNICODE |
1278 | -#include <unicode/regex.h> |
1279 | -#endif |
1280 | - |
1281 | #include "cxx_util.h" |
1282 | #include "unicode_util.h" |
1283 | #include "zorbatypes/zstring.h" |
1284 | |
1285 | -#ifndef ZORBA_NO_UNICODE |
1286 | +#ifndef ZORBA_NO_ICU |
1287 | + |
1288 | +#include <unicode/regex.h> |
1289 | |
1290 | namespace zorba { |
1291 | |
1292 | @@ -496,15 +494,17 @@ |
1293 | } // namespace unicode |
1294 | } // namespace zorba |
1295 | |
1296 | -#else ///ZORBA_NO_UNICODE (ascii part:) |
1297 | - |
1298 | -#include "util/regex_ascii.h" |
1299 | +/////////////////////////////////////////////////////////////////////////////// |
1300 | + |
1301 | +#else /* ZORBA_NO_ICU */ |
1302 | + |
1303 | +#include "util/regex_xquery.h" |
1304 | #include <string> |
1305 | |
1306 | namespace zorba{ |
1307 | /** |
1308 | * Converts an XQuery regular expression to the form used by the regular |
1309 | - * expression library Zorba is using (here regex_ascii). |
1310 | + * expression library Zorba is using (here regex_xquery). |
1311 | * |
1312 | * @param xq_re The XQuery regular expression. |
1313 | * @param lib_re A pointer to the resuling library regular expression. |
1314 | @@ -525,7 +525,7 @@ |
1315 | /** |
1316 | * Constructs a %regex. |
1317 | */ |
1318 | - regex() : regex_matcher( NULL ) { } |
1319 | + regex() : regex_matcher( nullptr ) { } |
1320 | |
1321 | /** |
1322 | * Destroys a %regex. |
1323 | @@ -835,31 +835,21 @@ |
1324 | |
1325 | /** |
1326 | * Get the start position of the matched group. |
1327 | - * If groupId is zero, then the start position of the whole match is returned. |
1328 | - * If groupId is non-zero, then the start position of that group is returned. |
1329 | - * If that group has not been matched, -1 is returned. |
1330 | + * If groupId is zero, then the start and end position of the whole match is returned. |
1331 | + * If groupId is non-zero, then the start and end position of that group is returned. |
1332 | + * If that group has not been matched, false is returned. |
1333 | * |
1334 | * @param groupId the id of the group, either zero for the entire regex, |
1335 | * or [1 .. group_count] for that specific group |
1336 | - * @return the start position, zero based, or -1 if that group didn't match |
1337 | + * @param start to return start position in bytes |
1338 | + * @param end to return end position in bytes |
1339 | + * @return true if that group exists and has been matched |
1340 | */ |
1341 | - int get_match_start( int groupId = 0 ); |
1342 | + bool get_match_start_end_bytes( int groupId, int *start, int *end ); |
1343 | |
1344 | - /** |
1345 | - * Get the end position of the matched group. |
1346 | - * If groupId is zero, then the end position of the whole match is returned. |
1347 | - * If groupId is non-zero, then the end position of that group is returned. |
1348 | - * If that group has not been matched, -1 is returned. |
1349 | - * |
1350 | - * @param groupId the id of the group, either zero for the entire regex, |
1351 | - * or [1 .. group_count] for that specific group |
1352 | - * @return the end position, zero based, or -1 if that group didn't match |
1353 | - */ |
1354 | - int get_match_end( int groupId = 0 ); |
1355 | |
1356 | private: |
1357 | - regex_ascii::CRegexAscii_parser regex_parser; |
1358 | - regex_ascii::CRegexAscii_regex *regex_matcher; |
1359 | + regex_xquery::CRegexXQuery_regex *regex_matcher; |
1360 | uint32_t parsed_flags; |
1361 | |
1362 | zstring s_in_; |
1363 | @@ -873,15 +863,13 @@ |
1364 | regex( regex const& ); |
1365 | regex& operator=( regex const& ); |
1366 | }; |
1367 | + |
1368 | +/////////////////////////////////////////////////////////////////////////////// |
1369 | + |
1370 | } // namespace unicode |
1371 | } // namespace zorba |
1372 | |
1373 | -#endif /* ZORBA_NO_UNICODE */ |
1374 | - |
1375 | - |
1376 | -/////////////////////////////////////////////////////////////////////////////// |
1377 | - |
1378 | - |
1379 | +#endif /* ZORBA_NO_ICU */ |
1380 | #endif /* ZORBA_REGEX_H */ |
1381 | /* |
1382 | * Local variables: |
1383 | |
1384 | === renamed file 'src/util/regex_ascii.cpp' => 'src/util/regex_xquery.cpp' |
1385 | --- src/util/regex_ascii.cpp 2011-08-05 02:21:55 +0000 |
1386 | +++ src/util/regex_xquery.cpp 2012-01-18 18:33:36 +0000 |
1387 | @@ -1,4 +1,4 @@ |
1388 | -a/* |
1389 | +/* |
1390 | * Copyright 2006-2008 The FLWOR Foundation. |
1391 | * |
1392 | * Licensed under the Apache License, Version 2.0 (the "License"); |
1393 | @@ -18,12 +18,15 @@ |
1394 | |
1395 | #include "diagnostics/xquery_diagnostics.h" |
1396 | |
1397 | -#include "regex_ascii.h" |
1398 | +#include "regex_xquery.h" |
1399 | #include <string.h> |
1400 | #include "zorbatypes/chartype.h" |
1401 | +#include "util/unicode_categories.h" |
1402 | +#include "util/ascii_util.h" |
1403 | +#include "util/utf8_string.h" |
1404 | |
1405 | namespace zorba { |
1406 | - namespace regex_ascii{ |
1407 | + namespace regex_xquery{ |
1408 | //ascii regular expression matching |
1409 | |
1410 | /*http://www.w3.org/TR/xmlschema-2/#regexs |
1411 | @@ -62,96 +65,138 @@ |
1412 | + http://www.w3.org/TR/xquery-operators/#regex-syntax (not implemented) |
1413 | */ |
1414 | |
1415 | + |
1416 | +static bool compare_ascii_i(const char *str1, const char *str2) |
1417 | +{ |
1418 | + while(*str1 && *str2) |
1419 | + { |
1420 | + if(ascii::to_lower(*str1) != ascii::to_lower(*str2)) |
1421 | + return false; |
1422 | + str1++; |
1423 | + str2++; |
1424 | + } |
1425 | + if(*str1 || *str2) |
1426 | + return false; |
1427 | + return true; |
1428 | +} |
1429 | + |
1430 | +static bool compare_unicode_ni(const char *str1, const char *str2, int len) |
1431 | +{ |
1432 | + while(len > 0) |
1433 | + { |
1434 | + const char *temp_str1 = str1; |
1435 | + const char *temp_str2 = str2; |
1436 | + unicode::code_point cp1 = unicode::to_upper(utf8::next_char(temp_str1)); |
1437 | + unicode::code_point cp2 = unicode::to_upper(utf8::next_char(temp_str2)); |
1438 | + if(cp1 != cp2) |
1439 | + return false; |
1440 | + len -= temp_str1-str1; |
1441 | + str1 = temp_str1; |
1442 | + str2 = temp_str2; |
1443 | + } |
1444 | + return true; |
1445 | +} |
1446 | +static utf8::size_type myutf8len(const char *source) |
1447 | +{ |
1448 | + utf8::size_type len = utf8::char_length(*source); |
1449 | + if(!len) |
1450 | + return 1; |
1451 | + else |
1452 | + return len; |
1453 | +} |
1454 | //////////////////////////////////// |
1455 | ////Regular expression parsing and building of the tree |
1456 | //////////////////////////////////// |
1457 | |
1458 | -CRegexAscii_regex* CRegexAscii_parser::parse(const char *pattern, unsigned int flags) |
1459 | +CRegexXQuery_regex* CRegexXQuery_parser::parse(const char *pattern, unsigned int flags) |
1460 | { |
1461 | this->flags = flags; |
1462 | - bool align_begin = false; |
1463 | |
1464 | - if(!(flags & REGEX_ASCII_LITERAL) && (pattern[0] == '^')) |
1465 | - align_begin = true; |
1466 | - |
1467 | int regex_len; |
1468 | - CRegexAscii_regex* regex = parse_regexp(pattern + (align_begin?1:0), ®ex_len); |
1469 | + CRegexXQuery_regex* regex = parse_regexp(pattern, ®ex_len); |
1470 | |
1471 | - if(regex) |
1472 | - regex->set_align_begin(align_begin); |
1473 | - |
1474 | return regex; |
1475 | } |
1476 | |
1477 | //until '\0' or ')' |
1478 | -CRegexAscii_regex* CRegexAscii_parser::parse_regexp(const char *pattern, |
1479 | +CRegexXQuery_regex* CRegexXQuery_parser::parse_regexp(const char *pattern, |
1480 | int *regex_len) |
1481 | { |
1482 | *regex_len = 0; |
1483 | int branch_len; |
1484 | regex_depth++; |
1485 | - CRegexAscii_regex *regex = new CRegexAscii_regex(current_regex); |
1486 | + std::auto_ptr<CRegexXQuery_regex> regex(new CRegexXQuery_regex(current_regex)); |
1487 | if(!current_regex) |
1488 | - current_regex = regex; |
1489 | + current_regex = regex.get(); |
1490 | if(regex_depth >= 2) |
1491 | { |
1492 | //mark this as group if it does not start with ?: |
1493 | if(pattern[0] != '?' || pattern[1] != ':') |
1494 | - current_regex->subregex.push_back(regex); |
1495 | + current_regex->subregex.push_back(regex.get()); |
1496 | else |
1497 | *regex_len = 2; |
1498 | } |
1499 | - CRegexAscii_branch *branch; |
1500 | + CRegexXQuery_branch *branch; |
1501 | + bool must_read_another_branch = true; |
1502 | while(pattern[*regex_len] && (pattern[*regex_len] != ')')) |
1503 | { |
1504 | branch = parse_branch(pattern+*regex_len, &branch_len); |
1505 | if(!branch) |
1506 | { |
1507 | regex_depth--; |
1508 | - delete regex; |
1509 | return NULL; |
1510 | } |
1511 | regex->add_branch(branch); |
1512 | *regex_len += branch_len; |
1513 | + if(pattern[*regex_len] == '|') |
1514 | + (*regex_len)++; |
1515 | + else |
1516 | + must_read_another_branch = false; |
1517 | } |
1518 | - if((current_regex == regex) && (pattern[*regex_len] == ')')) |
1519 | + if((current_regex == regex.get()) && (pattern[*regex_len] == ')')) |
1520 | { |
1521 | - throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(U_REGEX_MISMATCHED_PAREN)) ); |
1522 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_MISMATCHED_PAREN)) ); |
1523 | } |
1524 | if(pattern[*regex_len]) |
1525 | (*regex_len)++; |
1526 | + if(must_read_another_branch) |
1527 | + regex->add_branch(new CRegexXQuery_branch(current_regex));//add empty branch |
1528 | regex->flags = 0;//finished initialization |
1529 | regex_depth--; |
1530 | - return regex; |
1531 | + return regex.release(); |
1532 | } |
1533 | |
1534 | -CRegexAscii_branch* CRegexAscii_parser::parse_branch(const char *pattern, int *branch_len) |
1535 | +CRegexXQuery_branch* CRegexXQuery_parser::parse_branch(const char *pattern, int *branch_len) |
1536 | { |
1537 | int piece_len; |
1538 | |
1539 | - CRegexAscii_branch *branch = new CRegexAscii_branch(current_regex); |
1540 | - CRegexAscii_piece *piece; |
1541 | + std::auto_ptr<CRegexXQuery_branch> branch(new CRegexXQuery_branch(current_regex)); |
1542 | + CRegexXQuery_piece *piece; |
1543 | *branch_len = 0; |
1544 | while(pattern[*branch_len] && (pattern[*branch_len] != '|') && (pattern[*branch_len] != ')')) |
1545 | { |
1546 | piece = parse_piece(pattern+*branch_len, &piece_len); |
1547 | if(!piece) |
1548 | { |
1549 | - delete branch; |
1550 | return NULL; |
1551 | } |
1552 | + if(branch->piece_list.size() && dynamic_cast<CRegexXQuery_pinstart*>(piece->atom)) |
1553 | + { |
1554 | + //found ^ that is not at the beginning of branch |
1555 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_ATOM_CHAR), '^') ); |
1556 | + } |
1557 | branch->add_piece(piece); |
1558 | *branch_len += piece_len; |
1559 | } |
1560 | - if(pattern[*branch_len] == '|') |
1561 | - (*branch_len)++; |
1562 | - return branch; |
1563 | + //if(pattern[*branch_len] == '|') |
1564 | + // (*branch_len)++; |
1565 | + return branch.release(); |
1566 | } |
1567 | |
1568 | //piece = atom + quantifier |
1569 | -CRegexAscii_piece* CRegexAscii_parser::parse_piece(const char *pattern, int *piece_len) |
1570 | +CRegexXQuery_piece* CRegexXQuery_parser::parse_piece(const char *pattern, int *piece_len) |
1571 | { |
1572 | - CRegexAscii_piece *piece = new CRegexAscii_piece; |
1573 | + std::auto_ptr<CRegexXQuery_piece> piece(new CRegexXQuery_piece); |
1574 | IRegexAtom *atom; |
1575 | *piece_len = 0; |
1576 | |
1577 | @@ -160,19 +205,18 @@ |
1578 | atom = read_atom(pattern, &atom_len); |
1579 | if(!atom) |
1580 | { |
1581 | - delete piece; |
1582 | return NULL; |
1583 | } |
1584 | piece->set_atom(atom); |
1585 | if(!(flags & REGEX_ASCII_LITERAL)) |
1586 | - read_quantifier(piece, pattern+atom_len, &quantif_len); |
1587 | + read_quantifier(piece.get(), pattern+atom_len, &quantif_len); |
1588 | |
1589 | *piece_len += atom_len + quantif_len; |
1590 | |
1591 | - return piece; |
1592 | + return piece.release(); |
1593 | } |
1594 | |
1595 | -char CRegexAscii_parser::myishex(char c) |
1596 | +char CRegexXQuery_parser::myishex(char c) |
1597 | { |
1598 | if((c >= '0') && (c <= '9')) |
1599 | return c-'0'+1; |
1600 | @@ -183,26 +227,125 @@ |
1601 | return 0;//not a hex |
1602 | } |
1603 | |
1604 | -bool CRegexAscii_parser::myisdigit(char c) |
1605 | -{ |
1606 | - return (c >= '0') || (c <= '9'); |
1607 | -} |
1608 | - |
1609 | -char CRegexAscii_parser::readChar(const char *pattern, int *char_len, bool *is_multichar) |
1610 | +bool CRegexXQuery_parser::myisdigit(char c) |
1611 | +{ |
1612 | + return (c >= '0') && (c <= '9'); |
1613 | +} |
1614 | + |
1615 | +bool CRegexXQuery_parser::myisletterAZ(char c) |
1616 | +{ |
1617 | + return ((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z')); |
1618 | +} |
1619 | + |
1620 | +static const unicode::code_point specials_extcp[] = {0xFFF0, 0xFFFD, 0}; |
1621 | + |
1622 | +static CRegexXQuery_parser::block_escape_t block_escape[] = |
1623 | +{ |
1624 | +{{0x0000, 0x007F}, NULL, "BasicLatin"}, |
1625 | +{{0x0080, 0x00FF}, NULL, "Latin-1Supplement"}, |
1626 | +{{0x0100, 0x017F}, NULL, "LatinExtended-A"}, |
1627 | +{{0x0180, 0x024F}, NULL, "LatinExtended-B"}, |
1628 | +{{0x0250, 0x02AF}, NULL, "IPAExtensions"}, |
1629 | +{{0x02B0, 0x02FF}, NULL, "SpacingModifierLetters"}, |
1630 | +{{0x0300, 0x036F}, NULL, "CombiningDiacriticalMarks"}, |
1631 | +{{0x0370, 0x03FF}, NULL, "Greek"}, |
1632 | +{{0x0400, 0x04FF}, NULL, "Cyrillic"}, |
1633 | +{{0x0530, 0x058F}, NULL, "Armenian"}, |
1634 | +{{0x0590, 0x05FF}, NULL, "Hebrew"}, |
1635 | +{{0x0600, 0x06FF}, NULL, "Arabic"}, |
1636 | +{{0x0700, 0x074F}, NULL, "Syriac"}, |
1637 | +{{0x0780, 0x07BF}, NULL, "Thaana"}, |
1638 | +{{0x0900, 0x097F}, NULL, "Devanagari"}, |
1639 | +{{0x0980, 0x09FF}, NULL, "Bengali"}, |
1640 | +{{0x0A00, 0x0A7F}, NULL, "Gurmukhi"}, |
1641 | +{{0x0A80, 0x0AFF}, NULL, "Gujarati"}, |
1642 | +{{0x0B00, 0x0B7F}, NULL, "Oriya"}, |
1643 | +{{0x0B80, 0x0BFF}, NULL, "Tamil"}, |
1644 | +{{0x0C00, 0x0C7F}, NULL, "Telugu"}, |
1645 | +{{0x0C80, 0x0CFF}, NULL, "Kannada"}, |
1646 | +{{0x0D00, 0x0D7F}, NULL, "Malayalam"}, |
1647 | +{{0x0D80, 0x0DFF}, NULL, "Sinhala"}, |
1648 | +{{0x0E00, 0x0E7F}, NULL, "Thai"}, |
1649 | +{{0x0E80, 0x0EFF}, NULL, "Lao"}, |
1650 | +{{0x0F00, 0x0FFF}, NULL, "Tibetan"}, |
1651 | +{{0x1000, 0x109F}, NULL, "Myanmar"}, |
1652 | +{{0x10A0, 0x10FF}, NULL, "Georgian"}, |
1653 | +{{0x1100, 0x11FF}, NULL, "HangulJamo"}, |
1654 | +{{0x1200, 0x137F}, NULL, "Ethiopic"}, |
1655 | +{{0x13A0, 0x13FF}, NULL, "Cherokee"}, |
1656 | +{{0x1400, 0x167F}, NULL, "UnifiedCanadianAboriginalSyllabics"}, |
1657 | +{{0x1680, 0x169F}, NULL, "Ogham"}, |
1658 | +{{0x16A0, 0x16FF}, NULL, "Runic"}, |
1659 | +{{0x1780, 0x17FF}, NULL, "Khmer"}, |
1660 | +{{0x1800, 0x18AF}, NULL, "Mongolian"}, |
1661 | +{{0x1E00, 0x1EFF}, NULL, "LatinExtendedAdditional"}, |
1662 | +{{0x1F00, 0x1FFF}, NULL, "GreekExtended"}, |
1663 | +{{0x2000, 0x206F}, NULL, "GeneralPunctuation"}, |
1664 | +{{0x2070, 0x209F}, NULL, "SuperscriptsandSubscripts"}, |
1665 | +{{0x20A0, 0x20CF}, NULL, "CurrencySymbols"}, |
1666 | +{{0x20D0, 0x20FF}, NULL, "CombiningMarksforSymbols"}, |
1667 | +{{0x2100, 0x214F}, NULL, "LetterlikeSymbols"}, |
1668 | +{{0x2150, 0x218F}, NULL, "NumberForms"}, |
1669 | +{{0x2190, 0x21FF}, NULL, "Arrows"}, |
1670 | +{{0x2200, 0x22FF}, NULL, "MathematicalOperators"}, |
1671 | +{{0x2300, 0x23FF}, NULL, "MiscellaneousTechnical"}, |
1672 | +{{0x2400, 0x243F}, NULL, "ControlPictures"}, |
1673 | +{{0x2440, 0x245F}, NULL, "OpticalCharacterRecognition"}, |
1674 | +{{0x2460, 0x24FF}, NULL, "EnclosedAlphanumerics"}, |
1675 | +{{0x2500, 0x257F}, NULL, "BoxDrawing"}, |
1676 | +{{0x2580, 0x259F}, NULL, "BlockElements"}, |
1677 | +{{0x25A0, 0x25FF}, NULL, "GeometricShapes"}, |
1678 | +{{0x2600, 0x26FF}, NULL, "MiscellaneousSymbols"}, |
1679 | +{{0x2700, 0x27BF}, NULL, "Dingbats"}, |
1680 | +{{0x2800, 0x28FF}, NULL, "BraillePatterns"}, |
1681 | +{{0x2E80, 0x2EFF}, NULL, "CJKRadicalsSupplement"}, |
1682 | +{{0x2F00, 0x2FDF}, NULL, "KangxiRadicals"}, |
1683 | +{{0x2FF0, 0x2FFF}, NULL, "IdeographicDescriptionCharacters"}, |
1684 | +{{0x3000, 0x303F}, NULL, "CJKSymbolsandPunctuation"}, |
1685 | +{{0x3040, 0x309F}, NULL, "Hiragana"}, |
1686 | +{{0x30A0, 0x30FF}, NULL, "Katakana"}, |
1687 | +{{0x3100, 0x312F}, NULL, "Bopomofo"}, |
1688 | +{{0x3130, 0x318F}, NULL, "HangulCompatibilityJamo"}, |
1689 | +{{0x3190, 0x319F}, NULL, "Kanbun"}, |
1690 | +{{0x31A0, 0x31BF}, NULL, "BopomofoExtended"}, |
1691 | +{{0x3200, 0x32FF}, NULL, "EnclosedCJKLettersandMonths"}, |
1692 | +{{0x3300, 0x33FF}, NULL, "CJKCompatibility"}, |
1693 | +{{0x3400, 0x4DB5}, NULL, "CJKUnifiedIdeographsExtensionA"}, |
1694 | +{{0x4E00, 0x9FFF}, NULL, "CJKUnifiedIdeographs"}, |
1695 | +{{0xA000, 0xA48F}, NULL, "YiSyllables"}, |
1696 | +{{0xA490, 0xA4CF}, NULL, "YiRadicals"}, |
1697 | +{{0xAC00, 0xD7A3}, NULL, "HangulSyllables"}, |
1698 | +{{0xE000, 0xF8FF}, NULL, "PrivateUse"}, |
1699 | +{{0xF900, 0xFAFF}, NULL, "CJKCompatibilityIdeographs"}, |
1700 | +{{0xFB00, 0xFB4F}, NULL, "AlphabeticPresentationForms"}, |
1701 | +{{0xFB50, 0xFDFF}, NULL, "ArabicPresentationForms-A"}, |
1702 | +{{0xFE20, 0xFE2F}, NULL, "CombiningHalfMarks"}, |
1703 | +{{0xFE30, 0xFE4F}, NULL, "CJKCompatibilityForms"}, |
1704 | +{{0xFE50, 0xFE6F}, NULL, "SmallFormVariants"}, |
1705 | +{{0xFE70, 0xFEFE}, NULL, "ArabicPresentationForms-B"}, |
1706 | +{{0xFEFF, 0xFEFF}, specials_extcp, "Specials"}, |
1707 | +{{0xFF00, 0xFFEF}, NULL, "HalfwidthandFullwidthForms"} |
1708 | +}; |
1709 | + |
1710 | +CRegexXQuery_charmatch* CRegexXQuery_parser::readChar(const char *pattern, |
1711 | + int *char_len, |
1712 | + enum CHARGROUP_t *multichar_type) |
1713 | { |
1714 | char c = 0; |
1715 | *char_len = 0; |
1716 | - *is_multichar = false; |
1717 | + *multichar_type = CHARGROUP_NO_MULTICHAR; |
1718 | switch(pattern[*char_len]) |
1719 | { |
1720 | case '\\': |
1721 | - { (*char_len)++; |
1722 | + { |
1723 | + (*char_len)++; |
1724 | switch(pattern[*char_len]) |
1725 | { |
1726 | - case 'n': c = '\n';break; |
1727 | - case 'r': c = '\r';break; |
1728 | - case 't': c = '\t';break; |
1729 | + case 'n': c = '\n';(*char_len)++;return new CRegexXQuery_char_ascii(current_regex, c); |
1730 | + case 'r': c = '\r';(*char_len)++;return new CRegexXQuery_char_ascii(current_regex, c); |
1731 | + case 't': c = '\t';(*char_len)++;return new CRegexXQuery_char_ascii(current_regex, c); |
1732 | case '\\': |
1733 | + case '/'://+ |
1734 | case '|': |
1735 | case '.': |
1736 | case '?': |
1737 | @@ -216,19 +359,205 @@ |
1738 | case '['://#x5B |
1739 | case ']'://#x5D |
1740 | case '^'://#x5E |
1741 | + case '$'://+ |
1742 | c = pattern[*char_len]; |
1743 | - break; |
1744 | + (*char_len)++; |
1745 | + *multichar_type = CHARGROUP_FLAGS_ONECHAR_ASCII; |
1746 | + return new CRegexXQuery_char_ascii(current_regex, c); |
1747 | case 'p'://catEsc |
1748 | case 'P'://complEsc |
1749 | + { |
1750 | //ignore the prop for now |
1751 | - c = pattern[*char_len]; |
1752 | - *is_multichar = true; |
1753 | - if(pattern[*char_len+1] == '{') |
1754 | - { |
1755 | - while(pattern[*char_len] != '}') |
1756 | + *multichar_type = CHARGROUP_FLAGS_MULTICHAR_p;//(CHARGROUP_t)((pattern[*char_len] == 'P') ? 128 : 0); |
1757 | + bool is_reverse = (pattern[*char_len] == 'P'); |
1758 | + c = 0; |
1759 | + if(pattern[(*char_len)+1] != '{') |
1760 | + { |
1761 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_BROKEN_P_CONSTRUCT)) ); |
1762 | + } |
1763 | + (*char_len) += 2; |
1764 | + switch(pattern[*char_len]) |
1765 | + {//IsCategory |
1766 | + case 'L': |
1767 | + { |
1768 | + switch(pattern[(*char_len)+1]) |
1769 | + { |
1770 | + case '}': |
1771 | + c = unicode::UNICODE_Ll + 50;break; |
1772 | + case 'u': |
1773 | + c = unicode::UNICODE_Lu; (*char_len)++;break; |
1774 | + case 'l': |
1775 | + c = unicode::UNICODE_Ll; (*char_len)++;break; |
1776 | + case 't': |
1777 | + c = unicode::UNICODE_Lt; (*char_len)++;break; |
1778 | + case 'm': |
1779 | + c = unicode::UNICODE_Lm; (*char_len)++;break; |
1780 | + case 'o': |
1781 | + c = unicode::UNICODE_Lo; (*char_len)++;break; |
1782 | + default: |
1783 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PL_CONSTRUCT)) ); |
1784 | + } |
1785 | + }break; |
1786 | + case 'M': |
1787 | + { |
1788 | + switch(pattern[(*char_len)+1]) |
1789 | + { |
1790 | + case '}': |
1791 | + c = unicode::UNICODE_Mc + 50;break; |
1792 | + case 'n': |
1793 | + c = unicode::UNICODE_Mn; (*char_len)++;break; |
1794 | + case 'c': |
1795 | + c = unicode::UNICODE_Mc; (*char_len)++;break; |
1796 | + case 'e': |
1797 | + c = unicode::UNICODE_Me; (*char_len)++;break; |
1798 | + default: |
1799 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PM_CONSTRUCT)) ); |
1800 | + } |
1801 | + }break; |
1802 | + case 'N': |
1803 | + { |
1804 | + switch(pattern[(*char_len)+1]) |
1805 | + { |
1806 | + case '}': |
1807 | + c = unicode::UNICODE_Nd + 50;break; |
1808 | + case 'd': |
1809 | + c = unicode::UNICODE_Nd; (*char_len)++;break; |
1810 | + case 'l': |
1811 | + c = unicode::UNICODE_Nl; (*char_len)++;break; |
1812 | + case 'o': |
1813 | + c = unicode::UNICODE_No; (*char_len)++;break; |
1814 | + default: |
1815 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PN_CONSTRUCT)) ); |
1816 | + } |
1817 | + }break; |
1818 | + case 'P': |
1819 | + { |
1820 | + switch(pattern[(*char_len)+1]) |
1821 | + { |
1822 | + case '}': |
1823 | + c = unicode::UNICODE_Pc + 50;break; |
1824 | + case 'c': |
1825 | + c = unicode::UNICODE_Pc; (*char_len)++;break; |
1826 | + case 'd': |
1827 | + c = unicode::UNICODE_Pd; (*char_len)++;break; |
1828 | + case 's': |
1829 | + c = unicode::UNICODE_Ps; (*char_len)++;break; |
1830 | + case 'e': |
1831 | + c = unicode::UNICODE_Pe; (*char_len)++;break; |
1832 | + case 'i': |
1833 | + c = unicode::UNICODE_Pi; (*char_len)++;break; |
1834 | + case 'f': |
1835 | + c = unicode::UNICODE_Pf; (*char_len)++;break; |
1836 | + case 'o': |
1837 | + c = unicode::UNICODE_Po; (*char_len)++;break; |
1838 | + default: |
1839 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PP_CONSTRUCT)) ); |
1840 | + } |
1841 | + }break; |
1842 | + case 'Z': |
1843 | + { |
1844 | + switch(pattern[(*char_len)+1]) |
1845 | + { |
1846 | + case '}': |
1847 | + c = unicode::UNICODE_Zl + 50;break; |
1848 | + case 's': |
1849 | + c = unicode::UNICODE_Zs; (*char_len)++;break; |
1850 | + case 'l': |
1851 | + c = unicode::UNICODE_Zl; (*char_len)++;break; |
1852 | + case 'p': |
1853 | + c = unicode::UNICODE_Zp; (*char_len)++;break; |
1854 | + default: |
1855 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PZ_CONSTRUCT)) ); |
1856 | + } |
1857 | + }break; |
1858 | + case 'S': |
1859 | + { |
1860 | + switch(pattern[(*char_len)+1]) |
1861 | + { |
1862 | + case '}': |
1863 | + c = unicode::UNICODE_Sc + 50;break; |
1864 | + case 'm': |
1865 | + c = unicode::UNICODE_Sm; (*char_len)++;break; |
1866 | + case 'c': |
1867 | + c = unicode::UNICODE_Sc; (*char_len)++;break; |
1868 | + case 'k': |
1869 | + c = unicode::UNICODE_Sk; (*char_len)++;break; |
1870 | + case 'o': |
1871 | + c = unicode::UNICODE_So; (*char_len)++;break; |
1872 | + default: |
1873 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PS_CONSTRUCT)) ); |
1874 | + } |
1875 | + }break; |
1876 | + case 'C': |
1877 | + { |
1878 | + switch(pattern[(*char_len)+1]) |
1879 | + { |
1880 | + case '}': |
1881 | + c = unicode::UNICODE_Cc + 50;break; |
1882 | + case 'c': |
1883 | + c = unicode::UNICODE_Cc; (*char_len)++;break; |
1884 | + case 'f': |
1885 | + c = unicode::UNICODE_Cf; (*char_len)++;break; |
1886 | + case 'o': |
1887 | + c = unicode::UNICODE_Co; (*char_len)++;break; |
1888 | + case 'n': |
1889 | + c = unicode::UNICODE_Cn; (*char_len)++;break; |
1890 | + default: |
1891 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PC_CONSTRUCT)) ); |
1892 | + } |
1893 | + }break; |
1894 | + }//end switch |
1895 | + if(c) |
1896 | + { |
1897 | + if(pattern[(*char_len) + 1] != '}') |
1898 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_BROKEN_P_CONSTRUCT)) ); |
1899 | + (*char_len)++; |
1900 | + (*char_len)++; |
1901 | + return new CRegexXQuery_multicharP(current_regex, c, is_reverse); |
1902 | + } |
1903 | + if(pattern[*char_len] == 'I') |
1904 | + { |
1905 | + if(pattern[(*char_len)+1] == 's')//IsBlock |
1906 | + { |
1907 | + *multichar_type = CHARGROUP_FLAGS_MULTICHAR_Is; |
1908 | + (*char_len) += 2; |
1909 | + zstring block_name; |
1910 | + char tempc = pattern[(*char_len)]; |
1911 | + while(tempc && (tempc != '}')) |
1912 | + { |
1913 | + if(!myisletterAZ(tempc) && !myisdigit(tempc) && (tempc != '-')) |
1914 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_BROKEN_PIs_CONSTRUCT)) ); |
1915 | + block_name.append(1, tempc); |
1916 | + (*char_len)++; |
1917 | + tempc = pattern[(*char_len)]; |
1918 | + } |
1919 | + if(!tempc) |
1920 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_BROKEN_PIs_CONSTRUCT)) ); |
1921 | + //search for the block name |
1922 | + int i; |
1923 | + int nr_blocks = sizeof(block_escape)/sizeof(CRegexXQuery_parser::block_escape_t); |
1924 | + for(i=0;i<nr_blocks;i++) |
1925 | + { |
1926 | + if(compare_ascii_i(block_name.c_str(), block_escape[i].group_name)) |
1927 | + { |
1928 | + c = i; |
1929 | + break; |
1930 | + } |
1931 | + } |
1932 | + if(i==nr_blocks) |
1933 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_PIs_CONSTRUCT)) ); |
1934 | (*char_len)++; |
1935 | - } |
1936 | - break; |
1937 | + return new CRegexXQuery_multicharIs(current_regex, i, is_reverse); |
1938 | + } |
1939 | + else |
1940 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_BROKEN_PIs_CONSTRUCT)) ); |
1941 | + } |
1942 | + else |
1943 | + { |
1944 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_BROKEN_P_CONSTRUCT)) ); |
1945 | + } |
1946 | + break;//unreachable |
1947 | + }//end case 'p' |
1948 | //multiCharEsc |
1949 | case 's': |
1950 | case 'S': |
1951 | @@ -240,40 +569,104 @@ |
1952 | case 'D': |
1953 | case 'w': |
1954 | case 'W': |
1955 | - *is_multichar = true; |
1956 | + *multichar_type = CHARGROUP_FLAGS_MULTICHAR_OTHER; |
1957 | c = pattern[*char_len]; |
1958 | - break; |
1959 | - } |
1960 | - break; |
1961 | - } |
1962 | - case '#':///might be #xXX |
1963 | - { |
1964 | - if((pattern[*char_len+1] == 'x') && |
1965 | - myishex(pattern[*char_len+2]) && myishex(pattern[*char_len+3])) |
1966 | - { |
1967 | - c = (myishex(pattern[*char_len+2])-1)<<4 | (myishex(pattern[*char_len+3])-1); |
1968 | - *char_len += 3; |
1969 | - break; |
1970 | - } |
1971 | - } |
1972 | + (*char_len)++; |
1973 | + return new CRegexXQuery_multicharOther(current_regex, c); |
1974 | + case 'u'://unicode codepoint \uXXXX |
1975 | + { |
1976 | + unicode::code_point utf8c = 0; |
1977 | + (*char_len)++; |
1978 | + for(int i=0;i<4;i++) |
1979 | + { |
1980 | + char hex = myishex(pattern[*char_len]); |
1981 | + if(!hex) |
1982 | + { |
1983 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_UNICODE_CODEPOINT_u)) ); |
1984 | + } |
1985 | + utf8c <<= 4; |
1986 | + utf8c |= (hex-1) & 0x0f; |
1987 | + (*char_len)++; |
1988 | + } |
1989 | + return create_charmatch(utf8c, NULL, 0, multichar_type); |
1990 | + } |
1991 | + case 'U'://unicode codepoint \UXXXXXXXX |
1992 | + { |
1993 | + unicode::code_point utf8c = 0; |
1994 | + (*char_len)++; |
1995 | + for(int i=0;i<8;i++) |
1996 | + { |
1997 | + char hex = myishex(pattern[*char_len]); |
1998 | + if(!hex) |
1999 | + { |
2000 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_UNICODE_CODEPOINT_u)) ); |
2001 | + } |
2002 | + utf8c <<= 4; |
2003 | + utf8c |= (hex-1) & 0x0f; |
2004 | + (*char_len)++; |
2005 | + } |
2006 | + return create_charmatch(utf8c, NULL, 0, multichar_type); |
2007 | + } |
2008 | + default: |
2009 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_UNKNOWN_ESC_CHAR)) ); |
2010 | + } |
2011 | + assert(false); |
2012 | + break;//unreachable |
2013 | + }//end case '\' |
2014 | default: |
2015 | - c = pattern[*char_len]; |
2016 | - break; |
2017 | - } |
2018 | - |
2019 | - (*char_len)++; |
2020 | - return c; |
2021 | -} |
2022 | - |
2023 | - |
2024 | - |
2025 | -IRegexAtom* CRegexAscii_parser::read_atom(const char *pattern, int *atom_len) |
2026 | + { |
2027 | + const char *temp_pattern = pattern; |
2028 | + unicode::code_point utf8c = utf8::next_char(temp_pattern); |
2029 | + (*char_len) = temp_pattern - pattern; |
2030 | + return create_charmatch(utf8c, pattern, *char_len, multichar_type); |
2031 | + } |
2032 | + } |
2033 | + return NULL; |
2034 | +} |
2035 | + |
2036 | +CRegexXQuery_charmatch *CRegexXQuery_parser::create_charmatch(unicode::code_point utf8c, |
2037 | + const char *pattern, int utf8len, |
2038 | + enum CHARGROUP_t *multichar_type) |
2039 | +{ |
2040 | + if(utf8c <= 0x7F) |
2041 | + { |
2042 | + *multichar_type = CHARGROUP_FLAGS_ONECHAR_ASCII; |
2043 | + if(flags & REGEX_ASCII_CASE_INSENSITIVE) |
2044 | + return new CRegexXQuery_char_ascii_i(current_regex, (char)utf8c); |
2045 | + else |
2046 | + return new CRegexXQuery_char_ascii(current_regex, (char)utf8c); |
2047 | + } |
2048 | + else |
2049 | + { |
2050 | + *multichar_type = CHARGROUP_FLAGS_ONECHAR_UNICODE; |
2051 | + if(flags & REGEX_ASCII_CASE_INSENSITIVE) |
2052 | + return new CRegexXQuery_char_unicode_i(current_regex, utf8c); |
2053 | + else |
2054 | + { |
2055 | + if(pattern) |
2056 | + return new CRegexXQuery_char_unicode(current_regex, pattern, utf8len); |
2057 | + else |
2058 | + return new CRegexXQuery_char_unicode_cp(current_regex, utf8c); |
2059 | + } |
2060 | + } |
2061 | +} |
2062 | + |
2063 | +IRegexAtom* CRegexXQuery_parser::read_atom(const char *pattern, int *atom_len) |
2064 | { |
2065 | *atom_len = 0; |
2066 | - char c; |
2067 | - bool is_end_line = false; |
2068 | - c = pattern[*atom_len]; |
2069 | - if((!(flags & REGEX_ASCII_LITERAL)) && (c == '\\')) |
2070 | + if(flags & REGEX_ASCII_LITERAL) |
2071 | + { |
2072 | + unicode::code_point utf8c; |
2073 | + //bool is_end_line = false; |
2074 | + const char *temp_pattern = pattern; |
2075 | + utf8c = utf8::next_char(temp_pattern); |
2076 | + *atom_len = temp_pattern - pattern; |
2077 | + enum CHARGROUP_t multichar_type; |
2078 | + return create_charmatch(utf8c, pattern, *atom_len, &multichar_type); |
2079 | + } |
2080 | + |
2081 | + char c = *pattern; |
2082 | + if(c == '\\') |
2083 | { |
2084 | //check for back reference |
2085 | if(myisdigit(pattern[(*atom_len)+1])) |
2086 | @@ -281,13 +674,13 @@ |
2087 | (*atom_len)++; |
2088 | if(pattern[*atom_len] == '0') |
2089 | { |
2090 | - throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(U_REGEX_INVALID_BACK_REF)) ); |
2091 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_BACK_REF), 0, current_regex->subregex.size()) ); |
2092 | } |
2093 | unsigned int backref = pattern[*atom_len] - '0'; |
2094 | if((backref > current_regex->subregex.size()) || |
2095 | (current_regex->subregex.at(backref-1)->flags != 0)) |
2096 | { |
2097 | - throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(U_REGEX_INVALID_BACK_REF)) ); |
2098 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_BACK_REF), backref, current_regex->subregex.size()) ); |
2099 | } |
2100 | while(current_regex->subregex.size() >= backref*10) |
2101 | { |
2102 | @@ -303,70 +696,86 @@ |
2103 | break; |
2104 | } |
2105 | } |
2106 | - return new CRegexAscii_backref(current_regex, backref); |
2107 | + (*atom_len)++; |
2108 | + return new CRegexXQuery_backref(current_regex, backref); |
2109 | } |
2110 | } |
2111 | + if(c == '^') |
2112 | + { |
2113 | + (*atom_len)++; |
2114 | + return new CRegexXQuery_pinstart(current_regex); |
2115 | + } |
2116 | + if((c == '}') || (c == '{') || (c == '?') || (c == '*') || (c == '+') || (c == '|')) |
2117 | + { |
2118 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_ATOM_CHAR), c) ); |
2119 | + } |
2120 | switch(c) |
2121 | { |
2122 | case '[': |
2123 | { |
2124 | - if(!(flags & REGEX_ASCII_LITERAL)) |
2125 | - { |
2126 | - (*atom_len)++; |
2127 | - CRegexAscii_chargroup *chargroup = NULL; |
2128 | - int chargroup_len; |
2129 | - chargroup = readchargroup(pattern+*atom_len, &chargroup_len); |
2130 | - *atom_len += chargroup_len; |
2131 | - return chargroup; |
2132 | - } |
2133 | + (*atom_len)++; |
2134 | + CRegexXQuery_chargroup *chargroup = NULL; |
2135 | + int chargroup_len; |
2136 | + chargroup = readchargroup(pattern+*atom_len, &chargroup_len); |
2137 | + *atom_len += chargroup_len; |
2138 | + return chargroup; |
2139 | } |
2140 | case '.'://WildCharEsc |
2141 | { |
2142 | - if(!(flags & REGEX_ASCII_LITERAL)) |
2143 | - { |
2144 | - CRegexAscii_wildchar *wildchar = new CRegexAscii_wildchar(current_regex); |
2145 | - (*atom_len)++; |
2146 | - return wildchar; |
2147 | - } |
2148 | + (*atom_len)++; |
2149 | + return new CRegexXQuery_wildchar(current_regex); |
2150 | } |
2151 | case '('://begin an embedded reg exp |
2152 | { |
2153 | - if(!(flags & REGEX_ASCII_LITERAL)) |
2154 | - { |
2155 | - (*atom_len)++; |
2156 | - CRegexAscii_regex *emb_regex = NULL; |
2157 | - int regex_len; |
2158 | - emb_regex = parse_regexp(pattern + *atom_len, ®ex_len); |
2159 | - *atom_len += regex_len; |
2160 | - return emb_regex; |
2161 | - } |
2162 | + (*atom_len)++; |
2163 | + CRegexXQuery_regex *emb_regex = NULL; |
2164 | + int regex_len; |
2165 | + emb_regex = parse_regexp(pattern + *atom_len, ®ex_len); |
2166 | + *atom_len += regex_len; |
2167 | + return emb_regex; |
2168 | } |
2169 | case '$'://end line |
2170 | - if(!(flags & REGEX_ASCII_LITERAL)) |
2171 | - { |
2172 | - is_end_line = true; |
2173 | - } |
2174 | + //is_end_line = true; |
2175 | + (*atom_len)++; |
2176 | + return new CRegexXQuery_endline(current_regex); |
2177 | default: |
2178 | { |
2179 | - char c; |
2180 | + //char c; |
2181 | + CRegexXQuery_charmatch *charmatch = NULL; |
2182 | int c_len; |
2183 | - bool is_multichar = false; |
2184 | - if(!(flags & REGEX_ASCII_LITERAL)) |
2185 | - c = readChar(pattern+*atom_len, &c_len, &is_multichar); |
2186 | - else |
2187 | + CHARGROUP_t multichar_type = CHARGROUP_NO_MULTICHAR; |
2188 | + *atom_len = 0; |
2189 | + while(pattern[*atom_len]) |
2190 | { |
2191 | - c = pattern[*atom_len]; |
2192 | - c_len = 1; |
2193 | + charmatch = readChar(pattern+*atom_len, &c_len, &multichar_type); |
2194 | + *atom_len += c_len; |
2195 | + if((flags & REGEX_ASCII_NO_WHITESPACE) && (multichar_type == CHARGROUP_FLAGS_ONECHAR_ASCII)) |
2196 | + { |
2197 | + char c = (char)charmatch->get_c(); |
2198 | + if((c == ' ') || (c == '\t') || (c == '\r') || (c == '\n')) |
2199 | + { |
2200 | + //ignore this whitespace |
2201 | + delete charmatch; |
2202 | + continue; |
2203 | + } |
2204 | + else |
2205 | + break; |
2206 | + } |
2207 | + else |
2208 | + break; |
2209 | } |
2210 | - CRegexAscii_chargroup *chargroup = new CRegexAscii_chargroup(current_regex); |
2211 | - if(is_multichar) |
2212 | - chargroup->addMultiChar(c); |
2213 | + /* |
2214 | + std::auto_ptr<CRegexXQuery_chargroup> chargroup(new CRegexXQuery_chargroup(current_regex)); |
2215 | + if(multichar_type) |
2216 | + chargroup->addMultiChar(c, multichar_type); |
2217 | else if(is_end_line) |
2218 | chargroup->addEndLine(); |
2219 | else |
2220 | - chargroup->addCharRange(c, c); |
2221 | + chargroup->addOneChar(c); |
2222 | *atom_len += c_len; |
2223 | - return chargroup; |
2224 | + return chargroup.release(); |
2225 | + */ |
2226 | + return charmatch; |
2227 | } |
2228 | } |
2229 | } |
2230 | @@ -374,81 +783,119 @@ |
2231 | //read until ']' |
2232 | //posCharGroup ::= ( charRange | charClassEsc )+ |
2233 | //charRange ::= seRange | XmlCharIncDash |
2234 | -CRegexAscii_chargroup* CRegexAscii_parser::readchargroup(const char *pattern, int *chargroup_len) |
2235 | +CRegexXQuery_chargroup* CRegexXQuery_parser::readchargroup(const char *pattern, int *chargroup_len) |
2236 | { |
2237 | - CRegexAscii_chargroup *chargroup = NULL; |
2238 | + std::auto_ptr<CRegexXQuery_chargroup> chargroup; |
2239 | *chargroup_len = 0; |
2240 | if(pattern[*chargroup_len] == '^')//negative group |
2241 | { |
2242 | (*chargroup_len)++; |
2243 | - chargroup = new CRegexAscii_negchargroup(current_regex); |
2244 | + chargroup.reset(new CRegexXQuery_negchargroup(current_regex)); |
2245 | } |
2246 | else |
2247 | - chargroup = new CRegexAscii_chargroup(current_regex); |
2248 | + chargroup.reset(new CRegexXQuery_chargroup(current_regex)); |
2249 | while(pattern[*chargroup_len] && (pattern[*chargroup_len]!=']')) |
2250 | { |
2251 | - char c1, c2; |
2252 | - bool is_multichar; |
2253 | + //char c1, c2; |
2254 | + CHARGROUP_t multichar_type = CHARGROUP_NO_MULTICHAR; |
2255 | int c1_len; |
2256 | - c1 = pattern[*chargroup_len]; |
2257 | - c2 = pattern[*chargroup_len+1]; |
2258 | - if((c1 == '-') && (c2 == '['))//charClassSub |
2259 | + if((pattern[*chargroup_len] == '-') && (pattern[(*chargroup_len)+1] == '['))//charClassSub |
2260 | { |
2261 | int classsub_len; |
2262 | - CRegexAscii_chargroup *classsub = readchargroup(pattern + *chargroup_len+1 + 1, &classsub_len); |
2263 | + CRegexXQuery_chargroup *classsub = readchargroup(pattern + (*chargroup_len)+1 + 1, &classsub_len); |
2264 | if(!classsub) |
2265 | { |
2266 | - delete chargroup; |
2267 | - return NULL; |
2268 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_SUBCLASS)) ); |
2269 | } |
2270 | chargroup->addClassSub(classsub); |
2271 | *chargroup_len += 2 + classsub_len + 1; |
2272 | if(pattern[*chargroup_len-1] != ']') |
2273 | { |
2274 | - delete chargroup; |
2275 | - return NULL; |
2276 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_INVALID_USE_OF_SUBCLASS)) ); |
2277 | } |
2278 | - return chargroup; |
2279 | + return chargroup.release(); |
2280 | } |
2281 | |
2282 | - c1 = readChar(pattern+*chargroup_len, &c1_len, &is_multichar); |
2283 | - if(is_multichar)//first char is multichar |
2284 | + std::unique_ptr<CRegexXQuery_charmatch> charmatch(readChar(pattern+*chargroup_len, &c1_len, &multichar_type)); |
2285 | + if((multichar_type == CHARGROUP_FLAGS_MULTICHAR_p) || |
2286 | + (multichar_type == CHARGROUP_FLAGS_MULTICHAR_Is) || |
2287 | + (multichar_type == CHARGROUP_FLAGS_MULTICHAR_OTHER))//first char is multichar |
2288 | { |
2289 | - chargroup->addMultiChar(c1); |
2290 | + if((pattern[*chargroup_len+c1_len] == '-') &&///should not be a range |
2291 | + (pattern[*chargroup_len+c1_len+1] != ']')) |
2292 | + { |
2293 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_MULTICHAR_IN_CHAR_RANGE)) ); |
2294 | + } |
2295 | + //chargroup->addMultiChar(c1, multichar_type); |
2296 | + chargroup->addCharMatch(charmatch.release()); |
2297 | *chargroup_len += c1_len; |
2298 | continue; |
2299 | } |
2300 | - if(pattern[*chargroup_len+c1_len] == '-')///might be a range |
2301 | + (*chargroup_len) += c1_len; |
2302 | + if(pattern[*chargroup_len] == '-')///might be a range |
2303 | { |
2304 | - if(pattern[*chargroup_len+c1_len+1] == ']')//no range, just the last char is '-' |
2305 | + if(pattern[(*chargroup_len)+1] == ']')//no range, just the last char is '-' |
2306 | { |
2307 | - chargroup->addCharRange(c1, c1); |
2308 | - chargroup->addCharRange('-', '-'); |
2309 | - *chargroup_len += c1_len + 1; |
2310 | + //chargroup->addOneChar(c1); |
2311 | + //chargroup->addOneChar('-'); |
2312 | + chargroup->addCharMatch(charmatch.release()); |
2313 | + chargroup->addCharMatch(new CRegexXQuery_char_ascii(current_regex, '-')); |
2314 | + (*chargroup_len)++; |
2315 | continue; |
2316 | } |
2317 | - else |
2318 | + else if(pattern[(*chargroup_len)+1] != '[') |
2319 | { |
2320 | //it is a range |
2321 | - char c3; |
2322 | - int c3_len; |
2323 | - c3 = readChar(pattern+*chargroup_len+c1_len+1, &c3_len, &is_multichar); |
2324 | - if(is_multichar) |
2325 | - return NULL;//error |
2326 | - chargroup->addCharRange(c1, c3); |
2327 | - *chargroup_len += c1_len + 1 + c3_len; |
2328 | + (*chargroup_len)++; |
2329 | + std::unique_ptr<CRegexXQuery_charmatch> charmatch2; |
2330 | + CHARGROUP_t multichar_type2 = CHARGROUP_NO_MULTICHAR; |
2331 | + int c2_len; |
2332 | + charmatch2.reset(readChar(pattern+(*chargroup_len), &c2_len, &multichar_type2)); |
2333 | + if((multichar_type2 != CHARGROUP_FLAGS_ONECHAR_ASCII) && |
2334 | + (multichar_type2 != CHARGROUP_FLAGS_ONECHAR_ASCII))//second char in range is multichar |
2335 | + { |
2336 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_MULTICHAR_IN_CHAR_RANGE)) ); |
2337 | + } |
2338 | + //chargroup->addCharRange(c1, c3); |
2339 | + if((multichar_type == CHARGROUP_FLAGS_ONECHAR_ASCII) && (multichar_type2 == CHARGROUP_FLAGS_ONECHAR_ASCII)) |
2340 | + { |
2341 | + if(flags & REGEX_ASCII_CASE_INSENSITIVE) |
2342 | + chargroup->addCharMatch(new CRegexXQuery_char_range_ascii_i(current_regex, |
2343 | + (char)charmatch->get_c(), |
2344 | + (char)charmatch2->get_c())); |
2345 | + else |
2346 | + chargroup->addCharMatch(new CRegexXQuery_char_range_ascii(current_regex, |
2347 | + (char)charmatch->get_c(), |
2348 | + (char)charmatch2->get_c())); |
2349 | + } |
2350 | + else |
2351 | + { |
2352 | + if(flags & REGEX_ASCII_CASE_INSENSITIVE) |
2353 | + chargroup->addCharMatch(new CRegexXQuery_char_range_unicode_i(current_regex, |
2354 | + charmatch->get_c(), |
2355 | + charmatch2->get_c())); |
2356 | + else |
2357 | + chargroup->addCharMatch(new CRegexXQuery_char_range_unicode(current_regex, |
2358 | + charmatch->get_c(), |
2359 | + charmatch2->get_c())); |
2360 | + } |
2361 | + *chargroup_len += c2_len; |
2362 | continue; |
2363 | } |
2364 | } |
2365 | - chargroup->addCharRange(c1, c1); |
2366 | - *chargroup_len += c1_len; |
2367 | + //chargroup->addOneChar(c1); |
2368 | + chargroup->addCharMatch(charmatch.release()); |
2369 | } |
2370 | if(pattern[*chargroup_len]) |
2371 | (*chargroup_len)++; |
2372 | - return chargroup; |
2373 | + else |
2374 | + { |
2375 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_MISSING_CLOSE_BRACKET)) ); |
2376 | + } |
2377 | + return chargroup.release(); |
2378 | } |
2379 | |
2380 | -void CRegexAscii_parser::read_quantifier(CRegexAscii_piece *piece, |
2381 | +void CRegexXQuery_parser::read_quantifier(CRegexXQuery_piece *piece, |
2382 | const char *pattern, int *quantif_len) |
2383 | { |
2384 | *quantif_len = 0; |
2385 | @@ -496,6 +943,10 @@ |
2386 | max = max*10 + pattern[*quantif_len] - '0'; |
2387 | (*quantif_len)++; |
2388 | } |
2389 | + if(max < min) |
2390 | + { |
2391 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(pattern, ZED(REGEX_MAX_LT_MIN)) ); |
2392 | + } |
2393 | piece->set_quantifier_min_max(min, max, true); |
2394 | } |
2395 | while(pattern[*quantif_len] && (pattern[*quantif_len] != '}')) |
2396 | @@ -524,23 +975,25 @@ |
2397 | ///Constructors and destructors and internal functions |
2398 | //////////////////////////// |
2399 | |
2400 | -CRegexAscii_regex::CRegexAscii_regex(CRegexAscii_regex *topregex) : IRegexAtom(topregex?topregex:this) |
2401 | +CRegexXQuery_regex::CRegexXQuery_regex(CRegexXQuery_regex *topregex) : IRegexAtom(topregex?topregex:this) |
2402 | { |
2403 | matched_source = NULL; |
2404 | matched_len = 0; |
2405 | +// backup_matched_source = NULL; |
2406 | +// backup_matched_len = 0; |
2407 | flags = 128;//set to 0 after initialization |
2408 | } |
2409 | |
2410 | -CRegexAscii_regex::~CRegexAscii_regex() |
2411 | +CRegexXQuery_regex::~CRegexXQuery_regex() |
2412 | { |
2413 | - std::list<CRegexAscii_branch*>::iterator branch_it; |
2414 | + std::list<CRegexXQuery_branch*>::iterator branch_it; |
2415 | |
2416 | for(branch_it = branch_list.begin(); branch_it != branch_list.end(); branch_it++) |
2417 | { |
2418 | delete (*branch_it); |
2419 | } |
2420 | /* |
2421 | - std::vector<CRegexAscii_regex*>::iterator subregex_it; |
2422 | + std::vector<CRegexXQuery_regex*>::iterator subregex_it; |
2423 | for(subregex_it = subregex.begin(); subregex_it != subregex.end(); subregex_it++) |
2424 | { |
2425 | delete (*subregex_it); |
2426 | @@ -548,25 +1001,18 @@ |
2427 | */ |
2428 | } |
2429 | |
2430 | -bool CRegexAscii_regex::set_align_begin(bool align_begin) |
2431 | -{ |
2432 | - bool prev_align = this->align_begin; |
2433 | - this->align_begin = align_begin; |
2434 | - return prev_align; |
2435 | -} |
2436 | - |
2437 | -void CRegexAscii_regex::add_branch(CRegexAscii_branch *branch) |
2438 | +void CRegexXQuery_regex::add_branch(CRegexXQuery_branch *branch) |
2439 | { |
2440 | branch_list.push_back(branch); |
2441 | } |
2442 | |
2443 | -bool CRegexAscii_regex::get_indexed_match(int index, |
2444 | +bool CRegexXQuery_regex::get_indexed_match(int index, |
2445 | const char **matched_source, |
2446 | int *matched_len) |
2447 | { |
2448 | if(!index || index > (int)subregex.size()) |
2449 | return false; |
2450 | - CRegexAscii_regex *subr = subregex[index-1]; |
2451 | + CRegexXQuery_regex *subr = subregex[index-1]; |
2452 | *matched_source = subr->matched_source; |
2453 | if(!*matched_source) |
2454 | return false; |
2455 | @@ -574,145 +1020,209 @@ |
2456 | return true; |
2457 | } |
2458 | |
2459 | -unsigned int CRegexAscii_regex::get_indexed_regex_count() |
2460 | +unsigned int CRegexXQuery_regex::get_indexed_regex_count() |
2461 | { |
2462 | return subregex.size(); |
2463 | } |
2464 | |
2465 | -CRegexAscii_branch::CRegexAscii_branch(CRegexAscii_regex* regex) : |
2466 | - IRegexMatcher(regex) |
2467 | +CRegexXQuery_branch::CRegexXQuery_branch(CRegexXQuery_regex* regex) |
2468 | + //: |
2469 | + //IRegexMatcher(regex) |
2470 | { |
2471 | } |
2472 | |
2473 | -CRegexAscii_branch::~CRegexAscii_branch() |
2474 | +CRegexXQuery_branch::~CRegexXQuery_branch() |
2475 | { |
2476 | - std::list<CRegexAscii_piece*>::iterator piece_it; |
2477 | + std::list<RegexAscii_pieceinfo>::iterator piece_it; |
2478 | |
2479 | for(piece_it = piece_list.begin(); piece_it != piece_list.end(); piece_it++) |
2480 | { |
2481 | - delete (*piece_it); |
2482 | + delete (*piece_it).piece; |
2483 | } |
2484 | } |
2485 | |
2486 | -void CRegexAscii_branch::add_piece(CRegexAscii_piece *piece) |
2487 | +void CRegexXQuery_branch::add_piece(CRegexXQuery_piece *piece) |
2488 | { |
2489 | piece_list.push_back(piece); |
2490 | } |
2491 | |
2492 | -CRegexAscii_piece::CRegexAscii_piece() |
2493 | +CRegexXQuery_piece::CRegexXQuery_piece() |
2494 | { |
2495 | + atom = NULL; |
2496 | + regex_atom = NULL; |
2497 | } |
2498 | |
2499 | -CRegexAscii_piece::~CRegexAscii_piece() |
2500 | +CRegexXQuery_piece::~CRegexXQuery_piece() |
2501 | { |
2502 | delete atom; |
2503 | } |
2504 | |
2505 | -void CRegexAscii_piece::set_atom(IRegexAtom *atom) |
2506 | +void CRegexXQuery_piece::set_atom(IRegexAtom *atom) |
2507 | { |
2508 | this->atom = atom; |
2509 | + this->regex_atom = dynamic_cast<CRegexXQuery_regex*>(atom); |
2510 | } |
2511 | |
2512 | -void CRegexAscii_piece::set_quantifier_min_max(int min, int max, bool strict_max) |
2513 | +void CRegexXQuery_piece::set_quantifier_min_max(int min, int max, bool strict_max) |
2514 | { |
2515 | this->min = min; |
2516 | this->max = max; |
2517 | this->strict_max = strict_max; |
2518 | } |
2519 | -void CRegexAscii_piece::set_is_reluctant(bool is_reluctant) |
2520 | +void CRegexXQuery_piece::set_is_reluctant(bool is_reluctant) |
2521 | { |
2522 | this->is_reluctant = is_reluctant; |
2523 | } |
2524 | -void CRegexAscii_piece::get_quantifier(int *min, int *max, bool *strict_max) |
2525 | +void CRegexXQuery_piece::get_quantifier(int *min, int *max, bool *strict_max) |
2526 | { |
2527 | *min = this->min; |
2528 | *max = this->max; |
2529 | *strict_max = this->strict_max; |
2530 | } |
2531 | -bool CRegexAscii_piece::get_is_reluctant() |
2532 | +bool CRegexXQuery_piece::get_is_reluctant() |
2533 | { |
2534 | + if(atom->regex_intern->flags & REGEX_ASCII_MINIMAL_MATCH) |
2535 | + return true; |
2536 | return is_reluctant; |
2537 | } |
2538 | |
2539 | |
2540 | -CRegexAscii_chargroup::CRegexAscii_chargroup(CRegexAscii_regex* regex) : |
2541 | +CRegexXQuery_charmatch::CRegexXQuery_charmatch(CRegexXQuery_regex* regex) : |
2542 | + IRegexAtom(regex) |
2543 | +{ |
2544 | +} |
2545 | +CRegexXQuery_multicharP::CRegexXQuery_multicharP(CRegexXQuery_regex* regex, char type, bool is_reverse) : |
2546 | + CRegexXQuery_charmatch(regex) |
2547 | +{ |
2548 | + this->multichar_type = type; this->is_reverse = is_reverse; |
2549 | +} |
2550 | +CRegexXQuery_multicharIs::CRegexXQuery_multicharIs(CRegexXQuery_regex* regex, int block_index, bool is_reverse) : |
2551 | + CRegexXQuery_charmatch(regex) |
2552 | +{ |
2553 | + this->block_index = block_index; this->is_reverse = is_reverse; |
2554 | +} |
2555 | +CRegexXQuery_multicharOther::CRegexXQuery_multicharOther(CRegexXQuery_regex* regex, char type) : |
2556 | + CRegexXQuery_charmatch(regex) |
2557 | +{ |
2558 | + this->multichar_type = type; |
2559 | +} |
2560 | +CRegexXQuery_char_ascii::CRegexXQuery_char_ascii(CRegexXQuery_regex* regex, char c) : |
2561 | + CRegexXQuery_charmatch(regex) |
2562 | +{ |
2563 | + this->c = c; |
2564 | +} |
2565 | +CRegexXQuery_char_ascii_i::CRegexXQuery_char_ascii_i(CRegexXQuery_regex* regex, char c) : |
2566 | + CRegexXQuery_char_ascii(regex, toupper(c)) |
2567 | +{ |
2568 | +} |
2569 | +CRegexXQuery_char_range_ascii::CRegexXQuery_char_range_ascii(CRegexXQuery_regex* regex, char c1, char c2) : |
2570 | + CRegexXQuery_charmatch(regex) |
2571 | +{ |
2572 | + this->c1 = c1; this->c2 = c2; |
2573 | +} |
2574 | +CRegexXQuery_char_range_ascii_i::CRegexXQuery_char_range_ascii_i(CRegexXQuery_regex* regex, char c1, char c2) : |
2575 | + CRegexXQuery_char_range_ascii(regex, toupper(c1), toupper(c2)) |
2576 | +{ |
2577 | +} |
2578 | +CRegexXQuery_char_unicode::CRegexXQuery_char_unicode(CRegexXQuery_regex* regex, const char *source, int len) : |
2579 | + CRegexXQuery_charmatch(regex) |
2580 | +{ |
2581 | + this->len = len; |
2582 | + memcpy(c, source, len); |
2583 | +} |
2584 | +CRegexXQuery_char_unicode_cp::CRegexXQuery_char_unicode_cp(CRegexXQuery_regex* regex, unicode::code_point c) : |
2585 | + CRegexXQuery_charmatch(regex) |
2586 | +{ |
2587 | + this->c = c; |
2588 | +} |
2589 | +CRegexXQuery_char_unicode_i::CRegexXQuery_char_unicode_i(CRegexXQuery_regex* regex, unicode::code_point c) : |
2590 | + CRegexXQuery_char_unicode_cp(regex, unicode::to_upper(c)) |
2591 | +{ |
2592 | +} |
2593 | +CRegexXQuery_char_range_unicode::CRegexXQuery_char_range_unicode(CRegexXQuery_regex* regex, unicode::code_point c1, unicode::code_point c2) : |
2594 | + CRegexXQuery_charmatch(regex) |
2595 | +{ |
2596 | + this->c1 = c1; this->c2 = c2; |
2597 | +} |
2598 | +CRegexXQuery_char_range_unicode_i::CRegexXQuery_char_range_unicode_i(CRegexXQuery_regex* regex, unicode::code_point c1, unicode::code_point c2) : |
2599 | + CRegexXQuery_char_range_unicode(regex, unicode::to_upper(c1), unicode::to_upper(c2)) |
2600 | +{ |
2601 | +} |
2602 | +CRegexXQuery_endline::CRegexXQuery_endline(CRegexXQuery_regex* regex) : |
2603 | + CRegexXQuery_charmatch(regex) |
2604 | +{ |
2605 | +} |
2606 | + |
2607 | +unicode::code_point CRegexXQuery_char_unicode::get_c() |
2608 | +{ |
2609 | + const char *temp_c = (const char*)c; |
2610 | + return utf8::next_char(temp_c); |
2611 | +} |
2612 | + |
2613 | + |
2614 | +CRegexXQuery_chargroup::CRegexXQuery_chargroup(CRegexXQuery_regex* regex) : |
2615 | IRegexAtom(regex) |
2616 | { |
2617 | classsub = NULL; |
2618 | } |
2619 | |
2620 | -CRegexAscii_chargroup::~CRegexAscii_chargroup() |
2621 | +CRegexXQuery_chargroup::~CRegexXQuery_chargroup() |
2622 | { |
2623 | delete classsub; |
2624 | -} |
2625 | - |
2626 | -void CRegexAscii_chargroup::addMultiChar(char c) |
2627 | -{ |
2628 | - chargroup_t cgt; |
2629 | - cgt.flags = CHARGROUP_FLAGS_MULTICHAR; |
2630 | - cgt.c1 = c; |
2631 | - cgt.c2 = 0; |
2632 | - chargroup_list.push_back(cgt); |
2633 | -} |
2634 | - |
2635 | -void CRegexAscii_chargroup::addEndLine() |
2636 | -{ |
2637 | - chargroup_t cgt; |
2638 | - cgt.flags = CHARGROUP_FLAGS_ENDLINE; |
2639 | - cgt.c1 = '$'; |
2640 | - cgt.c2 = 0; |
2641 | - chargroup_list.push_back(cgt); |
2642 | -} |
2643 | - |
2644 | -void CRegexAscii_chargroup::addCharRange(char c1, char c2) |
2645 | -{ |
2646 | - chargroup_t cgt; |
2647 | - cgt.flags = 0; |
2648 | - cgt.c1 = c1; |
2649 | - cgt.c2 = c2; |
2650 | - chargroup_list.push_back(cgt); |
2651 | -} |
2652 | - |
2653 | -void CRegexAscii_chargroup::addClassSub(CRegexAscii_chargroup* classsub) |
2654 | + std::list<CRegexXQuery_charmatch* >::iterator charmatch_it; |
2655 | + for(charmatch_it=chargroup_list.begin(); charmatch_it != chargroup_list.end(); charmatch_it++) |
2656 | + delete (*charmatch_it); |
2657 | +} |
2658 | + |
2659 | +void CRegexXQuery_chargroup::addCharMatch(CRegexXQuery_charmatch *charmatch) |
2660 | +{ |
2661 | + chargroup_list.push_back(charmatch); |
2662 | +} |
2663 | +void CRegexXQuery_chargroup::addClassSub(CRegexXQuery_chargroup* classsub) |
2664 | { |
2665 | this->classsub = classsub; |
2666 | } |
2667 | |
2668 | -CRegexAscii_negchargroup::CRegexAscii_negchargroup(CRegexAscii_regex* regex) : |
2669 | - CRegexAscii_chargroup(regex) |
2670 | -{ |
2671 | -} |
2672 | - |
2673 | -CRegexAscii_negchargroup::~CRegexAscii_negchargroup() |
2674 | -{ |
2675 | -} |
2676 | - |
2677 | -CRegexAscii_wildchar::CRegexAscii_wildchar(CRegexAscii_regex* regex) : |
2678 | +CRegexXQuery_negchargroup::CRegexXQuery_negchargroup(CRegexXQuery_regex* regex) : |
2679 | + CRegexXQuery_chargroup(regex) |
2680 | +{ |
2681 | +} |
2682 | + |
2683 | +CRegexXQuery_negchargroup::~CRegexXQuery_negchargroup() |
2684 | +{ |
2685 | +} |
2686 | + |
2687 | +CRegexXQuery_wildchar::CRegexXQuery_wildchar(CRegexXQuery_regex* regex) : |
2688 | IRegexAtom(regex) |
2689 | { |
2690 | } |
2691 | |
2692 | -CRegexAscii_wildchar::~CRegexAscii_wildchar() |
2693 | +CRegexXQuery_wildchar::~CRegexXQuery_wildchar() |
2694 | { |
2695 | } |
2696 | |
2697 | -CRegexAscii_backref::CRegexAscii_backref(CRegexAscii_regex* regex, unsigned int backref_) : |
2698 | +CRegexXQuery_backref::CRegexXQuery_backref(CRegexXQuery_regex* regex, unsigned int backref_) : |
2699 | IRegexAtom(regex), |
2700 | backref(backref_) |
2701 | { |
2702 | } |
2703 | |
2704 | -CRegexAscii_backref::~CRegexAscii_backref() |
2705 | -{ |
2706 | -} |
2707 | - |
2708 | -CRegexAscii_parser::CRegexAscii_parser() |
2709 | +CRegexXQuery_backref::~CRegexXQuery_backref() |
2710 | +{ |
2711 | +} |
2712 | + |
2713 | +CRegexXQuery_pinstart::CRegexXQuery_pinstart(CRegexXQuery_regex* regex): |
2714 | + IRegexAtom(regex) |
2715 | +{ |
2716 | +} |
2717 | + |
2718 | +CRegexXQuery_parser::CRegexXQuery_parser() |
2719 | { |
2720 | current_regex = NULL; |
2721 | regex_depth = 0; |
2722 | } |
2723 | |
2724 | -CRegexAscii_parser::~CRegexAscii_parser() |
2725 | +CRegexXQuery_parser::~CRegexXQuery_parser() |
2726 | { |
2727 | } |
2728 | |
2729 | @@ -720,9 +1230,68 @@ |
2730 | ////////////////////////////////////////// |
2731 | ////Matching the pattern on a string |
2732 | ///////////////////////////////////////// |
2733 | +static std::list<RegexAscii_pieceinfo> empty_pieces;//empty list of pieces |
2734 | +/* |
2735 | +std::list<RegexAscii_pieceinfo>::iterator |
2736 | +IRegexAtom::choose_next_piece(const char *source, int *matched_len, |
2737 | + std::list<RegexAscii_pieceinfo>::iterator this_piece, |
2738 | + std::list<RegexAscii_pieceinfo>::iterator end_piece) |
2739 | +{ |
2740 | + //if this_piece is repetition, repeat until max, then go to next piece |
2741 | + int min, max; |
2742 | + bool strict_max; |
2743 | + while(this_piece != end_piece) |
2744 | + { |
2745 | + (*this_piece).piece->get_quantifier(&min, &max, &strict_max); |
2746 | + if(max <= ((*this_piece).nr_matches))//finished this piece |
2747 | + { |
2748 | + this_piece++; |
2749 | + } |
2750 | + else |
2751 | + break; |
2752 | + } |
2753 | + return this_piece; |
2754 | +} |
2755 | +*/ |
2756 | + |
2757 | +bool IRegexAtom::match(const char *source, int *start_from_branch, int *matched_len, |
2758 | + std::list<RegexAscii_pieceinfo>::iterator this_piece, |
2759 | + std::list<RegexAscii_pieceinfo>::iterator end_piece) |
2760 | +{ |
2761 | + *start_from_branch = 0; |
2762 | + bool retmatch; |
2763 | + retmatch = match_internal(source, start_from_branch, matched_len); |
2764 | + if(!retmatch) |
2765 | + return false; |
2766 | + |
2767 | + if(this_piece == end_piece) |
2768 | + return true; |
2769 | + |
2770 | + (*this_piece).nr_matches++; |
2771 | + int min,max; |
2772 | + bool strict_max; |
2773 | + (*this_piece).piece->get_quantifier(&min, &max, &strict_max); |
2774 | + std::list<RegexAscii_pieceinfo>::iterator init_piece = this_piece; |
2775 | + if(((min == 1) && (max == 1)) || //the simple common case |
2776 | + ((*matched_len == 0) && ((*this_piece).nr_matches>=min)))//to avoid infinite loop |
2777 | + { |
2778 | + this_piece++; |
2779 | + if(this_piece == end_piece) |
2780 | + return true; |
2781 | + } |
2782 | + int matched_len2; |
2783 | + retmatch = (*this_piece).piece->match_piece(this_piece, end_piece, source + *matched_len, &matched_len2); |
2784 | + if(!retmatch) |
2785 | + { |
2786 | + (*init_piece).nr_matches--; |
2787 | + return false; |
2788 | + } |
2789 | + *matched_len += matched_len2; |
2790 | + return true; |
2791 | +} |
2792 | |
2793 | //try every position in source to match the pattern |
2794 | -bool CRegexAscii_regex::match_anywhere(const char *source, unsigned int flags, |
2795 | +bool CRegexXQuery_regex::match_anywhere(const char *source, unsigned int flags, |
2796 | int *match_pos, int *matched_len) |
2797 | { |
2798 | *match_pos = 0; |
2799 | @@ -730,43 +1299,66 @@ |
2800 | return match_from(source, flags, match_pos, matched_len); |
2801 | } |
2802 | |
2803 | -bool CRegexAscii_regex::match_from(const char *source, unsigned int flags, |
2804 | +bool CRegexXQuery_regex::match_from(const char *source, unsigned int flags, |
2805 | int *match_pos, int *matched_len) |
2806 | { |
2807 | this->flags = flags; |
2808 | + this->source_start = source; |
2809 | reachedEnd = false; |
2810 | |
2811 | - std::vector<CRegexAscii_regex*>::iterator regex_it; |
2812 | + std::vector<CRegexXQuery_regex*>::iterator regex_it; |
2813 | for(regex_it = subregex.begin(); regex_it != subregex.end(); regex_it++) |
2814 | { |
2815 | (*regex_it)->matched_source = NULL; |
2816 | } |
2817 | -// if(!source[0]) |
2818 | -// { |
2819 | -// if(branch_list.empty()) |
2820 | -// return true; |
2821 | -// else |
2822 | -// return false; |
2823 | -// } |
2824 | - |
2825 | - bool skip_first_match = false; |
2826 | - if(*match_pos && align_begin) |
2827 | - skip_first_match = true; |
2828 | + |
2829 | + std::vector<std::pair<const char*, int> > saved_subregex; |
2830 | + |
2831 | + if(*match_pos && (flags & REGEX_ASCII_WHOLE_MATCH)) |
2832 | + return false; |
2833 | + |
2834 | do |
2835 | { |
2836 | - if(!skip_first_match) |
2837 | - { |
2838 | - if(match(source + *match_pos, matched_len)) |
2839 | - return true; |
2840 | - } |
2841 | - skip_first_match = false; |
2842 | - if(align_begin) |
2843 | + int start_from_branch = 0; |
2844 | + int longest_match = -1; |
2845 | + while(1) |
2846 | + { |
2847 | + if(!match(source + *match_pos, &start_from_branch, matched_len, empty_pieces.begin(), empty_pieces.end())) |
2848 | + break; |
2849 | + if(longest_match < *matched_len) |
2850 | + { |
2851 | + longest_match = *matched_len; |
2852 | + if(start_from_branch && (flags & REGEX_ASCII_GET_LONGEST_BRANCH)) |
2853 | + save_subregex_list(saved_subregex); |
2854 | + } |
2855 | + if(!start_from_branch || !(flags & REGEX_ASCII_GET_LONGEST_BRANCH)) |
2856 | + break; |
2857 | + //else try the other branches to see which is longer |
2858 | + } |
2859 | + if(longest_match != -1) |
2860 | + { |
2861 | + *matched_len = longest_match; |
2862 | + if(saved_subregex.size()) |
2863 | + load_subregex_list(saved_subregex); |
2864 | + if(flags & REGEX_ASCII_WHOLE_MATCH) |
2865 | + { |
2866 | + if(!source[*match_pos+*matched_len]) |
2867 | + return true; |
2868 | + if((flags & REGEX_ASCII_MULTILINE) && |
2869 | + ((source[*match_pos+*matched_len] == '\n') || (source[*match_pos+*matched_len] == '\r'))) |
2870 | + return true; |
2871 | + return false; |
2872 | + } |
2873 | + return true; |
2874 | + } |
2875 | + |
2876 | + if(flags & REGEX_ASCII_WHOLE_MATCH) |
2877 | { |
2878 | if(flags & REGEX_ASCII_MULTILINE) |
2879 | { |
2880 | - //goto the next line |
2881 | + //go to next line |
2882 | while(source[*match_pos] && (source[*match_pos] != '\n') && (source[*match_pos] != '\r')) |
2883 | - (*match_pos)++; |
2884 | + (*match_pos) += myutf8len(source); |
2885 | if(source[*match_pos] == '\n') |
2886 | { |
2887 | (*match_pos)++; |
2888 | @@ -780,190 +1372,1039 @@ |
2889 | (*match_pos)++; |
2890 | } |
2891 | if(!source[*match_pos]) |
2892 | - return false; |
2893 | + break; |
2894 | continue; |
2895 | } |
2896 | - return false; |
2897 | + break; |
2898 | } |
2899 | if(!source[*match_pos]) |
2900 | break; |
2901 | - (*match_pos)++; |
2902 | + (*match_pos) += myutf8len(source); |
2903 | } |
2904 | while(source[*match_pos]); |
2905 | +// if(!source[*match_pos]) |
2906 | +// { |
2907 | +// reachedEnd = true; |
2908 | +// } |
2909 | return false; |
2910 | } |
2911 | |
2912 | +void CRegexXQuery_regex::reset_match() |
2913 | +{ |
2914 | +// this->backup_matched_source = this->matched_source; |
2915 | +// this->backup_matched_len = this->matched_len; |
2916 | + this->matched_source = NULL; |
2917 | + this->matched_len = 0; |
2918 | + std::list<CRegexXQuery_branch*>::iterator branch_it; |
2919 | + for(branch_it = branch_list.begin(); branch_it != branch_list.end(); branch_it++) |
2920 | + { |
2921 | + (*branch_it)->reset(); |
2922 | + } |
2923 | +} |
2924 | +/* |
2925 | +void CRegexXQuery_regex::restore_match() |
2926 | +{ |
2927 | + this->matched_source = this->backup_matched_source; |
2928 | + this->matched_len = this->backup_matched_len; |
2929 | + std::list<CRegexXQuery_branch*>::iterator branch_it; |
2930 | + for(branch_it = branch_list.begin(); branch_it != branch_list.end(); branch_it++) |
2931 | + { |
2932 | + (*branch_it)->restore(); |
2933 | + } |
2934 | +} |
2935 | +*/ |
2936 | //match any of the branches |
2937 | -bool CRegexAscii_regex::match(const char *source, int *matched_len) |
2938 | +bool CRegexXQuery_regex::match(const char *source, int *start_from_branch, int *matched_len, |
2939 | + std::list<RegexAscii_pieceinfo>::iterator next_piece, |
2940 | + std::list<RegexAscii_pieceinfo>::iterator end_piece) |
2941 | { |
2942 | reachedEnd = false; |
2943 | - std::list<CRegexAscii_branch*>::iterator branch_it; |
2944 | - |
2945 | - for(branch_it = branch_list.begin(); branch_it != branch_list.end(); branch_it++) |
2946 | - { |
2947 | - if((*branch_it)->match(source, matched_len)) |
2948 | - { |
2949 | - matched_source = source; |
2950 | - this->matched_len = *matched_len; |
2951 | + if(!(flags & REGEX_ASCII_GROUPING_LEN_WHOLE_PIECE) || |
2952 | + (this->matched_source == NULL) || ((this->matched_source + this->matched_len) != source)) |
2953 | + this->matched_source = source; |
2954 | + *matched_len = 0; |
2955 | + std::list<CRegexXQuery_branch*>::iterator branch_it; |
2956 | + |
2957 | + if(*start_from_branch == 0) |
2958 | + { |
2959 | + for(branch_it = branch_list.begin(); branch_it != branch_list.end(); branch_it++) |
2960 | + { |
2961 | + (*branch_it)->reset(); |
2962 | + } |
2963 | + } |
2964 | + |
2965 | + branch_it = branch_list.begin(); |
2966 | + if(*start_from_branch) |
2967 | + { |
2968 | + for(int i=0;i<*start_from_branch;i++) |
2969 | + branch_it++; |
2970 | + } |
2971 | + (*start_from_branch)++; |
2972 | + for(; branch_it != branch_list.end(); branch_it++,(*start_from_branch)++) |
2973 | + { |
2974 | + if((*branch_it)->match(source, matched_len, this, next_piece, end_piece)) |
2975 | + { |
2976 | + //matched_source = source; |
2977 | + //this->matched_len = *matched_len; |
2978 | return true; |
2979 | } |
2980 | } |
2981 | - matched_source = NULL; |
2982 | - matched_len = 0; |
2983 | + *start_from_branch = 0; |
2984 | + if(this->matched_source == source) |
2985 | + this->matched_source = NULL; |
2986 | + *matched_len = 0; |
2987 | return false; |
2988 | } |
2989 | |
2990 | +void CRegexXQuery_regex::save_subregex_list(std::vector<std::pair<const char*, int> > &saved_subregex) |
2991 | +{ |
2992 | + saved_subregex.resize(0); |
2993 | + saved_subregex.reserve(subregex.size()); |
2994 | + std::vector<CRegexXQuery_regex*>::iterator it; |
2995 | + for(it=subregex.begin(); it != subregex.end(); it++) |
2996 | + { |
2997 | + saved_subregex.push_back(std::pair<const char*, int>((*it)->matched_source, (*it)->matched_len)); |
2998 | + } |
2999 | +} |
3000 | + |
3001 | +void CRegexXQuery_regex::load_subregex_list(std::vector<std::pair<const char*, int> > &saved_subregex) |
3002 | +{ |
3003 | + std::vector<std::pair<const char*, int> >::iterator it; |
3004 | + std::vector<CRegexXQuery_regex*>::iterator subit; |
3005 | + for(it=saved_subregex.begin(), subit = subregex.begin(); it != saved_subregex.end(); it++, subit++) |
3006 | + { |
3007 | + (*subit)->matched_source = (*it).first; |
3008 | + (*subit)->matched_len = (*it).second; |
3009 | + } |
3010 | +} |
3011 | + |
3012 | +void CRegexXQuery_branch::reset() |
3013 | +{ |
3014 | + std::list<RegexAscii_pieceinfo>::iterator piece_it; |
3015 | + for(piece_it = piece_list.begin(); piece_it != piece_list.end(); piece_it++) |
3016 | + { |
3017 | + (*piece_it).piece->atom->reset_match(); |
3018 | + } |
3019 | +} |
3020 | +/* |
3021 | +void CRegexXQuery_branch::restore() |
3022 | +{ |
3023 | + std::list<RegexAscii_pieceinfo>::iterator piece_it; |
3024 | + for(piece_it = piece_list.begin(); piece_it != piece_list.end(); piece_it++) |
3025 | + { |
3026 | + (*piece_it).piece->atom->restore_match(); |
3027 | + } |
3028 | +} |
3029 | +*/ |
3030 | //match all the pieces |
3031 | -bool CRegexAscii_branch::match(const char *source, int *matched_len) |
3032 | +bool CRegexXQuery_branch::match(const char *source, int *matched_len, |
3033 | + CRegexXQuery_regex* group_regex, |
3034 | + std::list<RegexAscii_pieceinfo>::iterator next_piece, |
3035 | + std::list<RegexAscii_pieceinfo>::iterator end_piece) |
3036 | { |
3037 | - std::list<CRegexAscii_piece*>::iterator piece_it; |
3038 | + std::list<RegexAscii_pieceinfo>::iterator piece_it; |
3039 | |
3040 | piece_it = piece_list.begin(); |
3041 | + //if(piece_it == piece_list.end()) |
3042 | + //if(!source[0]) |
3043 | + // return true; |
3044 | + //else |
3045 | + // return false; |
3046 | if(piece_it == piece_list.end()) |
3047 | - if(source[0]) |
3048 | - return false; |
3049 | + { |
3050 | + piece_it = next_piece; |
3051 | + if(next_piece == end_piece) |
3052 | + { |
3053 | + group_regex->matched_len = 0; |
3054 | + return true; |
3055 | + } |
3056 | + } |
3057 | + |
3058 | + std::list<RegexAscii_pieceinfo> temp_pieces(piece_list); |
3059 | + temp_pieces.push_back(group_regex);//this will be used to store the group match |
3060 | + temp_pieces.insert(temp_pieces.end(), next_piece, end_piece); |
3061 | + |
3062 | + return (*piece_it).piece->match_piece(temp_pieces.begin(), temp_pieces.end(), source, matched_len); |
3063 | +} |
3064 | + |
3065 | +bool CRegexXQuery_piece::match_piece(std::list<RegexAscii_pieceinfo>::iterator piece_it, |
3066 | + std::list<RegexAscii_pieceinfo>::iterator end_it, |
3067 | + const char *source, int *matched_len) |
3068 | +{ |
3069 | + if((*piece_it).nr_matches < 0) |
3070 | + { |
3071 | + //special case, store the group match |
3072 | + (*piece_it).group_regex->matched_len = source - (*piece_it).group_regex->matched_source; |
3073 | + piece_it++; |
3074 | + if(piece_it == end_it) |
3075 | + return true; |
3076 | else |
3077 | - return true; |
3078 | - if(!(*piece_it)->get_is_reluctant()) |
3079 | - return match_piece_iter_normal(piece_it, source, matched_len); |
3080 | + return (*piece_it).piece->match_piece(piece_it, end_it, source, matched_len); |
3081 | + } |
3082 | + |
3083 | + if(!get_is_reluctant()) |
3084 | + return match_piece_iter_normal(piece_it, end_it, source, matched_len); |
3085 | else |
3086 | - return match_piece_iter_reluctant(piece_it, source, matched_len); |
3087 | -} |
3088 | - |
3089 | -//match as less as possible |
3090 | -bool CRegexAscii_branch::match_piece_iter_reluctant( |
3091 | - std::list<CRegexAscii_piece*>::iterator piece_it, |
3092 | + return match_piece_iter_reluctant(piece_it, end_it, source, matched_len); |
3093 | +} |
3094 | + |
3095 | +int CRegexXQuery_piece::choose_another_branch(std::vector<std::pair<int,int> > &match_lens) |
3096 | +{ |
3097 | + int i = match_lens.size()-1; |
3098 | + i--; |
3099 | + while((i >= 0) && (match_lens.at(i).second == 0)) |
3100 | + i--; |
3101 | + if(i < 0) |
3102 | + return -1;//no more branches |
3103 | + match_lens.resize(i+1); |
3104 | + i++; |
3105 | + return i; |
3106 | +} |
3107 | + |
3108 | +bool CRegexXQuery_piece::is_regex_atom() |
3109 | +{ |
3110 | + return regex_atom != NULL; |
3111 | +} |
3112 | + |
3113 | +//match as less as possible (shortest string) |
3114 | +bool CRegexXQuery_piece::match_piece_iter_reluctant( |
3115 | + std::list<RegexAscii_pieceinfo>::iterator piece_it, |
3116 | + std::list<RegexAscii_pieceinfo>::iterator end_it, |
3117 | const char *source, int *matched_len) |
3118 | { |
3119 | *matched_len = 0; |
3120 | - if(piece_it == piece_list.end()) |
3121 | + if(piece_it == end_it) |
3122 | return true; |
3123 | |
3124 | int min, max; |
3125 | bool strict_max; |
3126 | //std::vector<int> match_lens; |
3127 | - (*piece_it)->get_quantifier(&min, &max, &strict_max); |
3128 | - if(strict_max && (max >= 0)) |
3129 | + (*piece_it).piece->get_quantifier(&min, &max, &strict_max); |
3130 | + |
3131 | + std::vector<std::pair<const char*, int> > saved_subregex; |
3132 | + |
3133 | + if(is_regex_atom()) |
3134 | { |
3135 | - int timeslen; |
3136 | - //check if the piece doesn't exceed the max match |
3137 | - if((*piece_it)->match_piece_times(source, ×len, max+1, NULL)) |
3138 | - return false;///too many matches |
3139 | + //recursive |
3140 | + bool retmatch; |
3141 | + atom->regex_intern->save_subregex_list(saved_subregex); |
3142 | + if((*piece_it).nr_matches >= min) |
3143 | + { |
3144 | + //go to next piece |
3145 | + std::list<RegexAscii_pieceinfo>::iterator next_it = piece_it; |
3146 | + next_it++; |
3147 | + if(next_it == end_it) |
3148 | + return true; |
3149 | + retmatch = (*next_it).piece->match_piece(next_it, end_it, source, matched_len); |
3150 | + if(retmatch) |
3151 | + return true; |
3152 | + } |
3153 | + if(((max == -1) || ((*piece_it).nr_matches < max)) &&//try further with this piece |
3154 | + (((*piece_it).nr_matches < min) || ((*piece_it).nr_matches == 0) || ((*piece_it).piece->regex_atom->matched_len)))//if matched_len is zero, avoid infinite loop |
3155 | + { |
3156 | + int start_from_branch = 0; |
3157 | + int shortest_len = -1; |
3158 | + bool branch_saved = false; |
3159 | + //try all branches to get the shortest len |
3160 | + (*piece_it).nr_matches++; |
3161 | + while(atom->match(source, &start_from_branch, matched_len, piece_it, end_it)) |
3162 | + { |
3163 | + if((shortest_len == -1) || (shortest_len > *matched_len)) |
3164 | + { |
3165 | + shortest_len = *matched_len; |
3166 | + if(start_from_branch && (atom->regex_intern->flags & REGEX_ASCII_GET_LONGEST_BRANCH)) |
3167 | + { |
3168 | + atom->regex_intern->save_subregex_list(saved_subregex); |
3169 | + branch_saved = true; |
3170 | + } |
3171 | + } |
3172 | + if(!start_from_branch || !(atom->regex_intern->flags & REGEX_ASCII_GET_LONGEST_BRANCH)) |
3173 | + break; |
3174 | + } |
3175 | + if(shortest_len != -1) |
3176 | + { |
3177 | + *matched_len = shortest_len; |
3178 | + if(branch_saved) |
3179 | + atom->regex_intern->load_subregex_list(saved_subregex); |
3180 | + return true; |
3181 | + } |
3182 | + else |
3183 | + { |
3184 | + (*piece_it).nr_matches--; |
3185 | + atom->regex_intern->load_subregex_list(saved_subregex); |
3186 | + return false; |
3187 | + } |
3188 | + } |
3189 | + else |
3190 | + { |
3191 | + atom->regex_intern->load_subregex_list(saved_subregex); |
3192 | + return false; |
3193 | + } |
3194 | } |
3195 | |
3196 | - int i=min; |
3197 | - std::list<CRegexAscii_piece*>::iterator next_it = piece_it; |
3198 | + int i=0; |
3199 | + int shortest_len = -1; |
3200 | + int otherpieces_shortest = -1; |
3201 | + int i_shortest = -1; |
3202 | + std::list<RegexAscii_pieceinfo>::iterator next_it = piece_it; |
3203 | + std::vector<std::pair<int,int> > match_lens; |
3204 | next_it++; |
3205 | int pieceslen = 0; |
3206 | while(1) |
3207 | { |
3208 | - if((max > 0) && (i>max)) |
3209 | - break; |
3210 | - int piecelen = 0; |
3211 | - if((*piece_it)->match_piece_times(source+pieceslen, &piecelen, !pieceslen ? i : 1, NULL)) |
3212 | - { |
3213 | - pieceslen += piecelen; |
3214 | + int piecelen = 0; |
3215 | + bool retmatch; |
3216 | + retmatch = match_piece_times(source, &piecelen, i < min ? min : i, &match_lens); |
3217 | + i = match_lens.size()-1;//number of matches |
3218 | + if(i<0) |
3219 | + i = 0; |
3220 | + if((i>=min)) |
3221 | + { |
3222 | + pieceslen = piecelen; |
3223 | + if((shortest_len >= 0) && (shortest_len <= pieceslen))//this branch is longer |
3224 | + {//try another branch |
3225 | + i = choose_another_branch(match_lens); |
3226 | + if(i >= 0) |
3227 | + continue;//try another branch |
3228 | + else |
3229 | + break; |
3230 | + } |
3231 | int otherpieces = 0; |
3232 | - if((next_it == piece_list.end()) || |
3233 | - ((*next_it)->get_is_reluctant() && match_piece_iter_reluctant(next_it, source+pieceslen, &otherpieces)) || |
3234 | - (!(*next_it)->get_is_reluctant() && match_piece_iter_normal(next_it, source+pieceslen, &otherpieces))) |
3235 | - { |
3236 | - *matched_len = pieceslen + otherpieces; |
3237 | - return true; |
3238 | - } |
3239 | + if((next_it == end_it) || |
3240 | + (*next_it).piece->match_piece(next_it, end_it, source+pieceslen, &otherpieces) |
3241 | + ) |
3242 | + { |
3243 | + if((i == pieceslen) || (match_lens.at(0).second == 0) ||//minimum achieved already, cannot go lower than that |
3244 | + !(atom->regex_intern->flags & REGEX_ASCII_GET_LONGEST_BRANCH)) |
3245 | + { |
3246 | + *matched_len = pieceslen + otherpieces; |
3247 | + return true; |
3248 | + } |
3249 | + if((shortest_len < 0) || (shortest_len > pieceslen)) |
3250 | + { |
3251 | + shortest_len = pieceslen; |
3252 | + otherpieces_shortest = otherpieces; |
3253 | + i_shortest = i; |
3254 | + if(match_lens.at(0).second != 0) |
3255 | + atom->regex_intern->save_subregex_list(saved_subregex); |
3256 | + } |
3257 | + i = choose_another_branch(match_lens); |
3258 | + if(i >= 0) |
3259 | + continue;//try another branch |
3260 | + else |
3261 | + break; |
3262 | + } |
3263 | + else |
3264 | + { |
3265 | + //try further |
3266 | + if(retmatch) |
3267 | + { |
3268 | + i++; |
3269 | + if((max < 0) || (i<=max)) |
3270 | + continue; |
3271 | + i--; |
3272 | + } |
3273 | + } |
3274 | + } |
3275 | + |
3276 | + if(i==0) |
3277 | + { |
3278 | + break; |
3279 | } |
3280 | else |
3281 | - break; |
3282 | - i++; |
3283 | + { |
3284 | + i = choose_another_branch(match_lens); |
3285 | + if(i >= 0) |
3286 | + continue;//try another branch |
3287 | + else |
3288 | + break; |
3289 | + } |
3290 | } |
3291 | |
3292 | + if(shortest_len >= 0) |
3293 | + { |
3294 | + if(strict_max && (max>=0) && (i_shortest > max)) |
3295 | + return false; |
3296 | + *matched_len = shortest_len + otherpieces_shortest; |
3297 | + if(saved_subregex.size()) |
3298 | + atom->regex_intern->load_subregex_list(saved_subregex); |
3299 | + return true; |
3300 | + } |
3301 | return false; |
3302 | } |
3303 | |
3304 | //match as much as possible |
3305 | -bool CRegexAscii_branch::match_piece_iter_normal( |
3306 | - std::list<CRegexAscii_piece*>::iterator piece_it, |
3307 | +bool CRegexXQuery_piece::match_piece_iter_normal( |
3308 | + std::list<RegexAscii_pieceinfo>::iterator piece_it, |
3309 | + std::list<RegexAscii_pieceinfo>::iterator end_it, |
3310 | const char *source, int *matched_len) |
3311 | { |
3312 | *matched_len = 0; |
3313 | |
3314 | int min, max; |
3315 | bool strict_max; |
3316 | - std::vector<int> match_lens; |
3317 | - (*piece_it)->get_quantifier(&min, &max, &strict_max); |
3318 | - int timeslen; |
3319 | - if(strict_max && (max >= 0)) |
3320 | + std::vector<std::pair<int,int> > match_lens; |
3321 | + (*piece_it).piece->get_quantifier(&min, &max, &strict_max); |
3322 | + int timeslen = 0; |
3323 | + std::vector<std::pair<const char*, int> > saved_subregex; |
3324 | + |
3325 | + if(is_regex_atom()) |
3326 | { |
3327 | - //check if the piece doesn't exceed the max match |
3328 | - //if((*piece_it)->match_piece_times(source, ×len, max+1, &match_lens)) |
3329 | - // return false;///too many matches |
3330 | - (*piece_it)->match_piece_times(source, ×len, max, &match_lens); |
3331 | + //recursive |
3332 | + bool retmatch; |
3333 | + atom->regex_intern->save_subregex_list(saved_subregex); |
3334 | + if(((max == -1) || ((*piece_it).nr_matches < max)) && //try further with this piece |
3335 | + (((*piece_it).nr_matches < min) || ((*piece_it).nr_matches == 0) || ((*piece_it).piece->regex_atom->matched_len)))//if matched_len is zero, avoid infinite loop |
3336 | + { |
3337 | + int start_from_branch = 0; |
3338 | + int longest_len = -1; |
3339 | + bool branch_saved = false; |
3340 | + //try all branches to get the longest len |
3341 | + (*piece_it).nr_matches++; |
3342 | + while(atom->match(source, &start_from_branch, matched_len, piece_it, end_it)) |
3343 | + { |
3344 | + if((longest_len < *matched_len)) |
3345 | + { |
3346 | + longest_len = *matched_len; |
3347 | + if(start_from_branch && (atom->regex_intern->flags & REGEX_ASCII_GET_LONGEST_BRANCH)) |
3348 | + { |
3349 | + atom->regex_intern->save_subregex_list(saved_subregex); |
3350 | + branch_saved = true; |
3351 | + } |
3352 | + } |
3353 | + if(!start_from_branch || !(atom->regex_intern->flags & REGEX_ASCII_GET_LONGEST_BRANCH)) |
3354 | + break; |
3355 | + } |
3356 | + if(longest_len != -1) |
3357 | + { |
3358 | + *matched_len = longest_len; |
3359 | + if(branch_saved) |
3360 | + atom->regex_intern->load_subregex_list(saved_subregex); |
3361 | + return true; |
3362 | + } |
3363 | + else |
3364 | + { |
3365 | + atom->regex_intern->load_subregex_list(saved_subregex); |
3366 | + (*piece_it).nr_matches--; |
3367 | + } |
3368 | + } |
3369 | + if((*piece_it).nr_matches >= min) |
3370 | + { |
3371 | + //go to next piece |
3372 | + std::list<RegexAscii_pieceinfo>::iterator next_it = piece_it; |
3373 | + next_it++; |
3374 | + if(next_it == end_it) |
3375 | + return true; |
3376 | + retmatch = (*next_it).piece->match_piece(next_it, end_it, source, matched_len); |
3377 | + if(!retmatch) |
3378 | + atom->regex_intern->load_subregex_list(saved_subregex); |
3379 | + return retmatch; |
3380 | + } |
3381 | + else |
3382 | + { |
3383 | + // regex_atom->restore_match(); |
3384 | + atom->regex_intern->load_subregex_list(saved_subregex); |
3385 | + return false; |
3386 | + } |
3387 | } |
3388 | - else if(!strict_max && (max >= 0)) |
3389 | - (*piece_it)->match_piece_times(source, ×len, max, &match_lens); |
3390 | - else |
3391 | - (*piece_it)->match_piece_times(source, ×len, -1, &match_lens); |
3392 | |
3393 | - int i; |
3394 | - std::list<CRegexAscii_piece*>::iterator next_it = piece_it; |
3395 | + int longest_len = -1; |
3396 | + int otherpieces_longest = -1; |
3397 | + int i_longest = -1; |
3398 | + int i = max; |
3399 | + std::list<RegexAscii_pieceinfo>::iterator next_it = piece_it; |
3400 | next_it++; |
3401 | - if(next_it == piece_list.end()) |
3402 | + |
3403 | + bool retmatch; |
3404 | + while(1) |
3405 | { |
3406 | - if((int)match_lens.size() > min) |
3407 | - { |
3408 | - *matched_len = timeslen; |
3409 | - return true; |
3410 | + retmatch = match_piece_times(source, ×len, i, &match_lens); |
3411 | + i=match_lens.size()-1;//number of matches |
3412 | + if((i>=min)) |
3413 | + { |
3414 | + if(timeslen < longest_len) |
3415 | + {//this branch is no use |
3416 | + i = choose_another_branch(match_lens); |
3417 | + if(i >= 0) |
3418 | + { |
3419 | + i = max; |
3420 | + continue;//try another branch |
3421 | + } |
3422 | + else |
3423 | + break; |
3424 | + } |
3425 | + //int piecelen = 0; |
3426 | + int otherpieces = 0; |
3427 | + if((next_it == end_it) || |
3428 | + (*next_it).piece->match_piece(next_it, end_it, source+timeslen, &otherpieces) |
3429 | + ) |
3430 | + { |
3431 | + if(timeslen > longest_len) |
3432 | + { |
3433 | + longest_len = timeslen; |
3434 | + otherpieces_longest = otherpieces; |
3435 | + i_longest = i; |
3436 | + if(!(atom->regex_intern->flags & REGEX_ASCII_GET_LONGEST_BRANCH)) |
3437 | + { |
3438 | + *matched_len = longest_len + otherpieces_longest; |
3439 | + return true; |
3440 | + } |
3441 | + else |
3442 | + { |
3443 | + if(match_lens.at(0).second) |
3444 | + atom->regex_intern->save_subregex_list(saved_subregex); |
3445 | + } |
3446 | + } |
3447 | + } |
3448 | + else |
3449 | + { |
3450 | + if(!match_lens.at(0).second) |
3451 | + { |
3452 | + match_lens.resize(match_lens.size()-1); |
3453 | + i--; |
3454 | + if(i >= 0) |
3455 | + continue;//try smaller |
3456 | + else |
3457 | + break; |
3458 | + } |
3459 | + else |
3460 | + { |
3461 | + i = choose_another_branch(match_lens); |
3462 | + if(i >= 0) |
3463 | + continue;//try another branch |
3464 | + else |
3465 | + break; |
3466 | + } |
3467 | + } |
3468 | + } |
3469 | + //now try another branch |
3470 | + i = choose_another_branch(match_lens); |
3471 | + if(i >= 0) |
3472 | + { |
3473 | + i = max; |
3474 | + continue;//try another branch |
3475 | } |
3476 | else |
3477 | - return false; |
3478 | - } |
3479 | - for(i=match_lens.size()-1; i>=min; i--) |
3480 | + break; |
3481 | + }//end while |
3482 | + |
3483 | + if(longest_len >= 0) |
3484 | { |
3485 | - int piecelen = 0; |
3486 | - int otherpieces = 0; |
3487 | - if(((*next_it)->get_is_reluctant() && match_piece_iter_reluctant(next_it, source+match_lens[i]+piecelen, &otherpieces)) || |
3488 | - (!(*next_it)->get_is_reluctant() && match_piece_iter_normal(next_it, source+match_lens[i]+piecelen, &otherpieces))) |
3489 | - { |
3490 | - *matched_len = match_lens[i] + piecelen + otherpieces; |
3491 | - return true; |
3492 | - } |
3493 | + *matched_len = longest_len + otherpieces_longest; |
3494 | + if(saved_subregex.size()) |
3495 | + atom->regex_intern->load_subregex_list(saved_subregex); |
3496 | + return true; |
3497 | } |
3498 | |
3499 | return false; |
3500 | } |
3501 | |
3502 | -bool CRegexAscii_piece::match_piece_times(const char *source, |
3503 | +bool CRegexXQuery_piece::match_piece_times(const char *source, |
3504 | int *piecelen, |
3505 | int times, |
3506 | - std::vector<int> *match_lens) |
3507 | + std::vector<std::pair<int,int> > *match_lens) |
3508 | { |
3509 | - *piecelen = 0; |
3510 | - for(int i=0;(times < 0) || (i<times);i++) |
3511 | - { |
3512 | + int i=0; |
3513 | + if(match_lens && match_lens->size()) |
3514 | + { |
3515 | + i = match_lens->size()-1; |
3516 | + } |
3517 | + if(match_lens && match_lens->size()) |
3518 | + *piecelen = match_lens->at(match_lens->size()-1).first; |
3519 | + else |
3520 | + *piecelen = 0; |
3521 | + if((times >= 0) && (i>=times)) |
3522 | + return true; |
3523 | + for(;(times < 0) || (i<times);i++) |
3524 | + { |
3525 | + int atomlen; |
3526 | + int start_from_branch = 0; |
3527 | + if(match_lens && (i<(int)match_lens->size())) |
3528 | + start_from_branch = match_lens->at(i).second; |
3529 | + bool first_branch = (start_from_branch == 0); |
3530 | + if(!atom->match(source+*piecelen, &start_from_branch, &atomlen, empty_pieces.begin(), empty_pieces.end())) |
3531 | + { |
3532 | + if(match_lens) |
3533 | + { |
3534 | + if(i >= (int)match_lens->size()) |
3535 | + match_lens->push_back(std::pair<int,int>(*piecelen, 0)); |
3536 | + else |
3537 | + (*match_lens)[i] = std::pair<int,int>(*piecelen, 0); |
3538 | + } |
3539 | + return false; |
3540 | + } |
3541 | if(match_lens) |
3542 | - match_lens->push_back(*piecelen); |
3543 | - int atomlen; |
3544 | - if(!atom->match(source+*piecelen, &atomlen)) |
3545 | - return false; |
3546 | + { |
3547 | + if(i >= (int)match_lens->size()) |
3548 | + match_lens->push_back(std::pair<int,int>(*piecelen, start_from_branch)); |
3549 | + else |
3550 | + (*match_lens)[i] = std::pair<int,int>(*piecelen, start_from_branch); |
3551 | + } |
3552 | *piecelen += atomlen; |
3553 | if(!atomlen && !source[*piecelen]) |
3554 | { |
3555 | - atom->regex_intern->reachedEnd = true; |
3556 | + // atom->regex_intern->set_reachedEnd(source); |
3557 | + break; |
3558 | + } |
3559 | + if(first_branch && (atomlen == 0))//avoid infinite loop |
3560 | + { |
3561 | break; |
3562 | } |
3563 | } |
3564 | if(match_lens) |
3565 | - match_lens->push_back(*piecelen); |
3566 | + { |
3567 | + // if(i >= match_lens->size()) |
3568 | + match_lens->push_back(std::pair<int,int>(*piecelen, 0)); |
3569 | + // else |
3570 | + // (*match_lens)[i] = std::pair<int,int>(*piecelen, 0); |
3571 | + } |
3572 | |
3573 | return true; |
3574 | } |
3575 | |
3576 | +bool CRegexXQuery_multicharP::match_internal(const char *source, int *start_from_branch, int *matched_len) |
3577 | +{ |
3578 | + if(!source[0]) |
3579 | + { |
3580 | + regex_intern->set_reachedEnd(source); |
3581 | + return false; |
3582 | + } |
3583 | + bool found = false; |
3584 | + const char *temp_source = source; |
3585 | + unicode::code_point utf8c = utf8::next_char(temp_source); |
3586 | + switch(multichar_type) |
3587 | + { |
3588 | + case unicode::UNICODE_Ll + 50: |
3589 | + if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Ll) || |
3590 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Lm) || |
3591 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Lo) || |
3592 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Lt) || |
3593 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Lu)) |
3594 | + { |
3595 | + if(!is_reverse) |
3596 | + found = true; |
3597 | + } |
3598 | + else |
3599 | + { |
3600 | + if(is_reverse) |
3601 | + found = true; |
3602 | + } |
3603 | + break; |
3604 | + case unicode::UNICODE_Mc + 50: |
3605 | + if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Mn) || |
3606 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Mc) || |
3607 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Me)) |
3608 | + { |
3609 | + if(!is_reverse) |
3610 | + found = true; |
3611 | + } |
3612 | + else |
3613 | + { |
3614 | + if(is_reverse) |
3615 | + found = true; |
3616 | + } |
3617 | + break; |
3618 | + case unicode::UNICODE_Nd + 50: |
3619 | + if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Nd) || |
3620 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Nl) || |
3621 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_No)) |
3622 | + { |
3623 | + if(!is_reverse) |
3624 | + found = true; |
3625 | + } |
3626 | + else |
3627 | + { |
3628 | + if(is_reverse) |
3629 | + found = true; |
3630 | + } |
3631 | + break; |
3632 | + case unicode::UNICODE_Pc + 50: |
3633 | + if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pc) || |
3634 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pd) || |
3635 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Ps) || |
3636 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pe) || |
3637 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pi) || |
3638 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pf) || |
3639 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Po)) |
3640 | + { |
3641 | + if(!is_reverse) |
3642 | + found = true; |
3643 | + } |
3644 | + else |
3645 | + { |
3646 | + if(is_reverse) |
3647 | + found = true; |
3648 | + } |
3649 | + break; |
3650 | + case unicode::UNICODE_Zl + 50: |
3651 | + if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Zs) || |
3652 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Zl) || |
3653 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Zp)) |
3654 | + { |
3655 | + if(!is_reverse) |
3656 | + found = true; |
3657 | + } |
3658 | + else |
3659 | + { |
3660 | + if(is_reverse) |
3661 | + found = true; |
3662 | + } |
3663 | + break; |
3664 | + case unicode::UNICODE_Sc + 50: |
3665 | + if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Sm) || |
3666 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Sc) || |
3667 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Sk) || |
3668 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_So)) |
3669 | + { |
3670 | + if(!is_reverse) |
3671 | + found = true; |
3672 | + } |
3673 | + else |
3674 | + { |
3675 | + if(is_reverse) |
3676 | + found = true; |
3677 | + } |
3678 | + break; |
3679 | + case unicode::UNICODE_Cc + 50: |
3680 | + if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Cc) || |
3681 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Cf) || |
3682 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Co))//ignore unicode::UNICODE_Cn |
3683 | + { |
3684 | + if(!is_reverse) |
3685 | + found = true; |
3686 | + } |
3687 | + else |
3688 | + { |
3689 | + if(is_reverse) |
3690 | + found = true; |
3691 | + } |
3692 | + break; |
3693 | + default: |
3694 | + if(unicode::check_codepoint_category(utf8c, (unicode::category)multichar_type)) |
3695 | + { |
3696 | + if(!is_reverse) |
3697 | + found = true; |
3698 | + } |
3699 | + else |
3700 | + { |
3701 | + if(is_reverse) |
3702 | + found = true; |
3703 | + } |
3704 | + break; |
3705 | + } |
3706 | + |
3707 | + if(found) |
3708 | + { |
3709 | + *matched_len = temp_source - source; |
3710 | + } |
3711 | + return found; |
3712 | +} |
3713 | + |
3714 | +bool CRegexXQuery_multicharIs::match_internal(const char *source, int *start_from_branch, int *matched_len) |
3715 | +{ |
3716 | + if(!source[0]) |
3717 | + { |
3718 | + regex_intern->set_reachedEnd(source); |
3719 | + return false; |
3720 | + } |
3721 | + bool found = false; |
3722 | + const char *temp_source = source; |
3723 | + unicode::code_point utf8c = utf8::next_char(temp_source); |
3724 | + const unicode::code_point *cp = block_escape[block_index].cp; |
3725 | + if((utf8c >= cp[0]) && (utf8c <= cp[1])) |
3726 | + { |
3727 | + if(!is_reverse) |
3728 | + found = true; |
3729 | + } |
3730 | + else if(block_escape[block_index].ext_cp) |
3731 | + { |
3732 | + cp = block_escape[block_index].ext_cp; |
3733 | + while(*cp) |
3734 | + { |
3735 | + if((utf8c >= cp[0]) && (utf8c <= cp[1])) |
3736 | + break; |
3737 | + cp += 2; |
3738 | + } |
3739 | + if(*cp) |
3740 | + { |
3741 | + if(!is_reverse) |
3742 | + found = true; |
3743 | + } |
3744 | + else |
3745 | + { |
3746 | + if(is_reverse) |
3747 | + found = true; |
3748 | + } |
3749 | + } |
3750 | + else |
3751 | + { |
3752 | + if(is_reverse) |
3753 | + found = true; |
3754 | + } |
3755 | + if(found) |
3756 | + { |
3757 | + *matched_len = temp_source - source; |
3758 | + } |
3759 | + return found; |
3760 | +} |
3761 | + |
3762 | +bool CRegexXQuery_multicharOther::match_internal(const char *source, int *start_from_branch, int *matched_len) |
3763 | +{ |
3764 | + if(!source[0]) |
3765 | + { |
3766 | + regex_intern->set_reachedEnd(source); |
3767 | + return false; |
3768 | + } |
3769 | + bool found = false; |
3770 | + bool value_true = true; |
3771 | + const char *temp_source = source; |
3772 | + unicode::code_point utf8c = utf8::next_char(temp_source); |
3773 | + switch(multichar_type) |
3774 | + { |
3775 | + case 'S':value_true = false;//[^\s] |
3776 | + case 's'://[#x20\t\n\r] |
3777 | + switch(utf8c) |
3778 | + { |
3779 | + case '\t': |
3780 | + case '\r': |
3781 | + case '\n': |
3782 | + case ' ': |
3783 | + found = true; |
3784 | + default: |
3785 | + break; |
3786 | + } |
3787 | + break; |
3788 | + case 'I':value_true = false;//[^\i] |
3789 | + case 'i'://the set of initial name characters, those matched by Letter | '_' | ':' |
3790 | + if((utf8c == '_') || |
3791 | + (utf8c == ':') || |
3792 | + XQCharType::isLetter(utf8c)) |
3793 | + { |
3794 | + found = true; |
3795 | + } |
3796 | + break; |
3797 | + case 'C':value_true = false;//[^\c] |
3798 | + case 'c'://the set of name characters, those matched by NameChar |
3799 | + if(XQCharType::isNameChar(utf8c)) |
3800 | + { |
3801 | + found = true; |
3802 | + } |
3803 | + break; |
3804 | + case 'D':value_true = false;//[^\d] |
3805 | + case 'd': |
3806 | + if(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Nd)) |
3807 | + found = true; |
3808 | + break; |
3809 | + case 'W':value_true = false;//[^\w] |
3810 | + case 'w': |
3811 | + found = !(unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pc) || |
3812 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pd) || |
3813 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Ps) || |
3814 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pe) || |
3815 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pi) || |
3816 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Pf) || |
3817 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Po) || |
3818 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Zs) || |
3819 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Zl) || |
3820 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Zp) || |
3821 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Cc) || |
3822 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Cf) || |
3823 | + unicode::check_codepoint_category(utf8c, unicode::UNICODE_Co));//ignore unicode::UNICODE_Cn |
3824 | + break; |
3825 | + default: |
3826 | + throw XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS(source, ZED(REGEX_UNIMPLEMENTED)) ); |
3827 | + } |
3828 | + if((found && value_true) || (!found && !value_true)) |
3829 | + { |
3830 | + *matched_len = temp_source - source; |
3831 | + return true; |
3832 | + } |
3833 | + else |
3834 | + { |
3835 | + return false; |
3836 | + } |
3837 | +} |
3838 | + |
3839 | +bool CRegexXQuery_char_ascii::match_internal(const char *source, int *start_from_branch, int *matched_len) |
3840 | +{ |
3841 | + if(!source[0]) |
3842 | + { |
3843 | + regex_intern->set_reachedEnd(source); |
3844 | + return false; |
3845 | + } |
3846 | + if(source[0] == c) |
3847 | + { |
3848 | + *matched_len = 1; |
3849 | + return true; |
3850 | + } |
3851 | + else |
3852 | + return false; |
3853 | +} |
3854 | + |
3855 | +bool CRegexXQuery_char_ascii_i::match_internal(const char *source, int *start_from_branch, int *matched_len) |
3856 | +{ |
3857 | + if(!source[0]) |
3858 | + { |
3859 | + regex_intern->set_reachedEnd(source); |
3860 | + return false; |
3861 | + } |
3862 | + char sup = toupper(source[0]); |
3863 | + if(sup == c) |
3864 | + { |
3865 | + *matched_len = 1; |
3866 | + return true; |
3867 | + } |
3868 | + else |
3869 | + return false; |
3870 | +} |
3871 | + |
3872 | +bool CRegexXQuery_char_range_ascii::match_internal(const char *source, int *start_from_branch, int *matched_len) |
3873 | +{ |
3874 | + if(!source[0]) |
3875 | + { |
3876 | + regex_intern->set_reachedEnd(source); |
3877 | + return false; |
3878 | + } |
3879 | + if((source[0] >= c1) && (source[0] <= c2)) |
3880 | + { |
3881 | + *matched_len = 1; |
3882 | + return true; |
3883 | + } |
3884 | + else |
3885 | + return false; |
3886 | +} |
3887 | + |
3888 | +bool CRegexXQuery_char_range_ascii_i::match_internal(const char *source, int *start_from_branch, int *matched_len) |
3889 | +{ |
3890 | + if(!source[0]) |
3891 | + { |
3892 | + regex_intern->set_reachedEnd(source); |
3893 | + return false; |
3894 | + } |
3895 | + char sup = toupper(source[0]); |
3896 | + if((sup >= c1) && (sup <= c2)) |
3897 | + { |
3898 | + *matched_len = 1; |
3899 | + return true; |
3900 | + } |
3901 | + else |
3902 | + return false; |
3903 | +} |
3904 | + |
3905 | +bool CRegexXQuery_char_unicode::match_internal(const char *source, int *start_from_branch, int *matched_len) |
3906 | +{ |
3907 | + if(!source[0]) |
3908 | + { |
3909 | + regex_intern->set_reachedEnd(source); |
3910 | + return false; |
3911 | + } |
3912 | + if(!memcmp(source, c, len)) |
3913 | + { |
3914 | + *matched_len = len; |
3915 | + return true; |
3916 | + } |
3917 | + else |
3918 | + return false; |
3919 | +} |
3920 | + |
3921 | +bool CRegexXQuery_char_unicode_cp::match_internal(const char *source, int *start_from_branch, int *matched_len) |
3922 | +{ |
3923 | + if(!source[0]) |
3924 | + { |
3925 | + regex_intern->set_reachedEnd(source); |
3926 | + return false; |
3927 | + } |
3928 | + const char *temp_source = source; |
3929 | + unicode::code_point utf8c = utf8::next_char(temp_source); |
3930 | + if(utf8c == c) |
3931 | + { |
3932 | + *matched_len = temp_source - source; |
3933 | + return true; |
3934 | + } |
3935 | + else |
3936 | + return false; |
3937 | +} |
3938 | + |
3939 | +bool CRegexXQuery_char_unicode_i::match_internal(const char *source, int *start_from_branch, int *matched_len) |
3940 | +{ |
3941 | + if(!source[0]) |
3942 | + { |
3943 | + regex_intern->set_reachedEnd(source); |
3944 | + return false; |
3945 | + } |
3946 | + const char *temp_source = source; |
3947 | + unicode::code_point sup = unicode::to_upper(utf8::next_char(temp_source)); |
3948 | + if(sup == c) |
3949 | + { |
3950 | + *matched_len = temp_source - source; |
3951 | + return true; |
3952 | + } |
3953 | + else |
3954 | + return false; |
3955 | +} |
3956 | + |
3957 | +bool CRegexXQuery_char_range_unicode::match_internal(const char *source, int *start_from_branch, int *matched_len) |
3958 | +{ |
3959 | + if(!source[0]) |
3960 | + { |
3961 | + regex_intern->set_reachedEnd(source); |
3962 | + return false; |
3963 | + } |
3964 | + const char *temp_source = source; |
3965 | + unicode::code_point utf8c = utf8::next_char(temp_source); |
3966 | + if((utf8c >= c1) && (utf8c <= c2)) |
3967 | + { |
3968 | + *matched_len = temp_source - source; |
3969 | + return true; |
3970 | + } |
3971 | + else |
3972 | + return false; |
3973 | +} |
3974 | + |
3975 | +bool CRegexXQuery_char_range_unicode_i::match_internal(const char *source, int *start_from_branch, int *matched_len) |
3976 | +{ |
3977 | + if(!source[0]) |
3978 | + { |
3979 | + regex_intern->set_reachedEnd(source); |
3980 | + return false; |
3981 | + } |
3982 | + const char *temp_source = source; |
3983 | + unicode::code_point sup = unicode::to_upper(utf8::next_char(temp_source)); |
3984 | + if((sup >= c1) && (sup <= c2)) |
3985 | + { |
3986 | + *matched_len = temp_source - source; |
3987 | + return true; |
3988 | + } |
3989 | + else |
3990 | + return false; |
3991 | +} |
3992 | + |
3993 | +bool CRegexXQuery_endline::match_internal(const char *source, int *start_from_branch, int *matched_len) |
3994 | +{ |
3995 | + *matched_len = 0; |
3996 | + if(!source[0]) |
3997 | + { |
3998 | + // regex_intern->reachedEnd = true; |
3999 | + return true; |
4000 | + } |
4001 | + if((source[0] == 0x0A) || ((source[0] == 0x0D) && (source[1] == 0x0A))) |
4002 | + { |
4003 | + if(regex_intern->get_flags() & REGEX_ASCII_MULTILINE) |
4004 | + { |
4005 | + // regex_intern->reachedEnd = true; |
4006 | + return true; |
4007 | + } |
4008 | + } |
4009 | + return false; |
4010 | +} |
4011 | + |
4012 | + |
4013 | //match any of chargroups |
4014 | -bool CRegexAscii_chargroup::match(const char *source, int *matched_len) |
4015 | +bool CRegexXQuery_chargroup::match_internal(const char *source, int *start_from_branch, int *matched_len) |
4016 | { |
4017 | *matched_len = 0; |
4018 | - std::list<chargroup_t>::iterator cgt_it; |
4019 | - |
4020 | + std::list<CRegexXQuery_charmatch* >::iterator cgt_it; |
4021 | +/* |
4022 | if(!source[0]) |
4023 | { |
4024 | regex_intern->reachedEnd = true; |
4025 | @@ -975,113 +2416,21 @@ |
4026 | return false; |
4027 | } |
4028 | |
4029 | - if(source[0] == 0x0A) |
4030 | + if((source[0] == 0x0A) || ((source[0] == 0x0D) && (source[1] == 0x0A))) |
4031 | { |
4032 | if((regex_intern->flags & REGEX_ASCII_MULTILINE) && |
4033 | (chargroup_list.size() == 1) && (chargroup_list.begin()->flags == CHARGROUP_FLAGS_ENDLINE)) |
4034 | { |
4035 | - *matched_len = 1; |
4036 | + // *matched_len = 1; |
4037 | return true; |
4038 | } |
4039 | } |
4040 | - |
4041 | +*/ |
4042 | + //bool found = false; |
4043 | for(cgt_it = chargroup_list.begin(); cgt_it != chargroup_list.end(); cgt_it++) |
4044 | { |
4045 | - if(cgt_it->flags == CHARGROUP_FLAGS_MULTICHAR) |
4046 | - { |
4047 | - switch(cgt_it->c1) |
4048 | - { |
4049 | - case 'p'://catEsc |
4050 | - case 'P'://complEsc |
4051 | - //ignore the prop for now |
4052 | - throw XQUERY_EXCEPTION( err::FORX0002 ); |
4053 | - case 's'://[#x20\t\n\r] |
4054 | - switch(source[0]) |
4055 | - { |
4056 | - case '\t': |
4057 | - case '\r': |
4058 | - case '\n': |
4059 | - case ' ': |
4060 | - *matched_len = 1; |
4061 | - return true; |
4062 | - default: |
4063 | - return false; |
4064 | - } |
4065 | - case 'S'://[^\s] |
4066 | - switch(source[0]) |
4067 | - { |
4068 | - case 0: |
4069 | - regex_intern->reachedEnd = true; |
4070 | - case '\t': |
4071 | - case '\r': |
4072 | - case '\n': |
4073 | - case ' ': |
4074 | - return false; |
4075 | - default: |
4076 | - *matched_len = 1; |
4077 | - return true; |
4078 | - } |
4079 | - case 'i'://the set of initial name characters, those matched by Letter | '_' | ':' |
4080 | - if((source[0] == '_') || |
4081 | - (source[0] == ':') || |
4082 | - XQCharType::isLetter(source[0])) |
4083 | - { |
4084 | - *matched_len = 1; |
4085 | - return true; |
4086 | - } |
4087 | - return false; |
4088 | - case 'I': |
4089 | - if((source[0] == '_') || |
4090 | - (source[0] == ':') || |
4091 | - XQCharType::isLetter(source[0])) |
4092 | - { |
4093 | - return false; |
4094 | - } |
4095 | - *matched_len = 1; |
4096 | - return true; |
4097 | - case 'c'://the set of name characters, those matched by NameChar |
4098 | - if(XQCharType::isNameChar(source[0])) |
4099 | - { |
4100 | - *matched_len = 1; |
4101 | - return true; |
4102 | - } |
4103 | - return false; |
4104 | - case 'C': |
4105 | - if(XQCharType::isNameChar(source[0])) |
4106 | - { |
4107 | - return false; |
4108 | - } |
4109 | - *matched_len = 1; |
4110 | - return true; |
4111 | - case 'd': |
4112 | - case 'D': |
4113 | - case 'w': |
4114 | - case 'W': |
4115 | - default: |
4116 | - throw XQUERY_EXCEPTION( err::FORX0002 ); |
4117 | - } |
4118 | - return false; |
4119 | - } |
4120 | - else if(cgt_it->flags == CHARGROUP_FLAGS_ENDLINE) |
4121 | - { |
4122 | - return false; |
4123 | - } |
4124 | - else |
4125 | - { |
4126 | - if(regex_intern->flags & REGEX_ASCII_CASE_INSENSITIVE) |
4127 | - { |
4128 | - char sup = toupper(source[0]); |
4129 | - if((sup >= toupper(cgt_it->c1)) && |
4130 | - (sup <= toupper(cgt_it->c2))) |
4131 | - break; |
4132 | - } |
4133 | - else |
4134 | - { |
4135 | - if((source[0] >= cgt_it->c1) && |
4136 | - (source[0] <= cgt_it->c2)) |
4137 | - break; |
4138 | - } |
4139 | - } |
4140 | + if((*cgt_it)->match_internal(source, start_from_branch, matched_len)) |
4141 | + break; |
4142 | } |
4143 | if(cgt_it == chargroup_list.end()) |
4144 | return false; |
4145 | @@ -1089,53 +2438,48 @@ |
4146 | if(classsub) |
4147 | { |
4148 | int classsub_len; |
4149 | - if(classsub->match(source, &classsub_len)) |
4150 | + if(classsub->match_internal(source, NULL, &classsub_len)) |
4151 | return false; |
4152 | } |
4153 | |
4154 | - *matched_len = 1; |
4155 | + //*matched_len = 1; |
4156 | return true; |
4157 | } |
4158 | |
4159 | -bool CRegexAscii_negchargroup::match(const char *source, int *matched_len) |
4160 | +bool CRegexXQuery_negchargroup::match_internal(const char *source, int *start_from_branch, int *matched_len) |
4161 | { |
4162 | if(!source[0]) |
4163 | { |
4164 | - regex_intern->reachedEnd = true; |
4165 | + regex_intern->set_reachedEnd(source); |
4166 | return false; |
4167 | } |
4168 | - if(!CRegexAscii_chargroup::match(source, matched_len)) |
4169 | + if(!CRegexXQuery_chargroup::match_internal(source, start_from_branch, matched_len)) |
4170 | { |
4171 | - *matched_len = 1; |
4172 | + *matched_len = myutf8len(source); |
4173 | return true; |
4174 | } |
4175 | return false; |
4176 | } |
4177 | |
4178 | -bool CRegexAscii_wildchar::match(const char *source, int *matched_len) |
4179 | +bool CRegexXQuery_wildchar::match_internal(const char *source, int *start_from_branch, int *matched_len) |
4180 | { |
4181 | *matched_len = 0; |
4182 | - if(source[0]) |
4183 | - { |
4184 | - if((regex_intern->flags & REGEX_ASCII_DOTALL) || |
4185 | - (source[0] != '\n') && (source[0] != '\r')) |
4186 | - { |
4187 | - *matched_len = 1; |
4188 | - return true; |
4189 | - } |
4190 | - else |
4191 | - return false; |
4192 | + if(!source[0]) |
4193 | + { |
4194 | + regex_intern->set_reachedEnd(source); |
4195 | + return false; |
4196 | + } |
4197 | + if((regex_intern->flags & REGEX_ASCII_DOTALL) || |
4198 | + ((source[0] != '\n') && (source[0] != '\r'))) |
4199 | + { |
4200 | + *matched_len = myutf8len(source); |
4201 | + return true; |
4202 | } |
4203 | else |
4204 | - { |
4205 | - if(!source[0]) |
4206 | - regex_intern->reachedEnd = true; |
4207 | - *matched_len = 0; |
4208 | return false; |
4209 | - } |
4210 | } |
4211 | |
4212 | -bool CRegexAscii_backref::match(const char *source, int *matched_len) |
4213 | +bool CRegexXQuery_backref::match_internal(const char *source, int *start_from_branch, int *matched_len) |
4214 | { |
4215 | const char *submatch = regex_intern->subregex.at(backref-1)->matched_source; |
4216 | if(!submatch) |
4217 | @@ -1143,15 +2487,42 @@ |
4218 | *matched_len = 0; |
4219 | return true; |
4220 | } |
4221 | + if(!source[0]) |
4222 | + { |
4223 | + regex_intern->set_reachedEnd(source); |
4224 | + return false; |
4225 | + } |
4226 | *matched_len = regex_intern->subregex.at(backref-1)->matched_len; |
4227 | - if(!strncmp(source, submatch, *matched_len)) |
4228 | - { |
4229 | - return true; |
4230 | - } |
4231 | - *matched_len = 0; |
4232 | - return false; |
4233 | -} |
4234 | - |
4235 | - }//end namespace regex_ascii |
4236 | + if(regex_intern->flags & REGEX_ASCII_CASE_INSENSITIVE) |
4237 | + { |
4238 | + if(compare_unicode_ni(source, submatch, *matched_len)) |
4239 | + { |
4240 | + return true; |
4241 | + } |
4242 | + } |
4243 | + else |
4244 | + { |
4245 | + if(!memcmp(source, submatch, *matched_len)) |
4246 | + { |
4247 | + return true; |
4248 | + } |
4249 | + } |
4250 | + *matched_len = 0; |
4251 | + return false; |
4252 | +} |
4253 | + |
4254 | +bool CRegexXQuery_pinstart::match_internal(const char *source, int *start_from_branch, int *matched_len) |
4255 | +{ |
4256 | + *matched_len = 0; |
4257 | + if(source == regex_intern->source_start) |
4258 | + return true; |
4259 | + if((regex_intern->flags & REGEX_ASCII_MULTILINE) && |
4260 | + ((source[-1] == '\n') || (source[-1] == '\r'))) |
4261 | + return true; |
4262 | + |
4263 | + return false; |
4264 | +} |
4265 | + |
4266 | + }//end namespace regex_xquery |
4267 | }//end namespace zorba |
4268 | /* vim:set et sw=2 ts=2: */ |
4269 | |
4270 | === renamed file 'src/util/regex_ascii.h' => 'src/util/regex_xquery.h' |
4271 | --- src/util/regex_ascii.h 2011-07-18 14:25:21 +0000 |
4272 | +++ src/util/regex_xquery.h 2012-01-18 18:33:36 +0000 |
4273 | @@ -21,103 +21,142 @@ |
4274 | #include <vector> |
4275 | |
4276 | #include <zorba/config.h> |
4277 | +#include <zorba/internal/unique_ptr.h> |
4278 | +#include "util/unicode_util.h" |
4279 | |
4280 | namespace zorba { |
4281 | - namespace regex_ascii{ |
4282 | + namespace regex_xquery{ |
4283 | |
4284 | //matching flags |
4285 | -#define REGEX_ASCII_CASE_INSENSITIVE 1 |
4286 | -#define REGEX_ASCII_DOTALL 2 |
4287 | -#define REGEX_ASCII_MULTILINE 4 |
4288 | -#define REGEX_ASCII_COMMENTS 8 |
4289 | -#define REGEX_ASCII_LITERAL 16 |
4290 | - |
4291 | -class CRegexAscii_regex; |
4292 | - |
4293 | -class IRegexMatcher |
4294 | +#define REGEX_ASCII_CASE_INSENSITIVE 1 //i |
4295 | +#define REGEX_ASCII_DOTALL 2 //s |
4296 | +#define REGEX_ASCII_MULTILINE 4 //m |
4297 | +#define REGEX_ASCII_NO_WHITESPACE 8 //x |
4298 | +#define REGEX_ASCII_LITERAL 16 //q |
4299 | + |
4300 | +#define REGEX_ASCII_GET_LONGEST_BRANCH 32 //try all branches and get the longest match (or shortest for reluctant pieces) |
4301 | +#define REGEX_ASCII_MINIMAL_MATCH 64 //consider all pieces as reluctant |
4302 | +#define REGEX_ASCII_WHOLE_MATCH 128 //match only all string, like having "^regex$" |
4303 | +#define REGEX_ASCII_GROUPING_LEN_WHOLE_PIECE 256 //compute the len of a grouping as for the whole piece ( for example (a)+ when matching "aa" and referred as $1 will get string len 2 instead of last 1) |
4304 | + |
4305 | +class CRegexXQuery_regex; |
4306 | +class CRegexXQuery_piece; |
4307 | + |
4308 | +struct RegexAscii_pieceinfo |
4309 | { |
4310 | -public: |
4311 | - CRegexAscii_regex *regex_intern; |
4312 | -public: |
4313 | - IRegexMatcher(CRegexAscii_regex* regex) : regex_intern(regex) {} |
4314 | - virtual ~IRegexMatcher() {} |
4315 | + union |
4316 | + { |
4317 | + CRegexXQuery_piece* piece; |
4318 | + CRegexXQuery_regex* group_regex; |
4319 | + }; |
4320 | + int nr_matches; |
4321 | |
4322 | - virtual bool match(const char *source, int *matched_len) = 0; |
4323 | + RegexAscii_pieceinfo(CRegexXQuery_piece* piece) {nr_matches=0;this->piece=piece;} |
4324 | + RegexAscii_pieceinfo(CRegexXQuery_regex* group_regex) {nr_matches=-1;this->group_regex=group_regex;} |
4325 | }; |
4326 | |
4327 | -class IRegexAtom : public IRegexMatcher |
4328 | + |
4329 | +class IRegexAtom |
4330 | { |
4331 | +protected: |
4332 | + friend class CRegexXQuery_piece; |
4333 | + CRegexXQuery_regex *regex_intern; |
4334 | public: |
4335 | - IRegexAtom(CRegexAscii_regex* regex) : IRegexMatcher(regex) {} |
4336 | + IRegexAtom(CRegexXQuery_regex* regex) : regex_intern(regex) {} |
4337 | virtual ~IRegexAtom() {} |
4338 | + |
4339 | + virtual bool match(const char *source, int *start_from_branch, int *matched_len, |
4340 | + std::list<RegexAscii_pieceinfo>::iterator next_piece, |
4341 | + std::list<RegexAscii_pieceinfo>::iterator end_piece); |
4342 | + virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len) = 0; |
4343 | + virtual void reset_match() {} |
4344 | +// virtual void restore_match() {} |
4345 | }; |
4346 | |
4347 | -class CRegexAscii_branch; |
4348 | -class CRegexAscii_piece; |
4349 | -class CRegexAscii_chargroup; |
4350 | -class CRegexAscii_parser; |
4351 | +class CRegexXQuery_branch; |
4352 | +class CRegexXQuery_piece; |
4353 | +class CRegexXQuery_chargroup; |
4354 | +class CRegexXQuery_parser; |
4355 | |
4356 | -class CRegexAscii_regex : public IRegexAtom |
4357 | +class CRegexXQuery_regex : public IRegexAtom |
4358 | { |
4359 | - friend class CRegexAscii_parser; |
4360 | - friend class CRegexAscii_branch; |
4361 | - friend class CRegexAscii_piece; |
4362 | - friend class CRegexAscii_chargroup; |
4363 | - friend class CRegexAscii_negchargroup; |
4364 | - friend class CRegexAscii_wildchar; |
4365 | - friend class CRegexAscii_backref; |
4366 | + friend class CRegexXQuery_parser; |
4367 | + friend class CRegexXQuery_branch; |
4368 | + friend class CRegexXQuery_piece; |
4369 | + friend class CRegexXQuery_chargroup; |
4370 | + friend class CRegexXQuery_negchargroup; |
4371 | + friend class CRegexXQuery_wildchar; |
4372 | + friend class CRegexXQuery_backref; |
4373 | + friend class CRegexXQuery_pinstart; |
4374 | public: |
4375 | - CRegexAscii_regex(CRegexAscii_regex *); |
4376 | - virtual ~CRegexAscii_regex(); |
4377 | + CRegexXQuery_regex(CRegexXQuery_regex *); |
4378 | + virtual ~CRegexXQuery_regex(); |
4379 | |
4380 | bool match_anywhere(const char *source, unsigned int flags, int *match_pos, int *matched_len); |
4381 | bool match_from(const char *source, unsigned int flags, int *match_pos, int *matched_len); |
4382 | - virtual bool match(const char *source, int *matched_len); |
4383 | |
4384 | //for replace $1, $2 ... |
4385 | bool get_indexed_match(int index, const char **matched_source, int *matched_len); |
4386 | unsigned int get_indexed_regex_count(); |
4387 | |
4388 | bool get_reachedEnd() {return reachedEnd;} |
4389 | - bool set_align_begin(bool align_begin); |
4390 | + void set_reachedEnd(const char *source) {if(source > source_start) reachedEnd = true;} |
4391 | + unsigned int get_flags() {return flags;} |
4392 | +public: |
4393 | + virtual bool match(const char *source, int *start_from_branch, int *matched_len, |
4394 | + std::list<RegexAscii_pieceinfo>::iterator next_piece, |
4395 | + std::list<RegexAscii_pieceinfo>::iterator end_piece); |
4396 | + virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len) {return false;}//not impl |
4397 | + virtual void reset_match(); |
4398 | +// virtual void restore_match(); |
4399 | private: |
4400 | - void add_branch(CRegexAscii_branch *branch); |
4401 | + void add_branch(CRegexXQuery_branch *branch); |
4402 | + |
4403 | + void save_subregex_list(std::vector<std::pair<const char*, int> > &saved_subregex); |
4404 | + void load_subregex_list(std::vector<std::pair<const char*, int> > &saved_subregex); |
4405 | private: |
4406 | unsigned int flags; |
4407 | - std::list<CRegexAscii_branch*> branch_list; |
4408 | - bool align_begin; |
4409 | + std::list<CRegexXQuery_branch*> branch_list; |
4410 | + |
4411 | + const char *source_start; |
4412 | |
4413 | const char *matched_source; |
4414 | int matched_len; |
4415 | - std::vector<CRegexAscii_regex*> subregex;//for grouping |
4416 | +// const unicode::code_point *backup_matched_source; |
4417 | +// int backup_matched_len; |
4418 | + std::vector<CRegexXQuery_regex*> subregex;//for grouping |
4419 | |
4420 | bool reachedEnd; |
4421 | }; |
4422 | |
4423 | -class CRegexAscii_branch : public IRegexMatcher |
4424 | +class CRegexXQuery_branch |
4425 | { |
4426 | - friend class CRegexAscii_parser; |
4427 | + friend class CRegexXQuery_parser; |
4428 | public: |
4429 | - CRegexAscii_branch(CRegexAscii_regex* regex); |
4430 | - ~CRegexAscii_branch(); |
4431 | + CRegexXQuery_branch(CRegexXQuery_regex* regex); |
4432 | + ~CRegexXQuery_branch(); |
4433 | |
4434 | - virtual bool match(const char *source, int *matched_len); |
4435 | -private: |
4436 | - std::list<CRegexAscii_piece*> piece_list; |
4437 | -private: |
4438 | - void add_piece(CRegexAscii_piece *piece); |
4439 | + bool match(const char *source, int *matched_len, |
4440 | + CRegexXQuery_regex* group_regex, |
4441 | + std::list<RegexAscii_pieceinfo>::iterator next_piece, |
4442 | + std::list<RegexAscii_pieceinfo>::iterator end_piece); |
4443 | + void reset(); |
4444 | +// void restore(); |
4445 | +private: |
4446 | + std::list<RegexAscii_pieceinfo> piece_list; |
4447 | +private: |
4448 | + void add_piece(CRegexXQuery_piece *piece); |
4449 | |
4450 | - bool match_piece_iter_reluctant(std::list<CRegexAscii_piece*>::iterator piece_it, |
4451 | - const char *source, int *matched_len); |
4452 | - bool match_piece_iter_normal(std::list<CRegexAscii_piece*>::iterator piece_it, |
4453 | - const char *source, int *matched_len); |
4454 | }; |
4455 | |
4456 | -class CRegexAscii_piece //: public IRegexMatcher |
4457 | +class CRegexXQuery_piece //: public IRegexMatcher |
4458 | { |
4459 | - friend class CRegexAscii_parser; |
4460 | -public: |
4461 | + friend class CRegexXQuery_parser; |
4462 | + friend class CRegexXQuery_branch; |
4463 | + |
4464 | IRegexAtom *atom; |
4465 | + CRegexXQuery_regex *regex_atom; |
4466 | + |
4467 | //quantifier |
4468 | bool strict_max; |
4469 | int min; |
4470 | @@ -125,8 +164,8 @@ |
4471 | bool is_reluctant; |
4472 | |
4473 | public: |
4474 | - CRegexAscii_piece(); |
4475 | - ~CRegexAscii_piece(); |
4476 | + CRegexXQuery_piece(); |
4477 | + ~CRegexXQuery_piece(); |
4478 | public: |
4479 | void set_atom(IRegexAtom *atom); |
4480 | void set_quantifier_min_max(int min, int max, bool strict_max); |
4481 | @@ -134,95 +173,294 @@ |
4482 | void get_quantifier(int *min, int *max, bool *strict_max); |
4483 | bool get_is_reluctant(); |
4484 | // bool match(const char *source, int *matched_len); |
4485 | + bool match_piece(std::list<RegexAscii_pieceinfo>::iterator next_piece, |
4486 | + std::list<RegexAscii_pieceinfo>::iterator end_piece, |
4487 | + const char *source, int *matched_len); |
4488 | +protected: |
4489 | bool match_piece_times(const char *source, |
4490 | int *piecelen, |
4491 | int times, |
4492 | - std::vector<int> *match_lens); |
4493 | -}; |
4494 | - |
4495 | -#define CHARGROUP_FLAGS_MULTICHAR 1 |
4496 | -#define CHARGROUP_FLAGS_ENDLINE 2 |
4497 | - |
4498 | -class CRegexAscii_chargroup : public IRegexAtom |
4499 | -{ |
4500 | - friend class CRegexAscii_parser; |
4501 | -public: |
4502 | - CRegexAscii_chargroup(CRegexAscii_regex* regex); |
4503 | - virtual ~CRegexAscii_chargroup(); |
4504 | + std::vector<std::pair<int,int> > *match_lens); |
4505 | + int choose_another_branch(std::vector<std::pair<int,int> > &match_lens); |
4506 | + bool match_piece_iter_reluctant(std::list<RegexAscii_pieceinfo>::iterator next_piece, |
4507 | + std::list<RegexAscii_pieceinfo>::iterator end_piece, |
4508 | + const char *source, int *matched_len); |
4509 | + bool match_piece_iter_normal(std::list<RegexAscii_pieceinfo>::iterator next_piece, |
4510 | + std::list<RegexAscii_pieceinfo>::iterator end_piece, |
4511 | + const char *source, int *matched_len); |
4512 | + bool is_regex_atom(); |
4513 | +}; |
4514 | + |
4515 | + |
4516 | +enum CHARGROUP_t |
4517 | +{ |
4518 | +CHARGROUP_NO_MULTICHAR = 0, |
4519 | +//CHARGROUP_FLAGS_CHAR_RANGE, |
4520 | +CHARGROUP_FLAGS_MULTICHAR_p, |
4521 | +CHARGROUP_FLAGS_MULTICHAR_Is, |
4522 | +CHARGROUP_FLAGS_MULTICHAR_OTHER, |
4523 | +CHARGROUP_FLAGS_ONECHAR_ASCII, |
4524 | +CHARGROUP_FLAGS_ONECHAR_UNICODE |
4525 | +//CHARGROUP_FLAGS_ENDLINE |
4526 | +}; |
4527 | + |
4528 | + |
4529 | +class CRegexXQuery_charmatch : public IRegexAtom |
4530 | +{ |
4531 | + friend class CRegexXQuery_parser; |
4532 | +protected: |
4533 | + //enum CHARGROUP_t type; |
4534 | +public: |
4535 | + CRegexXQuery_charmatch(CRegexXQuery_regex* regex);//, enum CHARGROUP_t type); |
4536 | + virtual ~CRegexXQuery_charmatch() {} |
4537 | + virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len) = 0; |
4538 | + virtual unicode::code_point get_c() {return 0;} |
4539 | +}; |
4540 | + |
4541 | +class CRegexXQuery_multicharP : public CRegexXQuery_charmatch |
4542 | +{ |
4543 | + char multichar_type; |
4544 | + bool is_reverse; |
4545 | +public: |
4546 | + CRegexXQuery_multicharP(CRegexXQuery_regex* regex, char type, bool is_reverse); |
4547 | + virtual ~CRegexXQuery_multicharP() {} |
4548 | + |
4549 | + virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len); |
4550 | +}; |
4551 | + |
4552 | +class CRegexXQuery_multicharIs : public CRegexXQuery_charmatch |
4553 | +{ |
4554 | + int block_index; |
4555 | + bool is_reverse; |
4556 | +public: |
4557 | + CRegexXQuery_multicharIs(CRegexXQuery_regex* regex, int block_index, bool is_reverse); |
4558 | + virtual ~CRegexXQuery_multicharIs() {} |
4559 | + |
4560 | + virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len); |
4561 | +}; |
4562 | + |
4563 | +class CRegexXQuery_multicharOther : public CRegexXQuery_charmatch |
4564 | +{ |
4565 | + char multichar_type; |
4566 | +public: |
4567 | + CRegexXQuery_multicharOther(CRegexXQuery_regex* regex, char type); |
4568 | + virtual ~CRegexXQuery_multicharOther() {} |
4569 | + |
4570 | + virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len); |
4571 | +}; |
4572 | + |
4573 | +class CRegexXQuery_char_ascii : public CRegexXQuery_charmatch |
4574 | +{ |
4575 | + friend class CRegexXQuery_parser; |
4576 | +protected: |
4577 | + char c; |
4578 | +public: |
4579 | + CRegexXQuery_char_ascii(CRegexXQuery_regex* regex, char c); |
4580 | + virtual ~CRegexXQuery_char_ascii() {} |
4581 | + |
4582 | + virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len); |
4583 | + virtual unicode::code_point get_c() {return c;} |
4584 | +}; |
4585 | + |
4586 | +class CRegexXQuery_char_ascii_i : public CRegexXQuery_char_ascii |
4587 | +{ |
4588 | +public: |
4589 | + CRegexXQuery_char_ascii_i(CRegexXQuery_regex* regex, char c); |
4590 | + virtual ~CRegexXQuery_char_ascii_i() {} |
4591 | + |
4592 | + virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len); |
4593 | + virtual unicode::code_point get_c() {return c;} |
4594 | +}; |
4595 | + |
4596 | +class CRegexXQuery_char_range_ascii : public CRegexXQuery_charmatch |
4597 | +{ |
4598 | +protected: |
4599 | + char c1; |
4600 | + char c2; |
4601 | +public: |
4602 | + CRegexXQuery_char_range_ascii(CRegexXQuery_regex* regex, char c1, char c2); |
4603 | + virtual ~CRegexXQuery_char_range_ascii() {} |
4604 | + |
4605 | + virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len); |
4606 | +}; |
4607 | + |
4608 | +class CRegexXQuery_char_range_ascii_i : public CRegexXQuery_char_range_ascii |
4609 | +{ |
4610 | +public: |
4611 | + CRegexXQuery_char_range_ascii_i(CRegexXQuery_regex* regex, char c1, char c2); |
4612 | + virtual ~CRegexXQuery_char_range_ascii_i() {} |
4613 | + |
4614 | + virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len); |
4615 | +}; |
4616 | + |
4617 | +class CRegexXQuery_char_unicode : public CRegexXQuery_charmatch |
4618 | +{ |
4619 | + unsigned char c[6]; |
4620 | + int len; |
4621 | +public: |
4622 | + CRegexXQuery_char_unicode(CRegexXQuery_regex* regex, const char *c, int len); |
4623 | + virtual ~CRegexXQuery_char_unicode() {} |
4624 | + |
4625 | + virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len); |
4626 | + virtual unicode::code_point get_c(); |
4627 | +}; |
4628 | + |
4629 | +class CRegexXQuery_char_unicode_cp : public CRegexXQuery_charmatch |
4630 | +{ |
4631 | +protected: |
4632 | + unicode::code_point c; |
4633 | +public: |
4634 | + CRegexXQuery_char_unicode_cp(CRegexXQuery_regex* regex, unicode::code_point c); |
4635 | + virtual ~CRegexXQuery_char_unicode_cp() {} |
4636 | + |
4637 | + virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len); |
4638 | + virtual unicode::code_point get_c() {return c;} |
4639 | +}; |
4640 | + |
4641 | +class CRegexXQuery_char_unicode_i : public CRegexXQuery_char_unicode_cp |
4642 | +{ |
4643 | +public: |
4644 | + CRegexXQuery_char_unicode_i(CRegexXQuery_regex* regex, unicode::code_point c); |
4645 | + virtual ~CRegexXQuery_char_unicode_i() {} |
4646 | + |
4647 | + virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len); |
4648 | + virtual unicode::code_point get_c() {return c;} |
4649 | +}; |
4650 | + |
4651 | +class CRegexXQuery_char_range_unicode : public CRegexXQuery_charmatch |
4652 | +{ |
4653 | +protected: |
4654 | + unicode::code_point c1; |
4655 | + unicode::code_point c2; |
4656 | +public: |
4657 | + CRegexXQuery_char_range_unicode(CRegexXQuery_regex* regex, unicode::code_point c1, unicode::code_point c2); |
4658 | + virtual ~CRegexXQuery_char_range_unicode() {} |
4659 | + |
4660 | + virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len); |
4661 | +}; |
4662 | + |
4663 | +class CRegexXQuery_char_range_unicode_i : public CRegexXQuery_char_range_unicode |
4664 | +{ |
4665 | +public: |
4666 | + CRegexXQuery_char_range_unicode_i(CRegexXQuery_regex* regex, unicode::code_point c1, unicode::code_point c2); |
4667 | + virtual ~CRegexXQuery_char_range_unicode_i() {} |
4668 | + |
4669 | + virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len); |
4670 | +}; |
4671 | + |
4672 | +class CRegexXQuery_endline : public CRegexXQuery_charmatch |
4673 | +{ |
4674 | +public: |
4675 | + CRegexXQuery_endline(CRegexXQuery_regex* regex); |
4676 | + virtual ~CRegexXQuery_endline() {} |
4677 | + |
4678 | + virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len); |
4679 | +}; |
4680 | + |
4681 | + |
4682 | +class CRegexXQuery_chargroup : public IRegexAtom |
4683 | +{ |
4684 | + friend class CRegexXQuery_parser; |
4685 | +public: |
4686 | + CRegexXQuery_chargroup(CRegexXQuery_regex* regex); |
4687 | + virtual ~CRegexXQuery_chargroup(); |
4688 | private: |
4689 | - typedef struct |
4690 | +/* typedef struct |
4691 | { |
4692 | - unsigned char flags; |
4693 | + CHARGROUP_t flags; |
4694 | char c1; |
4695 | char c2; |
4696 | }chargroup_t; |
4697 | - std::list<chargroup_t> chargroup_list; |
4698 | - CRegexAscii_chargroup *classsub; |
4699 | -public: |
4700 | - void addMultiChar(char c); |
4701 | - void addEndLine(); |
4702 | - void addCharRange(char c1, char c2); |
4703 | - void addClassSub(CRegexAscii_chargroup* classsub); |
4704 | - |
4705 | - virtual bool match(const char *source, int *matched_len); |
4706 | -}; |
4707 | - |
4708 | -class CRegexAscii_negchargroup : public CRegexAscii_chargroup |
4709 | -{ |
4710 | -public: |
4711 | - CRegexAscii_negchargroup(CRegexAscii_regex* regex); |
4712 | - virtual ~CRegexAscii_negchargroup(); |
4713 | - |
4714 | - virtual bool match(const char *source, int *matched_len); |
4715 | -}; |
4716 | - |
4717 | -class CRegexAscii_wildchar : public IRegexAtom |
4718 | -{ |
4719 | -public: |
4720 | - CRegexAscii_wildchar(CRegexAscii_regex* regex); |
4721 | - virtual ~CRegexAscii_wildchar(); |
4722 | - |
4723 | - virtual bool match(const char *source, int *matched_len); |
4724 | -}; |
4725 | - |
4726 | -class CRegexAscii_backref : public IRegexAtom |
4727 | -{ |
4728 | -public: |
4729 | - CRegexAscii_backref(CRegexAscii_regex* regex, unsigned int backref); |
4730 | - virtual ~CRegexAscii_backref(); |
4731 | - |
4732 | - virtual bool match(const char *source, int *matched_len); |
4733 | +*/ |
4734 | + std::list<CRegexXQuery_charmatch* > chargroup_list; |
4735 | + CRegexXQuery_chargroup *classsub; |
4736 | +public: |
4737 | + //void addMultiChar(char c, CHARGROUP_t multichar_type); |
4738 | + //void addEndLine(); |
4739 | + //void addCharRange(char c1, char c2); |
4740 | + //void addOneChar(char c); |
4741 | + void addCharMatch(CRegexXQuery_charmatch *charmatch); |
4742 | + void addClassSub(CRegexXQuery_chargroup* classsub); |
4743 | + |
4744 | + virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len); |
4745 | +}; |
4746 | + |
4747 | +class CRegexXQuery_negchargroup : public CRegexXQuery_chargroup |
4748 | +{ |
4749 | +public: |
4750 | + CRegexXQuery_negchargroup(CRegexXQuery_regex* regex); |
4751 | + virtual ~CRegexXQuery_negchargroup(); |
4752 | + |
4753 | + virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len); |
4754 | +}; |
4755 | + |
4756 | +class CRegexXQuery_wildchar : public IRegexAtom |
4757 | +{ |
4758 | +public: |
4759 | + CRegexXQuery_wildchar(CRegexXQuery_regex* regex); |
4760 | + virtual ~CRegexXQuery_wildchar(); |
4761 | + |
4762 | + virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len); |
4763 | +}; |
4764 | + |
4765 | +class CRegexXQuery_backref : public IRegexAtom |
4766 | +{ |
4767 | +public: |
4768 | + CRegexXQuery_backref(CRegexXQuery_regex* regex, unsigned int backref); |
4769 | + virtual ~CRegexXQuery_backref(); |
4770 | + |
4771 | + virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len); |
4772 | private: |
4773 | unsigned int backref; |
4774 | }; |
4775 | |
4776 | -class CRegexAscii_parser |
4777 | -{ |
4778 | -public: |
4779 | - CRegexAscii_parser(); |
4780 | - ~CRegexAscii_parser(); |
4781 | - |
4782 | -public: |
4783 | - CRegexAscii_regex* parse(const char *pattern, unsigned int flags); |
4784 | +class CRegexXQuery_pinstart : public IRegexAtom |
4785 | +{ |
4786 | +public: |
4787 | + CRegexXQuery_pinstart(CRegexXQuery_regex* regex); |
4788 | + |
4789 | + virtual bool match_internal(const char *source, int *start_from_branch, int *matched_len); |
4790 | +}; |
4791 | + |
4792 | +class CRegexXQuery_parser |
4793 | +{ |
4794 | +public: |
4795 | + typedef struct |
4796 | + { |
4797 | + const unicode::code_point cp[2];//in pairs start, end |
4798 | + const unicode::code_point *ext_cp; |
4799 | + const char *group_name; |
4800 | + }block_escape_t; |
4801 | + |
4802 | + CRegexXQuery_parser(); |
4803 | + ~CRegexXQuery_parser(); |
4804 | + |
4805 | +public: |
4806 | + CRegexXQuery_regex* parse(const char *pattern, unsigned int flags); |
4807 | |
4808 | protected: |
4809 | - CRegexAscii_regex* parse_regexp(const char *pattern, int *regex_len); |
4810 | - CRegexAscii_branch* parse_branch(const char *pattern, int *branch_len); |
4811 | - CRegexAscii_piece* parse_piece(const char *pattern, int *piece_len); |
4812 | + CRegexXQuery_regex* parse_regexp(const char *pattern, int *regex_len); |
4813 | + CRegexXQuery_branch* parse_branch(const char *pattern, int *branch_len); |
4814 | + CRegexXQuery_piece* parse_piece(const char *pattern, int *piece_len); |
4815 | char myishex(char c); |
4816 | bool myisdigit(char c); |
4817 | - char readChar(const char *pattern, int *char_len, bool *is_multichar); |
4818 | + bool myisletterAZ(char c); |
4819 | + CRegexXQuery_charmatch* readChar(const char *pattern, int *char_len, CHARGROUP_t *multichar_type); |
4820 | + CRegexXQuery_charmatch *create_charmatch(unicode::code_point utf8c, |
4821 | + const char *pattern, int utf8len, |
4822 | + enum CHARGROUP_t *multichar_type); |
4823 | IRegexAtom* read_atom(const char *pattern, int *atom_len); |
4824 | - CRegexAscii_chargroup* readchargroup(const char *pattern, int *chargroup_len); |
4825 | - void read_quantifier(CRegexAscii_piece *piece, const char *pattern, int *quantif_len); |
4826 | + CRegexXQuery_chargroup* readchargroup(const char *pattern, int *chargroup_len); |
4827 | + void read_quantifier(CRegexXQuery_piece *piece, const char *pattern, int *quantif_len); |
4828 | |
4829 | private: |
4830 | - CRegexAscii_regex *current_regex; |
4831 | + CRegexXQuery_regex *current_regex; |
4832 | int regex_depth; |
4833 | unsigned int flags; |
4834 | }; |
4835 | |
4836 | -}}//end namespace zorba::regex_ascii |
4837 | +} |
4838 | +}//end namespace zorba::regex_xquery |
4839 | |
4840 | #endif |
4841 | /* vim:set et sw=2 ts=2: */ |
4842 | |
4843 | === modified file 'src/util/unicode_categories.cpp' |
4844 | --- src/util/unicode_categories.cpp 2011-06-14 17:26:33 +0000 |
4845 | +++ src/util/unicode_categories.cpp 2012-01-18 18:33:36 +0000 |
4846 | @@ -65812,7 +65812,7 @@ |
4847 | { 0x100000, 0x100000, UNICODE_Co}, |
4848 | }; |
4849 | |
4850 | -bool check_codepoint_category(code_point cp, UnicodeCategoriesEnum categ) |
4851 | +bool check_codepoint_category(code_point cp, category categ) |
4852 | { |
4853 | if(cp < 0x10000) |
4854 | return codepoints_categories[cp] == categ; |
4855 | @@ -65824,10 +65824,10 @@ |
4856 | if(cp >= codepoints_categories2[i].cp1) |
4857 | return codepoints_categories2[i].category == categ; |
4858 | else |
4859 | - return false; |
4860 | + return categ ? false : true; |
4861 | } |
4862 | } |
4863 | - return false; |
4864 | + return categ ? false : true; |
4865 | } |
4866 | |
4867 | /* |
4868 | |
4869 | === modified file 'src/util/unicode_categories.h' |
4870 | --- src/util/unicode_categories.h 2011-06-14 17:26:33 +0000 |
4871 | +++ src/util/unicode_categories.h 2012-01-18 18:33:36 +0000 |
4872 | @@ -22,46 +22,53 @@ |
4873 | namespace zorba { |
4874 | namespace unicode { |
4875 | |
4876 | -//Unicode codepoint categories, as from http://www.fileformat.info/info/unicode/category/index.htm |
4877 | +/////////////////////////////////////////////////////////////////////////////// |
4878 | |
4879 | -enum UnicodeCategoriesEnum { |
4880 | -UNICODE_Cc, //Other, Control |
4881 | -UNICODE_Cf, //Other, Format |
4882 | -UNICODE_Co, //Other, Private Use |
4883 | -UNICODE_Cs, //Other, Surrogate |
4884 | -UNICODE_Ll, //Letter, Lowercase |
4885 | -UNICODE_Lm, //Letter, Modifier |
4886 | -UNICODE_Lo, //Letter, Other |
4887 | -UNICODE_Lt, //Letter, Titlecase |
4888 | -UNICODE_Lu, //Letter, Uppercase |
4889 | -UNICODE_Mc, //Mark, Spacing Combining |
4890 | -UNICODE_Me, //Mark, Enclosing |
4891 | -UNICODE_Mn, //Mark, Nonspacing |
4892 | -UNICODE_Nd, //Number, Decimal Digit |
4893 | -UNICODE_Nl, //Number, Letter |
4894 | -UNICODE_No, //Number, Other |
4895 | -UNICODE_Pc, //Punctuation, Connector |
4896 | -UNICODE_Pd, //Punctuation, Dash |
4897 | -UNICODE_Pe, //Punctuation, Close |
4898 | -UNICODE_Pf, //Punctuation, Final quote (may behave like Ps or Pe depending on usage) |
4899 | -UNICODE_Pi, //Punctuation, Initial quote (may behave like Ps or Pe depending on usage) |
4900 | -UNICODE_Po, //Punctuation, Other |
4901 | -UNICODE_Ps, //Punctuation, Open |
4902 | -UNICODE_Sc, //Symbol, Currency |
4903 | -UNICODE_Sk, //Symbol, Modifier |
4904 | -UNICODE_Sm, //Symbol, Math |
4905 | -UNICODE_So, //Symbol, Other |
4906 | -UNICODE_Zl, //Separator, Line |
4907 | -UNICODE_Zp, //Separator, Paragraph |
4908 | -UNICODE_Zs //Separator, Space |
4909 | +/** |
4910 | + * Unicode codepoint categories. |
4911 | + * See: http://www.fileformat.info/info/unicode/category/ |
4912 | + */ |
4913 | +enum category { |
4914 | + UNICODE_Cn, // Not Assigned |
4915 | + UNICODE_Cc, // Other, Control |
4916 | + UNICODE_Cf, // Other, Format |
4917 | + UNICODE_Co, // Other, Private Use |
4918 | + UNICODE_Cs, // Other, Surrogate |
4919 | + UNICODE_Ll, // Letter, Lowercase |
4920 | + UNICODE_Lm, // Letter, Modifier |
4921 | + UNICODE_Lo, // Letter, Other |
4922 | + UNICODE_Lt, // Letter, Titlecase |
4923 | + UNICODE_Lu, // Letter, Uppercase |
4924 | + UNICODE_Mc, // Mark, Spacing Combining |
4925 | + UNICODE_Me, // Mark, Enclosing |
4926 | + UNICODE_Mn, // Mark, Nonspacing |
4927 | + UNICODE_Nd, // Number, Decimal Digit |
4928 | + UNICODE_Nl, // Number, Letter |
4929 | + UNICODE_No, // Number, Other |
4930 | + UNICODE_Pc, // Punctuation, Connector |
4931 | + UNICODE_Pd, // Punctuation, Dash |
4932 | + UNICODE_Pe, // Punctuation, Close |
4933 | + UNICODE_Pf, // Punctuation, Final quote (like Ps or Pe depending on usage) |
4934 | + UNICODE_Pi, // Punctuation, Initial quote (like Ps or Pe depending on usage) |
4935 | + UNICODE_Po, // Punctuation, Other |
4936 | + UNICODE_Ps, // Punctuation, Open |
4937 | + UNICODE_Sc, // Symbol, Currency |
4938 | + UNICODE_Sk, // Symbol, Modifier |
4939 | + UNICODE_Sm, // Symbol, Math |
4940 | + UNICODE_So, // Symbol, Other |
4941 | + UNICODE_Zl, // Separator, Line |
4942 | + UNICODE_Zp, // Separator, Paragraph |
4943 | + UNICODE_Zs // Separator, Space |
4944 | }; |
4945 | |
4946 | bool is_UnicodeNd(code_point cp, code_point *ret_zero); |
4947 | |
4948 | -bool check_codepoint_category(code_point cp, UnicodeCategoriesEnum categ); |
4949 | - |
4950 | -} |
4951 | -} |
4952 | - |
4953 | -#endif |
4954 | +bool check_codepoint_category(code_point cp, category categ); |
4955 | + |
4956 | +/////////////////////////////////////////////////////////////////////////////// |
4957 | + |
4958 | +} // namespace unicode |
4959 | +} // namespaec zorba |
4960 | + |
4961 | +#endif /* ZORBA_UNICODE_CATEGORIES */ |
4962 | /* vim:set et sw=2 ts=2: */ |
4963 | |
4964 | === modified file 'src/util/unicode_util.cpp' |
4965 | --- src/util/unicode_util.cpp 2011-07-17 00:10:56 +0000 |
4966 | +++ src/util/unicode_util.cpp 2012-01-18 18:33:36 +0000 |
4967 | @@ -22,15 +22,19 @@ |
4968 | #include <functional> /* for binary_function */ |
4969 | #include <utility> /* for pair */ |
4970 | |
4971 | -#include <unicode/normlzr.h> |
4972 | -#include <unicode/ustring.h> |
4973 | +#ifndef ZORBA_NO_ICU |
4974 | +# include <unicode/normlzr.h> |
4975 | +# include <unicode/ustring.h> |
4976 | +#endif /* ZORBA_NO_ICU */ |
4977 | |
4978 | #include "cxx_util.h" |
4979 | #include "unicode_util.h" |
4980 | #include "utf8_util.h" |
4981 | |
4982 | using namespace std; |
4983 | +#ifndef ZORBA_NO_ICU |
4984 | U_NAMESPACE_USE |
4985 | +#endif /* ZORBA_NO_ICU */ |
4986 | |
4987 | namespace zorba { |
4988 | namespace unicode { |
4989 | @@ -2208,6 +2212,8 @@ |
4990 | return to_case<upper>( c ); |
4991 | } |
4992 | |
4993 | +#ifndef ZORBA_NO_ICU |
4994 | + |
4995 | bool normalize( string const &in, normalization::type n, string *out ) { |
4996 | UErrorCode status = U_ZERO_ERROR; |
4997 | UNormalizationMode icu_mode; |
4998 | @@ -2230,8 +2236,11 @@ |
4999 | return U_SUCCESS( status ) == TRUE; |
5000 | } |
Compiling with ZORBA_NO_ICU=ON fails on Linux:
[ 1%] Building CXX object src/CMakeFiles/ zorba_simplesto re.dir/ api/zorba_ string. cpp.o /zorba/ sandbox/ src/util/ regex.h: 501:0,
from /home/mbrantner /zorba/ sandbox/ src/api/ zorba_string. cpp:23: /zorba/ sandbox/ src/util/ regex_xquery. h:209:3: error: a class-key must be used when declaring a friend /zorba/ sandbox/ src/util/ regex_xquery. h:209:3: error: friend declaration does not name a class or function /zorba/ sandbox/ src/util/ regex_xquery. h:253:3: error: a class-key must be used when declaring a friend /zorba/ sandbox/ src/util/ regex_xquery. h:253:3: error: friend declaration does not name a class or function /zorba_ simplestore. dir/api/ zorba_string. cpp.o] Erro
In file included from /home/mbrantner
/home/mbrantner
/home/mbrantner
/home/mbrantner
/home/mbrantner
make[2]: *** [src/CMakeFiles