Merge lp:~paul-lucas/zorba/pjl-misc into lp:zorba
- pjl-misc
- Merge into trunk
Status: | Merged |
---|---|
Approved by: | Matthias Brantner |
Approved revision: | 11173 |
Merged at revision: | 11370 |
Proposed branch: | lp:~paul-lucas/zorba/pjl-misc |
Merge into: | lp:zorba |
Diff against target: |
1898 lines (+847/-756) 13 files modified
src/runtime/full_text/latin_tokenizer.cpp (+4/-4) src/runtime/full_text/latin_tokenizer.h (+4/-2) src/runtime/strings/strings_impl.cpp (+7/-8) src/util/CMakeLists.txt (+11/-10) src/util/icu_regex.cpp (+48/-291) src/util/icu_regex.h (+55/-427) src/util/passthru_streambuf.cpp (+2/-2) src/util/passthru_streambuf.h (+1/-1) src/util/regex.h (+34/-0) src/util/zorba_regex.cpp (+265/-0) src/util/zorba_regex.h (+406/-0) src/util/zorba_regex_engine.cpp (+8/-9) src/util/zorba_regex_engine.h (+2/-2) |
To merge this branch: | bzr merge lp:~paul-lucas/zorba/pjl-misc |
Related bugs: |
Reviewer | Review Type | Date Requested | Status |
---|---|---|---|
Matthias Brantner | Approve | ||
Paul J. Lucas | Approve | ||
Review via email: mp+158525@code.launchpad.net |
Commit message
regex clean-up:
* Split off Zorba's own regex engine from ICU's.
* Clean-up of some of the hack changes that Daniel made.
Also fixed build when ZORBA_NO_ICU=1.
Description of the change
regex clean-up:
* Split off Zorba's own regex engine from ICU's.
* Clean-up of some of the hack changes that Daniel made.
Also fixed build when ZORBA_NO_ICU=1.
Paul J. Lucas (paul-lucas) : | # |
Zorba Build Bot (zorba-buildbot) wrote : | # |
Zorba Build Bot (zorba-buildbot) wrote : | # |
The attempt to merge lp:~paul-lucas/zorba/pjl-misc into lp:zorba failed. Below is the output from the failed tests.
CMake Error at /home/ceej/
Validation queue job pjl-misc-
final status was:
26 tests did not succeed - changes not commited.
Error in read script: /home/ceej/
Zorba Build Bot (zorba-buildbot) wrote : | # |
There are additional revisions which have not been approved in review. Please seek review and approval of these new revisions.
Zorba Build Bot (zorba-buildbot) wrote : | # |
Validation queue starting for merge proposal.
Log at: http://
Zorba Build Bot (zorba-buildbot) wrote : | # |
The attempt to merge lp:~paul-lucas/zorba/pjl-misc into lp:zorba failed. Below is the output from the failed tests.
CMake Error at /home/ceej/
Validation queue job pjl-misc-
final status was:
1 tests did not succeed - changes not commited.
Error in read script: /home/ceej/
Zorba Build Bot (zorba-buildbot) wrote : | # |
Validation queue starting for merge proposal.
Log at: http://
Zorba Build Bot (zorba-buildbot) wrote : | # |
Validation queue job pjl-misc-
All tests succeeded!
Zorba Build Bot (zorba-buildbot) wrote : | # |
Voting does not meet specified criteria. Required: Approve > 1, Disapprove < 1, Needs Fixing < 1, Pending < 1, Needs Information < 1, Resubmit < 1. Got: 1 Approve.
Matthias Brantner (matthias-brantner) : | # |
Zorba Build Bot (zorba-buildbot) wrote : | # |
Validation queue starting for merge proposal.
Log at: http://
Zorba Build Bot (zorba-buildbot) wrote : | # |
Validation queue job pjl-misc-
All tests succeeded!
Preview Diff
1 | === modified file 'src/runtime/full_text/latin_tokenizer.cpp' |
2 | --- src/runtime/full_text/latin_tokenizer.cpp 2013-02-07 17:24:36 +0000 |
3 | +++ src/runtime/full_text/latin_tokenizer.cpp 2013-04-12 05:29:35 +0000 |
4 | @@ -210,7 +210,7 @@ |
5 | // no break; |
6 | case '!': |
7 | case '?': |
8 | - ++numbers().sent; |
9 | + ++state().sent; |
10 | } |
11 | } // for |
12 | |
13 | @@ -229,11 +229,11 @@ |
14 | << ": \"" << token << "\"\n"; |
15 | #endif /* PRINT_TOKENS */ |
16 | |
17 | - callback( |
18 | + callback.token( |
19 | token.data(), token.size(), lang, |
20 | - numbers().token, numbers().sent, numbers().para, item |
21 | + state().token, state().sent, state().para, item |
22 | ); |
23 | - ++numbers().token; |
24 | + ++state().token; |
25 | return true; |
26 | } |
27 | return false; |
28 | |
29 | === modified file 'src/runtime/full_text/latin_tokenizer.h' |
30 | --- src/runtime/full_text/latin_tokenizer.h 2013-02-07 17:24:36 +0000 |
31 | +++ src/runtime/full_text/latin_tokenizer.h 2013-04-12 05:29:35 +0000 |
32 | @@ -39,8 +39,8 @@ |
33 | // inherited |
34 | void destroy() const; |
35 | void properties( Properties* ) const; |
36 | - void tokenize_string( char const*, size_type, iso639_1::type, bool, Callback&, |
37 | - Item const* ); |
38 | + void tokenize_string( char const*, size_type, locale::iso639_1::type, bool, |
39 | + Callback&, Item const* ); |
40 | |
41 | private: |
42 | typedef zstring string_type; |
43 | @@ -65,6 +65,8 @@ |
44 | |
45 | class LatinTokenizerProvider : public TokenizerProvider { |
46 | public: |
47 | + LatinTokenizerProvider() { } // needed to work-around compiler bug |
48 | + |
49 | // inherited |
50 | bool getTokenizer( locale::iso639_1::type, Tokenizer::State* = 0, |
51 | Tokenizer::ptr* = 0 ) const; |
52 | |
53 | === modified file 'src/runtime/strings/strings_impl.cpp' |
54 | --- src/runtime/strings/strings_impl.cpp 2013-03-15 08:22:41 +0000 |
55 | +++ src/runtime/strings/strings_impl.cpp 2013-04-12 05:29:35 +0000 |
56 | @@ -1900,7 +1900,7 @@ |
57 | break; |
58 | } |
59 | #ifndef ZORBA_NO_ICU |
60 | - match_startg = rx.get_match_start(i+1); |
61 | + match_startg = rx.get_group_start(i+1); |
62 | if((match_startg < 0) && (gparent < 0)) |
63 | continue; |
64 | #else |
65 | @@ -1920,7 +1920,7 @@ |
66 | GENV_ITEMFACTORY->createTextNode(non_group_text_item, parent.getp(), non_group_str); |
67 | } |
68 | #ifndef ZORBA_NO_ICU |
69 | - match_endg = rx.get_match_end(i+1); |
70 | + match_endg = rx.get_group_end(i+1); |
71 | #else |
72 | match_endg = temp_endg; |
73 | #endif |
74 | @@ -2123,17 +2123,16 @@ |
75 | |
76 | unicode::regex rx; |
77 | rx.compile(lib_pattern, flags.c_str()); |
78 | - int nr_pattern_groups = rx.get_pattern_group_count(); |
79 | + int nr_pattern_groups = rx.get_group_count(); |
80 | std::vector<int> group_parent; |
81 | computePatternGroupsParents(xquery_pattern, group_parent); |
82 | |
83 | //see if regex can match empty strings |
84 | bool reachedEnd = false; |
85 | rx.set_string("", 0); |
86 | - if (rx.find_next_match(&reachedEnd)) |
87 | + if (rx.next_match(&reachedEnd)) |
88 | { |
89 | throw XQUERY_EXCEPTION(err::FORX0003, ERROR_PARAMS(lib_pattern)); |
90 | - |
91 | } |
92 | |
93 | store::Item_t null_parent; |
94 | @@ -2187,13 +2186,13 @@ |
95 | int match_end1 = 0; |
96 | unsigned int match_end1_bytes = 0; |
97 | reachedEnd = false; |
98 | - while(rx.find_next_match(&reachedEnd)) |
99 | + while(rx.next_match(&reachedEnd)) |
100 | { |
101 | int match_start2; |
102 | int match_end2; |
103 | #ifndef ZORBA_NO_ICU |
104 | - match_start2 = rx.get_match_start(); |
105 | - match_end2 = rx.get_match_end(); |
106 | + match_start2 = rx.get_group_start(); |
107 | + match_end2 = rx.get_group_end(); |
108 | #else |
109 | rx.get_match_start_end_bytes(0, &match_start2, &match_end2); |
110 | #endif |
111 | |
112 | === modified file 'src/util/CMakeLists.txt' |
113 | --- src/util/CMakeLists.txt 2013-03-16 20:44:27 +0000 |
114 | +++ src/util/CMakeLists.txt 2013-04-12 05:29:35 +0000 |
115 | @@ -24,7 +24,6 @@ |
116 | json_parser.cpp |
117 | json_util.cpp |
118 | mem_streambuf.cpp |
119 | - regex.cpp |
120 | stream_util.cpp |
121 | string_util.cpp |
122 | time_util.cpp |
123 | @@ -44,18 +43,20 @@ |
124 | LIST(APPEND UTIL_SRCS cxx_util.cpp) |
125 | ENDIF (NOT ZORBA_CXX_NULLPTR) |
126 | |
127 | -IF(ZORBA_WITH_FILE_ACCESS) |
128 | +IF (ZORBA_WITH_FILE_ACCESS) |
129 | LIST(APPEND UTIL_SRCS mmap_file.cpp) |
130 | -ENDIF(ZORBA_WITH_FILE_ACCESS) |
131 | +ENDIF (ZORBA_WITH_FILE_ACCESS) |
132 | |
133 | -IF(ZORBA_NO_ICU) |
134 | - LIST(APPEND UTIL_SRCS |
135 | - regex_xquery.cpp |
136 | - passthru_streambuf.cpp) |
137 | -ELSE(ZORBA_NO_ICU) |
138 | - LIST(APPEND UTIL_SRCS |
139 | +IF (ZORBA_NO_ICU) |
140 | + LIST(APPEND UTIL_SRCS |
141 | + passthru_streambuf.cpp |
142 | + zorba_regex.cpp |
143 | + zorba_regex_engine.cpp) |
144 | +ELSE (ZORBA_NO_ICU) |
145 | + LIST(APPEND UTIL_SRCS |
146 | + icu_regex.cpp |
147 | icu_streambuf.cpp) |
148 | -ENDIF(ZORBA_NO_ICU) |
149 | +ENDIF (ZORBA_NO_ICU) |
150 | |
151 | HEADER_GROUP_SUBFOLDER(UTIL_SRCS fx) |
152 | HEADER_GROUP_SUBFOLDER(UTIL_SRCS win32) |
153 | |
154 | === renamed file 'src/util/regex.cpp' => 'src/util/icu_regex.cpp' |
155 | --- src/util/regex.cpp 2013-04-11 17:45:40 +0000 |
156 | +++ src/util/icu_regex.cpp 2013-04-12 05:29:35 +0000 |
157 | @@ -15,14 +15,17 @@ |
158 | */ |
159 | #include "stdafx.h" |
160 | |
161 | +#include <zorba/config.h> |
162 | + |
163 | +#ifndef ZORBA_NO_ICU |
164 | + |
165 | #include <cstring> |
166 | #include <vector> |
167 | |
168 | #include <zorba/diagnostic_list.h> |
169 | -#include "diagnostics/xquery_exception.h" |
170 | - |
171 | #include "diagnostics/assert.h" |
172 | #include "diagnostics/dict.h" |
173 | +#include "diagnostics/xquery_exception.h" |
174 | |
175 | #include "ascii_util.h" |
176 | #include "cxx_util.h" |
177 | @@ -32,13 +35,12 @@ |
178 | #define INVALID_RE_EXCEPTION(...) \ |
179 | XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS( __VA_ARGS__ ) ) |
180 | |
181 | -#ifndef ZORBA_NO_ICU |
182 | -# include <unicode/uversion.h> |
183 | +#include <unicode/uversion.h> |
184 | U_NAMESPACE_USE |
185 | |
186 | -# ifndef U_ICU_VERSION_MAJOR_NUM |
187 | -# error "U_ICU_VERSION_MAJOR_NUM not defined" |
188 | -# elif U_ICU_VERSION_MAJOR_NUM < 4 |
189 | +#ifndef U_ICU_VERSION_MAJOR_NUM |
190 | +# error "U_ICU_VERSION_MAJOR_NUM not defined" |
191 | +#elif U_ICU_VERSION_MAJOR_NUM < 4 |
192 | // |
193 | // UREGEX_LITERAL is only in ICU since 4.0. For earlier versions, we |
194 | // define it ourselves. Of course it won't have any effect since it's not |
195 | @@ -46,8 +48,8 @@ |
196 | // though the constant is defined in 4.0, it's not actually implemented as |
197 | // of 4.4. |
198 | // |
199 | -# define UREGEX_LITERAL 16 |
200 | -# endif /* U_ICU_VERSION_MAJOR_NUM */ |
201 | +# define UREGEX_LITERAL 16 |
202 | +#endif /* U_ICU_VERSION_MAJOR_NUM */ |
203 | |
204 | using namespace std; |
205 | |
206 | @@ -559,7 +561,6 @@ |
207 | |
208 | /////////////////////////////////////////////////////////////////////////////// |
209 | |
210 | - |
211 | namespace unicode { |
212 | |
213 | void regex::compile( string const &u_pattern, char const *flags, |
214 | @@ -582,10 +583,27 @@ |
215 | } |
216 | } |
217 | |
218 | +int regex::get_group_count() { |
219 | + ZORBA_ASSERT( matcher_ ); |
220 | + return matcher_->groupCount(); |
221 | +} |
222 | + |
223 | +int regex::get_group_start( int group ) { |
224 | + ZORBA_ASSERT( matcher_ ); |
225 | + UErrorCode status = U_ZERO_ERROR; |
226 | + return matcher_->start( group, status ); |
227 | +} |
228 | + |
229 | +int regex::get_group_end( int group ) { |
230 | + ZORBA_ASSERT( matcher_ ); |
231 | + UErrorCode status = U_ZERO_ERROR; |
232 | + return matcher_->end( group, status ); |
233 | +} |
234 | + |
235 | bool regex::match_part( string const &s ) { |
236 | ZORBA_ASSERT( matcher_ ); |
237 | matcher_->reset( s ); |
238 | - return matcher_->find() != 0; |
239 | + return !!matcher_->find(); |
240 | } |
241 | |
242 | bool regex::match_whole( string const &s ) { |
243 | @@ -645,6 +663,19 @@ |
244 | return false; |
245 | } |
246 | |
247 | +bool regex::next_match( bool *reached_end ) { |
248 | + ZORBA_ASSERT( matcher_ ); |
249 | + bool const found = !!matcher_->find(); |
250 | + if ( reached_end ) { |
251 | +#if U_ICU_VERSION_MAJOR_NUM >= 4 |
252 | + *reached_end = !!matcher_->hitEnd(); |
253 | +#else |
254 | + *reached_end = true; |
255 | +#endif /* U_ICU_VERSION_MAJOR_NUM */ |
256 | + } |
257 | + return found; |
258 | +} |
259 | + |
260 | bool regex::replace_all( string const &in, string const &replacement, |
261 | string *out ) { |
262 | ZORBA_ASSERT( matcher_ ); |
263 | @@ -663,290 +694,16 @@ |
264 | replace_all( u_in, u_replacement, out ); |
265 | } |
266 | |
267 | -void regex::set_string( char const *in, size_type len ) { |
268 | - ZORBA_ASSERT( matcher_ ); |
269 | - to_string( in, len, &s_in_ ); |
270 | - matcher_->reset( s_in_ ); |
271 | -} |
272 | - |
273 | -bool regex::find_next_match( bool *reachedEnd ) { |
274 | - ZORBA_ASSERT( matcher_ ); |
275 | - UBool retfind = matcher_->find(); |
276 | - if ( reachedEnd ) { |
277 | -#if U_ICU_VERSION_MAJOR_NUM >= 4 |
278 | - *reachedEnd = matcher_->hitEnd() != 0; |
279 | -#else |
280 | - *reachedEnd = true; |
281 | -#endif |
282 | - } |
283 | - return retfind != 0; |
284 | -} |
285 | - |
286 | -int regex::get_pattern_group_count() { |
287 | - ZORBA_ASSERT( matcher_ ); |
288 | - return matcher_->groupCount(); |
289 | -} |
290 | - |
291 | -int regex::get_match_start( int groupId ) { |
292 | - ZORBA_ASSERT( matcher_ ); |
293 | - UErrorCode status = U_ZERO_ERROR; |
294 | - return matcher_->start( groupId, status ); |
295 | -} |
296 | - |
297 | -int regex::get_match_end( int groupId ) { |
298 | - ZORBA_ASSERT( matcher_ ); |
299 | - UErrorCode status = U_ZERO_ERROR; |
300 | - return matcher_->end( groupId, status ); |
301 | -} |
302 | - |
303 | -} // namespace unicode |
304 | -} // namespace zorba |
305 | +void regex::set_string( char const *s, size_type s_len ) { |
306 | + ZORBA_ASSERT( matcher_ ); |
307 | + to_string( s, s_len, &s_ ); |
308 | + matcher_->reset( s_ ); |
309 | +} |
310 | |
311 | /////////////////////////////////////////////////////////////////////////////// |
312 | |
313 | -#else /* ZORBA_NO_ICU */ |
314 | - |
315 | -#include "zorbatypes/zstring.h" |
316 | - |
317 | -namespace zorba { |
318 | -//no convertion |
319 | -void convert_xquery_re( zstring const &xq_re, zstring *lib_re, |
320 | - char const *flags) |
321 | -{ |
322 | - *lib_re = xq_re; |
323 | -} |
324 | - |
325 | -namespace unicode { |
326 | - |
327 | -uint32_t regex::parse_regex_flags(const char* flag_cstr) |
328 | -{ |
329 | - uint32_t flags = 0; |
330 | - for (const char* p = flag_cstr; *p != '\0'; ++p) |
331 | - { |
332 | - switch (*p) |
333 | - { |
334 | - case 'i': flags |= REGEX_ASCII_CASE_INSENSITIVE; break; |
335 | - case 's': flags |= REGEX_ASCII_DOTALL; break; |
336 | - case 'm': flags |= REGEX_ASCII_MULTILINE; break; |
337 | - case 'x': flags |= REGEX_ASCII_NO_WHITESPACE; break; |
338 | - case 'q': flags |= REGEX_ASCII_LITERAL; break; |
339 | - default: |
340 | - throw XQUERY_EXCEPTION( err::FORX0001, ERROR_PARAMS( *p ) ); |
341 | - break; |
342 | - } |
343 | - } |
344 | - return flags; |
345 | -} |
346 | - |
347 | -void regex::compile( char const *pattern, char const *flags) |
348 | -{ |
349 | - parsed_flags = parse_regex_flags(flags); |
350 | - regex_xquery::CRegexXQuery_parser regex_parser; |
351 | - regex_matcher = regex_parser.parse(pattern, parsed_flags); |
352 | - if(!regex_matcher) |
353 | - throw INVALID_RE_EXCEPTION(pattern); |
354 | -} |
355 | - |
356 | -bool regex::match_part( char const *s ) |
357 | -{ |
358 | - bool retval; |
359 | - int match_pos; |
360 | - int matched_len; |
361 | - |
362 | - retval = regex_matcher->match_anywhere(s, parsed_flags, &match_pos, &matched_len); |
363 | - |
364 | - return retval; |
365 | -} |
366 | - |
367 | -bool regex::next_match( char const *s, size_type *pos, zstring *match ) |
368 | -{ |
369 | - bool retval; |
370 | - int match_pos; |
371 | - int matched_len; |
372 | - |
373 | - retval = regex_matcher->match_anywhere(s+*pos, parsed_flags, &match_pos, &matched_len); |
374 | - if(retval) |
375 | - { |
376 | - match->assign(s+*pos+match_pos, matched_len); |
377 | - *pos += match_pos + matched_len; |
378 | - } |
379 | - return retval; |
380 | -} |
381 | - |
382 | -bool regex::next_token( char const *s, size_type *pos, zstring *token, |
383 | - bool *matched) |
384 | -{ |
385 | - if(!s[*pos]) |
386 | - return false; |
387 | - bool retval; |
388 | - int match_pos; |
389 | - int matched_len; |
390 | - |
391 | - retval = regex_matcher->match_anywhere(s+*pos, parsed_flags, &match_pos, &matched_len); |
392 | - if(retval) |
393 | - { |
394 | - if(token) |
395 | - token->assign(s+*pos, match_pos); |
396 | - *pos += match_pos + matched_len; |
397 | - if(matched) |
398 | - *matched = true; |
399 | - return true; |
400 | - } |
401 | - else |
402 | - { |
403 | - if(token) |
404 | - token->assign(s+*pos); |
405 | - *pos += strlen(s+*pos); |
406 | - if(matched) |
407 | - *matched = false; |
408 | - return true; |
409 | - } |
410 | -} |
411 | - |
412 | -bool regex::match_whole( char const *s ) |
413 | -{ |
414 | - bool retval; |
415 | - int matched_pos; |
416 | - int matched_len; |
417 | - |
418 | - retval = regex_matcher->match_anywhere(s, parsed_flags|REGEX_ASCII_WHOLE_MATCH, &matched_pos, &matched_len); |
419 | - if(!retval) |
420 | - return false; |
421 | - return true; |
422 | -} |
423 | - |
424 | -bool regex::replace_all( char const *in, char const *replacement, zstring *result ) |
425 | -{ |
426 | - int match_pos; |
427 | - int matched_len; |
428 | - |
429 | - const char *start_str = in; |
430 | - int subregex_count = regex_matcher->get_indexed_regex_count(); |
431 | - bool retval = false; |
432 | - |
433 | - while(regex_matcher->match_anywhere(start_str, parsed_flags, &match_pos, &matched_len)) |
434 | - { |
435 | - if(match_pos) |
436 | - result->append(start_str , match_pos); |
437 | - retval = true; |
438 | - const char *temprepl = replacement; |
439 | - const char *submatched_source; |
440 | - int submatched_len; |
441 | - int index; |
442 | - while(*temprepl) |
443 | - { |
444 | - //look for dollars |
445 | - if(*temprepl == '\\') |
446 | - { |
447 | - if(!(parsed_flags & REGEX_ASCII_LITERAL)) |
448 | - { |
449 | - temprepl++; |
450 | - if(!*temprepl) |
451 | - temprepl--; |
452 | - else if((*temprepl != '\\') && (*temprepl != '$'))//Invalid replacement string. |
453 | - throw XQUERY_EXCEPTION( err::FORX0004, ERROR_PARAMS( replacement ) ); |
454 | - } |
455 | - result->append(1, *temprepl); |
456 | - temprepl++; |
457 | - continue; |
458 | - } |
459 | - if((*temprepl == '$') && !(parsed_flags & REGEX_ASCII_LITERAL)) |
460 | - { |
461 | - temprepl++; |
462 | - index = 0; |
463 | - int nr_digits = 0; |
464 | - while(isdigit(*temprepl)) |
465 | - { |
466 | - if(nr_digits && ((index*10 + (*temprepl)-'0') > subregex_count)) |
467 | - break; |
468 | - index = index*10 + (*temprepl)-'0'; |
469 | - temprepl++; |
470 | - nr_digits++; |
471 | - } |
472 | - if(!nr_digits)//Invalid replacement string. |
473 | - throw XQUERY_EXCEPTION( err::FORX0004, ERROR_PARAMS( replacement ) ); |
474 | - else if(!index) |
475 | - { |
476 | - result->append(start_str+match_pos, matched_len); |
477 | - } |
478 | - else if(regex_matcher->get_indexed_match(index, &submatched_source, &submatched_len)) |
479 | - { |
480 | - if(submatched_source && submatched_len) |
481 | - result->append(submatched_source, submatched_len); |
482 | - } |
483 | - } |
484 | - else |
485 | - { |
486 | - result->append(1, *temprepl); |
487 | - temprepl++; |
488 | - } |
489 | - } |
490 | - start_str += match_pos + matched_len; |
491 | - } |
492 | - result->append(start_str); |
493 | - |
494 | - return retval; |
495 | -} |
496 | - |
497 | -void regex::set_string( const char* in, size_type len ) |
498 | -{ |
499 | - s_in_.assign(in, len); |
500 | - m_pos = 0; |
501 | - m_match_pos = 0; |
502 | - m_matched_len = 0; |
503 | -} |
504 | - |
505 | -bool regex::find_next_match( bool *reachedEnd ) |
506 | -{ |
507 | - bool retval; |
508 | - |
509 | - retval = regex_matcher->match_anywhere(s_in_.c_str()+m_pos, parsed_flags, &m_match_pos, &m_matched_len); |
510 | - if(retval) |
511 | - { |
512 | - m_match_pos += m_pos; |
513 | - m_pos = m_match_pos + m_matched_len; |
514 | - } |
515 | - else |
516 | - { |
517 | - m_pos = s_in_.length(); |
518 | - m_match_pos = 0; |
519 | - m_matched_len = 0; |
520 | - } |
521 | - if(reachedEnd) |
522 | - *reachedEnd = regex_matcher->get_reachedEnd(); |
523 | - return retval; |
524 | -} |
525 | - |
526 | -int regex::get_pattern_group_count() |
527 | -{ |
528 | - return (int)regex_matcher->get_indexed_regex_count(); |
529 | -} |
530 | - |
531 | -bool regex::get_match_start_end_bytes( int groupId, int *start, int *end ) |
532 | -{ |
533 | - *start = -1; |
534 | - *end = -1; |
535 | - if(groupId == 0) |
536 | - { |
537 | - *start = m_match_pos; |
538 | - *end = m_match_pos + m_matched_len; |
539 | - return true; |
540 | - } |
541 | - if(groupId > (int)regex_matcher->get_indexed_regex_count()) |
542 | - return false; |
543 | - const char *submatched_source; |
544 | - int submatched_len; |
545 | - if(!regex_matcher->get_indexed_match(groupId, &submatched_source, &submatched_len)) |
546 | - return false; |
547 | - *start = submatched_source - s_in_.c_str(); |
548 | - *end = *start + submatched_len; |
549 | - return true; |
550 | -} |
551 | - |
552 | } // namespace unicode |
553 | } // namespace zorba |
554 | + |
555 | #endif /* ZORBA_NO_ICU */ |
556 | - |
557 | -/////////////////////////////////////////////////////////////////////////////// |
558 | - |
559 | /* vim:set et sw=2 ts=2: */ |
560 | |
561 | === renamed file 'src/util/regex.h' => 'src/util/icu_regex.h' |
562 | --- src/util/regex.h 2013-02-07 17:24:36 +0000 |
563 | +++ src/util/icu_regex.h 2013-04-12 05:29:35 +0000 |
564 | @@ -14,8 +14,8 @@ |
565 | * limitations under the License. |
566 | */ |
567 | #pragma once |
568 | -#ifndef ZORBA_REGEX_H |
569 | -#define ZORBA_REGEX_H |
570 | +#ifndef ZORBA_ICU_REGEX_H |
571 | +#define ZORBA_ICU_REGEX_H |
572 | |
573 | #include "cxx_util.h" |
574 | #include "unicode_util.h" |
575 | @@ -42,7 +42,6 @@ |
576 | |
577 | namespace unicode { |
578 | |
579 | - |
580 | /** |
581 | * The %regex class wraps the underlying Unicode regular expression library. |
582 | */ |
583 | @@ -421,59 +420,61 @@ |
584 | return replace_all( in.c_str(), replacement.c_str(), out ); |
585 | } |
586 | |
587 | - |
588 | - /** |
589 | - * Set the string to work on, without doing matching yet. |
590 | - * |
591 | - * @param in The UTF-8 input string. |
592 | - * @param len the size in bytes. |
593 | - */ |
594 | - void set_string( const char* in, size_type len ); |
595 | - |
596 | - /** |
597 | - * Find the next match in string set by set_string(). |
598 | - * After finding a match, call get_match_start() and get_match_end() to get the position in the string. |
599 | - * |
600 | - * @param reachedEnd returns true if the end of string was reached while doing the match. |
601 | - * This works only for ICU greater than 4.0. For smaller versions, reachedEnd is always true. |
602 | - */ |
603 | - bool find_next_match( bool *reachedEnd ); |
604 | - |
605 | - /** |
606 | - * Get the number of parenthesized groups in the regular expression. |
607 | - * This number depends only on regular expression, and not on the working string. |
608 | - * |
609 | - * @return the number of parenthesized groups in the regular expression |
610 | - */ |
611 | - int get_pattern_group_count(); |
612 | - |
613 | - /** |
614 | - * Get the start position of the matched group. |
615 | - * If groupId is zero, then the start position of the whole match is returned. |
616 | - * If groupId is non-zero, then the start position of that group is returned. |
617 | - * If that group has not been matched, -1 is returned. |
618 | - * |
619 | - * @param groupId the id of the group, either zero for the entire regex, |
620 | - * or [1 .. group_count] for that specific group |
621 | - * @return the start position, zero based, or -1 if that group didn't match |
622 | - */ |
623 | - int get_match_start( int groupId = 0 ); |
624 | - |
625 | - /** |
626 | - * Get the end position of the matched group. |
627 | - * If groupId is zero, then the end position of the whole match is returned. |
628 | - * If groupId is non-zero, then the end position of that group is returned. |
629 | - * If that group has not been matched, -1 is returned. |
630 | - * |
631 | - * @param groupId the id of the group, either zero for the entire regex, |
632 | - * or [1 .. group_count] for that specific group |
633 | - * @return the end position, zero based, or -1 if that group didn't match |
634 | - */ |
635 | - int get_match_end( int groupId = 0 ); |
636 | + ////////// capturing subgroups ////////////////////////////////////////////// |
637 | + |
638 | + /** |
639 | + * Gets the number of capturing subgroups in the regular expression. |
640 | + * |
641 | + * @return Returns said number. |
642 | + */ |
643 | + int get_group_count(); |
644 | + |
645 | + /** |
646 | + * Gets the start character position of the matched capturing subgroup. |
647 | + * |
648 | + * @param group The ID of the capturing subgroup [1-N] where N is the result |
649 | + * of get_group_count(), or 0 for the whole match. |
650 | + * @return Returns the start position (zero-based) or -1 if \a group didn't |
651 | + * match. |
652 | + */ |
653 | + int get_group_start( int group = 0 ); |
654 | + |
655 | + /** |
656 | + * Gets the end character position of the matched group. |
657 | + * |
658 | + * @param group The ID of the capturing subgroup [1-N] where N is the result |
659 | + * of get_group_count(), or 0 for the whole match. |
660 | + * @return Returns the end position (zero-based) or -1 if \a group didn't |
661 | + * match. |
662 | + */ |
663 | + int get_group_end( int group = 0 ); |
664 | + |
665 | + /** |
666 | + * Sets the string to work on, without doing matching yet. |
667 | + * |
668 | + * @param s The UTF-8 input string. |
669 | + * @param s_len The length of \a s in bytes. |
670 | + */ |
671 | + void set_string( char const *s, size_type s_len ); |
672 | + |
673 | + /** |
674 | + * Finds the next match in the string set by set_string(). After finding a |
675 | + * match, call get_group_start() and get_group_end() to get the position in |
676 | + * the string. |
677 | + * |
678 | + * @param reached_end If not \c nullptr, set to \c true only if the end of |
679 | + * string has been reached while doing the match. (This works only for ICU |
680 | + * version 4.0 or later; for earlier versions, this is always set to |
681 | + * <code>true</code>.) |
682 | + * @return Returns \c true only if the next match was found. |
683 | + */ |
684 | + bool next_match( bool *reached_end ); |
685 | + |
686 | + ///////////////////////////////////////////////////////////////////////////// |
687 | |
688 | private: |
689 | U_NAMESPACE_QUALIFIER RegexMatcher *matcher_; |
690 | - string s_in_; |
691 | + string s_; |
692 | |
693 | enum re_type_t { |
694 | re_is_match, // RE specifies what to match |
695 | @@ -491,386 +492,13 @@ |
696 | regex& operator=( regex const& ); |
697 | }; |
698 | |
699 | -} // namespace unicode |
700 | -} // namespace zorba |
701 | - |
702 | -/////////////////////////////////////////////////////////////////////////////// |
703 | - |
704 | -#else /* ZORBA_NO_ICU */ |
705 | - |
706 | -#include "util/regex_xquery.h" |
707 | -#include <string> |
708 | - |
709 | -namespace zorba{ |
710 | -/** |
711 | - * Converts an XQuery regular expression to the form used by the regular |
712 | - * expression library Zorba is using (here regex_xquery). |
713 | - * |
714 | - * @param xq_re The XQuery regular expression. |
715 | - * @param lib_re A pointer to the resuling library regular expression. |
716 | - * @param flags The flags to use, if any. |
717 | - */ |
718 | -void convert_xquery_re( zstring const &xq_re, zstring *lib_re, |
719 | - char const *flags = "" ); |
720 | - |
721 | -namespace unicode{ |
722 | -////////// classes //////////////////////////////////////////////////////////// |
723 | - |
724 | - |
725 | -/** |
726 | - * The %regex class wraps the underlying Unicode regular expression library. |
727 | - */ |
728 | -class regex { |
729 | -public: |
730 | - /** |
731 | - * Constructs a %regex. |
732 | - */ |
733 | - regex() : regex_matcher( nullptr ) { } |
734 | - |
735 | - /** |
736 | - * Destroys a %regex. |
737 | - */ |
738 | - ~regex() { |
739 | - delete regex_matcher; |
740 | - } |
741 | - |
742 | - ////////// compile pattern ////////////////////////////////////////////////// |
743 | - |
744 | - /** |
745 | - * Compiles a regular expression. One of the compile functions must be |
746 | - * called prior to calling one of the match functions. |
747 | - * |
748 | - * @param pattern The regular expression pattern to compile. |
749 | - * @param flags The regular expression flags, if any. |
750 | - * @param throws err:FORX0002 if the regular expression is invalid. |
751 | - */ |
752 | - void compile( char const *pattern, char const *flags = "" ) ; |
753 | - |
754 | - /** |
755 | - * Compiles a regular expression. One of the compile functions must be |
756 | - * called prior to calling one of the match functions. |
757 | - * |
758 | - * @tparam StringType The pattern string type. |
759 | - * @param pattern The regular expression pattern to compile. |
760 | - * @param flags The regular expression flags, if any. |
761 | - * @param throws err:FORX0002 if the regular expression is invalid. |
762 | - */ |
763 | - template<class StringType> |
764 | - void compile( StringType const &pattern, char const *flags = "" ) { |
765 | - compile( pattern.c_str(), flags ); |
766 | - } |
767 | - |
768 | - /** |
769 | - * Compiles a regular expression. One of the compile functions must be |
770 | - * called prior to calling one of the match functions. |
771 | - * |
772 | - * @tparam PatternStringType The pattern string type. |
773 | - * @tparam FlagsStringType The flags string type. |
774 | - * @param pattern The regular expression pattern to compile. |
775 | - * @param flags The regular expression flags, if any. |
776 | - * @param throws err:FORX0002 if the regular expression is invalid. |
777 | - */ |
778 | - template<class PatternStringType,class FlagsStringType> |
779 | - void compile( PatternStringType const &pattern, |
780 | - FlagsStringType const &flags ) { |
781 | - compile( pattern.c_str(), flags.c_str() ); |
782 | - } |
783 | - |
784 | - ////////// partial match //////////////////////////////////////////////////// |
785 | - |
786 | - /** |
787 | - * Checks whether the given string partially patches the previosuly compiled |
788 | - * regular expression. A "partial match" means that at least part of the |
789 | - * string matches, e.g., "b" matches the regular expression "aba". |
790 | - * |
791 | - * @param s The null-terminated UTF-8 C string to attempt to match. |
792 | - * @return Returns \c true only if the string partially matches. |
793 | - */ |
794 | - bool match_part( char const *s ); |
795 | - |
796 | - /** |
797 | - * Checks whether the given string partially patches the previosuly compiled |
798 | - * regular expression. A "partial match" means that at least part of the |
799 | - * string matches, e.g., "b" matches the regular expression "aba". |
800 | - * |
801 | - * @param s The UTF-8 C string to attempt to match. |
802 | - * @param s_len The length of the string in bytes. |
803 | - * @return Returns \c true only if the string partially matches. |
804 | - */ |
805 | - bool match_part( char const *s, size_type s_len ) |
806 | - { |
807 | - zstring scut(s, s_len); |
808 | - return match_part(scut.c_str()); |
809 | - } |
810 | - |
811 | - /** |
812 | - * Checks whether the given string partially patches the previosuly compiled |
813 | - * regular expression. A "partial match" means that at least part of the |
814 | - * string matches, e.g., "b" matches the regular expression "aba". |
815 | - * |
816 | - * @tparam StringType The string type. |
817 | - * @param s The UTF-8 string to attempt to match. |
818 | - * @return Returns \c true only if the string partially matches. |
819 | - */ |
820 | - template<class StringType> |
821 | - bool match_part( StringType const &s ) { |
822 | - return match_part(s.c_str()); |
823 | - } |
824 | - |
825 | - ////////// partial match with substrings/tokenization /////////////////////// |
826 | - |
827 | - /** |
828 | - * Finds the next substring matching the pattern this %regex was compiled |
829 | - * with. |
830 | - * |
831 | - * @param s The C string to attempt to match. |
832 | - * @param pos A pointer to the position to start looking for a match. On |
833 | - * successful return, the position is updated to be one past the last |
834 | - * character of the match. |
835 | - * @param match A pointer to the string that is to be set to the substring |
836 | - * matching the pattern or NULL if the substring is not needed. |
837 | - * @return Returns \c true only if there is a match. |
838 | - */ |
839 | - bool next_match( char const *s, size_type *pos, zstring *match ); |
840 | - |
841 | - /** |
842 | - * Finds the next substring matching the pattern this %regex was compiled |
843 | - * with. |
844 | - * |
845 | - * @param s The C string to attempt to match. |
846 | - * @param s_len The length of the C string. |
847 | - * @param pos A pointer to the position to start looking for a match. On |
848 | - * successful return, the position is updated to be one past the last |
849 | - * character of the match. |
850 | - * @param match A pointer to the string that is to be set to the substring |
851 | - * matching the pattern or NULL if the substring is not needed. |
852 | - * @return Returns \c true only if there is a match. |
853 | - */ |
854 | - bool next_match( char const *s, size_type s_len, size_type *pos, |
855 | - zstring *match ) |
856 | - { |
857 | - zstring scut(s, s_len); |
858 | - return next_match(scut.c_str(), pos, match); |
859 | - } |
860 | - |
861 | - /** |
862 | - * Finds the next substring matching the pattern this %regex was compiled |
863 | - * with. |
864 | - * |
865 | - * @tparam StringType The string type. |
866 | - * @param s The string to attempt to match. |
867 | - * @param pos A pointer to the position to start looking for a match. On |
868 | - * successful return, the position is updated to be one past the last |
869 | - * character of the match. |
870 | - * @param match A pointer to the string that is to be set to the substring |
871 | - * matching the pattern or NULL if the substring is not needed. |
872 | - * @return Returns \c true only if there is a match. |
873 | - */ |
874 | - template<class StringType> |
875 | - bool next_match( StringType const &s, size_type *pos, zstring *match ) { |
876 | - return next_match(s.c_str(), pos, match); |
877 | - } |
878 | - |
879 | - |
880 | - /** |
881 | - * Finds the next substring separated by the pattern this %regex was compiled |
882 | - * with (similar to <code>strtok</code>(3)). |
883 | - * |
884 | - * @param s The C string to attempt to find a token in. |
885 | - * @param pos A pointer to the position to start looking for a token. On |
886 | - * successful return, the position is updated to be one past the last |
887 | - * character of the token. |
888 | - * @param token A pointer to the string that is to be set to the substring |
889 | - * separated by the pattern or \c NULL if the substring is not needed. |
890 | - * @param matched A pointer to a \c bool to indicate whether the pattern |
891 | - * matched for the token or \c NULL if this is not needed. If not \c NULL, |
892 | - * it is set to \c false either if there is no token or the token is the |
893 | - * final token after the last separator; it is set to \c true only for |
894 | - * non-last tokens. |
895 | - * @return Returns \c true only if there is a token. |
896 | - */ |
897 | - bool next_token( char const *s, size_type *pos, zstring *token, |
898 | - bool *matched = NULL ); |
899 | - |
900 | - /** |
901 | - * Finds the next substring separated by the pattern this %regex was compiled |
902 | - * with (similar to <code>strtok</code>(3)). |
903 | - * |
904 | - * @param s The C string to attempt to find a token in. |
905 | - * @param s_len The length of the C string. |
906 | - * @param pos A pointer to the position to start looking for a token. On |
907 | - * successful return, the position is updated to be one past the last |
908 | - * character of the token. |
909 | - * @param token A pointer to the string that is to be set to the substring |
910 | - * separated by the pattern or \c NULL if the substring is not needed. |
911 | - * @param matched A pointer to a \c bool to indicate whether the pattern |
912 | - * matched for the token or \c NULL if this is not needed. If not \c NULL, |
913 | - * it is set to \c false either if there is no token or the token is the |
914 | - * final token after the last separator; it is set to \c true only for |
915 | - * non-last tokens. |
916 | - * @return Returns \c true only if there is a token. |
917 | - */ |
918 | - bool next_token( char const *s, size_type s_len, size_type *pos, |
919 | - zstring *token, bool *matched = NULL ) |
920 | - { |
921 | - zstring scut(s, s_len); |
922 | - return next_token(scut.c_str(), pos, token, matched); |
923 | - } |
924 | - |
925 | - /** |
926 | - * Finds the next substring separated by the pattern this %regex was compiled |
927 | - * with (similar to <code>strtok</code>(3)). |
928 | - * |
929 | - * @tparam StringType The string type. |
930 | - * @param s The string to attempt to find a token in. |
931 | - * @param pos A pointer to the position to start looking for a token. On |
932 | - * successful return, the position is updated to be one past the last |
933 | - * character of the token. |
934 | - * @param token A pointer to the string that is to be set to the substring |
935 | - * separated by the pattern or \c NULL if the substring is not needed. |
936 | - * @return Returns \c true only if there is a token. |
937 | - */ |
938 | - template<class StringType> |
939 | - bool next_token( StringType const &s, size_type *pos, zstring *token, |
940 | - bool *matched = NULL ) { |
941 | - return next_token(s.c_str(), pos, token, matched); |
942 | - } |
943 | - |
944 | - ////////// whole match ////////////////////////////////////////////////////// |
945 | - |
946 | - /** |
947 | - * Checks whether the given string completely patches the previosuly compiled |
948 | - * regular expression. A "complete match" means that the entire string must |
949 | - * match the regular expression as if the pattern were "^pattern$". |
950 | - * |
951 | - * @param s The null-terminated UTF-8 C string to attempt to match. |
952 | - * @return Returns \c true only if the string completely matches. |
953 | - */ |
954 | - bool match_whole( char const *s ); |
955 | - |
956 | - /** |
957 | - * Checks whether the given string completely patches the previosuly compiled |
958 | - * regular expression. A "complete match" means that the entire string must |
959 | - * match the regular expression as if the pattern were "^pattern$". |
960 | - * |
961 | - * @param s The UTF-8 C string to attempt to match. |
962 | - * @param s_len The length of the string in bytes. |
963 | - * @return Returns \c true only if the string completely matches. |
964 | - */ |
965 | - bool match_whole( char const *s, size_type s_len ) |
966 | - { |
967 | - zstring scut(s, s_len); |
968 | - return match_whole(scut.c_str()); |
969 | - } |
970 | - |
971 | - /** |
972 | - * Checks whether the given string completely patches the previosuly compiled |
973 | - * regular expression. A "complete match" means that the entire string must |
974 | - * match the regular expression as if the pattern were "^pattern$". |
975 | - * |
976 | - * @tparam StringType The string type. |
977 | - * @param s The UTF-8 string to attempt to match. |
978 | - * @return Returns \c true only if the string completely matches. |
979 | - */ |
980 | - template<class StringType> |
981 | - bool match_whole( StringType const &s ) { |
982 | - return match_whole(s.c_str()); |
983 | - } |
984 | - |
985 | - ////////// replacement ////////////////////////////////////////////////////// |
986 | - |
987 | - /** |
988 | - * Replaces all occurrences of substrings that match the pattern this %regex |
989 | - * was compiled with. |
990 | - * |
991 | - * @param in The UTF-8 input string. |
992 | - * @param replacement The replacement string. |
993 | - * @param out The output string. |
994 | - * @param Returns \c true only if at least one replacement was made. |
995 | - */ |
996 | - bool replace_all( char const *in, char const *replacement, zstring *out ); |
997 | - |
998 | - /** |
999 | - * Replaces all occurrences of substrings that match the pattern this %regex |
1000 | - * was compiled with. |
1001 | - * |
1002 | - * @tparam InputStringType The input string type. |
1003 | - * @tparam ReplacementStringType The replacement string type. |
1004 | - * @param in The input string. |
1005 | - * @param replacement The replacement string. |
1006 | - * @param out The output string. |
1007 | - * @param Returns \c true only if at least one replacement was made. |
1008 | - */ |
1009 | - template<class InputStringType,class ReplacementStringType> |
1010 | - bool replace_all( InputStringType const &in, |
1011 | - ReplacementStringType const &replacement, |
1012 | - zstring *out ) { |
1013 | - return replace_all( in.c_str(), replacement.c_str(), out ); |
1014 | - } |
1015 | - |
1016 | - |
1017 | - /** |
1018 | - * Set the string to work on, without doing matching yet. |
1019 | - * |
1020 | - * @param in The UTF-8 input string. |
1021 | - * @param len the size in bytes. |
1022 | - */ |
1023 | - void set_string( const char* in, size_type len ); |
1024 | - |
1025 | - /** |
1026 | - * Find the next match in string set by set_string(). |
1027 | - * After finding a match, call get_match_start() and get_match_end() to get the position in the string. |
1028 | - * |
1029 | - * @param reachedEnd returns true if the end of string was reached while doing the match. |
1030 | - */ |
1031 | - bool find_next_match( bool *reachedEnd ); |
1032 | - |
1033 | - /** |
1034 | - * Get the number of parenthesized groups in the regular expression. |
1035 | - * This number depends only on regular expression, and not on the working string. |
1036 | - * |
1037 | - * @return the number of parenthesized groups in the regular expression |
1038 | - */ |
1039 | - int get_pattern_group_count(); |
1040 | - |
1041 | - /** |
1042 | - * Get the start position of the matched group. |
1043 | - * If groupId is zero, then the start and end position of the whole match is returned. |
1044 | - * If groupId is non-zero, then the start and end position of that group is returned. |
1045 | - * If that group has not been matched, false is returned. |
1046 | - * |
1047 | - * @param groupId the id of the group, either zero for the entire regex, |
1048 | - * or [1 .. group_count] for that specific group |
1049 | - * @param start to return start position in bytes |
1050 | - * @param end to return end position in bytes |
1051 | - * @return true if that group exists and has been matched |
1052 | - */ |
1053 | - bool get_match_start_end_bytes( int groupId, int *start, int *end ); |
1054 | - |
1055 | - |
1056 | -private: |
1057 | - regex_xquery::CRegexXQuery_regex *regex_matcher; |
1058 | - uint32_t parsed_flags; |
1059 | - |
1060 | - zstring s_in_; |
1061 | - int m_pos; |
1062 | - int m_match_pos; |
1063 | - int m_matched_len; |
1064 | - |
1065 | - uint32_t parse_regex_flags(const char* flag_cstr); |
1066 | - |
1067 | - // forbid |
1068 | - regex( regex const& ); |
1069 | - regex& operator=( regex const& ); |
1070 | -}; |
1071 | - |
1072 | /////////////////////////////////////////////////////////////////////////////// |
1073 | |
1074 | } // namespace unicode |
1075 | } // namespace zorba |
1076 | |
1077 | #endif /* ZORBA_NO_ICU */ |
1078 | -#endif /* ZORBA_REGEX_H */ |
1079 | +#endif /* ZORBA_ICU_REGEX_H */ |
1080 | /* |
1081 | * Local variables: |
1082 | * mode: c++ |
1083 | |
1084 | === modified file 'src/util/passthru_streambuf.cpp' |
1085 | --- src/util/passthru_streambuf.cpp 2013-02-07 17:24:36 +0000 |
1086 | +++ src/util/passthru_streambuf.cpp 2013-04-12 05:29:35 +0000 |
1087 | @@ -26,7 +26,7 @@ |
1088 | /////////////////////////////////////////////////////////////////////////////// |
1089 | |
1090 | passthru_streambuf::passthru_streambuf( char const*, streambuf *orig ) : |
1091 | - proxy_streambuf( orig ) |
1092 | + internal::proxy_streambuf( orig ) |
1093 | { |
1094 | if ( !orig ) |
1095 | throw invalid_argument( "null streambuf" ); |
1096 | @@ -85,7 +85,7 @@ |
1097 | |
1098 | passthru_streambuf::int_type passthru_streambuf::pbackfail( int_type c ) { |
1099 | return traits_type::eq_int_type( c, traits_type::eof() ) ? |
1100 | - c : proxy_buf_->sputbackc( traits_type::to_char_type( c ) ); |
1101 | + c : original()->sputbackc( traits_type::to_char_type( c ) ); |
1102 | } |
1103 | |
1104 | passthru_streambuf::int_type passthru_streambuf::uflow() { |
1105 | |
1106 | === modified file 'src/util/passthru_streambuf.h' |
1107 | --- src/util/passthru_streambuf.h 2013-02-07 17:24:36 +0000 |
1108 | +++ src/util/passthru_streambuf.h 2013-04-12 05:29:35 +0000 |
1109 | @@ -30,7 +30,7 @@ |
1110 | * A %passthru_streambuf is-a std::streambuf that simply passes through |
1111 | * characters unchanged. |
1112 | */ |
1113 | -class passthru_streambuf : public proxy_streambuf { |
1114 | +class passthru_streambuf : public internal::proxy_streambuf { |
1115 | public: |
1116 | #ifdef WIN32 |
1117 | // These typedefs are needed (but shouldn't be) when using MSVC++. |
1118 | |
1119 | === added file 'src/util/regex.h' |
1120 | --- src/util/regex.h 1970-01-01 00:00:00 +0000 |
1121 | +++ src/util/regex.h 2013-04-12 05:29:35 +0000 |
1122 | @@ -0,0 +1,34 @@ |
1123 | +/* |
1124 | + * Copyright 2006-2008 The FLWOR Foundation. |
1125 | + * |
1126 | + * Licensed under the Apache License, Version 2.0 (the "License"); |
1127 | + * you may not use this file except in compliance with the License. |
1128 | + * You may obtain a copy of the License at |
1129 | + * |
1130 | + * http://www.apache.org/licenses/LICENSE-2.0 |
1131 | + * |
1132 | + * Unless required by applicable law or agreed to in writing, software |
1133 | + * distributed under the License is distributed on an "AS IS" BASIS, |
1134 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
1135 | + * See the License for the specific language governing permissions and |
1136 | + * limitations under the License. |
1137 | + */ |
1138 | +#pragma once |
1139 | +#ifndef ZORBA_REGEX_H |
1140 | +#define ZORBA_REGEX_H |
1141 | + |
1142 | +#include <zorba/config.h> |
1143 | + |
1144 | +#ifdef ZORBA_NO_ICU |
1145 | +#include "zorba_regex.h" |
1146 | +#else |
1147 | +#include "icu_regex.h" |
1148 | +#endif /* ZORBA_NO_ICU */ |
1149 | + |
1150 | +#endif /* ZORBA_REGEX_H */ |
1151 | +/* |
1152 | + * Local variables: |
1153 | + * mode: c++ |
1154 | + * End: |
1155 | + */ |
1156 | +/* vim:set et sw=2 ts=2: */ |
1157 | |
1158 | === added file 'src/util/zorba_regex.cpp' |
1159 | --- src/util/zorba_regex.cpp 1970-01-01 00:00:00 +0000 |
1160 | +++ src/util/zorba_regex.cpp 2013-04-12 05:29:35 +0000 |
1161 | @@ -0,0 +1,265 @@ |
1162 | +/* |
1163 | + * Copyright 2006-2008 The FLWOR Foundation. |
1164 | + * |
1165 | + * Licensed under the Apache License, Version 2.0 (the "License"); |
1166 | + * you may not use this file except in compliance with the License. |
1167 | + * You may obtain a copy of the License at |
1168 | + * |
1169 | + * http://www.apache.org/licenses/LICENSE-2.0 |
1170 | + * |
1171 | + * Unless required by applicable law or agreed to in writing, software |
1172 | + * distributed under the License is distributed on an "AS IS" BASIS, |
1173 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
1174 | + * See the License for the specific language governing permissions and |
1175 | + * limitations under the License. |
1176 | + */ |
1177 | +#include "stdafx.h" |
1178 | + |
1179 | +#include <cstring> |
1180 | + |
1181 | +#include <zorba/diagnostic_list.h> |
1182 | +#include "diagnostics/dict.h" |
1183 | +#include "diagnostics/xquery_exception.h" |
1184 | + |
1185 | +#include "stl_util.h" |
1186 | +#include "zorba_regex.h" |
1187 | + |
1188 | +#define INVALID_RE_EXCEPTION(...) \ |
1189 | + XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS( __VA_ARGS__ ) ) |
1190 | + |
1191 | +namespace zorba { |
1192 | +namespace unicode { |
1193 | + |
1194 | +/////////////////////////////////////////////////////////////////////////////// |
1195 | + |
1196 | +uint32_t regex::parse_regex_flags(const char* flag_cstr) |
1197 | +{ |
1198 | + uint32_t flags = 0; |
1199 | + for (const char* p = flag_cstr; *p != '\0'; ++p) |
1200 | + { |
1201 | + switch (*p) |
1202 | + { |
1203 | + case 'i': flags |= REGEX_ASCII_CASE_INSENSITIVE; break; |
1204 | + case 's': flags |= REGEX_ASCII_DOTALL; break; |
1205 | + case 'm': flags |= REGEX_ASCII_MULTILINE; break; |
1206 | + case 'x': flags |= REGEX_ASCII_NO_WHITESPACE; break; |
1207 | + case 'q': flags |= REGEX_ASCII_LITERAL; break; |
1208 | + default: |
1209 | + throw XQUERY_EXCEPTION( err::FORX0001, ERROR_PARAMS( *p ) ); |
1210 | + break; |
1211 | + } |
1212 | + } |
1213 | + return flags; |
1214 | +} |
1215 | + |
1216 | +void regex::compile( char const *pattern, char const *flags) |
1217 | +{ |
1218 | + parsed_flags = parse_regex_flags(flags); |
1219 | + regex_engine::CRegexXQuery_parser regex_parser; |
1220 | + regex_matcher = regex_parser.parse(pattern, parsed_flags); |
1221 | + if(!regex_matcher) |
1222 | + throw INVALID_RE_EXCEPTION(pattern); |
1223 | +} |
1224 | + |
1225 | +bool regex::match_part( char const *s ) |
1226 | +{ |
1227 | + bool retval; |
1228 | + int match_pos; |
1229 | + int matched_len; |
1230 | + |
1231 | + retval = regex_matcher->match_anywhere(s, parsed_flags, &match_pos, &matched_len); |
1232 | + |
1233 | + return retval; |
1234 | +} |
1235 | + |
1236 | +bool regex::next_match( char const *s, size_type *pos, zstring *match ) |
1237 | +{ |
1238 | + bool retval; |
1239 | + int match_pos; |
1240 | + int matched_len; |
1241 | + |
1242 | + retval = regex_matcher->match_anywhere(s+*pos, parsed_flags, &match_pos, &matched_len); |
1243 | + if(retval) |
1244 | + { |
1245 | + match->assign(s+*pos+match_pos, matched_len); |
1246 | + *pos += match_pos + matched_len; |
1247 | + } |
1248 | + return retval; |
1249 | +} |
1250 | + |
1251 | +bool regex::next_token( char const *s, size_type *pos, zstring *token, |
1252 | + bool *matched) |
1253 | +{ |
1254 | + if(!s[*pos]) |
1255 | + return false; |
1256 | + bool retval; |
1257 | + int match_pos; |
1258 | + int matched_len; |
1259 | + |
1260 | + retval = regex_matcher->match_anywhere(s+*pos, parsed_flags, &match_pos, &matched_len); |
1261 | + if(retval) |
1262 | + { |
1263 | + if(token) |
1264 | + token->assign(s+*pos, match_pos); |
1265 | + *pos += match_pos + matched_len; |
1266 | + if(matched) |
1267 | + *matched = true; |
1268 | + return true; |
1269 | + } |
1270 | + else |
1271 | + { |
1272 | + if(token) |
1273 | + token->assign(s+*pos); |
1274 | + *pos += strlen(s+*pos); |
1275 | + if(matched) |
1276 | + *matched = false; |
1277 | + return true; |
1278 | + } |
1279 | +} |
1280 | + |
1281 | +bool regex::match_whole( char const *s ) |
1282 | +{ |
1283 | + bool retval; |
1284 | + int matched_pos; |
1285 | + int matched_len; |
1286 | + |
1287 | + retval = regex_matcher->match_anywhere(s, parsed_flags|REGEX_ASCII_WHOLE_MATCH, &matched_pos, &matched_len); |
1288 | + if(!retval) |
1289 | + return false; |
1290 | + return true; |
1291 | +} |
1292 | + |
1293 | +bool regex::replace_all( char const *in, char const *replacement, zstring *result ) |
1294 | +{ |
1295 | + int match_pos; |
1296 | + int matched_len; |
1297 | + |
1298 | + const char *start_str = in; |
1299 | + int subregex_count = regex_matcher->get_indexed_regex_count(); |
1300 | + bool retval = false; |
1301 | + |
1302 | + while(regex_matcher->match_anywhere(start_str, parsed_flags, &match_pos, &matched_len)) |
1303 | + { |
1304 | + if(match_pos) |
1305 | + result->append(start_str , match_pos); |
1306 | + retval = true; |
1307 | + const char *temprepl = replacement; |
1308 | + const char *submatched_source; |
1309 | + int submatched_len; |
1310 | + int index; |
1311 | + while(*temprepl) |
1312 | + { |
1313 | + //look for dollars |
1314 | + if(*temprepl == '\\') |
1315 | + { |
1316 | + if(!(parsed_flags & REGEX_ASCII_LITERAL)) |
1317 | + { |
1318 | + temprepl++; |
1319 | + if(!*temprepl) |
1320 | + temprepl--; |
1321 | + else if((*temprepl != '\\') && (*temprepl != '$'))//Invalid replacement string. |
1322 | + throw XQUERY_EXCEPTION( err::FORX0004, ERROR_PARAMS( replacement ) ); |
1323 | + } |
1324 | + result->append(1, *temprepl); |
1325 | + temprepl++; |
1326 | + continue; |
1327 | + } |
1328 | + if((*temprepl == '$') && !(parsed_flags & REGEX_ASCII_LITERAL)) |
1329 | + { |
1330 | + temprepl++; |
1331 | + index = 0; |
1332 | + int nr_digits = 0; |
1333 | + while(isdigit(*temprepl)) |
1334 | + { |
1335 | + if(nr_digits && ((index*10 + (*temprepl)-'0') > subregex_count)) |
1336 | + break; |
1337 | + index = index*10 + (*temprepl)-'0'; |
1338 | + temprepl++; |
1339 | + nr_digits++; |
1340 | + } |
1341 | + if(!nr_digits)//Invalid replacement string. |
1342 | + throw XQUERY_EXCEPTION( err::FORX0004, ERROR_PARAMS( replacement ) ); |
1343 | + else if(!index) |
1344 | + { |
1345 | + result->append(start_str+match_pos, matched_len); |
1346 | + } |
1347 | + else if(regex_matcher->get_indexed_match(index, &submatched_source, &submatched_len)) |
1348 | + { |
1349 | + if(submatched_source && submatched_len) |
1350 | + result->append(submatched_source, submatched_len); |
1351 | + } |
1352 | + } |
1353 | + else |
1354 | + { |
1355 | + result->append(1, *temprepl); |
1356 | + temprepl++; |
1357 | + } |
1358 | + } |
1359 | + start_str += match_pos + matched_len; |
1360 | + } |
1361 | + result->append(start_str); |
1362 | + |
1363 | + return retval; |
1364 | +} |
1365 | + |
1366 | +void regex::set_string( const char* in, size_type len ) |
1367 | +{ |
1368 | + s_in_.assign(in, len); |
1369 | + m_pos = 0; |
1370 | + m_match_pos = 0; |
1371 | + m_matched_len = 0; |
1372 | +} |
1373 | + |
1374 | +bool regex::next_match( bool *reachedEnd ) |
1375 | +{ |
1376 | + bool retval; |
1377 | + |
1378 | + retval = regex_matcher->match_anywhere(s_in_.c_str()+m_pos, parsed_flags, &m_match_pos, &m_matched_len); |
1379 | + if(retval) |
1380 | + { |
1381 | + m_match_pos += m_pos; |
1382 | + m_pos = m_match_pos + m_matched_len; |
1383 | + } |
1384 | + else |
1385 | + { |
1386 | + m_pos = s_in_.length(); |
1387 | + m_match_pos = 0; |
1388 | + m_matched_len = 0; |
1389 | + } |
1390 | + if(reachedEnd) |
1391 | + *reachedEnd = regex_matcher->get_reachedEnd(); |
1392 | + return retval; |
1393 | +} |
1394 | + |
1395 | +int regex::get_group_count() |
1396 | +{ |
1397 | + return (int)regex_matcher->get_indexed_regex_count(); |
1398 | +} |
1399 | + |
1400 | +bool regex::get_match_start_end_bytes( int groupId, int *start, int *end ) |
1401 | +{ |
1402 | + *start = -1; |
1403 | + *end = -1; |
1404 | + if(groupId == 0) |
1405 | + { |
1406 | + *start = m_match_pos; |
1407 | + *end = m_match_pos + m_matched_len; |
1408 | + return true; |
1409 | + } |
1410 | + if(groupId > (int)regex_matcher->get_indexed_regex_count()) |
1411 | + return false; |
1412 | + const char *submatched_source; |
1413 | + int submatched_len; |
1414 | + if(!regex_matcher->get_indexed_match(groupId, &submatched_source, &submatched_len)) |
1415 | + return false; |
1416 | + *start = submatched_source - s_in_.c_str(); |
1417 | + *end = *start + submatched_len; |
1418 | + return true; |
1419 | +} |
1420 | + |
1421 | +/////////////////////////////////////////////////////////////////////////////// |
1422 | + |
1423 | +} // namespace unicode |
1424 | +} // namespace zorba |
1425 | + |
1426 | +/* vim:set et sw=2 ts=2: */ |
1427 | |
1428 | === added file 'src/util/zorba_regex.h' |
1429 | --- src/util/zorba_regex.h 1970-01-01 00:00:00 +0000 |
1430 | +++ src/util/zorba_regex.h 2013-04-12 05:29:35 +0000 |
1431 | @@ -0,0 +1,406 @@ |
1432 | +/* |
1433 | + * Copyright 2006-2008 The FLWOR Foundation. |
1434 | + * |
1435 | + * Licensed under the Apache License, Version 2.0 (the "License"); |
1436 | + * you may not use this file except in compliance with the License. |
1437 | + * You may obtain a copy of the License at |
1438 | + * |
1439 | + * http://www.apache.org/licenses/LICENSE-2.0 |
1440 | + * |
1441 | + * Unless required by applicable law or agreed to in writing, software |
1442 | + * distributed under the License is distributed on an "AS IS" BASIS, |
1443 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
1444 | + * See the License for the specific language governing permissions and |
1445 | + * limitations under the License. |
1446 | + */ |
1447 | +#pragma once |
1448 | +#ifndef ZORBA_ZORBA_REGEX_H |
1449 | +#define ZORBA_ZORBA_REGEX_H |
1450 | + |
1451 | +#include <zorba/config.h> |
1452 | + |
1453 | +#ifdef ZORBA_NO_ICU |
1454 | + |
1455 | +#include "cxx_util.h" |
1456 | +#include "unicode_util.h" |
1457 | +#include "zorba_regex_engine.h" |
1458 | +#include "zorbatypes/zstring.h" |
1459 | + |
1460 | +namespace zorba { |
1461 | + |
1462 | +/** |
1463 | + * Converts an XQuery regular expression to the form used by the regular |
1464 | + * expression library Zorba is using (here regex_engine). |
1465 | + * |
1466 | + * @param xq_re The XQuery regular expression. |
1467 | + * @param lib_re A pointer to the resuling library regular expression. |
1468 | + * @param flags The flags to use, if any. |
1469 | + */ |
1470 | +inline void convert_xquery_re( zstring const &xq_re, zstring *lib_re, |
1471 | + char const *flags = "" ) { |
1472 | + *lib_re = xq_re; // no converseion neeeded |
1473 | +} |
1474 | + |
1475 | +////////// classes //////////////////////////////////////////////////////////// |
1476 | + |
1477 | +namespace unicode { |
1478 | + |
1479 | +/** |
1480 | + * The %regex class wraps the underlying Unicode regular expression library. |
1481 | + */ |
1482 | +class regex { |
1483 | +public: |
1484 | + /** |
1485 | + * Constructs a %regex. |
1486 | + */ |
1487 | + regex() : regex_matcher( nullptr ) { } |
1488 | + |
1489 | + /** |
1490 | + * Destroys a %regex. |
1491 | + */ |
1492 | + ~regex() { |
1493 | + delete regex_matcher; |
1494 | + } |
1495 | + |
1496 | + ////////// compile pattern ////////////////////////////////////////////////// |
1497 | + |
1498 | + /** |
1499 | + * Compiles a regular expression. One of the compile functions must be |
1500 | + * called prior to calling one of the match functions. |
1501 | + * |
1502 | + * @param pattern The regular expression pattern to compile. |
1503 | + * @param flags The regular expression flags, if any. |
1504 | + * @param throws err:FORX0002 if the regular expression is invalid. |
1505 | + */ |
1506 | + void compile( char const *pattern, char const *flags = "" ) ; |
1507 | + |
1508 | + /** |
1509 | + * Compiles a regular expression. One of the compile functions must be |
1510 | + * called prior to calling one of the match functions. |
1511 | + * |
1512 | + * @tparam StringType The pattern string type. |
1513 | + * @param pattern The regular expression pattern to compile. |
1514 | + * @param flags The regular expression flags, if any. |
1515 | + * @param throws err:FORX0002 if the regular expression is invalid. |
1516 | + */ |
1517 | + template<class StringType> |
1518 | + void compile( StringType const &pattern, char const *flags = "" ) { |
1519 | + compile( pattern.c_str(), flags ); |
1520 | + } |
1521 | + |
1522 | + /** |
1523 | + * Compiles a regular expression. One of the compile functions must be |
1524 | + * called prior to calling one of the match functions. |
1525 | + * |
1526 | + * @tparam PatternStringType The pattern string type. |
1527 | + * @tparam FlagsStringType The flags string type. |
1528 | + * @param pattern The regular expression pattern to compile. |
1529 | + * @param flags The regular expression flags, if any. |
1530 | + * @param throws err:FORX0002 if the regular expression is invalid. |
1531 | + */ |
1532 | + template<class PatternStringType,class FlagsStringType> |
1533 | + void compile( PatternStringType const &pattern, |
1534 | + FlagsStringType const &flags ) { |
1535 | + compile( pattern.c_str(), flags.c_str() ); |
1536 | + } |
1537 | + |
1538 | + ////////// partial match //////////////////////////////////////////////////// |
1539 | + |
1540 | + /** |
1541 | + * Checks whether the given string partially patches the previosuly compiled |
1542 | + * regular expression. A "partial match" means that at least part of the |
1543 | + * string matches, e.g., "b" matches the regular expression "aba". |
1544 | + * |
1545 | + * @param s The null-terminated UTF-8 C string to attempt to match. |
1546 | + * @return Returns \c true only if the string partially matches. |
1547 | + */ |
1548 | + bool match_part( char const *s ); |
1549 | + |
1550 | + /** |
1551 | + * Checks whether the given string partially patches the previosuly compiled |
1552 | + * regular expression. A "partial match" means that at least part of the |
1553 | + * string matches, e.g., "b" matches the regular expression "aba". |
1554 | + * |
1555 | + * @param s The UTF-8 C string to attempt to match. |
1556 | + * @param s_len The length of the string in bytes. |
1557 | + * @return Returns \c true only if the string partially matches. |
1558 | + */ |
1559 | + bool match_part( char const *s, size_type s_len ) |
1560 | + { |
1561 | + zstring scut(s, s_len); |
1562 | + return match_part(scut.c_str()); |
1563 | + } |
1564 | + |
1565 | + /** |
1566 | + * Checks whether the given string partially patches the previosuly compiled |
1567 | + * regular expression. A "partial match" means that at least part of the |
1568 | + * string matches, e.g., "b" matches the regular expression "aba". |
1569 | + * |
1570 | + * @tparam StringType The string type. |
1571 | + * @param s The UTF-8 string to attempt to match. |
1572 | + * @return Returns \c true only if the string partially matches. |
1573 | + */ |
1574 | + template<class StringType> |
1575 | + bool match_part( StringType const &s ) { |
1576 | + return match_part(s.c_str()); |
1577 | + } |
1578 | + |
1579 | + ////////// partial match with substrings/tokenization /////////////////////// |
1580 | + |
1581 | + /** |
1582 | + * Finds the next substring matching the pattern this %regex was compiled |
1583 | + * with. |
1584 | + * |
1585 | + * @param s The C string to attempt to match. |
1586 | + * @param pos A pointer to the position to start looking for a match. On |
1587 | + * successful return, the position is updated to be one past the last |
1588 | + * character of the match. |
1589 | + * @param match A pointer to the string that is to be set to the substring |
1590 | + * matching the pattern or NULL if the substring is not needed. |
1591 | + * @return Returns \c true only if there is a match. |
1592 | + */ |
1593 | + bool next_match( char const *s, size_type *pos, zstring *match ); |
1594 | + |
1595 | + /** |
1596 | + * Finds the next substring matching the pattern this %regex was compiled |
1597 | + * with. |
1598 | + * |
1599 | + * @param s The C string to attempt to match. |
1600 | + * @param s_len The length of the C string. |
1601 | + * @param pos A pointer to the position to start looking for a match. On |
1602 | + * successful return, the position is updated to be one past the last |
1603 | + * character of the match. |
1604 | + * @param match A pointer to the string that is to be set to the substring |
1605 | + * matching the pattern or NULL if the substring is not needed. |
1606 | + * @return Returns \c true only if there is a match. |
1607 | + */ |
1608 | + bool next_match( char const *s, size_type s_len, size_type *pos, |
1609 | + zstring *match ) |
1610 | + { |
1611 | + zstring scut(s, s_len); |
1612 | + return next_match(scut.c_str(), pos, match); |
1613 | + } |
1614 | + |
1615 | + /** |
1616 | + * Finds the next substring matching the pattern this %regex was compiled |
1617 | + * with. |
1618 | + * |
1619 | + * @tparam StringType The string type. |
1620 | + * @param s The string to attempt to match. |
1621 | + * @param pos A pointer to the position to start looking for a match. On |
1622 | + * successful return, the position is updated to be one past the last |
1623 | + * character of the match. |
1624 | + * @param match A pointer to the string that is to be set to the substring |
1625 | + * matching the pattern or NULL if the substring is not needed. |
1626 | + * @return Returns \c true only if there is a match. |
1627 | + */ |
1628 | + template<class StringType> |
1629 | + bool next_match( StringType const &s, size_type *pos, zstring *match ) { |
1630 | + return next_match(s.c_str(), pos, match); |
1631 | + } |
1632 | + |
1633 | + |
1634 | + /** |
1635 | + * Finds the next substring separated by the pattern this %regex was compiled |
1636 | + * with (similar to <code>strtok</code>(3)). |
1637 | + * |
1638 | + * @param s The C string to attempt to find a token in. |
1639 | + * @param pos A pointer to the position to start looking for a token. On |
1640 | + * successful return, the position is updated to be one past the last |
1641 | + * character of the token. |
1642 | + * @param token A pointer to the string that is to be set to the substring |
1643 | + * separated by the pattern or \c NULL if the substring is not needed. |
1644 | + * @param matched A pointer to a \c bool to indicate whether the pattern |
1645 | + * matched for the token or \c NULL if this is not needed. If not \c NULL, |
1646 | + * it is set to \c false either if there is no token or the token is the |
1647 | + * final token after the last separator; it is set to \c true only for |
1648 | + * non-last tokens. |
1649 | + * @return Returns \c true only if there is a token. |
1650 | + */ |
1651 | + bool next_token( char const *s, size_type *pos, zstring *token, |
1652 | + bool *matched = NULL ); |
1653 | + |
1654 | + /** |
1655 | + * Finds the next substring separated by the pattern this %regex was compiled |
1656 | + * with (similar to <code>strtok</code>(3)). |
1657 | + * |
1658 | + * @param s The C string to attempt to find a token in. |
1659 | + * @param s_len The length of the C string. |
1660 | + * @param pos A pointer to the position to start looking for a token. On |
1661 | + * successful return, the position is updated to be one past the last |
1662 | + * character of the token. |
1663 | + * @param token A pointer to the string that is to be set to the substring |
1664 | + * separated by the pattern or \c NULL if the substring is not needed. |
1665 | + * @param matched A pointer to a \c bool to indicate whether the pattern |
1666 | + * matched for the token or \c NULL if this is not needed. If not \c NULL, |
1667 | + * it is set to \c false either if there is no token or the token is the |
1668 | + * final token after the last separator; it is set to \c true only for |
1669 | + * non-last tokens. |
1670 | + * @return Returns \c true only if there is a token. |
1671 | + */ |
1672 | + bool next_token( char const *s, size_type s_len, size_type *pos, |
1673 | + zstring *token, bool *matched = NULL ) |
1674 | + { |
1675 | + zstring scut(s, s_len); |
1676 | + return next_token(scut.c_str(), pos, token, matched); |
1677 | + } |
1678 | + |
1679 | + /** |
1680 | + * Finds the next substring separated by the pattern this %regex was compiled |
1681 | + * with (similar to <code>strtok</code>(3)). |
1682 | + * |
1683 | + * @tparam StringType The string type. |
1684 | + * @param s The string to attempt to find a token in. |
1685 | + * @param pos A pointer to the position to start looking for a token. On |
1686 | + * successful return, the position is updated to be one past the last |
1687 | + * character of the token. |
1688 | + * @param token A pointer to the string that is to be set to the substring |
1689 | + * separated by the pattern or \c NULL if the substring is not needed. |
1690 | + * @return Returns \c true only if there is a token. |
1691 | + */ |
1692 | + template<class StringType> |
1693 | + bool next_token( StringType const &s, size_type *pos, zstring *token, |
1694 | + bool *matched = NULL ) { |
1695 | + return next_token(s.c_str(), pos, token, matched); |
1696 | + } |
1697 | + |
1698 | + ////////// whole match ////////////////////////////////////////////////////// |
1699 | + |
1700 | + /** |
1701 | + * Checks whether the given string completely patches the previosuly compiled |
1702 | + * regular expression. A "complete match" means that the entire string must |
1703 | + * match the regular expression as if the pattern were "^pattern$". |
1704 | + * |
1705 | + * @param s The null-terminated UTF-8 C string to attempt to match. |
1706 | + * @return Returns \c true only if the string completely matches. |
1707 | + */ |
1708 | + bool match_whole( char const *s ); |
1709 | + |
1710 | + /** |
1711 | + * Checks whether the given string completely patches the previosuly compiled |
1712 | + * regular expression. A "complete match" means that the entire string must |
1713 | + * match the regular expression as if the pattern were "^pattern$". |
1714 | + * |
1715 | + * @param s The UTF-8 C string to attempt to match. |
1716 | + * @param s_len The length of the string in bytes. |
1717 | + * @return Returns \c true only if the string completely matches. |
1718 | + */ |
1719 | + bool match_whole( char const *s, size_type s_len ) |
1720 | + { |
1721 | + zstring scut(s, s_len); |
1722 | + return match_whole(scut.c_str()); |
1723 | + } |
1724 | + |
1725 | + /** |
1726 | + * Checks whether the given string completely patches the previosuly compiled |
1727 | + * regular expression. A "complete match" means that the entire string must |
1728 | + * match the regular expression as if the pattern were "^pattern$". |
1729 | + * |
1730 | + * @tparam StringType The string type. |
1731 | + * @param s The UTF-8 string to attempt to match. |
1732 | + * @return Returns \c true only if the string completely matches. |
1733 | + */ |
1734 | + template<class StringType> |
1735 | + bool match_whole( StringType const &s ) { |
1736 | + return match_whole(s.c_str()); |
1737 | + } |
1738 | + |
1739 | + ////////// replacement ////////////////////////////////////////////////////// |
1740 | + |
1741 | + /** |
1742 | + * Replaces all occurrences of substrings that match the pattern this %regex |
1743 | + * was compiled with. |
1744 | + * |
1745 | + * @param in The UTF-8 input string. |
1746 | + * @param replacement The replacement string. |
1747 | + * @param out The output string. |
1748 | + * @param Returns \c true only if at least one replacement was made. |
1749 | + */ |
1750 | + bool replace_all( char const *in, char const *replacement, zstring *out ); |
1751 | + |
1752 | + /** |
1753 | + * Replaces all occurrences of substrings that match the pattern this %regex |
1754 | + * was compiled with. |
1755 | + * |
1756 | + * @tparam InputStringType The input string type. |
1757 | + * @tparam ReplacementStringType The replacement string type. |
1758 | + * @param in The input string. |
1759 | + * @param replacement The replacement string. |
1760 | + * @param out The output string. |
1761 | + * @param Returns \c true only if at least one replacement was made. |
1762 | + */ |
1763 | + template<class InputStringType,class ReplacementStringType> |
1764 | + bool replace_all( InputStringType const &in, |
1765 | + ReplacementStringType const &replacement, |
1766 | + zstring *out ) { |
1767 | + return replace_all( in.c_str(), replacement.c_str(), out ); |
1768 | + } |
1769 | + |
1770 | + |
1771 | + /** |
1772 | + * Set the string to work on, without doing matching yet. |
1773 | + * |
1774 | + * @param in The UTF-8 input string. |
1775 | + * @param len the size in bytes. |
1776 | + */ |
1777 | + void set_string( const char* in, size_type len ); |
1778 | + |
1779 | + /** |
1780 | + * Find the next match in string set by set_string(). |
1781 | + * After finding a match, call get_group_start() and get_group_end() to get the position in the string. |
1782 | + * |
1783 | + * @param reachedEnd returns true if the end of string was reached while doing the match. |
1784 | + */ |
1785 | + bool next_match( bool *reachedEnd ); |
1786 | + |
1787 | + /** |
1788 | + * Get the number of parenthesized groups in the regular expression. |
1789 | + * This number depends only on regular expression, and not on the working string. |
1790 | + * |
1791 | + * @return the number of parenthesized groups in the regular expression |
1792 | + */ |
1793 | + int get_group_count(); |
1794 | + |
1795 | + /** |
1796 | + * Get the start position of the matched group. |
1797 | + * If groupId is zero, then the start and end position of the whole match is returned. |
1798 | + * If groupId is non-zero, then the start and end position of that group is returned. |
1799 | + * If that group has not been matched, false is returned. |
1800 | + * |
1801 | + * @param groupId the id of the group, either zero for the entire regex, |
1802 | + * or [1 .. group_count] for that specific group |
1803 | + * @param start to return start position in bytes |
1804 | + * @param end to return end position in bytes |
1805 | + * @return true if that group exists and has been matched |
1806 | + */ |
1807 | + bool get_match_start_end_bytes( int groupId, int *start, int *end ); |
1808 | + |
1809 | +private: |
1810 | + regex_engine::CRegexXQuery_regex *regex_matcher; |
1811 | + uint32_t parsed_flags; |
1812 | + |
1813 | + zstring s_in_; |
1814 | + int m_pos; |
1815 | + int m_match_pos; |
1816 | + int m_matched_len; |
1817 | + |
1818 | + uint32_t parse_regex_flags(const char* flag_cstr); |
1819 | + |
1820 | + // forbid |
1821 | + regex( regex const& ); |
1822 | + regex& operator=( regex const& ); |
1823 | +}; |
1824 | + |
1825 | +/////////////////////////////////////////////////////////////////////////////// |
1826 | + |
1827 | +} // namespace unicode |
1828 | +} // namespace zorba |
1829 | + |
1830 | +#endif /* ZORBA_NO_ICU */ |
1831 | +#endif /* ZORBA_ZORBA_REGEX_H */ |
1832 | +/* |
1833 | + * Local variables: |
1834 | + * mode: c++ |
1835 | + * End: |
1836 | + */ |
1837 | +/* vim:set et sw=2 ts=2: */ |
1838 | |
1839 | === renamed file 'src/util/regex_xquery.cpp' => 'src/util/zorba_regex_engine.cpp' |
1840 | --- src/util/regex_xquery.cpp 2013-02-28 11:15:32 +0000 |
1841 | +++ src/util/zorba_regex_engine.cpp 2013-04-12 05:29:35 +0000 |
1842 | @@ -16,19 +16,18 @@ |
1843 | |
1844 | #include "stdafx.h" |
1845 | |
1846 | -#include <ctype.h> |
1847 | -#include <string.h> |
1848 | +#include <cctype> |
1849 | +#include <cstring> |
1850 | |
1851 | +#include "ascii_util.h" |
1852 | #include "diagnostics/xquery_diagnostics.h" |
1853 | -#include "util/ascii_util.h" |
1854 | -#include "util/unicode_util.h" |
1855 | -#include "util/utf8_string.h" |
1856 | +#include "unicode_util.h" |
1857 | +#include "utf8_string.h" |
1858 | +#include "zorba_regex_engine.h" |
1859 | #include "zorbatypes/chartype.h" |
1860 | |
1861 | -#include "regex_xquery.h" |
1862 | - |
1863 | namespace zorba { |
1864 | - namespace regex_xquery{ |
1865 | + namespace regex_engine{ |
1866 | //ascii regular expression matching |
1867 | |
1868 | /*http://www.w3.org/TR/xmlschema-2/#regexs |
1869 | @@ -2525,6 +2524,6 @@ |
1870 | return false; |
1871 | } |
1872 | |
1873 | - }//end namespace regex_xquery |
1874 | + }//end namespace regex_engine |
1875 | }//end namespace zorba |
1876 | /* vim:set et sw=2 ts=2: */ |
1877 | |
1878 | === renamed file 'src/util/regex_xquery.h' => 'src/util/zorba_regex_engine.h' |
1879 | --- src/util/regex_xquery.h 2013-02-07 17:24:36 +0000 |
1880 | +++ src/util/zorba_regex_engine.h 2013-04-12 05:29:35 +0000 |
1881 | @@ -21,7 +21,7 @@ |
1882 | #include <vector> |
1883 | #include <util/unicode_util.h> |
1884 | namespace zorba { |
1885 | - namespace regex_xquery{ |
1886 | + namespace regex_engine{ |
1887 | |
1888 | //matching flags |
1889 | #define REGEX_ASCII_CASE_INSENSITIVE 1 //i |
1890 | @@ -456,7 +456,7 @@ |
1891 | }; |
1892 | |
1893 | } |
1894 | -}//end namespace zorba::regex_xquery |
1895 | +}//end namespace zorba::regex_engine |
1896 | |
1897 | #endif |
1898 | /* vim:set et sw=2 ts=2: */ |
Validation queue starting for merge proposal. zorbatest. lambda. nu:8080/ remotequeue/ pjl-misc- 2013-04- 12T04-39- 45.281Z/ log.html
Log at: http://