Merge lp:~paul-lucas/zorba/pjl-misc into lp:zorba

Proposed by Paul J. Lucas
Status: Merged
Approved by: Matthias Brantner
Approved revision: 11173
Merged at revision: 11370
Proposed branch: lp:~paul-lucas/zorba/pjl-misc
Merge into: lp:zorba
Diff against target: 1898 lines (+847/-756)
13 files modified
src/runtime/full_text/latin_tokenizer.cpp (+4/-4)
src/runtime/full_text/latin_tokenizer.h (+4/-2)
src/runtime/strings/strings_impl.cpp (+7/-8)
src/util/CMakeLists.txt (+11/-10)
src/util/icu_regex.cpp (+48/-291)
src/util/icu_regex.h (+55/-427)
src/util/passthru_streambuf.cpp (+2/-2)
src/util/passthru_streambuf.h (+1/-1)
src/util/regex.h (+34/-0)
src/util/zorba_regex.cpp (+265/-0)
src/util/zorba_regex.h (+406/-0)
src/util/zorba_regex_engine.cpp (+8/-9)
src/util/zorba_regex_engine.h (+2/-2)
To merge this branch: bzr merge lp:~paul-lucas/zorba/pjl-misc
Reviewer Review Type Date Requested Status
Matthias Brantner Approve
Paul J. Lucas Approve
Review via email: mp+158525@code.launchpad.net

Commit message

regex clean-up:

* Split off Zorba's own regex engine from ICU's.
* Clean-up of some of the hack changes that Daniel made.

Also fixed build when ZORBA_NO_ICU=1.

Description of the change

regex clean-up:

* Split off Zorba's own regex engine from ICU's.
* Clean-up of some of the hack changes that Daniel made.

Also fixed build when ZORBA_NO_ICU=1.

To post a comment you must log in.
Revision history for this message
Paul J. Lucas (paul-lucas) :
review: Approve
Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote :
Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote :

The attempt to merge lp:~paul-lucas/zorba/pjl-misc into lp:zorba failed. Below is the output from the failed tests.

CMake Error at /home/ceej/zo/testing/zorbatest/tester/TarmacLander.cmake:275 (message):
  Validation queue job pjl-misc-2013-04-12T04-39-45.281Z is finished. The
  final status was:

  26 tests did not succeed - changes not commited.

Error in read script: /home/ceej/zo/testing/zorbatest/tester/TarmacLander.cmake

Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote :

There are additional revisions which have not been approved in review. Please seek review and approval of these new revisions.

Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote :
Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote :

The attempt to merge lp:~paul-lucas/zorba/pjl-misc into lp:zorba failed. Below is the output from the failed tests.

CMake Error at /home/ceej/zo/testing/zorbatest/tester/TarmacLander.cmake:275 (message):
  Validation queue job pjl-misc-2013-04-12T05-38-39.33Z is finished. The
  final status was:

  1 tests did not succeed - changes not commited.

Error in read script: /home/ceej/zo/testing/zorbatest/tester/TarmacLander.cmake

Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote :
Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote :

Validation queue job pjl-misc-2013-04-12T06-13-41.393Z is finished. The final status was:

All tests succeeded!

Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote :

Voting does not meet specified criteria. Required: Approve > 1, Disapprove < 1, Needs Fixing < 1, Pending < 1, Needs Information < 1, Resubmit < 1. Got: 1 Approve.

Revision history for this message
Matthias Brantner (matthias-brantner) :
review: Approve
Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote :
Revision history for this message
Zorba Build Bot (zorba-buildbot) wrote :

Validation queue job pjl-misc-2013-04-12T20-50-42.062Z is finished. The final status was:

All tests succeeded!

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'src/runtime/full_text/latin_tokenizer.cpp'
2--- src/runtime/full_text/latin_tokenizer.cpp 2013-02-07 17:24:36 +0000
3+++ src/runtime/full_text/latin_tokenizer.cpp 2013-04-12 05:29:35 +0000
4@@ -210,7 +210,7 @@
5 // no break;
6 case '!':
7 case '?':
8- ++numbers().sent;
9+ ++state().sent;
10 }
11 } // for
12
13@@ -229,11 +229,11 @@
14 << ": \"" << token << "\"\n";
15 #endif /* PRINT_TOKENS */
16
17- callback(
18+ callback.token(
19 token.data(), token.size(), lang,
20- numbers().token, numbers().sent, numbers().para, item
21+ state().token, state().sent, state().para, item
22 );
23- ++numbers().token;
24+ ++state().token;
25 return true;
26 }
27 return false;
28
29=== modified file 'src/runtime/full_text/latin_tokenizer.h'
30--- src/runtime/full_text/latin_tokenizer.h 2013-02-07 17:24:36 +0000
31+++ src/runtime/full_text/latin_tokenizer.h 2013-04-12 05:29:35 +0000
32@@ -39,8 +39,8 @@
33 // inherited
34 void destroy() const;
35 void properties( Properties* ) const;
36- void tokenize_string( char const*, size_type, iso639_1::type, bool, Callback&,
37- Item const* );
38+ void tokenize_string( char const*, size_type, locale::iso639_1::type, bool,
39+ Callback&, Item const* );
40
41 private:
42 typedef zstring string_type;
43@@ -65,6 +65,8 @@
44
45 class LatinTokenizerProvider : public TokenizerProvider {
46 public:
47+ LatinTokenizerProvider() { } // needed to work-around compiler bug
48+
49 // inherited
50 bool getTokenizer( locale::iso639_1::type, Tokenizer::State* = 0,
51 Tokenizer::ptr* = 0 ) const;
52
53=== modified file 'src/runtime/strings/strings_impl.cpp'
54--- src/runtime/strings/strings_impl.cpp 2013-03-15 08:22:41 +0000
55+++ src/runtime/strings/strings_impl.cpp 2013-04-12 05:29:35 +0000
56@@ -1900,7 +1900,7 @@
57 break;
58 }
59 #ifndef ZORBA_NO_ICU
60- match_startg = rx.get_match_start(i+1);
61+ match_startg = rx.get_group_start(i+1);
62 if((match_startg < 0) && (gparent < 0))
63 continue;
64 #else
65@@ -1920,7 +1920,7 @@
66 GENV_ITEMFACTORY->createTextNode(non_group_text_item, parent.getp(), non_group_str);
67 }
68 #ifndef ZORBA_NO_ICU
69- match_endg = rx.get_match_end(i+1);
70+ match_endg = rx.get_group_end(i+1);
71 #else
72 match_endg = temp_endg;
73 #endif
74@@ -2123,17 +2123,16 @@
75
76 unicode::regex rx;
77 rx.compile(lib_pattern, flags.c_str());
78- int nr_pattern_groups = rx.get_pattern_group_count();
79+ int nr_pattern_groups = rx.get_group_count();
80 std::vector<int> group_parent;
81 computePatternGroupsParents(xquery_pattern, group_parent);
82
83 //see if regex can match empty strings
84 bool reachedEnd = false;
85 rx.set_string("", 0);
86- if (rx.find_next_match(&reachedEnd))
87+ if (rx.next_match(&reachedEnd))
88 {
89 throw XQUERY_EXCEPTION(err::FORX0003, ERROR_PARAMS(lib_pattern));
90-
91 }
92
93 store::Item_t null_parent;
94@@ -2187,13 +2186,13 @@
95 int match_end1 = 0;
96 unsigned int match_end1_bytes = 0;
97 reachedEnd = false;
98- while(rx.find_next_match(&reachedEnd))
99+ while(rx.next_match(&reachedEnd))
100 {
101 int match_start2;
102 int match_end2;
103 #ifndef ZORBA_NO_ICU
104- match_start2 = rx.get_match_start();
105- match_end2 = rx.get_match_end();
106+ match_start2 = rx.get_group_start();
107+ match_end2 = rx.get_group_end();
108 #else
109 rx.get_match_start_end_bytes(0, &match_start2, &match_end2);
110 #endif
111
112=== modified file 'src/util/CMakeLists.txt'
113--- src/util/CMakeLists.txt 2013-03-16 20:44:27 +0000
114+++ src/util/CMakeLists.txt 2013-04-12 05:29:35 +0000
115@@ -24,7 +24,6 @@
116 json_parser.cpp
117 json_util.cpp
118 mem_streambuf.cpp
119- regex.cpp
120 stream_util.cpp
121 string_util.cpp
122 time_util.cpp
123@@ -44,18 +43,20 @@
124 LIST(APPEND UTIL_SRCS cxx_util.cpp)
125 ENDIF (NOT ZORBA_CXX_NULLPTR)
126
127-IF(ZORBA_WITH_FILE_ACCESS)
128+IF (ZORBA_WITH_FILE_ACCESS)
129 LIST(APPEND UTIL_SRCS mmap_file.cpp)
130-ENDIF(ZORBA_WITH_FILE_ACCESS)
131+ENDIF (ZORBA_WITH_FILE_ACCESS)
132
133-IF(ZORBA_NO_ICU)
134- LIST(APPEND UTIL_SRCS
135- regex_xquery.cpp
136- passthru_streambuf.cpp)
137-ELSE(ZORBA_NO_ICU)
138- LIST(APPEND UTIL_SRCS
139+IF (ZORBA_NO_ICU)
140+ LIST(APPEND UTIL_SRCS
141+ passthru_streambuf.cpp
142+ zorba_regex.cpp
143+ zorba_regex_engine.cpp)
144+ELSE (ZORBA_NO_ICU)
145+ LIST(APPEND UTIL_SRCS
146+ icu_regex.cpp
147 icu_streambuf.cpp)
148-ENDIF(ZORBA_NO_ICU)
149+ENDIF (ZORBA_NO_ICU)
150
151 HEADER_GROUP_SUBFOLDER(UTIL_SRCS fx)
152 HEADER_GROUP_SUBFOLDER(UTIL_SRCS win32)
153
154=== renamed file 'src/util/regex.cpp' => 'src/util/icu_regex.cpp'
155--- src/util/regex.cpp 2013-04-11 17:45:40 +0000
156+++ src/util/icu_regex.cpp 2013-04-12 05:29:35 +0000
157@@ -15,14 +15,17 @@
158 */
159 #include "stdafx.h"
160
161+#include <zorba/config.h>
162+
163+#ifndef ZORBA_NO_ICU
164+
165 #include <cstring>
166 #include <vector>
167
168 #include <zorba/diagnostic_list.h>
169-#include "diagnostics/xquery_exception.h"
170-
171 #include "diagnostics/assert.h"
172 #include "diagnostics/dict.h"
173+#include "diagnostics/xquery_exception.h"
174
175 #include "ascii_util.h"
176 #include "cxx_util.h"
177@@ -32,13 +35,12 @@
178 #define INVALID_RE_EXCEPTION(...) \
179 XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS( __VA_ARGS__ ) )
180
181-#ifndef ZORBA_NO_ICU
182-# include <unicode/uversion.h>
183+#include <unicode/uversion.h>
184 U_NAMESPACE_USE
185
186-# ifndef U_ICU_VERSION_MAJOR_NUM
187-# error "U_ICU_VERSION_MAJOR_NUM not defined"
188-# elif U_ICU_VERSION_MAJOR_NUM < 4
189+#ifndef U_ICU_VERSION_MAJOR_NUM
190+# error "U_ICU_VERSION_MAJOR_NUM not defined"
191+#elif U_ICU_VERSION_MAJOR_NUM < 4
192 //
193 // UREGEX_LITERAL is only in ICU since 4.0. For earlier versions, we
194 // define it ourselves. Of course it won't have any effect since it's not
195@@ -46,8 +48,8 @@
196 // though the constant is defined in 4.0, it's not actually implemented as
197 // of 4.4.
198 //
199-# define UREGEX_LITERAL 16
200-# endif /* U_ICU_VERSION_MAJOR_NUM */
201+# define UREGEX_LITERAL 16
202+#endif /* U_ICU_VERSION_MAJOR_NUM */
203
204 using namespace std;
205
206@@ -559,7 +561,6 @@
207
208 ///////////////////////////////////////////////////////////////////////////////
209
210-
211 namespace unicode {
212
213 void regex::compile( string const &u_pattern, char const *flags,
214@@ -582,10 +583,27 @@
215 }
216 }
217
218+int regex::get_group_count() {
219+ ZORBA_ASSERT( matcher_ );
220+ return matcher_->groupCount();
221+}
222+
223+int regex::get_group_start( int group ) {
224+ ZORBA_ASSERT( matcher_ );
225+ UErrorCode status = U_ZERO_ERROR;
226+ return matcher_->start( group, status );
227+}
228+
229+int regex::get_group_end( int group ) {
230+ ZORBA_ASSERT( matcher_ );
231+ UErrorCode status = U_ZERO_ERROR;
232+ return matcher_->end( group, status );
233+}
234+
235 bool regex::match_part( string const &s ) {
236 ZORBA_ASSERT( matcher_ );
237 matcher_->reset( s );
238- return matcher_->find() != 0;
239+ return !!matcher_->find();
240 }
241
242 bool regex::match_whole( string const &s ) {
243@@ -645,6 +663,19 @@
244 return false;
245 }
246
247+bool regex::next_match( bool *reached_end ) {
248+ ZORBA_ASSERT( matcher_ );
249+ bool const found = !!matcher_->find();
250+ if ( reached_end ) {
251+#if U_ICU_VERSION_MAJOR_NUM >= 4
252+ *reached_end = !!matcher_->hitEnd();
253+#else
254+ *reached_end = true;
255+#endif /* U_ICU_VERSION_MAJOR_NUM */
256+ }
257+ return found;
258+}
259+
260 bool regex::replace_all( string const &in, string const &replacement,
261 string *out ) {
262 ZORBA_ASSERT( matcher_ );
263@@ -663,290 +694,16 @@
264 replace_all( u_in, u_replacement, out );
265 }
266
267-void regex::set_string( char const *in, size_type len ) {
268- ZORBA_ASSERT( matcher_ );
269- to_string( in, len, &s_in_ );
270- matcher_->reset( s_in_ );
271-}
272-
273-bool regex::find_next_match( bool *reachedEnd ) {
274- ZORBA_ASSERT( matcher_ );
275- UBool retfind = matcher_->find();
276- if ( reachedEnd ) {
277-#if U_ICU_VERSION_MAJOR_NUM >= 4
278- *reachedEnd = matcher_->hitEnd() != 0;
279-#else
280- *reachedEnd = true;
281-#endif
282- }
283- return retfind != 0;
284-}
285-
286-int regex::get_pattern_group_count() {
287- ZORBA_ASSERT( matcher_ );
288- return matcher_->groupCount();
289-}
290-
291-int regex::get_match_start( int groupId ) {
292- ZORBA_ASSERT( matcher_ );
293- UErrorCode status = U_ZERO_ERROR;
294- return matcher_->start( groupId, status );
295-}
296-
297-int regex::get_match_end( int groupId ) {
298- ZORBA_ASSERT( matcher_ );
299- UErrorCode status = U_ZERO_ERROR;
300- return matcher_->end( groupId, status );
301-}
302-
303-} // namespace unicode
304-} // namespace zorba
305+void regex::set_string( char const *s, size_type s_len ) {
306+ ZORBA_ASSERT( matcher_ );
307+ to_string( s, s_len, &s_ );
308+ matcher_->reset( s_ );
309+}
310
311 ///////////////////////////////////////////////////////////////////////////////
312
313-#else /* ZORBA_NO_ICU */
314-
315-#include "zorbatypes/zstring.h"
316-
317-namespace zorba {
318-//no convertion
319-void convert_xquery_re( zstring const &xq_re, zstring *lib_re,
320- char const *flags)
321-{
322- *lib_re = xq_re;
323-}
324-
325-namespace unicode {
326-
327-uint32_t regex::parse_regex_flags(const char* flag_cstr)
328-{
329- uint32_t flags = 0;
330- for (const char* p = flag_cstr; *p != '\0'; ++p)
331- {
332- switch (*p)
333- {
334- case 'i': flags |= REGEX_ASCII_CASE_INSENSITIVE; break;
335- case 's': flags |= REGEX_ASCII_DOTALL; break;
336- case 'm': flags |= REGEX_ASCII_MULTILINE; break;
337- case 'x': flags |= REGEX_ASCII_NO_WHITESPACE; break;
338- case 'q': flags |= REGEX_ASCII_LITERAL; break;
339- default:
340- throw XQUERY_EXCEPTION( err::FORX0001, ERROR_PARAMS( *p ) );
341- break;
342- }
343- }
344- return flags;
345-}
346-
347-void regex::compile( char const *pattern, char const *flags)
348-{
349- parsed_flags = parse_regex_flags(flags);
350- regex_xquery::CRegexXQuery_parser regex_parser;
351- regex_matcher = regex_parser.parse(pattern, parsed_flags);
352- if(!regex_matcher)
353- throw INVALID_RE_EXCEPTION(pattern);
354-}
355-
356-bool regex::match_part( char const *s )
357-{
358- bool retval;
359- int match_pos;
360- int matched_len;
361-
362- retval = regex_matcher->match_anywhere(s, parsed_flags, &match_pos, &matched_len);
363-
364- return retval;
365-}
366-
367-bool regex::next_match( char const *s, size_type *pos, zstring *match )
368-{
369- bool retval;
370- int match_pos;
371- int matched_len;
372-
373- retval = regex_matcher->match_anywhere(s+*pos, parsed_flags, &match_pos, &matched_len);
374- if(retval)
375- {
376- match->assign(s+*pos+match_pos, matched_len);
377- *pos += match_pos + matched_len;
378- }
379- return retval;
380-}
381-
382-bool regex::next_token( char const *s, size_type *pos, zstring *token,
383- bool *matched)
384-{
385- if(!s[*pos])
386- return false;
387- bool retval;
388- int match_pos;
389- int matched_len;
390-
391- retval = regex_matcher->match_anywhere(s+*pos, parsed_flags, &match_pos, &matched_len);
392- if(retval)
393- {
394- if(token)
395- token->assign(s+*pos, match_pos);
396- *pos += match_pos + matched_len;
397- if(matched)
398- *matched = true;
399- return true;
400- }
401- else
402- {
403- if(token)
404- token->assign(s+*pos);
405- *pos += strlen(s+*pos);
406- if(matched)
407- *matched = false;
408- return true;
409- }
410-}
411-
412-bool regex::match_whole( char const *s )
413-{
414- bool retval;
415- int matched_pos;
416- int matched_len;
417-
418- retval = regex_matcher->match_anywhere(s, parsed_flags|REGEX_ASCII_WHOLE_MATCH, &matched_pos, &matched_len);
419- if(!retval)
420- return false;
421- return true;
422-}
423-
424-bool regex::replace_all( char const *in, char const *replacement, zstring *result )
425-{
426- int match_pos;
427- int matched_len;
428-
429- const char *start_str = in;
430- int subregex_count = regex_matcher->get_indexed_regex_count();
431- bool retval = false;
432-
433- while(regex_matcher->match_anywhere(start_str, parsed_flags, &match_pos, &matched_len))
434- {
435- if(match_pos)
436- result->append(start_str , match_pos);
437- retval = true;
438- const char *temprepl = replacement;
439- const char *submatched_source;
440- int submatched_len;
441- int index;
442- while(*temprepl)
443- {
444- //look for dollars
445- if(*temprepl == '\\')
446- {
447- if(!(parsed_flags & REGEX_ASCII_LITERAL))
448- {
449- temprepl++;
450- if(!*temprepl)
451- temprepl--;
452- else if((*temprepl != '\\') && (*temprepl != '$'))//Invalid replacement string.
453- throw XQUERY_EXCEPTION( err::FORX0004, ERROR_PARAMS( replacement ) );
454- }
455- result->append(1, *temprepl);
456- temprepl++;
457- continue;
458- }
459- if((*temprepl == '$') && !(parsed_flags & REGEX_ASCII_LITERAL))
460- {
461- temprepl++;
462- index = 0;
463- int nr_digits = 0;
464- while(isdigit(*temprepl))
465- {
466- if(nr_digits && ((index*10 + (*temprepl)-'0') > subregex_count))
467- break;
468- index = index*10 + (*temprepl)-'0';
469- temprepl++;
470- nr_digits++;
471- }
472- if(!nr_digits)//Invalid replacement string.
473- throw XQUERY_EXCEPTION( err::FORX0004, ERROR_PARAMS( replacement ) );
474- else if(!index)
475- {
476- result->append(start_str+match_pos, matched_len);
477- }
478- else if(regex_matcher->get_indexed_match(index, &submatched_source, &submatched_len))
479- {
480- if(submatched_source && submatched_len)
481- result->append(submatched_source, submatched_len);
482- }
483- }
484- else
485- {
486- result->append(1, *temprepl);
487- temprepl++;
488- }
489- }
490- start_str += match_pos + matched_len;
491- }
492- result->append(start_str);
493-
494- return retval;
495-}
496-
497-void regex::set_string( const char* in, size_type len )
498-{
499- s_in_.assign(in, len);
500- m_pos = 0;
501- m_match_pos = 0;
502- m_matched_len = 0;
503-}
504-
505-bool regex::find_next_match( bool *reachedEnd )
506-{
507- bool retval;
508-
509- retval = regex_matcher->match_anywhere(s_in_.c_str()+m_pos, parsed_flags, &m_match_pos, &m_matched_len);
510- if(retval)
511- {
512- m_match_pos += m_pos;
513- m_pos = m_match_pos + m_matched_len;
514- }
515- else
516- {
517- m_pos = s_in_.length();
518- m_match_pos = 0;
519- m_matched_len = 0;
520- }
521- if(reachedEnd)
522- *reachedEnd = regex_matcher->get_reachedEnd();
523- return retval;
524-}
525-
526-int regex::get_pattern_group_count()
527-{
528- return (int)regex_matcher->get_indexed_regex_count();
529-}
530-
531-bool regex::get_match_start_end_bytes( int groupId, int *start, int *end )
532-{
533- *start = -1;
534- *end = -1;
535- if(groupId == 0)
536- {
537- *start = m_match_pos;
538- *end = m_match_pos + m_matched_len;
539- return true;
540- }
541- if(groupId > (int)regex_matcher->get_indexed_regex_count())
542- return false;
543- const char *submatched_source;
544- int submatched_len;
545- if(!regex_matcher->get_indexed_match(groupId, &submatched_source, &submatched_len))
546- return false;
547- *start = submatched_source - s_in_.c_str();
548- *end = *start + submatched_len;
549- return true;
550-}
551-
552 } // namespace unicode
553 } // namespace zorba
554+
555 #endif /* ZORBA_NO_ICU */
556-
557-///////////////////////////////////////////////////////////////////////////////
558-
559 /* vim:set et sw=2 ts=2: */
560
561=== renamed file 'src/util/regex.h' => 'src/util/icu_regex.h'
562--- src/util/regex.h 2013-02-07 17:24:36 +0000
563+++ src/util/icu_regex.h 2013-04-12 05:29:35 +0000
564@@ -14,8 +14,8 @@
565 * limitations under the License.
566 */
567 #pragma once
568-#ifndef ZORBA_REGEX_H
569-#define ZORBA_REGEX_H
570+#ifndef ZORBA_ICU_REGEX_H
571+#define ZORBA_ICU_REGEX_H
572
573 #include "cxx_util.h"
574 #include "unicode_util.h"
575@@ -42,7 +42,6 @@
576
577 namespace unicode {
578
579-
580 /**
581 * The %regex class wraps the underlying Unicode regular expression library.
582 */
583@@ -421,59 +420,61 @@
584 return replace_all( in.c_str(), replacement.c_str(), out );
585 }
586
587-
588- /**
589- * Set the string to work on, without doing matching yet.
590- *
591- * @param in The UTF-8 input string.
592- * @param len the size in bytes.
593- */
594- void set_string( const char* in, size_type len );
595-
596- /**
597- * Find the next match in string set by set_string().
598- * After finding a match, call get_match_start() and get_match_end() to get the position in the string.
599- *
600- * @param reachedEnd returns true if the end of string was reached while doing the match.
601- * This works only for ICU greater than 4.0. For smaller versions, reachedEnd is always true.
602- */
603- bool find_next_match( bool *reachedEnd );
604-
605- /**
606- * Get the number of parenthesized groups in the regular expression.
607- * This number depends only on regular expression, and not on the working string.
608- *
609- * @return the number of parenthesized groups in the regular expression
610- */
611- int get_pattern_group_count();
612-
613- /**
614- * Get the start position of the matched group.
615- * If groupId is zero, then the start position of the whole match is returned.
616- * If groupId is non-zero, then the start position of that group is returned.
617- * If that group has not been matched, -1 is returned.
618- *
619- * @param groupId the id of the group, either zero for the entire regex,
620- * or [1 .. group_count] for that specific group
621- * @return the start position, zero based, or -1 if that group didn't match
622- */
623- int get_match_start( int groupId = 0 );
624-
625- /**
626- * Get the end position of the matched group.
627- * If groupId is zero, then the end position of the whole match is returned.
628- * If groupId is non-zero, then the end position of that group is returned.
629- * If that group has not been matched, -1 is returned.
630- *
631- * @param groupId the id of the group, either zero for the entire regex,
632- * or [1 .. group_count] for that specific group
633- * @return the end position, zero based, or -1 if that group didn't match
634- */
635- int get_match_end( int groupId = 0 );
636+ ////////// capturing subgroups //////////////////////////////////////////////
637+
638+ /**
639+ * Gets the number of capturing subgroups in the regular expression.
640+ *
641+ * @return Returns said number.
642+ */
643+ int get_group_count();
644+
645+ /**
646+ * Gets the start character position of the matched capturing subgroup.
647+ *
648+ * @param group The ID of the capturing subgroup [1-N] where N is the result
649+ * of get_group_count(), or 0 for the whole match.
650+ * @return Returns the start position (zero-based) or -1 if \a group didn't
651+ * match.
652+ */
653+ int get_group_start( int group = 0 );
654+
655+ /**
656+ * Gets the end character position of the matched group.
657+ *
658+ * @param group The ID of the capturing subgroup [1-N] where N is the result
659+ * of get_group_count(), or 0 for the whole match.
660+ * @return Returns the end position (zero-based) or -1 if \a group didn't
661+ * match.
662+ */
663+ int get_group_end( int group = 0 );
664+
665+ /**
666+ * Sets the string to work on, without doing matching yet.
667+ *
668+ * @param s The UTF-8 input string.
669+ * @param s_len The length of \a s in bytes.
670+ */
671+ void set_string( char const *s, size_type s_len );
672+
673+ /**
674+ * Finds the next match in the string set by set_string(). After finding a
675+ * match, call get_group_start() and get_group_end() to get the position in
676+ * the string.
677+ *
678+ * @param reached_end If not \c nullptr, set to \c true only if the end of
679+ * string has been reached while doing the match. (This works only for ICU
680+ * version 4.0 or later; for earlier versions, this is always set to
681+ * <code>true</code>.)
682+ * @return Returns \c true only if the next match was found.
683+ */
684+ bool next_match( bool *reached_end );
685+
686+ /////////////////////////////////////////////////////////////////////////////
687
688 private:
689 U_NAMESPACE_QUALIFIER RegexMatcher *matcher_;
690- string s_in_;
691+ string s_;
692
693 enum re_type_t {
694 re_is_match, // RE specifies what to match
695@@ -491,386 +492,13 @@
696 regex& operator=( regex const& );
697 };
698
699-} // namespace unicode
700-} // namespace zorba
701-
702-///////////////////////////////////////////////////////////////////////////////
703-
704-#else /* ZORBA_NO_ICU */
705-
706-#include "util/regex_xquery.h"
707-#include <string>
708-
709-namespace zorba{
710-/**
711- * Converts an XQuery regular expression to the form used by the regular
712- * expression library Zorba is using (here regex_xquery).
713- *
714- * @param xq_re The XQuery regular expression.
715- * @param lib_re A pointer to the resuling library regular expression.
716- * @param flags The flags to use, if any.
717- */
718-void convert_xquery_re( zstring const &xq_re, zstring *lib_re,
719- char const *flags = "" );
720-
721-namespace unicode{
722-////////// classes ////////////////////////////////////////////////////////////
723-
724-
725-/**
726- * The %regex class wraps the underlying Unicode regular expression library.
727- */
728-class regex {
729-public:
730- /**
731- * Constructs a %regex.
732- */
733- regex() : regex_matcher( nullptr ) { }
734-
735- /**
736- * Destroys a %regex.
737- */
738- ~regex() {
739- delete regex_matcher;
740- }
741-
742- ////////// compile pattern //////////////////////////////////////////////////
743-
744- /**
745- * Compiles a regular expression. One of the compile functions must be
746- * called prior to calling one of the match functions.
747- *
748- * @param pattern The regular expression pattern to compile.
749- * @param flags The regular expression flags, if any.
750- * @param throws err:FORX0002 if the regular expression is invalid.
751- */
752- void compile( char const *pattern, char const *flags = "" ) ;
753-
754- /**
755- * Compiles a regular expression. One of the compile functions must be
756- * called prior to calling one of the match functions.
757- *
758- * @tparam StringType The pattern string type.
759- * @param pattern The regular expression pattern to compile.
760- * @param flags The regular expression flags, if any.
761- * @param throws err:FORX0002 if the regular expression is invalid.
762- */
763- template<class StringType>
764- void compile( StringType const &pattern, char const *flags = "" ) {
765- compile( pattern.c_str(), flags );
766- }
767-
768- /**
769- * Compiles a regular expression. One of the compile functions must be
770- * called prior to calling one of the match functions.
771- *
772- * @tparam PatternStringType The pattern string type.
773- * @tparam FlagsStringType The flags string type.
774- * @param pattern The regular expression pattern to compile.
775- * @param flags The regular expression flags, if any.
776- * @param throws err:FORX0002 if the regular expression is invalid.
777- */
778- template<class PatternStringType,class FlagsStringType>
779- void compile( PatternStringType const &pattern,
780- FlagsStringType const &flags ) {
781- compile( pattern.c_str(), flags.c_str() );
782- }
783-
784- ////////// partial match ////////////////////////////////////////////////////
785-
786- /**
787- * Checks whether the given string partially patches the previosuly compiled
788- * regular expression. A "partial match" means that at least part of the
789- * string matches, e.g., "b" matches the regular expression "aba".
790- *
791- * @param s The null-terminated UTF-8 C string to attempt to match.
792- * @return Returns \c true only if the string partially matches.
793- */
794- bool match_part( char const *s );
795-
796- /**
797- * Checks whether the given string partially patches the previosuly compiled
798- * regular expression. A "partial match" means that at least part of the
799- * string matches, e.g., "b" matches the regular expression "aba".
800- *
801- * @param s The UTF-8 C string to attempt to match.
802- * @param s_len The length of the string in bytes.
803- * @return Returns \c true only if the string partially matches.
804- */
805- bool match_part( char const *s, size_type s_len )
806- {
807- zstring scut(s, s_len);
808- return match_part(scut.c_str());
809- }
810-
811- /**
812- * Checks whether the given string partially patches the previosuly compiled
813- * regular expression. A "partial match" means that at least part of the
814- * string matches, e.g., "b" matches the regular expression "aba".
815- *
816- * @tparam StringType The string type.
817- * @param s The UTF-8 string to attempt to match.
818- * @return Returns \c true only if the string partially matches.
819- */
820- template<class StringType>
821- bool match_part( StringType const &s ) {
822- return match_part(s.c_str());
823- }
824-
825- ////////// partial match with substrings/tokenization ///////////////////////
826-
827- /**
828- * Finds the next substring matching the pattern this %regex was compiled
829- * with.
830- *
831- * @param s The C string to attempt to match.
832- * @param pos A pointer to the position to start looking for a match. On
833- * successful return, the position is updated to be one past the last
834- * character of the match.
835- * @param match A pointer to the string that is to be set to the substring
836- * matching the pattern or NULL if the substring is not needed.
837- * @return Returns \c true only if there is a match.
838- */
839- bool next_match( char const *s, size_type *pos, zstring *match );
840-
841- /**
842- * Finds the next substring matching the pattern this %regex was compiled
843- * with.
844- *
845- * @param s The C string to attempt to match.
846- * @param s_len The length of the C string.
847- * @param pos A pointer to the position to start looking for a match. On
848- * successful return, the position is updated to be one past the last
849- * character of the match.
850- * @param match A pointer to the string that is to be set to the substring
851- * matching the pattern or NULL if the substring is not needed.
852- * @return Returns \c true only if there is a match.
853- */
854- bool next_match( char const *s, size_type s_len, size_type *pos,
855- zstring *match )
856- {
857- zstring scut(s, s_len);
858- return next_match(scut.c_str(), pos, match);
859- }
860-
861- /**
862- * Finds the next substring matching the pattern this %regex was compiled
863- * with.
864- *
865- * @tparam StringType The string type.
866- * @param s The string to attempt to match.
867- * @param pos A pointer to the position to start looking for a match. On
868- * successful return, the position is updated to be one past the last
869- * character of the match.
870- * @param match A pointer to the string that is to be set to the substring
871- * matching the pattern or NULL if the substring is not needed.
872- * @return Returns \c true only if there is a match.
873- */
874- template<class StringType>
875- bool next_match( StringType const &s, size_type *pos, zstring *match ) {
876- return next_match(s.c_str(), pos, match);
877- }
878-
879-
880- /**
881- * Finds the next substring separated by the pattern this %regex was compiled
882- * with (similar to <code>strtok</code>(3)).
883- *
884- * @param s The C string to attempt to find a token in.
885- * @param pos A pointer to the position to start looking for a token. On
886- * successful return, the position is updated to be one past the last
887- * character of the token.
888- * @param token A pointer to the string that is to be set to the substring
889- * separated by the pattern or \c NULL if the substring is not needed.
890- * @param matched A pointer to a \c bool to indicate whether the pattern
891- * matched for the token or \c NULL if this is not needed. If not \c NULL,
892- * it is set to \c false either if there is no token or the token is the
893- * final token after the last separator; it is set to \c true only for
894- * non-last tokens.
895- * @return Returns \c true only if there is a token.
896- */
897- bool next_token( char const *s, size_type *pos, zstring *token,
898- bool *matched = NULL );
899-
900- /**
901- * Finds the next substring separated by the pattern this %regex was compiled
902- * with (similar to <code>strtok</code>(3)).
903- *
904- * @param s The C string to attempt to find a token in.
905- * @param s_len The length of the C string.
906- * @param pos A pointer to the position to start looking for a token. On
907- * successful return, the position is updated to be one past the last
908- * character of the token.
909- * @param token A pointer to the string that is to be set to the substring
910- * separated by the pattern or \c NULL if the substring is not needed.
911- * @param matched A pointer to a \c bool to indicate whether the pattern
912- * matched for the token or \c NULL if this is not needed. If not \c NULL,
913- * it is set to \c false either if there is no token or the token is the
914- * final token after the last separator; it is set to \c true only for
915- * non-last tokens.
916- * @return Returns \c true only if there is a token.
917- */
918- bool next_token( char const *s, size_type s_len, size_type *pos,
919- zstring *token, bool *matched = NULL )
920- {
921- zstring scut(s, s_len);
922- return next_token(scut.c_str(), pos, token, matched);
923- }
924-
925- /**
926- * Finds the next substring separated by the pattern this %regex was compiled
927- * with (similar to <code>strtok</code>(3)).
928- *
929- * @tparam StringType The string type.
930- * @param s The string to attempt to find a token in.
931- * @param pos A pointer to the position to start looking for a token. On
932- * successful return, the position is updated to be one past the last
933- * character of the token.
934- * @param token A pointer to the string that is to be set to the substring
935- * separated by the pattern or \c NULL if the substring is not needed.
936- * @return Returns \c true only if there is a token.
937- */
938- template<class StringType>
939- bool next_token( StringType const &s, size_type *pos, zstring *token,
940- bool *matched = NULL ) {
941- return next_token(s.c_str(), pos, token, matched);
942- }
943-
944- ////////// whole match //////////////////////////////////////////////////////
945-
946- /**
947- * Checks whether the given string completely patches the previosuly compiled
948- * regular expression. A "complete match" means that the entire string must
949- * match the regular expression as if the pattern were "^pattern$".
950- *
951- * @param s The null-terminated UTF-8 C string to attempt to match.
952- * @return Returns \c true only if the string completely matches.
953- */
954- bool match_whole( char const *s );
955-
956- /**
957- * Checks whether the given string completely patches the previosuly compiled
958- * regular expression. A "complete match" means that the entire string must
959- * match the regular expression as if the pattern were "^pattern$".
960- *
961- * @param s The UTF-8 C string to attempt to match.
962- * @param s_len The length of the string in bytes.
963- * @return Returns \c true only if the string completely matches.
964- */
965- bool match_whole( char const *s, size_type s_len )
966- {
967- zstring scut(s, s_len);
968- return match_whole(scut.c_str());
969- }
970-
971- /**
972- * Checks whether the given string completely patches the previosuly compiled
973- * regular expression. A "complete match" means that the entire string must
974- * match the regular expression as if the pattern were "^pattern$".
975- *
976- * @tparam StringType The string type.
977- * @param s The UTF-8 string to attempt to match.
978- * @return Returns \c true only if the string completely matches.
979- */
980- template<class StringType>
981- bool match_whole( StringType const &s ) {
982- return match_whole(s.c_str());
983- }
984-
985- ////////// replacement //////////////////////////////////////////////////////
986-
987- /**
988- * Replaces all occurrences of substrings that match the pattern this %regex
989- * was compiled with.
990- *
991- * @param in The UTF-8 input string.
992- * @param replacement The replacement string.
993- * @param out The output string.
994- * @param Returns \c true only if at least one replacement was made.
995- */
996- bool replace_all( char const *in, char const *replacement, zstring *out );
997-
998- /**
999- * Replaces all occurrences of substrings that match the pattern this %regex
1000- * was compiled with.
1001- *
1002- * @tparam InputStringType The input string type.
1003- * @tparam ReplacementStringType The replacement string type.
1004- * @param in The input string.
1005- * @param replacement The replacement string.
1006- * @param out The output string.
1007- * @param Returns \c true only if at least one replacement was made.
1008- */
1009- template<class InputStringType,class ReplacementStringType>
1010- bool replace_all( InputStringType const &in,
1011- ReplacementStringType const &replacement,
1012- zstring *out ) {
1013- return replace_all( in.c_str(), replacement.c_str(), out );
1014- }
1015-
1016-
1017- /**
1018- * Set the string to work on, without doing matching yet.
1019- *
1020- * @param in The UTF-8 input string.
1021- * @param len the size in bytes.
1022- */
1023- void set_string( const char* in, size_type len );
1024-
1025- /**
1026- * Find the next match in string set by set_string().
1027- * After finding a match, call get_match_start() and get_match_end() to get the position in the string.
1028- *
1029- * @param reachedEnd returns true if the end of string was reached while doing the match.
1030- */
1031- bool find_next_match( bool *reachedEnd );
1032-
1033- /**
1034- * Get the number of parenthesized groups in the regular expression.
1035- * This number depends only on regular expression, and not on the working string.
1036- *
1037- * @return the number of parenthesized groups in the regular expression
1038- */
1039- int get_pattern_group_count();
1040-
1041- /**
1042- * Get the start position of the matched group.
1043- * If groupId is zero, then the start and end position of the whole match is returned.
1044- * If groupId is non-zero, then the start and end position of that group is returned.
1045- * If that group has not been matched, false is returned.
1046- *
1047- * @param groupId the id of the group, either zero for the entire regex,
1048- * or [1 .. group_count] for that specific group
1049- * @param start to return start position in bytes
1050- * @param end to return end position in bytes
1051- * @return true if that group exists and has been matched
1052- */
1053- bool get_match_start_end_bytes( int groupId, int *start, int *end );
1054-
1055-
1056-private:
1057- regex_xquery::CRegexXQuery_regex *regex_matcher;
1058- uint32_t parsed_flags;
1059-
1060- zstring s_in_;
1061- int m_pos;
1062- int m_match_pos;
1063- int m_matched_len;
1064-
1065- uint32_t parse_regex_flags(const char* flag_cstr);
1066-
1067- // forbid
1068- regex( regex const& );
1069- regex& operator=( regex const& );
1070-};
1071-
1072 ///////////////////////////////////////////////////////////////////////////////
1073
1074 } // namespace unicode
1075 } // namespace zorba
1076
1077 #endif /* ZORBA_NO_ICU */
1078-#endif /* ZORBA_REGEX_H */
1079+#endif /* ZORBA_ICU_REGEX_H */
1080 /*
1081 * Local variables:
1082 * mode: c++
1083
1084=== modified file 'src/util/passthru_streambuf.cpp'
1085--- src/util/passthru_streambuf.cpp 2013-02-07 17:24:36 +0000
1086+++ src/util/passthru_streambuf.cpp 2013-04-12 05:29:35 +0000
1087@@ -26,7 +26,7 @@
1088 ///////////////////////////////////////////////////////////////////////////////
1089
1090 passthru_streambuf::passthru_streambuf( char const*, streambuf *orig ) :
1091- proxy_streambuf( orig )
1092+ internal::proxy_streambuf( orig )
1093 {
1094 if ( !orig )
1095 throw invalid_argument( "null streambuf" );
1096@@ -85,7 +85,7 @@
1097
1098 passthru_streambuf::int_type passthru_streambuf::pbackfail( int_type c ) {
1099 return traits_type::eq_int_type( c, traits_type::eof() ) ?
1100- c : proxy_buf_->sputbackc( traits_type::to_char_type( c ) );
1101+ c : original()->sputbackc( traits_type::to_char_type( c ) );
1102 }
1103
1104 passthru_streambuf::int_type passthru_streambuf::uflow() {
1105
1106=== modified file 'src/util/passthru_streambuf.h'
1107--- src/util/passthru_streambuf.h 2013-02-07 17:24:36 +0000
1108+++ src/util/passthru_streambuf.h 2013-04-12 05:29:35 +0000
1109@@ -30,7 +30,7 @@
1110 * A %passthru_streambuf is-a std::streambuf that simply passes through
1111 * characters unchanged.
1112 */
1113-class passthru_streambuf : public proxy_streambuf {
1114+class passthru_streambuf : public internal::proxy_streambuf {
1115 public:
1116 #ifdef WIN32
1117 // These typedefs are needed (but shouldn't be) when using MSVC++.
1118
1119=== added file 'src/util/regex.h'
1120--- src/util/regex.h 1970-01-01 00:00:00 +0000
1121+++ src/util/regex.h 2013-04-12 05:29:35 +0000
1122@@ -0,0 +1,34 @@
1123+/*
1124+ * Copyright 2006-2008 The FLWOR Foundation.
1125+ *
1126+ * Licensed under the Apache License, Version 2.0 (the "License");
1127+ * you may not use this file except in compliance with the License.
1128+ * You may obtain a copy of the License at
1129+ *
1130+ * http://www.apache.org/licenses/LICENSE-2.0
1131+ *
1132+ * Unless required by applicable law or agreed to in writing, software
1133+ * distributed under the License is distributed on an "AS IS" BASIS,
1134+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1135+ * See the License for the specific language governing permissions and
1136+ * limitations under the License.
1137+ */
1138+#pragma once
1139+#ifndef ZORBA_REGEX_H
1140+#define ZORBA_REGEX_H
1141+
1142+#include <zorba/config.h>
1143+
1144+#ifdef ZORBA_NO_ICU
1145+#include "zorba_regex.h"
1146+#else
1147+#include "icu_regex.h"
1148+#endif /* ZORBA_NO_ICU */
1149+
1150+#endif /* ZORBA_REGEX_H */
1151+/*
1152+ * Local variables:
1153+ * mode: c++
1154+ * End:
1155+ */
1156+/* vim:set et sw=2 ts=2: */
1157
1158=== added file 'src/util/zorba_regex.cpp'
1159--- src/util/zorba_regex.cpp 1970-01-01 00:00:00 +0000
1160+++ src/util/zorba_regex.cpp 2013-04-12 05:29:35 +0000
1161@@ -0,0 +1,265 @@
1162+/*
1163+ * Copyright 2006-2008 The FLWOR Foundation.
1164+ *
1165+ * Licensed under the Apache License, Version 2.0 (the "License");
1166+ * you may not use this file except in compliance with the License.
1167+ * You may obtain a copy of the License at
1168+ *
1169+ * http://www.apache.org/licenses/LICENSE-2.0
1170+ *
1171+ * Unless required by applicable law or agreed to in writing, software
1172+ * distributed under the License is distributed on an "AS IS" BASIS,
1173+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1174+ * See the License for the specific language governing permissions and
1175+ * limitations under the License.
1176+ */
1177+#include "stdafx.h"
1178+
1179+#include <cstring>
1180+
1181+#include <zorba/diagnostic_list.h>
1182+#include "diagnostics/dict.h"
1183+#include "diagnostics/xquery_exception.h"
1184+
1185+#include "stl_util.h"
1186+#include "zorba_regex.h"
1187+
1188+#define INVALID_RE_EXCEPTION(...) \
1189+ XQUERY_EXCEPTION( err::FORX0002, ERROR_PARAMS( __VA_ARGS__ ) )
1190+
1191+namespace zorba {
1192+namespace unicode {
1193+
1194+///////////////////////////////////////////////////////////////////////////////
1195+
1196+uint32_t regex::parse_regex_flags(const char* flag_cstr)
1197+{
1198+ uint32_t flags = 0;
1199+ for (const char* p = flag_cstr; *p != '\0'; ++p)
1200+ {
1201+ switch (*p)
1202+ {
1203+ case 'i': flags |= REGEX_ASCII_CASE_INSENSITIVE; break;
1204+ case 's': flags |= REGEX_ASCII_DOTALL; break;
1205+ case 'm': flags |= REGEX_ASCII_MULTILINE; break;
1206+ case 'x': flags |= REGEX_ASCII_NO_WHITESPACE; break;
1207+ case 'q': flags |= REGEX_ASCII_LITERAL; break;
1208+ default:
1209+ throw XQUERY_EXCEPTION( err::FORX0001, ERROR_PARAMS( *p ) );
1210+ break;
1211+ }
1212+ }
1213+ return flags;
1214+}
1215+
1216+void regex::compile( char const *pattern, char const *flags)
1217+{
1218+ parsed_flags = parse_regex_flags(flags);
1219+ regex_engine::CRegexXQuery_parser regex_parser;
1220+ regex_matcher = regex_parser.parse(pattern, parsed_flags);
1221+ if(!regex_matcher)
1222+ throw INVALID_RE_EXCEPTION(pattern);
1223+}
1224+
1225+bool regex::match_part( char const *s )
1226+{
1227+ bool retval;
1228+ int match_pos;
1229+ int matched_len;
1230+
1231+ retval = regex_matcher->match_anywhere(s, parsed_flags, &match_pos, &matched_len);
1232+
1233+ return retval;
1234+}
1235+
1236+bool regex::next_match( char const *s, size_type *pos, zstring *match )
1237+{
1238+ bool retval;
1239+ int match_pos;
1240+ int matched_len;
1241+
1242+ retval = regex_matcher->match_anywhere(s+*pos, parsed_flags, &match_pos, &matched_len);
1243+ if(retval)
1244+ {
1245+ match->assign(s+*pos+match_pos, matched_len);
1246+ *pos += match_pos + matched_len;
1247+ }
1248+ return retval;
1249+}
1250+
1251+bool regex::next_token( char const *s, size_type *pos, zstring *token,
1252+ bool *matched)
1253+{
1254+ if(!s[*pos])
1255+ return false;
1256+ bool retval;
1257+ int match_pos;
1258+ int matched_len;
1259+
1260+ retval = regex_matcher->match_anywhere(s+*pos, parsed_flags, &match_pos, &matched_len);
1261+ if(retval)
1262+ {
1263+ if(token)
1264+ token->assign(s+*pos, match_pos);
1265+ *pos += match_pos + matched_len;
1266+ if(matched)
1267+ *matched = true;
1268+ return true;
1269+ }
1270+ else
1271+ {
1272+ if(token)
1273+ token->assign(s+*pos);
1274+ *pos += strlen(s+*pos);
1275+ if(matched)
1276+ *matched = false;
1277+ return true;
1278+ }
1279+}
1280+
1281+bool regex::match_whole( char const *s )
1282+{
1283+ bool retval;
1284+ int matched_pos;
1285+ int matched_len;
1286+
1287+ retval = regex_matcher->match_anywhere(s, parsed_flags|REGEX_ASCII_WHOLE_MATCH, &matched_pos, &matched_len);
1288+ if(!retval)
1289+ return false;
1290+ return true;
1291+}
1292+
1293+bool regex::replace_all( char const *in, char const *replacement, zstring *result )
1294+{
1295+ int match_pos;
1296+ int matched_len;
1297+
1298+ const char *start_str = in;
1299+ int subregex_count = regex_matcher->get_indexed_regex_count();
1300+ bool retval = false;
1301+
1302+ while(regex_matcher->match_anywhere(start_str, parsed_flags, &match_pos, &matched_len))
1303+ {
1304+ if(match_pos)
1305+ result->append(start_str , match_pos);
1306+ retval = true;
1307+ const char *temprepl = replacement;
1308+ const char *submatched_source;
1309+ int submatched_len;
1310+ int index;
1311+ while(*temprepl)
1312+ {
1313+ //look for dollars
1314+ if(*temprepl == '\\')
1315+ {
1316+ if(!(parsed_flags & REGEX_ASCII_LITERAL))
1317+ {
1318+ temprepl++;
1319+ if(!*temprepl)
1320+ temprepl--;
1321+ else if((*temprepl != '\\') && (*temprepl != '$'))//Invalid replacement string.
1322+ throw XQUERY_EXCEPTION( err::FORX0004, ERROR_PARAMS( replacement ) );
1323+ }
1324+ result->append(1, *temprepl);
1325+ temprepl++;
1326+ continue;
1327+ }
1328+ if((*temprepl == '$') && !(parsed_flags & REGEX_ASCII_LITERAL))
1329+ {
1330+ temprepl++;
1331+ index = 0;
1332+ int nr_digits = 0;
1333+ while(isdigit(*temprepl))
1334+ {
1335+ if(nr_digits && ((index*10 + (*temprepl)-'0') > subregex_count))
1336+ break;
1337+ index = index*10 + (*temprepl)-'0';
1338+ temprepl++;
1339+ nr_digits++;
1340+ }
1341+ if(!nr_digits)//Invalid replacement string.
1342+ throw XQUERY_EXCEPTION( err::FORX0004, ERROR_PARAMS( replacement ) );
1343+ else if(!index)
1344+ {
1345+ result->append(start_str+match_pos, matched_len);
1346+ }
1347+ else if(regex_matcher->get_indexed_match(index, &submatched_source, &submatched_len))
1348+ {
1349+ if(submatched_source && submatched_len)
1350+ result->append(submatched_source, submatched_len);
1351+ }
1352+ }
1353+ else
1354+ {
1355+ result->append(1, *temprepl);
1356+ temprepl++;
1357+ }
1358+ }
1359+ start_str += match_pos + matched_len;
1360+ }
1361+ result->append(start_str);
1362+
1363+ return retval;
1364+}
1365+
1366+void regex::set_string( const char* in, size_type len )
1367+{
1368+ s_in_.assign(in, len);
1369+ m_pos = 0;
1370+ m_match_pos = 0;
1371+ m_matched_len = 0;
1372+}
1373+
1374+bool regex::next_match( bool *reachedEnd )
1375+{
1376+ bool retval;
1377+
1378+ retval = regex_matcher->match_anywhere(s_in_.c_str()+m_pos, parsed_flags, &m_match_pos, &m_matched_len);
1379+ if(retval)
1380+ {
1381+ m_match_pos += m_pos;
1382+ m_pos = m_match_pos + m_matched_len;
1383+ }
1384+ else
1385+ {
1386+ m_pos = s_in_.length();
1387+ m_match_pos = 0;
1388+ m_matched_len = 0;
1389+ }
1390+ if(reachedEnd)
1391+ *reachedEnd = regex_matcher->get_reachedEnd();
1392+ return retval;
1393+}
1394+
1395+int regex::get_group_count()
1396+{
1397+ return (int)regex_matcher->get_indexed_regex_count();
1398+}
1399+
1400+bool regex::get_match_start_end_bytes( int groupId, int *start, int *end )
1401+{
1402+ *start = -1;
1403+ *end = -1;
1404+ if(groupId == 0)
1405+ {
1406+ *start = m_match_pos;
1407+ *end = m_match_pos + m_matched_len;
1408+ return true;
1409+ }
1410+ if(groupId > (int)regex_matcher->get_indexed_regex_count())
1411+ return false;
1412+ const char *submatched_source;
1413+ int submatched_len;
1414+ if(!regex_matcher->get_indexed_match(groupId, &submatched_source, &submatched_len))
1415+ return false;
1416+ *start = submatched_source - s_in_.c_str();
1417+ *end = *start + submatched_len;
1418+ return true;
1419+}
1420+
1421+///////////////////////////////////////////////////////////////////////////////
1422+
1423+} // namespace unicode
1424+} // namespace zorba
1425+
1426+/* vim:set et sw=2 ts=2: */
1427
1428=== added file 'src/util/zorba_regex.h'
1429--- src/util/zorba_regex.h 1970-01-01 00:00:00 +0000
1430+++ src/util/zorba_regex.h 2013-04-12 05:29:35 +0000
1431@@ -0,0 +1,406 @@
1432+/*
1433+ * Copyright 2006-2008 The FLWOR Foundation.
1434+ *
1435+ * Licensed under the Apache License, Version 2.0 (the "License");
1436+ * you may not use this file except in compliance with the License.
1437+ * You may obtain a copy of the License at
1438+ *
1439+ * http://www.apache.org/licenses/LICENSE-2.0
1440+ *
1441+ * Unless required by applicable law or agreed to in writing, software
1442+ * distributed under the License is distributed on an "AS IS" BASIS,
1443+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1444+ * See the License for the specific language governing permissions and
1445+ * limitations under the License.
1446+ */
1447+#pragma once
1448+#ifndef ZORBA_ZORBA_REGEX_H
1449+#define ZORBA_ZORBA_REGEX_H
1450+
1451+#include <zorba/config.h>
1452+
1453+#ifdef ZORBA_NO_ICU
1454+
1455+#include "cxx_util.h"
1456+#include "unicode_util.h"
1457+#include "zorba_regex_engine.h"
1458+#include "zorbatypes/zstring.h"
1459+
1460+namespace zorba {
1461+
1462+/**
1463+ * Converts an XQuery regular expression to the form used by the regular
1464+ * expression library Zorba is using (here regex_engine).
1465+ *
1466+ * @param xq_re The XQuery regular expression.
1467+ * @param lib_re A pointer to the resuling library regular expression.
1468+ * @param flags The flags to use, if any.
1469+ */
1470+inline void convert_xquery_re( zstring const &xq_re, zstring *lib_re,
1471+ char const *flags = "" ) {
1472+ *lib_re = xq_re; // no converseion neeeded
1473+}
1474+
1475+////////// classes ////////////////////////////////////////////////////////////
1476+
1477+namespace unicode {
1478+
1479+/**
1480+ * The %regex class wraps the underlying Unicode regular expression library.
1481+ */
1482+class regex {
1483+public:
1484+ /**
1485+ * Constructs a %regex.
1486+ */
1487+ regex() : regex_matcher( nullptr ) { }
1488+
1489+ /**
1490+ * Destroys a %regex.
1491+ */
1492+ ~regex() {
1493+ delete regex_matcher;
1494+ }
1495+
1496+ ////////// compile pattern //////////////////////////////////////////////////
1497+
1498+ /**
1499+ * Compiles a regular expression. One of the compile functions must be
1500+ * called prior to calling one of the match functions.
1501+ *
1502+ * @param pattern The regular expression pattern to compile.
1503+ * @param flags The regular expression flags, if any.
1504+ * @param throws err:FORX0002 if the regular expression is invalid.
1505+ */
1506+ void compile( char const *pattern, char const *flags = "" ) ;
1507+
1508+ /**
1509+ * Compiles a regular expression. One of the compile functions must be
1510+ * called prior to calling one of the match functions.
1511+ *
1512+ * @tparam StringType The pattern string type.
1513+ * @param pattern The regular expression pattern to compile.
1514+ * @param flags The regular expression flags, if any.
1515+ * @param throws err:FORX0002 if the regular expression is invalid.
1516+ */
1517+ template<class StringType>
1518+ void compile( StringType const &pattern, char const *flags = "" ) {
1519+ compile( pattern.c_str(), flags );
1520+ }
1521+
1522+ /**
1523+ * Compiles a regular expression. One of the compile functions must be
1524+ * called prior to calling one of the match functions.
1525+ *
1526+ * @tparam PatternStringType The pattern string type.
1527+ * @tparam FlagsStringType The flags string type.
1528+ * @param pattern The regular expression pattern to compile.
1529+ * @param flags The regular expression flags, if any.
1530+ * @param throws err:FORX0002 if the regular expression is invalid.
1531+ */
1532+ template<class PatternStringType,class FlagsStringType>
1533+ void compile( PatternStringType const &pattern,
1534+ FlagsStringType const &flags ) {
1535+ compile( pattern.c_str(), flags.c_str() );
1536+ }
1537+
1538+ ////////// partial match ////////////////////////////////////////////////////
1539+
1540+ /**
1541+ * Checks whether the given string partially patches the previosuly compiled
1542+ * regular expression. A "partial match" means that at least part of the
1543+ * string matches, e.g., "b" matches the regular expression "aba".
1544+ *
1545+ * @param s The null-terminated UTF-8 C string to attempt to match.
1546+ * @return Returns \c true only if the string partially matches.
1547+ */
1548+ bool match_part( char const *s );
1549+
1550+ /**
1551+ * Checks whether the given string partially patches the previosuly compiled
1552+ * regular expression. A "partial match" means that at least part of the
1553+ * string matches, e.g., "b" matches the regular expression "aba".
1554+ *
1555+ * @param s The UTF-8 C string to attempt to match.
1556+ * @param s_len The length of the string in bytes.
1557+ * @return Returns \c true only if the string partially matches.
1558+ */
1559+ bool match_part( char const *s, size_type s_len )
1560+ {
1561+ zstring scut(s, s_len);
1562+ return match_part(scut.c_str());
1563+ }
1564+
1565+ /**
1566+ * Checks whether the given string partially patches the previosuly compiled
1567+ * regular expression. A "partial match" means that at least part of the
1568+ * string matches, e.g., "b" matches the regular expression "aba".
1569+ *
1570+ * @tparam StringType The string type.
1571+ * @param s The UTF-8 string to attempt to match.
1572+ * @return Returns \c true only if the string partially matches.
1573+ */
1574+ template<class StringType>
1575+ bool match_part( StringType const &s ) {
1576+ return match_part(s.c_str());
1577+ }
1578+
1579+ ////////// partial match with substrings/tokenization ///////////////////////
1580+
1581+ /**
1582+ * Finds the next substring matching the pattern this %regex was compiled
1583+ * with.
1584+ *
1585+ * @param s The C string to attempt to match.
1586+ * @param pos A pointer to the position to start looking for a match. On
1587+ * successful return, the position is updated to be one past the last
1588+ * character of the match.
1589+ * @param match A pointer to the string that is to be set to the substring
1590+ * matching the pattern or NULL if the substring is not needed.
1591+ * @return Returns \c true only if there is a match.
1592+ */
1593+ bool next_match( char const *s, size_type *pos, zstring *match );
1594+
1595+ /**
1596+ * Finds the next substring matching the pattern this %regex was compiled
1597+ * with.
1598+ *
1599+ * @param s The C string to attempt to match.
1600+ * @param s_len The length of the C string.
1601+ * @param pos A pointer to the position to start looking for a match. On
1602+ * successful return, the position is updated to be one past the last
1603+ * character of the match.
1604+ * @param match A pointer to the string that is to be set to the substring
1605+ * matching the pattern or NULL if the substring is not needed.
1606+ * @return Returns \c true only if there is a match.
1607+ */
1608+ bool next_match( char const *s, size_type s_len, size_type *pos,
1609+ zstring *match )
1610+ {
1611+ zstring scut(s, s_len);
1612+ return next_match(scut.c_str(), pos, match);
1613+ }
1614+
1615+ /**
1616+ * Finds the next substring matching the pattern this %regex was compiled
1617+ * with.
1618+ *
1619+ * @tparam StringType The string type.
1620+ * @param s The string to attempt to match.
1621+ * @param pos A pointer to the position to start looking for a match. On
1622+ * successful return, the position is updated to be one past the last
1623+ * character of the match.
1624+ * @param match A pointer to the string that is to be set to the substring
1625+ * matching the pattern or NULL if the substring is not needed.
1626+ * @return Returns \c true only if there is a match.
1627+ */
1628+ template<class StringType>
1629+ bool next_match( StringType const &s, size_type *pos, zstring *match ) {
1630+ return next_match(s.c_str(), pos, match);
1631+ }
1632+
1633+
1634+ /**
1635+ * Finds the next substring separated by the pattern this %regex was compiled
1636+ * with (similar to <code>strtok</code>(3)).
1637+ *
1638+ * @param s The C string to attempt to find a token in.
1639+ * @param pos A pointer to the position to start looking for a token. On
1640+ * successful return, the position is updated to be one past the last
1641+ * character of the token.
1642+ * @param token A pointer to the string that is to be set to the substring
1643+ * separated by the pattern or \c NULL if the substring is not needed.
1644+ * @param matched A pointer to a \c bool to indicate whether the pattern
1645+ * matched for the token or \c NULL if this is not needed. If not \c NULL,
1646+ * it is set to \c false either if there is no token or the token is the
1647+ * final token after the last separator; it is set to \c true only for
1648+ * non-last tokens.
1649+ * @return Returns \c true only if there is a token.
1650+ */
1651+ bool next_token( char const *s, size_type *pos, zstring *token,
1652+ bool *matched = NULL );
1653+
1654+ /**
1655+ * Finds the next substring separated by the pattern this %regex was compiled
1656+ * with (similar to <code>strtok</code>(3)).
1657+ *
1658+ * @param s The C string to attempt to find a token in.
1659+ * @param s_len The length of the C string.
1660+ * @param pos A pointer to the position to start looking for a token. On
1661+ * successful return, the position is updated to be one past the last
1662+ * character of the token.
1663+ * @param token A pointer to the string that is to be set to the substring
1664+ * separated by the pattern or \c NULL if the substring is not needed.
1665+ * @param matched A pointer to a \c bool to indicate whether the pattern
1666+ * matched for the token or \c NULL if this is not needed. If not \c NULL,
1667+ * it is set to \c false either if there is no token or the token is the
1668+ * final token after the last separator; it is set to \c true only for
1669+ * non-last tokens.
1670+ * @return Returns \c true only if there is a token.
1671+ */
1672+ bool next_token( char const *s, size_type s_len, size_type *pos,
1673+ zstring *token, bool *matched = NULL )
1674+ {
1675+ zstring scut(s, s_len);
1676+ return next_token(scut.c_str(), pos, token, matched);
1677+ }
1678+
1679+ /**
1680+ * Finds the next substring separated by the pattern this %regex was compiled
1681+ * with (similar to <code>strtok</code>(3)).
1682+ *
1683+ * @tparam StringType The string type.
1684+ * @param s The string to attempt to find a token in.
1685+ * @param pos A pointer to the position to start looking for a token. On
1686+ * successful return, the position is updated to be one past the last
1687+ * character of the token.
1688+ * @param token A pointer to the string that is to be set to the substring
1689+ * separated by the pattern or \c NULL if the substring is not needed.
1690+ * @return Returns \c true only if there is a token.
1691+ */
1692+ template<class StringType>
1693+ bool next_token( StringType const &s, size_type *pos, zstring *token,
1694+ bool *matched = NULL ) {
1695+ return next_token(s.c_str(), pos, token, matched);
1696+ }
1697+
1698+ ////////// whole match //////////////////////////////////////////////////////
1699+
1700+ /**
1701+ * Checks whether the given string completely patches the previosuly compiled
1702+ * regular expression. A "complete match" means that the entire string must
1703+ * match the regular expression as if the pattern were "^pattern$".
1704+ *
1705+ * @param s The null-terminated UTF-8 C string to attempt to match.
1706+ * @return Returns \c true only if the string completely matches.
1707+ */
1708+ bool match_whole( char const *s );
1709+
1710+ /**
1711+ * Checks whether the given string completely patches the previosuly compiled
1712+ * regular expression. A "complete match" means that the entire string must
1713+ * match the regular expression as if the pattern were "^pattern$".
1714+ *
1715+ * @param s The UTF-8 C string to attempt to match.
1716+ * @param s_len The length of the string in bytes.
1717+ * @return Returns \c true only if the string completely matches.
1718+ */
1719+ bool match_whole( char const *s, size_type s_len )
1720+ {
1721+ zstring scut(s, s_len);
1722+ return match_whole(scut.c_str());
1723+ }
1724+
1725+ /**
1726+ * Checks whether the given string completely patches the previosuly compiled
1727+ * regular expression. A "complete match" means that the entire string must
1728+ * match the regular expression as if the pattern were "^pattern$".
1729+ *
1730+ * @tparam StringType The string type.
1731+ * @param s The UTF-8 string to attempt to match.
1732+ * @return Returns \c true only if the string completely matches.
1733+ */
1734+ template<class StringType>
1735+ bool match_whole( StringType const &s ) {
1736+ return match_whole(s.c_str());
1737+ }
1738+
1739+ ////////// replacement //////////////////////////////////////////////////////
1740+
1741+ /**
1742+ * Replaces all occurrences of substrings that match the pattern this %regex
1743+ * was compiled with.
1744+ *
1745+ * @param in The UTF-8 input string.
1746+ * @param replacement The replacement string.
1747+ * @param out The output string.
1748+ * @param Returns \c true only if at least one replacement was made.
1749+ */
1750+ bool replace_all( char const *in, char const *replacement, zstring *out );
1751+
1752+ /**
1753+ * Replaces all occurrences of substrings that match the pattern this %regex
1754+ * was compiled with.
1755+ *
1756+ * @tparam InputStringType The input string type.
1757+ * @tparam ReplacementStringType The replacement string type.
1758+ * @param in The input string.
1759+ * @param replacement The replacement string.
1760+ * @param out The output string.
1761+ * @param Returns \c true only if at least one replacement was made.
1762+ */
1763+ template<class InputStringType,class ReplacementStringType>
1764+ bool replace_all( InputStringType const &in,
1765+ ReplacementStringType const &replacement,
1766+ zstring *out ) {
1767+ return replace_all( in.c_str(), replacement.c_str(), out );
1768+ }
1769+
1770+
1771+ /**
1772+ * Set the string to work on, without doing matching yet.
1773+ *
1774+ * @param in The UTF-8 input string.
1775+ * @param len the size in bytes.
1776+ */
1777+ void set_string( const char* in, size_type len );
1778+
1779+ /**
1780+ * Find the next match in string set by set_string().
1781+ * After finding a match, call get_group_start() and get_group_end() to get the position in the string.
1782+ *
1783+ * @param reachedEnd returns true if the end of string was reached while doing the match.
1784+ */
1785+ bool next_match( bool *reachedEnd );
1786+
1787+ /**
1788+ * Get the number of parenthesized groups in the regular expression.
1789+ * This number depends only on regular expression, and not on the working string.
1790+ *
1791+ * @return the number of parenthesized groups in the regular expression
1792+ */
1793+ int get_group_count();
1794+
1795+ /**
1796+ * Get the start position of the matched group.
1797+ * If groupId is zero, then the start and end position of the whole match is returned.
1798+ * If groupId is non-zero, then the start and end position of that group is returned.
1799+ * If that group has not been matched, false is returned.
1800+ *
1801+ * @param groupId the id of the group, either zero for the entire regex,
1802+ * or [1 .. group_count] for that specific group
1803+ * @param start to return start position in bytes
1804+ * @param end to return end position in bytes
1805+ * @return true if that group exists and has been matched
1806+ */
1807+ bool get_match_start_end_bytes( int groupId, int *start, int *end );
1808+
1809+private:
1810+ regex_engine::CRegexXQuery_regex *regex_matcher;
1811+ uint32_t parsed_flags;
1812+
1813+ zstring s_in_;
1814+ int m_pos;
1815+ int m_match_pos;
1816+ int m_matched_len;
1817+
1818+ uint32_t parse_regex_flags(const char* flag_cstr);
1819+
1820+ // forbid
1821+ regex( regex const& );
1822+ regex& operator=( regex const& );
1823+};
1824+
1825+///////////////////////////////////////////////////////////////////////////////
1826+
1827+} // namespace unicode
1828+} // namespace zorba
1829+
1830+#endif /* ZORBA_NO_ICU */
1831+#endif /* ZORBA_ZORBA_REGEX_H */
1832+/*
1833+ * Local variables:
1834+ * mode: c++
1835+ * End:
1836+ */
1837+/* vim:set et sw=2 ts=2: */
1838
1839=== renamed file 'src/util/regex_xquery.cpp' => 'src/util/zorba_regex_engine.cpp'
1840--- src/util/regex_xquery.cpp 2013-02-28 11:15:32 +0000
1841+++ src/util/zorba_regex_engine.cpp 2013-04-12 05:29:35 +0000
1842@@ -16,19 +16,18 @@
1843
1844 #include "stdafx.h"
1845
1846-#include <ctype.h>
1847-#include <string.h>
1848+#include <cctype>
1849+#include <cstring>
1850
1851+#include "ascii_util.h"
1852 #include "diagnostics/xquery_diagnostics.h"
1853-#include "util/ascii_util.h"
1854-#include "util/unicode_util.h"
1855-#include "util/utf8_string.h"
1856+#include "unicode_util.h"
1857+#include "utf8_string.h"
1858+#include "zorba_regex_engine.h"
1859 #include "zorbatypes/chartype.h"
1860
1861-#include "regex_xquery.h"
1862-
1863 namespace zorba {
1864- namespace regex_xquery{
1865+ namespace regex_engine{
1866 //ascii regular expression matching
1867
1868 /*http://www.w3.org/TR/xmlschema-2/#regexs
1869@@ -2525,6 +2524,6 @@
1870 return false;
1871 }
1872
1873- }//end namespace regex_xquery
1874+ }//end namespace regex_engine
1875 }//end namespace zorba
1876 /* vim:set et sw=2 ts=2: */
1877
1878=== renamed file 'src/util/regex_xquery.h' => 'src/util/zorba_regex_engine.h'
1879--- src/util/regex_xquery.h 2013-02-07 17:24:36 +0000
1880+++ src/util/zorba_regex_engine.h 2013-04-12 05:29:35 +0000
1881@@ -21,7 +21,7 @@
1882 #include <vector>
1883 #include <util/unicode_util.h>
1884 namespace zorba {
1885- namespace regex_xquery{
1886+ namespace regex_engine{
1887
1888 //matching flags
1889 #define REGEX_ASCII_CASE_INSENSITIVE 1 //i
1890@@ -456,7 +456,7 @@
1891 };
1892
1893 }
1894-}//end namespace zorba::regex_xquery
1895+}//end namespace zorba::regex_engine
1896
1897 #endif
1898 /* vim:set et sw=2 ts=2: */

Subscribers

People subscribed via source and target branches