fix for regexp bug

From: Brian Denheyer <briand_at_deldotd.com>
Date: Thu, 10 Jun 1999 16:38:59 -0700 (PDT)

The problem was in STk. Apparently it was doing a simple loop of the
matching elements and quitting as soon as it got to the first NULL
result (which indicates the subexpression didn't match). Since there
can be optional sub-expressions, this means that it did not check for
possible matches AFTER the failure.

There is test code and a patch at the end of the e-mail.

Unfortunately my change makes matching more inefficient. Since there
is no way to tell how many subexpressions there are, ALL 20 of the
subexpressions must be checked for possible matches. Theoretically,
you could have missed 19 of them and the 20th matched.

The pcre library is superior in this regard since it gives you precise
information about all of the matches.


Brian

------------------------------------------------------------

(define re (string->regexp "^(xx)?(A*)$"))
(display re)
(newline)

(display (re "ZZZ"))
(newline)

(display (re "x"))
(newline)

(display (re "xx"))
(newline)

(display (re "xxAAA"))
(newline)

(display (re "A"))
(newline)
(exit)

------------------------------------------------------------

.stkrc
#<regexp 808d16c>
#f
#f
((0 2) (0 2) (2 2))
((0 5) (0 2) (2 5))
((0 1) (0 0) (0 1))

------------------------------------------------------------

--- sregexp.c.old Thu Jun 10 16:01:30 1999
+++ sregexp.c Thu Jun 10 16:16:07 1999
_at_@ -69,9 +69,16 @@
 }
 
 /*
- * Try to match string against regular expression. Returns sub-match
- * object, or #f if no match.
+ Try to match string against regular expression. Returns sub-match
+ object, or #f if no match.
+
+ Finding the matching sets is complicated by the fact that
+ parenthesized expressions can be optional, i.e. it is possible to
+ get a result where some of the sub-epxressions didn't match but
+ later sub-expressions did match. So ALL of the elements of startp
+ must be checked and a record kept of the last non-zero entry.
  */
+
 static SCM apply_regexp(SCM regexp, SCM l, SCM env)
 {
   SCM string;
_at_@ -79,25 +86,34 @@
 
   ENTER_SCM("apply-regexp");
 
- if (STk_llength (l) != 1) Serror("bad number of args", l);
+ if (STk_llength (l) != 1)
+ Serror("bad number of args", l);
   string = CAR (l);
 
- if (NSTRINGP (string)) Serror("bad string", string);
+ if (NSTRINGP (string))
+ Serror("bad string", string);
   the_chars = CHARS (string);
   
   if (TclRegExec(REGEXP(regexp), the_chars, the_chars)) {
     struct regexp *r = REGEXP(regexp);
     SCM z = NIL;
- int i;
+ int i, last;
+
+ for (i=0; i < NSUBEXP; i++) {
+ if (r->startp[i] != NULL)
+ last = i;
+ }
     
- /* Find the length of the result */
- for (i=0; r->startp[i]; i++) {/*Nothing*/}
-
     /* Build result */
- for (--i; i >= 0; i--) {
- z = Cons(LIST2(STk_makeinteger(r->startp[i]-the_chars),
- STk_makeinteger(r->endp[i]-the_chars)),
- z);
+ for (i=last; i >= 0; i--) {
+ if (r->startp[i] != NULL)
+ z = Cons(LIST2(STk_makeinteger(r->startp[i]-the_chars),
+ STk_makeinteger(r->endp[i]-the_chars)),
+ z);
+ else
+ z = Cons(LIST2(STk_makeinteger(0),
+ STk_makeinteger(0)),
+ z);
     }
     return z;
   }
Received on Fri Jun 11 1999 - 01:40:17 CEST

This archive was generated by hypermail 2.3.0 : Mon Jul 21 2014 - 19:38:59 CEST