Regular expressions

From: Tom Tromey <tromey_at_busco.lanl.gov>
Date: Sat, 24 Sep 94 15:12:03 MDT

Due to demand, I am just going to mail my regular expression package
to the list. It comes in two parts, a C file that does all the dirty
work, and a Scheme file that supplies some helpful (?) support
functions, plus an example of how bad my Scheme programming skills
actually are.

I've introduced two new types, "regexp" (a compiled regular
expression), and "regmatch" (what is returned by a successful match).

A regular expression is a procedure of one argument. The arg must be
a string. The procedure returns a regmatch (if a match was made), or
#f if no match.

You can turn a string into a regexp with string->regexp.

regexp? will tell you if an object is a regexp. regexp-match? will do
the same for a match object. There are 3 operations you can do on a
regexp-match:

1. Find the number of sub-matches (via match-length). This number is
   always at least 1 (because sub-match 0 is the entire matching
   substring).

2. Find the beginning of a sub-match. (match-beginning match index)

3. Find the end of a sub-match. (match-end match index)

The beginning and the end are both numbers, suitable for use with
"substring" (ie they follow the same boundary conventions)

The auxiliary scheme code in regexp.scm (to follow in next message)
provides some useful support.

regexp-substitute can be used to perform a substitution based on the
results of a regular expression. regexp-substitute-all is similar,
but does all matches. These functions let you do sed-like things.

(Aside: I'm not sure if I like these functions or not. I made up the
interfaces, so it isn't yet clear if they are actually useful. Doing
it this way seemed better than parsing a string though, if you know
what I mean)

regexp-quote quotes a string. That is, given a string S, it will
return T such that (string->regexp T) will match S and nothing else.

I haven't been programming in scheme for very long. I'd like to hear
any comments on style that you might have. In particular in
regexp.scm I couldn't find a good way for regexp-substitute and
regexp-substitute-all to share an environment. What you see is my
hacky way to do it. Is this something that no-one does? I wanted to
do it this way so as to minimize my "namespace footprint".

Tom
---
tromey_at_busco.lanl.gov             Member, League for Programming Freedom
/*
 * sregexp.c -- Regular expressions for STk.
 * tromey Fri Jul 22 1994
 *
 */
/* TO DO:
 * Write regexp-quote.
 */
#include <stk.h>
#include "tclRegexp.h"

/*
 * Regular expression type.  A regular expression is a function that
 * takes one argument.  It returns #f if no match, or a regular
 * expression match object on match.
 */
static void free_regexp (SCM ht);
static SCM apply_regexp (SCM x, SCM args, SCM env);
static int tc_regexp;
static extended_scheme_type regexp_type =
{
  "regexp",			/* name */
  EXT_ISPROC | EXT_EVALPARAM,	/* flags */
  NULL,				/* gc_mark_fct */
  free_regexp,			/* gc_sweep_fct */
  apply_regexp,			/* apply_fct */
  NULL				/* display_fct */
};
#define REGEXP(x) ((struct regexp *) ((x)->storage_as.extension.data))
#define REGEXPP(x) (TYPEP ((x), tc_regexp))

/*
 * Regular expression match object.  It can be queried to get
 * sub-match information.
 */
static void free_regmatch (SCM ht);
static int tc_regmatch;
static extended_scheme_type regmatch_type =
{
  "regmatch",			/* name */
  0,				/* flags */
  NULL,				/* gc_mark_fct */
  free_regmatch,		/* gc_sweep_fct */
  NULL,				/* apply_fct */
  NULL				/* display_fct */
};
struct re_registers
{
  int number;			/* Number of matches. */
  int startp[NSUBEXP];		/* Start indices. */
  int endp[NSUBEXP];		/* End indices. */
};
#define REGMATCH(x) ((struct re_registers *) ((x)->storage_as.extension.data))
#define REGMATCHP(x) (TYPEP ((x), tc_regmatch))

/*
 * Return #t if object is a regexp, #f otherwise.
 */
static PRIMITIVE regexp_p (SCM obj)
{
  return (REGEXPP (obj) ? truth : ntruth);
}
/*
 * Return compiled form of regexp represented by string.  Error if not
 * a string, or if regexp has a syntax error.
 */
static PRIMITIVE string_to_regexp (SCM obj)
{
  SCM result;
  if (NSTRINGP (obj))
    err ("not a string", obj);
  NEWCELL (result, tc_regexp);
  REGEXP (result) = TclRegComp (CHARS (obj));
  if (REGEXP (result) == NULL)
    {
      /* FIXME use Tcl error message. */
      err ("error compiling regexp", obj);
    }
  return (result);
}
/*
 * Try to match string against regular expression.  Returns sub-match
 * object, or #f if no match.
 */
static PRIMITIVE apply_regexp (SCM regexp, SCM l, SCM env)
{
  int match, number, length;
  struct re_registers *matches;
  SCM result, string;
  char *the_chars;
  length = llength (l);
  if (length != 1)
    err ("bad number of args", l);
  string = CAR (l);
  if (!REGEXPP (regexp))
    err ("not a regular expression", regexp);
  if (NSTRINGP (string))
    err ("not a string", string);
  the_chars = CHARS (string);
  match = TclRegExec (REGEXP (regexp), the_chars, CHARS (string));
  if (match)
    {
      matches = must_malloc (sizeof (struct re_registers));
      for (number = 0; REGEXP (regexp)->startp[number] != NULL; ++number)
	{
	  matches->startp[number] = (REGEXP (regexp)->startp[number]
				     - the_chars);
	  matches->endp[number] = (REGEXP (regexp)->endp[number] - the_chars);
	}
      matches->number = number;
      NEWCELL (result, tc_regmatch);
      REGMATCH (result) = matches;
    }
  else
    result = ntruth;
  return (result);
}

/*
 * Is object a regmatch?
 */
static PRIMITIVE regmatch_p (SCM obj)
{
  return (REGMATCHP (obj) ? truth : ntruth);
}
/*
 * Return number of submatches.
 */
static PRIMITIVE regmatch_size (SCM rm)
{
  if (!REGMATCHP (rm))
    err ("not a regular expression match", rm);
  return (makeinteger (REGMATCH (rm)->number));
}
/*
 * Return nth start index of regmatch.
 */
static PRIMITIVE regmatch_start (SCM rm, SCM n)
{
  int val;
  if (!REGMATCHP (rm))
    err ("not a regular expression match", rm);
  if (NINTEGERP (n))
    err ("not an integer", n);
  val = INTEGER (n);
  if (val >= REGMATCH (rm)->number)
    err ("match index out of bounds", n);
  return (makeinteger (REGMATCH (rm)->startp[val]));
}
/*
 * Return nth end index of regmatch.
 */
static PRIMITIVE regmatch_end (SCM rm, SCM n)
{
  int val;
  if (!REGMATCHP (rm))
    err ("not a regular expression match", rm);
  if (NINTEGERP (n))
    err ("not an integer", n);
  val = INTEGER (n);
  if (val >= REGMATCH (rm)->number)
    err ("match index out of bounds", n);
  return (makeinteger (REGMATCH (rm)->endp[val]));
}

/*
 * GC interface.
 */
static void free_regexp (SCM reg)
{
  free (REGEXP (reg));
}
static void free_regmatch (SCM reg)
{
  free (REGMATCH (reg));
}

/*
 * Initialization.
 */
PRIMITIVE init_sregexp (void)
{
  tc_regexp = add_new_type (&regexp_type);
  tc_regmatch = add_new_type (&regmatch_type);
  add_new_primitive ("regexp?", tc_subr_1, regexp_p);
  add_new_primitive ("string->regexp", tc_subr_1, string_to_regexp);
  add_new_primitive ("regmatch?", tc_subr_1, regmatch_p);
  /* These are named oddly for similarity with Emacs. FIXME? */
  add_new_primitive ("match-length", tc_subr_1, regmatch_size);
  add_new_primitive ("match-beginning", tc_subr_2, regmatch_start);
  add_new_primitive ("match-end", tc_subr_2, regmatch_end);
  return (UNDEFINED);
}
Received on Sat Sep 24 1994 - 23:13:34 CEST

This archive was generated by hypermail 2.3.0 : Mon Jul 21 2014 - 19:38:59 CEST