FreeBSD Bugzilla – Attachment 6249 Details for
Bug 14342
[PATCH] Speed ups for regex!
Home
|
New
|
Browse
|
Search
|
[?]
|
Reports
|
Help
|
New Account
|
Log In
Remember
[x]
|
Forgot Password
Login:
[x]
[patch]
file.diff
file.diff (text/plain), 4.49 KB, created by
howardjp
on 1999-10-15 05:50:00 UTC
(
hide
)
Description:
file.diff
Filename:
MIME Type:
Creator:
howardjp
Created:
1999-10-15 05:50:00 UTC
Size:
4.49 KB
patch
obsolete
>diff -ur regex.orig/engine.c regex/engine.c >--- regex.orig/engine.c Mon Sep 13 09:53:57 1999 >+++ regex/engine.c Fri Oct 15 00:30:46 1999 >@@ -144,14 +144,14 @@ > int eflags; > { > register char *endp; >- register int i; >+ register int i, j; >+ int patlen = g->mlen; > struct match mv; > register struct match *m = &mv; >- register char *dp; >+ register char *dp, *k, *s; > register const sopno gf = g->firststate+1; /* +1 for OEND */ > register const sopno gl = g->laststate; >- char *start; >- char *stop; >+ char *start, *stop; > > /* simplify the situation where possible */ > if (g->cflags®_NOSUB) >@@ -167,14 +167,37 @@ > return(REG_INVARG); > > /* prescreening; this does wonders for this rather slow code */ >- if (g->must != NULL) { >- for (dp = start; dp < stop; dp++) >- if (*dp == g->must[0] && stop - dp >= g->mlen && >- memcmp(dp, g->must, (size_t)g->mlen) == 0) >- break; >- if (dp == stop) /* we didn't find g->must */ >- return(REG_NOMATCH); >+ if (g->must != NULL) { >+ for (k = string + patlen - 1; k < stop;) { >+ /* >+ * for a large class of patterns, upwards of 80% of >+ * match time is spent on the next line. we beat >+ * existing microcode (vax 'matchc') this way. >+ */ >+ while ((k += g->delta0[*(unsigned char *) k]) < stop); >+ if (k < string + LARGE) >+ break; >+ k -= LARGE; >+ >+ j = patlen - 1; >+ s = k - 1; >+ while (g->cmap[*((unsigned char *)s--)] == >+ g->must[--j]); >+ >+ /* >+ * delta-less shortcut for literati. short shrift for >+ * genetic engineers? >+ */ >+ >+ if (j < 0) >+ goto bmpassed; >+ >+ k++; /* no match; restart next char */ >+ } >+ return REG_NOMATCH; > } >+ >+bmpassed: > > /* match struct setup */ > m->g = g; >diff -ur regex.orig/regcomp.c regex/regcomp.c >--- regex.orig/regcomp.c Mon Sep 13 09:53:57 1999 >+++ regex/regcomp.c Fri Oct 15 00:32:47 1999 >@@ -245,6 +245,7 @@ > g->categories = &g->catspace[-(CHAR_MIN)]; > (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); > g->backrefs = 0; >+ (void) memset((void *)g->delta0, 0, 256 * sizeof(int)); > > /* do it */ > EMIT(OEND, 0); >@@ -1738,6 +1739,7 @@ > } > assert(cp == g->must + g->mlen); > *cp++ = '\0'; /* just on general principles */ >+ gosper(g->must, g->delta0, g->cmap); > } > > /* >@@ -1774,4 +1776,46 @@ > if (plusnest != 0) > g->iflags |= BAD; > return(maxnest); >+} >+ >+ >+#define NALT 7 /* tied to scanf() size in alternate() */ >+ >+/* >+ * Compute "Boyer-Moore" delta table -- put skip distance in delta0[c] >+ */ >+void >+gosper(pattern, delta0, cmap) >+ char *pattern; /* ... HAKMEM lives ... */ >+ int *delta0; /* Who/what is HAKMEM? */ >+ char *cmap; >+{ >+ char *altpat[NALT]; >+ int altlen[NALT]; >+ register int j, altmin; >+ unsigned char c; >+ >+ /* Make one-string case look like simple alternatives case */ >+ altmin = altlen[0] = strlen(pattern); >+ altpat[0] = pattern; >+ >+ /* For chars that aren't in any string, skip by string length. */ >+ for (j = 0; j < 256; j++) { >+ delta0[j] = altmin; >+ cmap[j] = j; /* Sneak in initialization of cmap */ >+ } >+ >+ /* For chars in a string, skip distance from char to end of string. */ >+ /* (If char appears more than once, skip minimum distance.) */ >+ >+ delta0[0] = 0; >+ for (j = 0; j < altlen[0] - 1; j++) { >+ c = altpat[0][j]; >+ delta0[c] = MIN(delta0[c], altlen[0] - j - 1); >+ } >+ >+ /* For last char of each string, fall out of search loop. */ >+ c = altpat[0][altlen[0] - 1]; >+ delta0[c] = LARGE; >+ > } >diff -ur regex.orig/regex2.h regex/regex2.h >--- regex.orig/regex2.h Mon Sep 13 09:53:57 1999 >+++ regex/regex2.h Fri Oct 15 00:36:14 1999 >@@ -56,6 +56,8 @@ > */ > #define MAGIC1 ((('r'^0200)<<8) | 'e') > >+#define META "\n^$.[]()?+*|\\" /* egrep meta-characters */ >+ > /* > * The internal representation is a *strip*, a sequence of > * operators ending with an endmarker. (Some terminology etc. is a >@@ -164,6 +166,8 @@ > size_t nsub; /* copy of re_nsub */ > int backrefs; /* does it use back references? */ > sopno nplus; /* how deep does it nest +s? */ >+ int delta0[256]; /* Boyer-Moore algorithm core */ >+ char cmap[256]; > /* catspace must be last */ > cat_t catspace[1]; /* actually [NC] */ > }; >@@ -171,3 +175,14 @@ > /* misc utilities */ > #define OUT (CHAR_MAX+1) /* a non-character value */ > #define ISWORD(c) (isalnum((uch)(c)) || (c) == '_') >+ >+/* stuff used by Boyer-Moore */ >+ >+#define MIN(A, B) ((A) > (B) ? (B) : (A)) >+ >+#define BUFSIZE 65536 /* make higher for cray */ >+#define PATSIZE 6000 >+#define LARGE BUFSIZE + PATSIZE >+ >+void gosper(char *, int *, char *); >+int chimaera(char *, char *, struct re_guts *);
You cannot view the attachment while viewing its details because your browser does not support IFRAMEs.
View the attachment on a separate page
.
View Attachment As Diff
View Attachment As Raw
Actions:
View
|
Diff
Attachments on
bug 14342
: 6249