Attachment #255419 for bug #282880




PORTNAME=	amberfish
DISTVERSION=	1.7.1
PORTREVISION=	3
CATEGORIES=	textproc databases
MASTER_SITES=	SF/${PORTNAME}/Amberfish%20source%20-%20stable/${PORTVERSION}	\
		http://etymon.com/software/amberfish/stable/

USE_GITLAB=	yes
GL_COMMIT=	d0b6e49d750e2c445a4c6526422d4ff43bc668d7
WWW=		https://web.archive.org/web/20100419215307/http://www.etymon.com/tr.html

MAINTAINER=	nrn@etymon.com
COMMENT=	Full-text search engine with command-line interface
WWW=		https://gitlab.com/amberfish/amberfish

LICENSE=	MIT
LICENSE_FILE=	${WRKSRC}/LICENSE

LIB_DEPENDS=	libxerces-c.so:textproc/xerces-c3



OPTIONS_DEFINE=	DOCS

DOCS_BUILD_DEPENDS=	asciidoctor:textproc/rubygem-asciidoctor
DOCS_PLIST_FILES=	share/man/man1/af.1.gz

post-configure:
	echo "#define AF_VERSION \"v${PORTVERSION}\"" > ${WRKSRC}/src/backend/version.h
	echo v${PORTVERSION} > ${WRKSRC}/doc/version.adoc
post-patch:
	@${REINPLACE_CMD} -e \
		's|$${MAKEFLAGS} ||' ${WRKSRC}/Makefile
	@${REINPLACE_CMD} -e \
		's|cp |$${BSD_INSTALL_MAN} |' ${WRKSRC}/doc/Makefile.in
	@${REINPLACE_CMD} -e \
		's|-O3 |@CFLAGS@ | ; \
		s|make strip|| ; \
		s|cp |$${BSD_INSTALL_PROGRAM} |' ${WRKSRC}/src/Makefile.in

post-patch-DOCS-off:
	@${REINPLACE_CMD} -e \
		'/cd doc/d' ${WRKSRC}/Makefile

post-install-DOCS-on:
	@${MKDIR} ${STAGEDIR}${DOCSDIR}
	${INSTALL_DATA} ${WRKSRC}/doc/amberfish.html ${STAGEDIR}${DOCSDIR}
	${INSTALL_DATA} ${WRKSRC}/doc/html/*.html ${STAGEDIR}${DOCSDIR}

.include <bsd.port.mk>

Lines 1-2 Link Here

(-)b/textproc/amberfish/distinfo (-2 / +2 lines)
1	SHA256 (amberfish-1.6.4.tar.gz) = 155ac6e6b9b76fb7cbd94952548f718ab6add72c3b4fd2482d89abb39d96ce76	1	SHA256 (amberfish-amberfish-d0b6e49d750e2c445a4c6526422d4ff43bc668d7_GL0.tar.gz) = 76b878255f85e13e0716bfa7f54023cac09e0352ead631c8cc429d0d850438d9
2	SIZE (amberfish-1.6.4.tar.gz) = 127198	2	SIZE (amberfish-amberfish-d0b6e49d750e2c445a4c6526422d4ff43bc668d7_GL0.tar.gz) = 137011




--- Makefile.orig	2024-11-23 13:45:47 UTC
+++ Makefile
@@ -16,18 +16,18 @@ strip:
 	cd doc ; ${MAKE} html
 
 strip:
-	cd src/backend ; ${MAKE} ${MAKEFLAGS} strip
+	cd src/backend ; ${MAKE} strip
 #	cd src/interface ; ${MAKE} ${MAKEFLAGS} strip
 
 install:
-	cd src/backend ; ${MAKE} ${MAKEFLAGS} install
+	cd src/backend ; ${MAKE} install
 #	cd src/interface ; ${MAKE} ${MAKEFLAGS} install
-	cd doc ; ${MAKE} ${MAKEFLAGS} install
+	cd doc ; ${MAKE} install
 
 uninstall:
-	cd src/backend ; ${MAKE} ${MAKEFLAGS} uninstall
+	cd src/backend ; ${MAKE} uninstall
 #	cd src/interface ; ${MAKE} ${MAKEFLAGS} uninstall
-	cd doc ; ${MAKE} ${MAKEFLAGS} uninstall
+	cd doc ; ${MAKE} uninstall
 
 clean:
 	rm -fr autom4te.cache

Added Link Here

(-)b/textproc/amberfish/files/patch-src_backend_Makefile.in (+10 lines)
1	--- src/backend/Makefile.in.orig 2024-11-23 13:47:04 UTC
2	+++ src/backend/Makefile.in
3	@@ -62,7 +62,6 @@ install: all
4	strip ${AF}
5
6	install: all
7	- make strip
8	mkdir -p ${PREFIXBIN}
9	cp ${BIN} ${PREFIXBIN}/.
10





/* This is the Porter stemming algorithm, coded up in ANSI C by the
   author. It may be be regarded as cononical, in that it follows the
   algorithm presented in

   Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
   no. 3, pp 130-137,

   only differing from it at the points maked --DEPARTURE-- below.

   See also http://www.tartarus.org/~martin/PorterStemmer

The algorithm as described in the paper could be exactly replicated
by adjusting the points of DEPARTURE, but this is barely necessary,
because (a) the points of DEPARTURE are definitely improvements, and
(b) no encoding of the Porter stemmer I have seen is anything like
as exact as this version, even with the points of DEPARTURE!

You can compile it on Unix with 'gcc -O3 -o stem stem.c' after which
'stem' takes a list of inputs and sends the stemmed equivalent to
stdout.

The algorithm as encoded here is particularly fast.

Release 1
*/

#include <string.h>                               /* for memmove */

#define TRUE 1
#define FALSE 0

/* The main part of the stemming algorithm starts here. b is a buffer
   holding a word to be stemmed. The letters are in b[k0], b[k0+1] ...
   ending at b[k]. In fact k0 = 0 in this demo program. k is readjusted
   downwards as the stemming progresses. Zero termination is not in fact
   used in the algorithm.

   Note that only lower case sequences are stemmed. Forcing to lower case
   should be done before stem(...) is called.
*/

static char * b;                                  /* buffer for word to be stemmed */
static int k,k0,j;                                /* j is a general offset into the string */

/* cons(i) is TRUE <=> b[i] is a consonant. */

static int cons(int i)
{
    switch (b[i])
    {
        case 'a': case 'e': case 'i': case 'o': case 'u': return FALSE;
        case 'y': return (i==k0) ? TRUE : !cons(i-1);
        default: return TRUE;
    }
}


/* m() measures the number of consonant sequences between k0 and j. if c is
   a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
   presence,

      <c><v>       gives 0
      <c>vc<v>     gives 1
      <c>vcvc<v>   gives 2
      <c>vcvcvc<v> gives 3
      ....
*/

static int m()
{
    int n = 0;
    int i = k0;
    while(TRUE)
    {
        if (i > j) return n;
        if (! cons(i)) break; i++;
    }
    i++;
    while(TRUE)
    {
        while(TRUE)
        {
            if (i > j) return n;
            if (cons(i)) break;
            i++;
        }
        i++;
        n++;
        while(TRUE)
        {
            if (i > j) return n;
            if (! cons(i)) break;
            i++;
        }
        i++;
    }
}


/* vowelinstem() is TRUE <=> k0,...j contains a vowel */

static int vowelinstem()
{
    int i; for (i = k0; i <= j; i++) if (! cons(i)) return TRUE;
    return FALSE;
}


/* doublec(j) is TRUE <=> j,(j-1) contain a double consonant. */

static int doublec(int j)
{
    if (j < k0+1) return FALSE;
    if (b[j] != b[j-1]) return FALSE;
    return cons(j);
}


/* cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant
   and also if the second c is not w,x or y. this is used when trying to
   restore an e at the end of a short word. e.g.

      cav(e), lov(e), hop(e), crim(e), but
      snow, box, tray.

*/

static int cvc(int i)
{
    if (i < k0+2 || !cons(i) || cons(i-1) || !cons(i-2)) return FALSE;
    {
        int ch = b[i];
        if (ch == 'w' || ch == 'x' || ch == 'y') return FALSE;
    }
    return TRUE;
}


/* ends(s) is TRUE <=> k0,...k ends with the string s. */

static int ends(char * s)
{
    int length = s[0];
    if (s[length] != b[k]) return FALSE;          /* tiny speed-up */
    if (length > k-k0+1) return FALSE;
    if (memcmp(b+k-length+1,s+1,length) != 0) return FALSE;
    j = k-length;
    return TRUE;
}


/* setto(s) sets (j+1),...k to the characters in the string s, readjusting
   k. */

static void setto(char * s)
{
    int length = s[0];
    memmove(b+j+1,s+1,length);
    k = j+length;
}


/* r(s) is used further down. */

static void r(char * s) { if (m() > 0) setto(s); }

/* step1ab() gets rid of plurals and -ed or -ing. e.g.

	caresses  ->  caress
	ponies    ->  poni
	ties      ->  ti
	caress    ->  caress
	cats      ->  cat

	feed      ->  feed
	agreed    ->  agree
	disabled  ->  disable

	matting   ->  mat
	mating    ->  mate
	meeting   ->  meet
	milling   ->  mill
	messing   ->  mess

	meetings  ->  meet

*/

static void step1ab()
{
    if (b[k] == 's')
    {
        if (ends("\04" "sses")) k -= 2; else
            if (ends("\03" "ies")) setto("\01" "i"); else
                if (b[k-1] != 's') k--;
    }
    if (ends("\03" "eed")) { if (m() > 0) k--; }
    else
    if ((ends("\02" "ed") || ends("\03" "ing")) && vowelinstem())
    {
        k = j;
        if (ends("\02" "at")) setto("\03" "ate"); else
            if (ends("\02" "bl")) setto("\03" "ble"); else
                if (ends("\02" "iz")) setto("\03" "ize"); else
                    if (doublec(k))
                    {
                        k--;
                        {
                            int ch = b[k];
                            if (ch == 'l' || ch == 's' || ch == 'z') k++;
                        }
                    }
        else if (m() == 1 && cvc(k)) setto("\01" "e");
    }
}


/* step1c() turns terminal y to i when there is another vowel in the stem. */

static void step1c() { if (ends("\01" "y") && vowelinstem()) b[k] = 'i'; }

/* step2() maps double suffices to single ones. so -ization ( = -ize plus
   -ation) maps to -ize etc. note that the string before the suffix must give
   m() > 0. */

static void step2()
{
    switch (b[k-1])
    {
        case 'a': if (ends("\07" "ational")) { r("\03" "ate"); break; }
        if (ends("\06" "tional")) { r("\04" "tion"); break; }
        break;
        case 'c': if (ends("\04" "enci")) { r("\04" "ence"); break; }
        if (ends("\04" "anci")) { r("\04" "ance"); break; }
        break;
        case 'e': if (ends("\04" "izer")) { r("\03" "ize"); break; }
        break;
        case 'l': if (ends("\03" "bli"))          /*-DEPARTURE-*/
        {
            r("\03" "ble"); break;
        }

/* To match the published algorithm, replace this line with
   case 'l': if (ends("\04" "abli")) { r("\04" "able"); break; } */

        if (ends("\04" "alli")) { r("\02" "al"); break; }
        if (ends("\05" "entli")) { r("\03" "ent"); break; }
        if (ends("\03" "eli")) { r("\01" "e"); break; }
        if (ends("\05" "ousli")) { r("\03" "ous"); break; }
        break;
        case 'o': if (ends("\07" "ization")) { r("\03" "ize"); break; }
        if (ends("\05" "ation")) { r("\03" "ate"); break; }
        if (ends("\04" "ator")) { r("\03" "ate"); break; }
        break;
        case 's': if (ends("\05" "alism")) { r("\02" "al"); break; }
        if (ends("\07" "iveness")) { r("\03" "ive"); break; }
        if (ends("\07" "fulness")) { r("\03" "ful"); break; }
        if (ends("\07" "ousness")) { r("\03" "ous"); break; }
        break;
        case 't': if (ends("\05" "aliti")) { r("\02" "al"); break; }
        if (ends("\05" "iviti")) { r("\03" "ive"); break; }
        if (ends("\06" "biliti")) { r("\03" "ble"); break; }
        break;
        case 'g': if (ends("\04" "logi"))         /*-DEPARTURE-*/
        {
            r("\03" "log"); break;
        }

/* To match the published algorithm, delete this line */

    }
}


/* step3() deals with -ic-, -full, -ness etc. similar strategy to step2. */

static void step3()
{
    switch (b[k])
    {
        case 'e': if (ends("\05" "icate")) { r("\02" "ic"); break; }
        if (ends("\05" "ative")) { r("\00" ""); break; }
        if (ends("\05" "alize")) { r("\02" "al"); break; }
        break;
        case 'i': if (ends("\05" "iciti")) { r("\02" "ic"); break; }
        break;
        case 'l': if (ends("\04" "ical")) { r("\02" "ic"); break; }
        if (ends("\03" "ful")) { r("\00" ""); break; }
        break;
        case 's': if (ends("\04" "ness")) { r("\00" ""); break; }
        break;
    }
}


/* step4() takes off -ant, -ence etc., in context <c>vcvc<v>. */

static void step4()
{
    switch (b[k-1])
    {
        case 'a': if (ends("\02" "al")) break; return;
        case 'c': if (ends("\04" "ance")) break;
        if (ends("\04" "ence")) break; return;
        case 'e': if (ends("\02" "er")) break; return;
        case 'i': if (ends("\02" "ic")) break; return;
        case 'l': if (ends("\04" "able")) break;
        if (ends("\04" "ible")) break; return;
        case 'n': if (ends("\03" "ant")) break;
        if (ends("\05" "ement")) break;
        if (ends("\04" "ment")) break;
        if (ends("\03" "ent")) break; return;
        case 'o': if (ends("\03" "ion") && (b[j] == 's' || b[j] == 't')) break;
        if (ends("\02" "ou")) break; return;
/* takes care of -ous */
        case 's': if (ends("\03" "ism")) break; return;
        case 't': if (ends("\03" "ate")) break;
        if (ends("\03" "iti")) break; return;
        case 'u': if (ends("\03" "ous")) break; return;
        case 'v': if (ends("\03" "ive")) break; return;
        case 'z': if (ends("\03" "ize")) break; return;
        default: return;
    }
    if (m() > 1) k = j;
}


/* step5() removes a final -e if m() > 1, and changes -ll to -l if
   m() > 1. */

static void step5()
{
    j = k;
    if (b[k] == 'e')
    {
        int a = m();
        if (a > 1 || a == 1 && !cvc(k-1)) k--;
    }
    if (b[k] == 'l' && doublec(k) && m() > 1) k--;
}


/* In stem(p,i,j), p is a char pointer, and the string to be stemmed is from
   p[i] to p[j] inclusive. Typically i is zero and j is the offset to the last
   character of a string, (p[j+1] == '\0'). The stemmer adjusts the
   characters p[i] ... p[j] and returns the new end-point of the string, k.
   Stemming never increases word length, so i <= k <= j. To turn the stemmer
   into a module, declare 'stem' as extern, and delete the remainder of this
   file.
*/

int stem(char * p, int i, int j)
{                                                 /* copy the parameters into statics */
    b = p; k = j; k0 = i;
    if (k <= k0+1) return k;                      /*-DEPARTURE-*/

/* With this line, strings of length 1 or 2 don't go through the
   stemming process, although no mention is made of this in the
   published algorithm. Remove the line to match the published
   algorithm. */

    step1ab(); step1c(); step2(); step3(); step4(); step5();
    return k;
}


/*--------------------stemmer definition ends here------------------------*/

#include <stdio.h>
#include <stdlib.h>                               /* for malloc, free */
#include <ctype.h>                                /* for isupper, islower, tolower */

static char * s;                                  /* a char * (=string) pointer; passed into b above */

#define INC 50                                    /* size units in which s is increased */
static int i_max = INC;                           /* maximum offset in s */

void increase_s()
{
    i_max += INC;
    {
        char * new_s = (char *) malloc(i_max+1);
        {                                         /* copy across */
            int i; for (i = 0; i < i_max; i++) new_s[i] = s[i];
        }
        free(s); s = new_s;
    }
}


#define LETTER(ch) (isupper(ch) || islower(ch))

static void stemfile(FILE * f)
{
    while(TRUE)
    {
        int ch = getc(f);
        if (ch == EOF) return;
        if (LETTER(ch))
        {
            int i = 0;
            while(TRUE)
            {
                if (i == i_max) increase_s();

                ch = tolower(ch);                 /* forces lower case */

                s[i] = ch; i++;
                ch = getc(f);
                if (!LETTER(ch)) { ungetc(ch,f); break; }
            }
            s[stem(s,0,i-1)+1] = 0;
/* the previous line calls the stemmer and uses its result to
   zero-terminate the string in s */
            printf("%s",s);
        }
        else putchar(ch);
    }
}

/*
 * Commented out as required by amberfish's INSTALL file
 *
	int main(int argc, char * argv[])
	{
	    int i;
	    s = (char *) malloc(i_max+1);
	    for (i = 1; i < argc; i++)
	    {
	        FILE * f = fopen(argv[i],"r");
	        if (f == 0) { fprintf(stderr,"File %s not found\n",argv[i]); exit(1); }
	        stemfile(f);
	    }
	    free(s);
	    return 0;
	}
*/




Amberfish is a full-text search engine with a command-line interface.
Its features include free-text and Boolean queries, relevance-ranked
results, wildcard search, phrase search, field search and structured
field path queries for XML, multiple documents per file and nested
documents, searching across multiple indexes, incremental update of
indexes, and low memory requirements for building indexes.
hierarchical result sets (XML only), automatic searching across multiple
databases (allowing modular indexing), TREC format results, efficient
indexing, and relatively low memory requirements during indexing (and the
ability to index documents larger than available memory). Z39.50 support
is available. Other features include Boolean queries, right truncation,
phrase searching, relevance ranking, support for multiple documents per
file, incremental indexing, and easy integration with other UNIX tools,
The architecture is also designed to permit proximity queries; however,
they are not fully implemented at present.

This port also includes the Porter stemming algorithm for suffix
stripping, available at:
     http://www.tartarus.org/~martin/PorterStemmer
- 

Return to bug 282880

Lines 1-16 Link Here

(-)b/textproc/amberfish/Makefile (-28 / +14 lines)
1	PORTNAME= amberfish	1	PORTNAME= amberfish
2	PORTVERSION= 1.6.4	2	DISTVERSION= 1.7.1
3	PORTREVISION= 3
4	CATEGORIES= textproc databases	3	CATEGORIES= textproc databases
5	MASTER_SITES= SF/${PORTNAME}/Amberfish%20source%20-%20stable/${PORTVERSION} \
6	http://etymon.com/software/amberfish/stable/
7		4
8	MAINTAINER= ports@FreeBSD.org	5	USE_GITLAB= yes
9	COMMENT= General purpose text retrieval Software	6	GL_COMMIT= d0b6e49d750e2c445a4c6526422d4ff43bc668d7
10	WWW= https://web.archive.org/web/20100419215307/http://www.etymon.com/tr.html
11		7
12	LICENSE= GPLv2	8	MAINTAINER= nrn@etymon.com
13	LICENSE_FILE= ${WRKSRC}/COPYING	9	COMMENT= Full-text search engine with command-line interface
		10	WWW= https://gitlab.com/amberfish/amberfish
		11
		12	LICENSE= MIT
		13	LICENSE_FILE= ${WRKSRC}/LICENSE
14		14
15	LIB_DEPENDS= libxerces-c.so:textproc/xerces-c3	15	LIB_DEPENDS= libxerces-c.so:textproc/xerces-c3
16		16
Lines 26-54 PORTDOCS= * Link Here
26		26
27	OPTIONS_DEFINE= DOCS	27	OPTIONS_DEFINE= DOCS
28		28
29	DOCS_USES= makeinfo	29	DOCS_BUILD_DEPENDS= asciidoctor:textproc/rubygem-asciidoctor
30	DOCS_PLIST_FILES= share/man/man1/af.1.gz	30	DOCS_PLIST_FILES= share/man/man1/af.1.gz
31		31
32	post-extract:	32	post-configure:
33	${CP} ${FILESDIR}/porter.cc ${WRKSRC}/src	33	echo "#define AF_VERSION \"v${PORTVERSION}\"" > ${WRKSRC}/src/backend/version.h
34		34	echo v${PORTVERSION} > ${WRKSRC}/doc/version.adoc
35	post-patch:
36	@${REINPLACE_CMD} -e \
37	's\|$${MAKEFLAGS} \|\|' ${WRKSRC}/Makefile
38	@${REINPLACE_CMD} -e \
39	's\|cp \|$${BSD_INSTALL_MAN} \|' ${WRKSRC}/doc/Makefile.in
40	@${REINPLACE_CMD} -e \
41	's\|-O3 \|@CFLAGS@ \| ; \
42	s\|make strip\|\| ; \
43	s\|cp \|$${BSD_INSTALL_PROGRAM} \|' ${WRKSRC}/src/Makefile.in
44
45	post-patch-DOCS-off:
46	@${REINPLACE_CMD} -e \
47	'/cd doc/d' ${WRKSRC}/Makefile
48		35
49	post-install-DOCS-on:	36	post-install-DOCS-on:
50	@${MKDIR} ${STAGEDIR}${DOCSDIR}	37	@${MKDIR} ${STAGEDIR}${DOCSDIR}
51	${INSTALL_DATA} ${WRKSRC}/amberfish.png ${STAGEDIR}${DOCSDIR}	38	${INSTALL_DATA} ${WRKSRC}/doc/amberfish.html ${STAGEDIR}${DOCSDIR}
52	${INSTALL_DATA} ${WRKSRC}/doc/html/*.html ${STAGEDIR}${DOCSDIR}
53		39
54	.include <bsd.port.mk>	40	.include <bsd.port.mk>

Added Link Here

(-)b/textproc/amberfish/files/patch-Makefile (+26 lines)
1	--- Makefile.orig 2024-11-23 13:45:47 UTC
2	+++ Makefile
3	@@ -16,18 +16,18 @@ strip:
4	cd doc ; ${MAKE} html
5
6	strip:
7	- cd src/backend ; ${MAKE} ${MAKEFLAGS} strip
8	+ cd src/backend ; ${MAKE} strip
9	# cd src/interface ; ${MAKE} ${MAKEFLAGS} strip
10
11	install:
12	- cd src/backend ; ${MAKE} ${MAKEFLAGS} install
13	+ cd src/backend ; ${MAKE} install
14	# cd src/interface ; ${MAKE} ${MAKEFLAGS} install
15	- cd doc ; ${MAKE} ${MAKEFLAGS} install
16	+ cd doc ; ${MAKE} install
17
18	uninstall:
19	- cd src/backend ; ${MAKE} ${MAKEFLAGS} uninstall
20	+ cd src/backend ; ${MAKE} uninstall
21	# cd src/interface ; ${MAKE} ${MAKEFLAGS} uninstall
22	- cd doc ; ${MAKE} ${MAKEFLAGS} uninstall
23	+ cd doc ; ${MAKE} uninstall
24
25	clean:
26	rm -fr autom4te.cache

Removed Link Here

(-)a/textproc/amberfish/files/porter.cc (-438 lines)
1
2	/* This is the Porter stemming algorithm, coded up in ANSI C by the
3	author. It may be be regarded as cononical, in that it follows the
4	algorithm presented in
5
6	Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
7	no. 3, pp 130-137,
8
9	only differing from it at the points maked --DEPARTURE-- below.
10
11	See also http://www.tartarus.org/~martin/PorterStemmer
12
13	The algorithm as described in the paper could be exactly replicated
14	by adjusting the points of DEPARTURE, but this is barely necessary,
15	because (a) the points of DEPARTURE are definitely improvements, and
16	(b) no encoding of the Porter stemmer I have seen is anything like
17	as exact as this version, even with the points of DEPARTURE!
18
19	You can compile it on Unix with 'gcc -O3 -o stem stem.c' after which
20	'stem' takes a list of inputs and sends the stemmed equivalent to
21	stdout.
22
23	The algorithm as encoded here is particularly fast.
24
25	Release 1
26	*/
27
28	#include <string.h> /* for memmove */
29
30	#define TRUE 1
31	#define FALSE 0
32
33	/* The main part of the stemming algorithm starts here. b is a buffer
34	holding a word to be stemmed. The letters are in b[k0], b[k0+1] ...
35	ending at b[k]. In fact k0 = 0 in this demo program. k is readjusted
36	downwards as the stemming progresses. Zero termination is not in fact
37	used in the algorithm.
38
39	Note that only lower case sequences are stemmed. Forcing to lower case
40	should be done before stem(...) is called.
41	*/
42
43	static char * b; /* buffer for word to be stemmed */
44	static int k,k0,j; /* j is a general offset into the string */
45
46	/* cons(i) is TRUE <=> b[i] is a consonant. */
47
48	static int cons(int i)
49	{
50	switch (b[i])
51	{
52	case 'a': case 'e': case 'i': case 'o': case 'u': return FALSE;
53	case 'y': return (i==k0) ? TRUE : !cons(i-1);
54	default: return TRUE;
55	}
56	}
57
58
59	/* m() measures the number of consonant sequences between k0 and j. if c is
60	a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
61	presence,
62
63	<c><v> gives 0
64	<c>vc<v> gives 1
65	<c>vcvc<v> gives 2
66	<c>vcvcvc<v> gives 3
67	....
68	*/
69
70	static int m()
71	{
72	int n = 0;
73	int i = k0;
74	while(TRUE)
75	{
76	if (i > j) return n;
77	if (! cons(i)) break; i++;
78	}
79	i++;
80	while(TRUE)
81	{
82	while(TRUE)
83	{
84	if (i > j) return n;
85	if (cons(i)) break;
86	i++;
87	}
88	i++;
89	n++;
90	while(TRUE)
91	{
92	if (i > j) return n;
93	if (! cons(i)) break;
94	i++;
95	}
96	i++;
97	}
98	}
99
100
101	/* vowelinstem() is TRUE <=> k0,...j contains a vowel */
102
103	static int vowelinstem()
104	{
105	int i; for (i = k0; i <= j; i++) if (! cons(i)) return TRUE;
106	return FALSE;
107	}
108
109
110	/* doublec(j) is TRUE <=> j,(j-1) contain a double consonant. */
111
112	static int doublec(int j)
113	{
114	if (j < k0+1) return FALSE;
115	if (b[j] != b[j-1]) return FALSE;
116	return cons(j);
117	}
118
119
120	/* cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant
121	and also if the second c is not w,x or y. this is used when trying to
122	restore an e at the end of a short word. e.g.
123
124	cav(e), lov(e), hop(e), crim(e), but
125	snow, box, tray.
126
127	*/
128
129	static int cvc(int i)
130	{
131	if (i < k0+2 \|\| !cons(i) \|\| cons(i-1) \|\| !cons(i-2)) return FALSE;
132	{
133	int ch = b[i];
134	if (ch == 'w' \|\| ch == 'x' \|\| ch == 'y') return FALSE;
135	}
136	return TRUE;
137	}
138
139
140	/* ends(s) is TRUE <=> k0,...k ends with the string s. */
141
142	static int ends(char * s)
143	{
144	int length = s[0];
145	if (s[length] != b[k]) return FALSE; /* tiny speed-up */
146	if (length > k-k0+1) return FALSE;
147	if (memcmp(b+k-length+1,s+1,length) != 0) return FALSE;
148	j = k-length;
149	return TRUE;
150	}
151
152
153	/* setto(s) sets (j+1),...k to the characters in the string s, readjusting
154	k. */
155
156	static void setto(char * s)
157	{
158	int length = s[0];
159	memmove(b+j+1,s+1,length);
160	k = j+length;
161	}
162
163
164	/* r(s) is used further down. */
165
166	static void r(char * s) { if (m() > 0) setto(s); }
167
168	/* step1ab() gets rid of plurals and -ed or -ing. e.g.
169
170	caresses -> caress
171	ponies -> poni
172	ties -> ti
173	caress -> caress
174	cats -> cat
175
176	feed -> feed
177	agreed -> agree
178	disabled -> disable
179
180	matting -> mat
181	mating -> mate
182	meeting -> meet
183	milling -> mill
184	messing -> mess
185
186	meetings -> meet
187
188	*/
189
190	static void step1ab()
191	{
192	if (b[k] == 's')
193	{
194	if (ends("\04" "sses")) k -= 2; else
195	if (ends("\03" "ies")) setto("\01" "i"); else
196	if (b[k-1] != 's') k--;
197	}
198	if (ends("\03" "eed")) { if (m() > 0) k--; }
199	else
200	if ((ends("\02" "ed") \|\| ends("\03" "ing")) && vowelinstem())
201	{
202	k = j;
203	if (ends("\02" "at")) setto("\03" "ate"); else
204	if (ends("\02" "bl")) setto("\03" "ble"); else
205	if (ends("\02" "iz")) setto("\03" "ize"); else
206	if (doublec(k))
207	{
208	k--;
209	{
210	int ch = b[k];
211	if (ch == 'l' \|\| ch == 's' \|\| ch == 'z') k++;
212	}
213	}
214	else if (m() == 1 && cvc(k)) setto("\01" "e");
215	}
216	}
217
218
219	/* step1c() turns terminal y to i when there is another vowel in the stem. */
220
221	static void step1c() { if (ends("\01" "y") && vowelinstem()) b[k] = 'i'; }
222
223	/* step2() maps double suffices to single ones. so -ization ( = -ize plus
224	-ation) maps to -ize etc. note that the string before the suffix must give
225	m() > 0. */
226
227	static void step2()
228	{
229	switch (b[k-1])
230	{
231	case 'a': if (ends("\07" "ational")) { r("\03" "ate"); break; }
232	if (ends("\06" "tional")) { r("\04" "tion"); break; }
233	break;
234	case 'c': if (ends("\04" "enci")) { r("\04" "ence"); break; }
235	if (ends("\04" "anci")) { r("\04" "ance"); break; }
236	break;
237	case 'e': if (ends("\04" "izer")) { r("\03" "ize"); break; }
238	break;
239	case 'l': if (ends("\03" "bli")) /-DEPARTURE-/
240	{
241	r("\03" "ble"); break;
242	}
243
244	/* To match the published algorithm, replace this line with
245	case 'l': if (ends("\04" "abli")) { r("\04" "able"); break; } */
246
247	if (ends("\04" "alli")) { r("\02" "al"); break; }
248	if (ends("\05" "entli")) { r("\03" "ent"); break; }
249	if (ends("\03" "eli")) { r("\01" "e"); break; }
250	if (ends("\05" "ousli")) { r("\03" "ous"); break; }
251	break;
252	case 'o': if (ends("\07" "ization")) { r("\03" "ize"); break; }
253	if (ends("\05" "ation")) { r("\03" "ate"); break; }
254	if (ends("\04" "ator")) { r("\03" "ate"); break; }
255	break;
256	case 's': if (ends("\05" "alism")) { r("\02" "al"); break; }
257	if (ends("\07" "iveness")) { r("\03" "ive"); break; }
258	if (ends("\07" "fulness")) { r("\03" "ful"); break; }
259	if (ends("\07" "ousness")) { r("\03" "ous"); break; }
260	break;
261	case 't': if (ends("\05" "aliti")) { r("\02" "al"); break; }
262	if (ends("\05" "iviti")) { r("\03" "ive"); break; }
263	if (ends("\06" "biliti")) { r("\03" "ble"); break; }
264	break;
265	case 'g': if (ends("\04" "logi")) /-DEPARTURE-/
266	{
267	r("\03" "log"); break;
268	}
269
270	/* To match the published algorithm, delete this line */
271
272	}
273	}
274
275
276	/* step3() deals with -ic-, -full, -ness etc. similar strategy to step2. */
277
278	static void step3()
279	{
280	switch (b[k])
281	{
282	case 'e': if (ends("\05" "icate")) { r("\02" "ic"); break; }
283	if (ends("\05" "ative")) { r("\00" ""); break; }
284	if (ends("\05" "alize")) { r("\02" "al"); break; }
285	break;
286	case 'i': if (ends("\05" "iciti")) { r("\02" "ic"); break; }
287	break;
288	case 'l': if (ends("\04" "ical")) { r("\02" "ic"); break; }
289	if (ends("\03" "ful")) { r("\00" ""); break; }
290	break;
291	case 's': if (ends("\04" "ness")) { r("\00" ""); break; }
292	break;
293	}
294	}
295
296
297	/* step4() takes off -ant, -ence etc., in context <c>vcvc<v>. */
298
299	static void step4()
300	{
301	switch (b[k-1])
302	{
303	case 'a': if (ends("\02" "al")) break; return;
304	case 'c': if (ends("\04" "ance")) break;
305	if (ends("\04" "ence")) break; return;
306	case 'e': if (ends("\02" "er")) break; return;
307	case 'i': if (ends("\02" "ic")) break; return;
308	case 'l': if (ends("\04" "able")) break;
309	if (ends("\04" "ible")) break; return;
310	case 'n': if (ends("\03" "ant")) break;
311	if (ends("\05" "ement")) break;
312	if (ends("\04" "ment")) break;
313	if (ends("\03" "ent")) break; return;
314	case 'o': if (ends("\03" "ion") && (b[j] == 's' \|\| b[j] == 't')) break;
315	if (ends("\02" "ou")) break; return;
316	/* takes care of -ous */
317	case 's': if (ends("\03" "ism")) break; return;
318	case 't': if (ends("\03" "ate")) break;
319	if (ends("\03" "iti")) break; return;
320	case 'u': if (ends("\03" "ous")) break; return;
321	case 'v': if (ends("\03" "ive")) break; return;
322	case 'z': if (ends("\03" "ize")) break; return;
323	default: return;
324	}
325	if (m() > 1) k = j;
326	}
327
328
329	/* step5() removes a final -e if m() > 1, and changes -ll to -l if
330	m() > 1. */
331
332	static void step5()
333	{
334	j = k;
335	if (b[k] == 'e')
336	{
337	int a = m();
338	if (a > 1 \|\| a == 1 && !cvc(k-1)) k--;
339	}
340	if (b[k] == 'l' && doublec(k) && m() > 1) k--;
341	}
342
343
344	/* In stem(p,i,j), p is a char pointer, and the string to be stemmed is from
345	p[i] to p[j] inclusive. Typically i is zero and j is the offset to the last
346	character of a string, (p[j+1] == '\0'). The stemmer adjusts the
347	characters p[i] ... p[j] and returns the new end-point of the string, k.
348	Stemming never increases word length, so i <= k <= j. To turn the stemmer
349	into a module, declare 'stem' as extern, and delete the remainder of this
350	file.
351	*/
352
353	int stem(char * p, int i, int j)
354	{ /* copy the parameters into statics */
355	b = p; k = j; k0 = i;
356	if (k <= k0+1) return k; /-DEPARTURE-/
357
358	/* With this line, strings of length 1 or 2 don't go through the
359	stemming process, although no mention is made of this in the
360	published algorithm. Remove the line to match the published
361	algorithm. */
362
363	step1ab(); step1c(); step2(); step3(); step4(); step5();
364	return k;
365	}
366
367
368	/--------------------stemmer definition ends here------------------------/
369
370	#include <stdio.h>
371	#include <stdlib.h> /* for malloc, free */
372	#include <ctype.h> /* for isupper, islower, tolower */
373
374	static char * s; /* a char * (=string) pointer; passed into b above */
375
376	#define INC 50 /* size units in which s is increased */
377	static int i_max = INC; /* maximum offset in s */
378
379	void increase_s()
380	{
381	i_max += INC;
382	{
383	char * new_s = (char *) malloc(i_max+1);
384	{ /* copy across */
385	int i; for (i = 0; i < i_max; i++) new_s[i] = s[i];
386	}
387	free(s); s = new_s;
388	}
389	}
390
391
392	#define LETTER(ch) (isupper(ch) \|\| islower(ch))
393
394	static void stemfile(FILE * f)
395	{
396	while(TRUE)
397	{
398	int ch = getc(f);
399	if (ch == EOF) return;
400	if (LETTER(ch))
401	{
402	int i = 0;
403	while(TRUE)
404	{
405	if (i == i_max) increase_s();
406
407	ch = tolower(ch); /* forces lower case */
408
409	s[i] = ch; i++;
410	ch = getc(f);
411	if (!LETTER(ch)) { ungetc(ch,f); break; }
412	}
413	s[stem(s,0,i-1)+1] = 0;
414	/* the previous line calls the stemmer and uses its result to
415	zero-terminate the string in s */
416	printf("%s",s);
417	}
418	else putchar(ch);
419	}
420	}
421
422	/*
423	* Commented out as required by amberfish's INSTALL file
424	*
425	int main(int argc, char * argv[])
426	{
427	int i;
428	s = (char *) malloc(i_max+1);
429	for (i = 1; i < argc; i++)
430	{
431	FILE * f = fopen(argv[i],"r");
432	if (f == 0) { fprintf(stderr,"File %s not found\n",argv[i]); exit(1); }
433	stemfile(f);
434	}
435	free(s);
436	return 0;
437	}
438	*/

Lines 1-19 Link Here

(-)b/textproc/amberfish/pkg-descr (-20 / +6 lines)
1	Amberfish is general purpose text retrieval software, developed at Etymon	1	Amberfish is a full-text search engine with a command-line interface.
2	by Nassib Nassar and distributed as open source software under the terms	2	Its features include free-text and Boolean queries, relevance-ranked
3	of version 2 of the GNU General Public License (GPL). Its distinguishing	3	results, wildcard search, phrase search, field search and structured
4	features are indexing/search of semi-structured text (i.e. both free tex	4	field path queries for XML, multiple documents per file and nested
5	and multiply nested fields), built-in support for XML documents using the	5	documents, searching across multiple indexes, incremental update of
6	Xerces library, structured queries allowing generalized field/tag paths,	6	indexes, and low memory requirements for building indexes.
7	hierarchical result sets (XML only), automatic searching across multiple
8	databases (allowing modular indexing), TREC format results, efficient
9	indexing, and relatively low memory requirements during indexing (and the
10	ability to index documents larger than available memory). Z39.50 support
11	is available. Other features include Boolean queries, right truncation,
12	phrase searching, relevance ranking, support for multiple documents per
13	file, incremental indexing, and easy integration with other UNIX tools,
14	The architecture is also designed to permit proximity queries; however,
15	they are not fully implemented at present.
16
17	This port also includes the Porter stemming algorithm for suffix
18	stripping, available at:
19	http://www.tartarus.org/~martin/PorterStemmer
20	-