Removed
Link Here
|
1 |
|
2 |
/* This is the Porter stemming algorithm, coded up in ANSI C by the |
3 |
author. It may be be regarded as cononical, in that it follows the |
4 |
algorithm presented in |
5 |
|
6 |
Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, |
7 |
no. 3, pp 130-137, |
8 |
|
9 |
only differing from it at the points maked --DEPARTURE-- below. |
10 |
|
11 |
See also http://www.tartarus.org/~martin/PorterStemmer |
12 |
|
13 |
The algorithm as described in the paper could be exactly replicated |
14 |
by adjusting the points of DEPARTURE, but this is barely necessary, |
15 |
because (a) the points of DEPARTURE are definitely improvements, and |
16 |
(b) no encoding of the Porter stemmer I have seen is anything like |
17 |
as exact as this version, even with the points of DEPARTURE! |
18 |
|
19 |
You can compile it on Unix with 'gcc -O3 -o stem stem.c' after which |
20 |
'stem' takes a list of inputs and sends the stemmed equivalent to |
21 |
stdout. |
22 |
|
23 |
The algorithm as encoded here is particularly fast. |
24 |
|
25 |
Release 1 |
26 |
*/ |
27 |
|
28 |
#include <string.h> /* for memmove */ |
29 |
|
30 |
#define TRUE 1 |
31 |
#define FALSE 0 |
32 |
|
33 |
/* The main part of the stemming algorithm starts here. b is a buffer |
34 |
holding a word to be stemmed. The letters are in b[k0], b[k0+1] ... |
35 |
ending at b[k]. In fact k0 = 0 in this demo program. k is readjusted |
36 |
downwards as the stemming progresses. Zero termination is not in fact |
37 |
used in the algorithm. |
38 |
|
39 |
Note that only lower case sequences are stemmed. Forcing to lower case |
40 |
should be done before stem(...) is called. |
41 |
*/ |
42 |
|
43 |
static char * b; /* buffer for word to be stemmed */ |
44 |
static int k,k0,j; /* j is a general offset into the string */ |
45 |
|
46 |
/* cons(i) is TRUE <=> b[i] is a consonant. */ |
47 |
|
48 |
static int cons(int i) |
49 |
{ |
50 |
switch (b[i]) |
51 |
{ |
52 |
case 'a': case 'e': case 'i': case 'o': case 'u': return FALSE; |
53 |
case 'y': return (i==k0) ? TRUE : !cons(i-1); |
54 |
default: return TRUE; |
55 |
} |
56 |
} |
57 |
|
58 |
|
59 |
/* m() measures the number of consonant sequences between k0 and j. if c is |
60 |
a consonant sequence and v a vowel sequence, and <..> indicates arbitrary |
61 |
presence, |
62 |
|
63 |
<c><v> gives 0 |
64 |
<c>vc<v> gives 1 |
65 |
<c>vcvc<v> gives 2 |
66 |
<c>vcvcvc<v> gives 3 |
67 |
.... |
68 |
*/ |
69 |
|
70 |
static int m() |
71 |
{ |
72 |
int n = 0; |
73 |
int i = k0; |
74 |
while(TRUE) |
75 |
{ |
76 |
if (i > j) return n; |
77 |
if (! cons(i)) break; i++; |
78 |
} |
79 |
i++; |
80 |
while(TRUE) |
81 |
{ |
82 |
while(TRUE) |
83 |
{ |
84 |
if (i > j) return n; |
85 |
if (cons(i)) break; |
86 |
i++; |
87 |
} |
88 |
i++; |
89 |
n++; |
90 |
while(TRUE) |
91 |
{ |
92 |
if (i > j) return n; |
93 |
if (! cons(i)) break; |
94 |
i++; |
95 |
} |
96 |
i++; |
97 |
} |
98 |
} |
99 |
|
100 |
|
101 |
/* vowelinstem() is TRUE <=> k0,...j contains a vowel */ |
102 |
|
103 |
static int vowelinstem() |
104 |
{ |
105 |
int i; for (i = k0; i <= j; i++) if (! cons(i)) return TRUE; |
106 |
return FALSE; |
107 |
} |
108 |
|
109 |
|
110 |
/* doublec(j) is TRUE <=> j,(j-1) contain a double consonant. */ |
111 |
|
112 |
static int doublec(int j) |
113 |
{ |
114 |
if (j < k0+1) return FALSE; |
115 |
if (b[j] != b[j-1]) return FALSE; |
116 |
return cons(j); |
117 |
} |
118 |
|
119 |
|
120 |
/* cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant |
121 |
and also if the second c is not w,x or y. this is used when trying to |
122 |
restore an e at the end of a short word. e.g. |
123 |
|
124 |
cav(e), lov(e), hop(e), crim(e), but |
125 |
snow, box, tray. |
126 |
|
127 |
*/ |
128 |
|
129 |
static int cvc(int i) |
130 |
{ |
131 |
if (i < k0+2 || !cons(i) || cons(i-1) || !cons(i-2)) return FALSE; |
132 |
{ |
133 |
int ch = b[i]; |
134 |
if (ch == 'w' || ch == 'x' || ch == 'y') return FALSE; |
135 |
} |
136 |
return TRUE; |
137 |
} |
138 |
|
139 |
|
140 |
/* ends(s) is TRUE <=> k0,...k ends with the string s. */ |
141 |
|
142 |
static int ends(char * s) |
143 |
{ |
144 |
int length = s[0]; |
145 |
if (s[length] != b[k]) return FALSE; /* tiny speed-up */ |
146 |
if (length > k-k0+1) return FALSE; |
147 |
if (memcmp(b+k-length+1,s+1,length) != 0) return FALSE; |
148 |
j = k-length; |
149 |
return TRUE; |
150 |
} |
151 |
|
152 |
|
153 |
/* setto(s) sets (j+1),...k to the characters in the string s, readjusting |
154 |
k. */ |
155 |
|
156 |
static void setto(char * s) |
157 |
{ |
158 |
int length = s[0]; |
159 |
memmove(b+j+1,s+1,length); |
160 |
k = j+length; |
161 |
} |
162 |
|
163 |
|
164 |
/* r(s) is used further down. */ |
165 |
|
166 |
static void r(char * s) { if (m() > 0) setto(s); } |
167 |
|
168 |
/* step1ab() gets rid of plurals and -ed or -ing. e.g. |
169 |
|
170 |
caresses -> caress |
171 |
ponies -> poni |
172 |
ties -> ti |
173 |
caress -> caress |
174 |
cats -> cat |
175 |
|
176 |
feed -> feed |
177 |
agreed -> agree |
178 |
disabled -> disable |
179 |
|
180 |
matting -> mat |
181 |
mating -> mate |
182 |
meeting -> meet |
183 |
milling -> mill |
184 |
messing -> mess |
185 |
|
186 |
meetings -> meet |
187 |
|
188 |
*/ |
189 |
|
190 |
static void step1ab() |
191 |
{ |
192 |
if (b[k] == 's') |
193 |
{ |
194 |
if (ends("\04" "sses")) k -= 2; else |
195 |
if (ends("\03" "ies")) setto("\01" "i"); else |
196 |
if (b[k-1] != 's') k--; |
197 |
} |
198 |
if (ends("\03" "eed")) { if (m() > 0) k--; } |
199 |
else |
200 |
if ((ends("\02" "ed") || ends("\03" "ing")) && vowelinstem()) |
201 |
{ |
202 |
k = j; |
203 |
if (ends("\02" "at")) setto("\03" "ate"); else |
204 |
if (ends("\02" "bl")) setto("\03" "ble"); else |
205 |
if (ends("\02" "iz")) setto("\03" "ize"); else |
206 |
if (doublec(k)) |
207 |
{ |
208 |
k--; |
209 |
{ |
210 |
int ch = b[k]; |
211 |
if (ch == 'l' || ch == 's' || ch == 'z') k++; |
212 |
} |
213 |
} |
214 |
else if (m() == 1 && cvc(k)) setto("\01" "e"); |
215 |
} |
216 |
} |
217 |
|
218 |
|
219 |
/* step1c() turns terminal y to i when there is another vowel in the stem. */ |
220 |
|
221 |
static void step1c() { if (ends("\01" "y") && vowelinstem()) b[k] = 'i'; } |
222 |
|
223 |
/* step2() maps double suffices to single ones. so -ization ( = -ize plus |
224 |
-ation) maps to -ize etc. note that the string before the suffix must give |
225 |
m() > 0. */ |
226 |
|
227 |
static void step2() |
228 |
{ |
229 |
switch (b[k-1]) |
230 |
{ |
231 |
case 'a': if (ends("\07" "ational")) { r("\03" "ate"); break; } |
232 |
if (ends("\06" "tional")) { r("\04" "tion"); break; } |
233 |
break; |
234 |
case 'c': if (ends("\04" "enci")) { r("\04" "ence"); break; } |
235 |
if (ends("\04" "anci")) { r("\04" "ance"); break; } |
236 |
break; |
237 |
case 'e': if (ends("\04" "izer")) { r("\03" "ize"); break; } |
238 |
break; |
239 |
case 'l': if (ends("\03" "bli")) /*-DEPARTURE-*/ |
240 |
{ |
241 |
r("\03" "ble"); break; |
242 |
} |
243 |
|
244 |
/* To match the published algorithm, replace this line with |
245 |
case 'l': if (ends("\04" "abli")) { r("\04" "able"); break; } */ |
246 |
|
247 |
if (ends("\04" "alli")) { r("\02" "al"); break; } |
248 |
if (ends("\05" "entli")) { r("\03" "ent"); break; } |
249 |
if (ends("\03" "eli")) { r("\01" "e"); break; } |
250 |
if (ends("\05" "ousli")) { r("\03" "ous"); break; } |
251 |
break; |
252 |
case 'o': if (ends("\07" "ization")) { r("\03" "ize"); break; } |
253 |
if (ends("\05" "ation")) { r("\03" "ate"); break; } |
254 |
if (ends("\04" "ator")) { r("\03" "ate"); break; } |
255 |
break; |
256 |
case 's': if (ends("\05" "alism")) { r("\02" "al"); break; } |
257 |
if (ends("\07" "iveness")) { r("\03" "ive"); break; } |
258 |
if (ends("\07" "fulness")) { r("\03" "ful"); break; } |
259 |
if (ends("\07" "ousness")) { r("\03" "ous"); break; } |
260 |
break; |
261 |
case 't': if (ends("\05" "aliti")) { r("\02" "al"); break; } |
262 |
if (ends("\05" "iviti")) { r("\03" "ive"); break; } |
263 |
if (ends("\06" "biliti")) { r("\03" "ble"); break; } |
264 |
break; |
265 |
case 'g': if (ends("\04" "logi")) /*-DEPARTURE-*/ |
266 |
{ |
267 |
r("\03" "log"); break; |
268 |
} |
269 |
|
270 |
/* To match the published algorithm, delete this line */ |
271 |
|
272 |
} |
273 |
} |
274 |
|
275 |
|
276 |
/* step3() deals with -ic-, -full, -ness etc. similar strategy to step2. */ |
277 |
|
278 |
static void step3() |
279 |
{ |
280 |
switch (b[k]) |
281 |
{ |
282 |
case 'e': if (ends("\05" "icate")) { r("\02" "ic"); break; } |
283 |
if (ends("\05" "ative")) { r("\00" ""); break; } |
284 |
if (ends("\05" "alize")) { r("\02" "al"); break; } |
285 |
break; |
286 |
case 'i': if (ends("\05" "iciti")) { r("\02" "ic"); break; } |
287 |
break; |
288 |
case 'l': if (ends("\04" "ical")) { r("\02" "ic"); break; } |
289 |
if (ends("\03" "ful")) { r("\00" ""); break; } |
290 |
break; |
291 |
case 's': if (ends("\04" "ness")) { r("\00" ""); break; } |
292 |
break; |
293 |
} |
294 |
} |
295 |
|
296 |
|
297 |
/* step4() takes off -ant, -ence etc., in context <c>vcvc<v>. */ |
298 |
|
299 |
static void step4() |
300 |
{ |
301 |
switch (b[k-1]) |
302 |
{ |
303 |
case 'a': if (ends("\02" "al")) break; return; |
304 |
case 'c': if (ends("\04" "ance")) break; |
305 |
if (ends("\04" "ence")) break; return; |
306 |
case 'e': if (ends("\02" "er")) break; return; |
307 |
case 'i': if (ends("\02" "ic")) break; return; |
308 |
case 'l': if (ends("\04" "able")) break; |
309 |
if (ends("\04" "ible")) break; return; |
310 |
case 'n': if (ends("\03" "ant")) break; |
311 |
if (ends("\05" "ement")) break; |
312 |
if (ends("\04" "ment")) break; |
313 |
if (ends("\03" "ent")) break; return; |
314 |
case 'o': if (ends("\03" "ion") && (b[j] == 's' || b[j] == 't')) break; |
315 |
if (ends("\02" "ou")) break; return; |
316 |
/* takes care of -ous */ |
317 |
case 's': if (ends("\03" "ism")) break; return; |
318 |
case 't': if (ends("\03" "ate")) break; |
319 |
if (ends("\03" "iti")) break; return; |
320 |
case 'u': if (ends("\03" "ous")) break; return; |
321 |
case 'v': if (ends("\03" "ive")) break; return; |
322 |
case 'z': if (ends("\03" "ize")) break; return; |
323 |
default: return; |
324 |
} |
325 |
if (m() > 1) k = j; |
326 |
} |
327 |
|
328 |
|
329 |
/* step5() removes a final -e if m() > 1, and changes -ll to -l if |
330 |
m() > 1. */ |
331 |
|
332 |
static void step5() |
333 |
{ |
334 |
j = k; |
335 |
if (b[k] == 'e') |
336 |
{ |
337 |
int a = m(); |
338 |
if (a > 1 || a == 1 && !cvc(k-1)) k--; |
339 |
} |
340 |
if (b[k] == 'l' && doublec(k) && m() > 1) k--; |
341 |
} |
342 |
|
343 |
|
344 |
/* In stem(p,i,j), p is a char pointer, and the string to be stemmed is from |
345 |
p[i] to p[j] inclusive. Typically i is zero and j is the offset to the last |
346 |
character of a string, (p[j+1] == '\0'). The stemmer adjusts the |
347 |
characters p[i] ... p[j] and returns the new end-point of the string, k. |
348 |
Stemming never increases word length, so i <= k <= j. To turn the stemmer |
349 |
into a module, declare 'stem' as extern, and delete the remainder of this |
350 |
file. |
351 |
*/ |
352 |
|
353 |
int stem(char * p, int i, int j) |
354 |
{ /* copy the parameters into statics */ |
355 |
b = p; k = j; k0 = i; |
356 |
if (k <= k0+1) return k; /*-DEPARTURE-*/ |
357 |
|
358 |
/* With this line, strings of length 1 or 2 don't go through the |
359 |
stemming process, although no mention is made of this in the |
360 |
published algorithm. Remove the line to match the published |
361 |
algorithm. */ |
362 |
|
363 |
step1ab(); step1c(); step2(); step3(); step4(); step5(); |
364 |
return k; |
365 |
} |
366 |
|
367 |
|
368 |
/*--------------------stemmer definition ends here------------------------*/ |
369 |
|
370 |
#include <stdio.h> |
371 |
#include <stdlib.h> /* for malloc, free */ |
372 |
#include <ctype.h> /* for isupper, islower, tolower */ |
373 |
|
374 |
static char * s; /* a char * (=string) pointer; passed into b above */ |
375 |
|
376 |
#define INC 50 /* size units in which s is increased */ |
377 |
static int i_max = INC; /* maximum offset in s */ |
378 |
|
379 |
void increase_s() |
380 |
{ |
381 |
i_max += INC; |
382 |
{ |
383 |
char * new_s = (char *) malloc(i_max+1); |
384 |
{ /* copy across */ |
385 |
int i; for (i = 0; i < i_max; i++) new_s[i] = s[i]; |
386 |
} |
387 |
free(s); s = new_s; |
388 |
} |
389 |
} |
390 |
|
391 |
|
392 |
#define LETTER(ch) (isupper(ch) || islower(ch)) |
393 |
|
394 |
static void stemfile(FILE * f) |
395 |
{ |
396 |
while(TRUE) |
397 |
{ |
398 |
int ch = getc(f); |
399 |
if (ch == EOF) return; |
400 |
if (LETTER(ch)) |
401 |
{ |
402 |
int i = 0; |
403 |
while(TRUE) |
404 |
{ |
405 |
if (i == i_max) increase_s(); |
406 |
|
407 |
ch = tolower(ch); /* forces lower case */ |
408 |
|
409 |
s[i] = ch; i++; |
410 |
ch = getc(f); |
411 |
if (!LETTER(ch)) { ungetc(ch,f); break; } |
412 |
} |
413 |
s[stem(s,0,i-1)+1] = 0; |
414 |
/* the previous line calls the stemmer and uses its result to |
415 |
zero-terminate the string in s */ |
416 |
printf("%s",s); |
417 |
} |
418 |
else putchar(ch); |
419 |
} |
420 |
} |
421 |
|
422 |
/* |
423 |
* Commented out as required by amberfish's INSTALL file |
424 |
* |
425 |
int main(int argc, char * argv[]) |
426 |
{ |
427 |
int i; |
428 |
s = (char *) malloc(i_max+1); |
429 |
for (i = 1; i < argc; i++) |
430 |
{ |
431 |
FILE * f = fopen(argv[i],"r"); |
432 |
if (f == 0) { fprintf(stderr,"File %s not found\n",argv[i]); exit(1); } |
433 |
stemfile(f); |
434 |
} |
435 |
free(s); |
436 |
return 0; |
437 |
} |
438 |
*/ |