Link Here
|
|
|
1 |
--- webalizer.c.a-urasim Wed Apr 17 07:11:31 2002 |
2 |
+++ webalizer.c Tue Dec 23 23:26:23 2003 |
3 |
@@ -39,6 +39,7 @@ |
4 |
#include <sys/utsname.h> |
5 |
#include <sys/times.h> |
6 |
#include <zlib.h> |
7 |
+#include <iconv.h> |
8 |
|
9 |
/* ensure getopt */ |
10 |
#ifdef HAVE_GETOPT_H |
11 |
@@ -224,6 +225,8 @@ |
12 |
char *f_cp=f_buf+GZ_BUFSIZE; /* pointer into the buffer */ |
13 |
int f_end; /* count to end of buffer */ |
14 |
|
15 |
+iconv_t cd_from_sjis, cd_from_utf8; |
16 |
+ |
17 |
/*********************************************/ |
18 |
/* MAIN - start here */ |
19 |
/*********************************************/ |
20 |
@@ -526,6 +529,9 @@ |
21 |
|
22 |
start_time = times(&mytms); |
23 |
|
24 |
+ cd_from_sjis = iconv_open("EUC-JP", "Shift_JIS"); |
25 |
+ cd_from_utf8 = iconv_open("EUC-JP", "UTF-8"); |
26 |
+ |
27 |
/*********************************************/ |
28 |
/* MAIN PROCESS LOOP - read through log file */ |
29 |
/*********************************************/ |
30 |
@@ -1345,6 +1351,9 @@ |
31 |
if (dns_db) close_cache(); |
32 |
#endif |
33 |
|
34 |
+ iconv_close(cd_from_sjis); |
35 |
+ iconv_close(cd_from_utf8); |
36 |
+ |
37 |
/* Whew, all done! Exit with completion status (0) */ |
38 |
exit(0); |
39 |
} |
40 |
@@ -1773,6 +1782,23 @@ |
41 |
|
42 |
if (!str) return NULL; /* make sure strings valid */ |
43 |
|
44 |
+ while(*cp1){ /* for apache log's escape code. */ |
45 |
+ if(*cp1 == '\\' && *(cp1+1) == 'x' && |
46 |
+ isxdigit(*(cp1+2)) && isxdigit(*(cp1+3))){ |
47 |
+ *cp2 = from_hex(*(cp1+2))*16 + from_hex(*(cp1+3)); |
48 |
+ if ((*cp2<32)||(*cp2==127)) *cp2='_'; |
49 |
+ cp1+=4; cp2++; |
50 |
+ |
51 |
+ } |
52 |
+ else if(*cp1 == '\\' && *(cp1+1) == '\\'){ |
53 |
+ *cp2++='\\'; |
54 |
+ cp1+=2; |
55 |
+ } |
56 |
+ else *cp2++ = *cp1++; |
57 |
+ } |
58 |
+ *cp2=*cp1; |
59 |
+ |
60 |
+ cp1=cp2=str; |
61 |
while (*cp1) |
62 |
{ |
63 |
if (*cp1=='%') /* Found an escape? */ |
64 |
@@ -1783,7 +1809,7 @@ |
65 |
if (*cp1) *cp2=from_hex(*cp1++)*16; /* convert hex to an ascii */ |
66 |
if (*cp1) *cp2+=from_hex(*cp1); /* (hopefully) character */ |
67 |
if ((*cp2<32)||(*cp2==127)) *cp2='_'; /* make '_' if its bad */ |
68 |
- if (*cp1) cp2++; cp1++; |
69 |
+ if (*cp1){ cp2++; cp1++;} /* bug? */ |
70 |
} |
71 |
else *cp2++='%'; |
72 |
} |
73 |
@@ -1793,6 +1819,116 @@ |
74 |
return str; /* return the string */ |
75 |
} |
76 |
|
77 |
+int score_eucj(unsigned char *str) |
78 |
+{ |
79 |
+ int stat=0; |
80 |
+ int score=0; |
81 |
+ int bad=0; |
82 |
+ if(str==NULL) return -1; |
83 |
+ |
84 |
+ for(; *str!=0;str++){ |
85 |
+ switch(stat){ |
86 |
+ case 0: |
87 |
+ if(*str>= 0x20 && *str <= 0x7e) score++; //ASCII |
88 |
+ else if(*str >= 0xa1 && *str <= 0xfe) stat=1; //KANJI(1) |
89 |
+ else if(*str == 0x8f); // HOJYO KANJI |
90 |
+ else if(*str == 0x8e) stat=2; // KANA |
91 |
+ else if(*str < 0x20); //CTRL |
92 |
+ else bad=1; |
93 |
+ break; |
94 |
+ case 1: |
95 |
+ if(*str >= 0xa1 && *str <= 0xfe) score += 2; //KANJI(2) |
96 |
+ else bad=1; |
97 |
+ stat=0; |
98 |
+ break; |
99 |
+ case 2: |
100 |
+ if(*str >= 0xa1 && *str <= 0xdf); //hankaku <- 0 |
101 |
+ else bad=1; |
102 |
+ stat=0; |
103 |
+ break; |
104 |
+ } |
105 |
+ } |
106 |
+ if(bad != 0) score = -1; |
107 |
+ return score; |
108 |
+} |
109 |
+ |
110 |
+int score_sjis(unsigned char *str) |
111 |
+{ |
112 |
+ int stat=0; |
113 |
+ int score=0; |
114 |
+ int bad=0; |
115 |
+ if(str==NULL) return -1; |
116 |
+ |
117 |
+ for(; *str != 0; str++){ |
118 |
+ switch(stat){ |
119 |
+ case 0: |
120 |
+ if(*str>= 0x20 && *str <= 0x7e) score++;//ASCII |
121 |
+ else if((*str >= 0x81 && *str <= 0x9f) || |
122 |
+ (*str >= 0xe0 && *str <= 0xfc)) stat=1; //SJIS(1) |
123 |
+ else if(*str >= 0xa1 && *str <= 0xdf); // KANA |
124 |
+ else if(*str < 0x20); // CTRL |
125 |
+ else bad=1; |
126 |
+ break; |
127 |
+ case 1: |
128 |
+ if((*str >= 0x40 && *str <= 0x7e) || |
129 |
+ (*str >= 0x80 && *str <= 0xfc)) score += 2; //SJIS(2) |
130 |
+ else bad=1; |
131 |
+ stat=0; |
132 |
+ break; |
133 |
+ } |
134 |
+ } |
135 |
+ if(bad != 0) score = -1; |
136 |
+ return score; |
137 |
+} |
138 |
+ |
139 |
+int score_utf8(unsigned char *str) |
140 |
+{ |
141 |
+ int stat=0; |
142 |
+ int score=0; |
143 |
+ int bad=0; |
144 |
+ if(str==NULL) return -1; |
145 |
+ |
146 |
+ for(; *str != 0; str++){ |
147 |
+ switch(stat){ |
148 |
+ case 0: |
149 |
+ if(*str>= 0x20 && *str <= 0x7e) score++; //ASCII |
150 |
+ else if(*str >= 0xc0 && *str <= 0xdf) stat=1; //greek etc. |
151 |
+ else if(*str >= 0xe0 && *str <= 0xef) stat=2; //KANJI etc. |
152 |
+ else if(*str >= 0xf0 && *str <= 0xf7) stat=4; |
153 |
+ else if(*str < 0x20); //CTRL |
154 |
+ else bad=1; |
155 |
+ break; |
156 |
+ case 1: |
157 |
+ if(*str >= 0x80 && *str <= 0xbf) score++; |
158 |
+ else bad=1; |
159 |
+ stat=0; |
160 |
+ break; |
161 |
+ case 2: |
162 |
+ if(*str >= 0x80 && *str <= 0xbf) stat=3; //KANJI(2) |
163 |
+ else {bad=1; stat=0;} |
164 |
+ break; |
165 |
+ case 3: |
166 |
+ if(*str >= 0x80 && *str <= 0xbf) score+=3; //KANJI(3) |
167 |
+ else bad=1; |
168 |
+ stat=0; |
169 |
+ break; |
170 |
+ case 4: |
171 |
+ case 5: |
172 |
+ if(*str >= 0x80 && *str <= 0xbf) stat++; |
173 |
+ else {bad=1; stat=0;} |
174 |
+ break; |
175 |
+ case 6: |
176 |
+ if(*str >= 0x80 && *str <= 0xbf) score+=4; |
177 |
+ else bad=1; |
178 |
+ stat=0; |
179 |
+ break; |
180 |
+ } |
181 |
+ } |
182 |
+ if(bad != 0) score = -1; |
183 |
+ return score; |
184 |
+} |
185 |
+ |
186 |
+ |
187 |
/*********************************************/ |
188 |
/* SRCH_STRING - get search strings from ref */ |
189 |
/*********************************************/ |
190 |
@@ -1804,6 +1940,10 @@ |
191 |
char srch[80]=""; |
192 |
unsigned char *cp1, *cp2, *cps; |
193 |
int sp_flg=0; |
194 |
+ int sjis, eucj, utf8; |
195 |
+ char tmpbuf2[BUFSIZE]; |
196 |
+ size_t inlen, outlen; |
197 |
+ unsigned char *cp3; |
198 |
|
199 |
/* Check if search engine referrer or return */ |
200 |
if ( (cps=isinglist(search_list,log_rec.refer))==NULL) return; |
201 |
@@ -1839,9 +1978,39 @@ |
202 |
cp1=cp2+strlen(cp2)-1; |
203 |
while (cp1!=cp2) if (isspace(*cp1)) *cp1--='\0'; else break; |
204 |
|
205 |
+ utf8=score_utf8(cp2); |
206 |
+ sjis=score_sjis(cp2); |
207 |
+ eucj=score_eucj(cp2); |
208 |
+ if(utf8 >= sjis && utf8 >= eucj){ |
209 |
+ iconv(cd_from_utf8, NULL, 0, NULL, 0); |
210 |
+ cp3 = cp2; |
211 |
+ inlen = strlen(cp2)+1; |
212 |
+ cp1 = tmpbuf2; |
213 |
+ outlen = sizeof(tmpbuf2); |
214 |
+ if(iconv(cd_from_utf8, (char **)&cp3, &inlen, (char**)&cp1, &outlen) >= 0 && |
215 |
+ inlen == 0){ |
216 |
+ cp2 = tmpbuf2; |
217 |
+ } |
218 |
+ } |
219 |
+ else if(sjis > utf8 && sjis > eucj){ |
220 |
+ iconv(cd_from_sjis, NULL, 0, NULL, 0); |
221 |
+ cp3 = cp2; |
222 |
+ inlen = strlen(cp2)+1; |
223 |
+ cp1 = tmpbuf2; |
224 |
+ outlen = sizeof(tmpbuf2); |
225 |
+ if(iconv(cd_from_sjis, (char **)&cp3, &inlen, (char**)&cp1, &outlen) >= 0 && |
226 |
+ inlen == 0){ |
227 |
+ cp2 = tmpbuf2; |
228 |
+ } |
229 |
+ } |
230 |
+ |
231 |
/* strip invalid chars */ |
232 |
cp1=cp2; |
233 |
- while (*cp1!=0) { if ((*cp1<32)||(*cp1==127)) *cp1='_'; cp1++; } |
234 |
+ while (*cp1!=0) { |
235 |
+ if ((*cp1<32)||(*cp1==127)) *cp1='_'; |
236 |
+ *cp1=tolower(*cp1); |
237 |
+ cp1++; |
238 |
+ } |
239 |
|
240 |
if (put_snode(cp2,(u_long)1,sr_htab)) |
241 |
{ |