Lines 1-1148
Link Here
|
1 |
diff -uNr lib/Mail/SpamAssassin/HTML.pm lib/Mail/SpamAssassin/HTML.pm |
|
|
2 |
--- lib/Mail/SpamAssassin/HTML.pm 2011-06-07 08:59:17.000000000 +0900 |
3 |
+++ lib/Mail/SpamAssassin/HTML.pm 2011-07-14 22:35:46.000000000 +0900 |
4 |
@@ -84,7 +84,7 @@ |
5 |
$ok_attributes{span}{$_} = 1 for qw( style ); |
6 |
|
7 |
sub new { |
8 |
- my ($class) = @_; |
9 |
+ my ($class, $opts) = @_; |
10 |
my $self = $class->SUPER::new( |
11 |
api_version => 3, |
12 |
handlers => [ |
13 |
@@ -97,6 +97,7 @@ |
14 |
declaration => ["html_declaration", "self,text"], |
15 |
], |
16 |
marked_sections => 1); |
17 |
+ $self->{normalize} = $opts->{'normalize'} || 0; |
18 |
|
19 |
$self; |
20 |
} |
21 |
@@ -672,7 +673,14 @@ |
22 |
} |
23 |
} |
24 |
else { |
25 |
- $text =~ s/[ \t\n\r\f\x0b\xa0]+/ /g; |
26 |
+ if ($self->{normalize}) { |
27 |
+ $text =~ s/\xc2\xa0/ /g; # no-break space |
28 |
+ $text =~ s/\xe3\x80\x80/ /g; # ideographicspace |
29 |
+ $text =~ s/[ \t\n\r\f\x0b]+/ /g; |
30 |
+ } |
31 |
+ else { |
32 |
+ $text =~ s/[ \t\n\r\f\x0b\xa0]+/ /g; |
33 |
+ } |
34 |
# trim leading whitespace if previous element was whitespace |
35 |
# and current element is not invisible |
36 |
if (@{ $self->{text} } && !$display{invisible} && |
37 |
diff -uNr lib/Mail/SpamAssassin/Message/Node.pm lib/Mail/SpamAssassin/Message/Node.pm |
38 |
--- lib/Mail/SpamAssassin/Message/Node.pm 2011-06-07 08:59:17.000000000 +0900 |
39 |
+++ lib/Mail/SpamAssassin/Message/Node.pm 2011-07-14 22:35:46.000000000 +0900 |
40 |
@@ -42,6 +42,7 @@ |
41 |
use Mail::SpamAssassin::Constants qw(:sa); |
42 |
use Mail::SpamAssassin::HTML; |
43 |
use Mail::SpamAssassin::Logger; |
44 |
+use Mail::SpamAssassin::Util::Charset; |
45 |
|
46 |
=item new() |
47 |
|
48 |
@@ -387,27 +388,10 @@ |
49 |
|
50 |
sub _normalize { |
51 |
my ($self, $data, $charset) = @_; |
52 |
- return $data unless $self->{normalize}; |
53 |
+ return wantarray ? ($data, $charset) : $data unless $self->{normalize}; |
54 |
|
55 |
- my $detected = Encode::Detect::Detector::detect($data); |
56 |
- |
57 |
- my $converter; |
58 |
- |
59 |
- if ($charset && $charset !~ /^us-ascii$/i && |
60 |
- ($detected || 'none') !~ /^(?:UTF|EUC|ISO-2022|Shift_JIS|Big5|GB)/i) { |
61 |
- dbg("message: Using labeled charset $charset"); |
62 |
- $converter = Encode::find_encoding($charset); |
63 |
- } |
64 |
- |
65 |
- $converter = Encode::find_encoding($detected) unless $converter || !defined($detected); |
66 |
- |
67 |
- return $data unless $converter; |
68 |
- |
69 |
- dbg("message: Converting..."); |
70 |
- |
71 |
- my $rv = $converter->decode($data, 0); |
72 |
- utf8::downgrade($rv, 1); |
73 |
- return $rv |
74 |
+ my ($decoded_data, $detected_charset) = normalize_charset($data, $charset); |
75 |
+ return wantarray ? ($decoded_data, $detected_charset) : $decoded_data; |
76 |
} |
77 |
|
78 |
=item rendered() |
79 |
@@ -430,8 +414,12 @@ |
80 |
# text/x-aol is ignored here, but looks like text/html ... |
81 |
return(undef,undef) unless ( $self->{'type'} =~ /^text\/(?:plain|html)$/i ); |
82 |
|
83 |
- my $text = $self->_normalize($self->decode(), $self->{charset}); |
84 |
+ my ($text, $charset) = $self->_normalize($self->decode(), $self->{charset}); |
85 |
my $raw = length($text); |
86 |
+ if ($self->{normalize}) { |
87 |
+ $self->{charset} = $charset; |
88 |
+ $self->{language} = get_language($text, $charset); |
89 |
+ } |
90 |
|
91 |
# render text/html always, or any other text|text/plain part as text/html |
92 |
# based on a heuristic which simulates a certain common mail client |
93 |
@@ -441,7 +429,7 @@ |
94 |
{ |
95 |
$self->{rendered_type} = 'text/html'; |
96 |
|
97 |
- my $html = Mail::SpamAssassin::HTML->new(); # object |
98 |
+ my $html = Mail::SpamAssassin::HTML->new({normalize=>$self->{normalize}}); # object |
99 |
$html->parse($text); # parse+render text |
100 |
$self->{rendered} = $html->get_rendered_text(); |
101 |
$self->{visible_rendered} = $html->get_rendered_text(invisible => 0); |
102 |
diff -uNr lib/Mail/SpamAssassin/Message.pm lib/Mail/SpamAssassin/Message.pm |
103 |
--- lib/Mail/SpamAssassin/Message.pm 2011-06-07 08:59:17.000000000 +0900 |
104 |
+++ lib/Mail/SpamAssassin/Message.pm 2011-07-14 22:35:46.000000000 +0900 |
105 |
@@ -559,6 +559,8 @@ |
106 |
delete $self->{'pristine_headers'}; |
107 |
delete $self->{'line_ending'}; |
108 |
delete $self->{'missing_head_body_separator'}; |
109 |
+ delete $self->{'charset'}; |
110 |
+ delete $self->{'language'}; |
111 |
|
112 |
my @toclean = ( $self ); |
113 |
|
114 |
@@ -585,6 +587,8 @@ |
115 |
delete $part->{'invisible_rendered'}; |
116 |
delete $part->{'type'}; |
117 |
delete $part->{'rendered_type'}; |
118 |
+ delete $self->{'charset'}; |
119 |
+ delete $self->{'language'}; |
120 |
|
121 |
# if there are children nodes, add them to the queue of nodes to clean up |
122 |
if (exists $part->{'body_parts'}) { |
123 |
@@ -1014,7 +1018,14 @@ |
124 |
|
125 |
# whitespace handling (warning: small changes have large effects!) |
126 |
$text =~ s/\n+\s*\n+/\f/gs; # double newlines => form feed |
127 |
- $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space |
128 |
+ if ($self->{normalize}) { |
129 |
+ $text =~ s/\xc2\xa0/ /g; # no-break space => space |
130 |
+ $text =~ s/\xe3\x80\x80/ /g; # ideographicspace => space |
131 |
+ $text =~ tr/ \t\n\r\x0b/ /s; # whitespace => space |
132 |
+ } |
133 |
+ else { |
134 |
+ $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space |
135 |
+ } |
136 |
$text =~ tr/\f/\n/; # form feeds => newline |
137 |
|
138 |
# warn "message: $text"; |
139 |
@@ -1071,7 +1082,14 @@ |
140 |
|
141 |
# whitespace handling (warning: small changes have large effects!) |
142 |
$text =~ s/\n+\s*\n+/\f/gs; # double newlines => form feed |
143 |
- $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space |
144 |
+ if ($self->{normalize}) { |
145 |
+ $text =~ s/\xc2\xa0/ /g; # no-break space => space |
146 |
+ $text =~ s/\xe3\x80\x80/ /g; # ideographicspace => space |
147 |
+ $text =~ tr/ \t\n\r\x0b/ /s; # whitespace => space |
148 |
+ } |
149 |
+ else { |
150 |
+ $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space |
151 |
+ } |
152 |
$text =~ tr/\f/\n/; # form feeds => newline |
153 |
|
154 |
my @textary = split_into_array_of_short_lines ($text); |
155 |
@@ -1122,7 +1140,14 @@ |
156 |
|
157 |
# whitespace handling (warning: small changes have large effects!) |
158 |
$text =~ s/\n+\s*\n+/\f/gs; # double newlines => form feed |
159 |
- $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space |
160 |
+ if ($self->{normalize}) { |
161 |
+ $text =~ s/\xc2\xa0/ /g; # no-break space => space |
162 |
+ $text =~ s/\xe3\x80\x80/ /g; # ideographicspace => space |
163 |
+ $text =~ tr/ \t\n\r\x0b/ /s; # whitespace => space |
164 |
+ } |
165 |
+ else { |
166 |
+ $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space |
167 |
+ } |
168 |
$text =~ tr/\f/\n/; # form feeds => newline |
169 |
|
170 |
my @textary = split_into_array_of_short_lines ($text); |
171 |
@@ -1198,6 +1223,28 @@ |
172 |
|
173 |
# --------------------------------------------------------------------------- |
174 |
|
175 |
+sub get_language { |
176 |
+ my ($self) = @_; |
177 |
+ |
178 |
+ if (defined $self->{language}) { return $self->{language}; } |
179 |
+ my @parts = $self->find_parts(qr/^(?:text|message)\b/i,1); |
180 |
+ return '' unless @parts; |
181 |
+ |
182 |
+ # Go through each part |
183 |
+ my @langs; |
184 |
+ for(my $pt = 0 ; $pt <= $#parts ; $pt++ ) { |
185 |
+ my $p = $parts[$pt]; |
186 |
+ my $lang = $p->{language}; |
187 |
+ next unless ($lang); |
188 |
+ push(@langs, $lang) unless (grep(/^$lang$/, @langs)) |
189 |
+ } |
190 |
+ $self->{language} = scalar(@langs) ? join(' ', @langs) : ''; |
191 |
+ return $self->{language}; |
192 |
+} |
193 |
+ |
194 |
+# --------------------------------------------------------------------------- |
195 |
+ |
196 |
+ |
197 |
1; |
198 |
|
199 |
=back |
200 |
diff -uNr lib/Mail/SpamAssassin/PerMsgStatus.pm lib/Mail/SpamAssassin/PerMsgStatus.pm |
201 |
--- lib/Mail/SpamAssassin/PerMsgStatus.pm 2011-06-07 08:59:17.000000000 +0900 |
202 |
+++ lib/Mail/SpamAssassin/PerMsgStatus.pm 2011-07-14 22:35:46.000000000 +0900 |
203 |
@@ -53,6 +53,7 @@ |
204 |
use warnings; |
205 |
use re 'taint'; |
206 |
|
207 |
+use Encode; |
208 |
use Time::HiRes qw(time); |
209 |
|
210 |
use Mail::SpamAssassin::Constants qw(:sa); |
211 |
@@ -733,19 +734,41 @@ |
212 |
|
213 |
# the report charset |
214 |
my $report_charset = "; charset=iso-8859-1"; |
215 |
- if ($self->{conf}->{report_charset}) { |
216 |
- $report_charset = "; charset=" . $self->{conf}->{report_charset}; |
217 |
- } |
218 |
|
219 |
# the SpamAssassin report |
220 |
my $report = $self->get_report(); |
221 |
+ if ($self->{conf}->{report_charset}) { |
222 |
+ $report_charset = "; charset=" . $self->{conf}->{report_charset}; |
223 |
+ } |
224 |
|
225 |
# If there are any wide characters, need to MIME-encode in UTF-8 |
226 |
# TODO: If $report_charset is something other than iso-8859-1/us-ascii, then |
227 |
# we could try converting to that charset if possible |
228 |
- unless ($] < 5.008 || utf8::downgrade($report, 1)) { |
229 |
+ my $is_utf8 = 0; |
230 |
+ if ($self->{conf}->{normalize_charset}) { |
231 |
+ $report = Encode::decode_utf8($report); |
232 |
+ $is_utf8 = 1; |
233 |
+ } |
234 |
+ else { |
235 |
+ if ($self->{msg}->{charset}) { |
236 |
+ eval { |
237 |
+ my $scratch = $report; |
238 |
+ $report = Encode::decode($self->{msg}->{charset},$scratch,Encode::FB_CROAK); |
239 |
+ $is_utf8 = 1; |
240 |
+ }; |
241 |
+ } |
242 |
+ } |
243 |
+ if ($is_utf8) { |
244 |
+ $is_utf8 = 1; |
245 |
+ eval { |
246 |
+ my $scratch = $report; |
247 |
+ $report = Encode::encode($self->{conf}->{report_charset},$scratch,Encode::FB_CROAK); |
248 |
+ $is_utf8 = 0; |
249 |
+ }; |
250 |
+ if ($is_utf8) { |
251 |
+ $report = Encode::encode_utf8($report); |
252 |
$report_charset = "; charset=utf-8"; |
253 |
- utf8::encode($report); |
254 |
+ } |
255 |
} |
256 |
|
257 |
# get original headers, "pristine" if we can do it |
258 |
diff -uNr lib/Mail/SpamAssassin/Plugin/Bayes.pm lib/Mail/SpamAssassin/Plugin/Bayes.pm |
259 |
--- lib/Mail/SpamAssassin/Plugin/Bayes.pm 2011-06-07 08:59:17.000000000 +0900 |
260 |
+++ lib/Mail/SpamAssassin/Plugin/Bayes.pm 2011-07-14 22:35:46.000000000 +0900 |
261 |
@@ -223,6 +223,15 @@ |
262 |
# will require a longer token than English ones.) |
263 |
use constant MAX_TOKEN_LENGTH => 15; |
264 |
|
265 |
+# Skip if a token is too short. |
266 |
+our $SKIP_UTF8_SHORT_TOKENS_RE = qr{(?: |
267 |
+ [\x00-\x7F] # 1 byte |
268 |
+ | [\xC0-\xDF][\x80-\xBF] # 2 bytes |
269 |
+ | [\xE0-\xEF][\x80-\xBF]{2} # 3 bytes |
270 |
+ | [\xF0-\xF7][\x80-\xBF]{3} # 4 bytes |
271 |
+ | (?:\xE3[\x81-\x83][\x80-\xBF]){2} # 2 characters of Hiragana and Katakana |
272 |
+)}x; |
273 |
+ |
274 |
########################################################################### |
275 |
|
276 |
sub new { |
277 |
@@ -983,9 +992,28 @@ |
278 |
$msgdata->{bayes_token_body} = $msg->{msg}->get_visible_rendered_body_text_array(); |
279 |
$msgdata->{bayes_token_inviz} = $msg->{msg}->get_invisible_rendered_body_text_array(); |
280 |
@{$msgdata->{bayes_token_uris}} = $msg->get_uri_list(); |
281 |
+ if ($self->{conf}->{normalize_charset}) { |
282 |
+ my $tokenizer = $self->get_tokenizer($msg); |
283 |
+ if (ref($tokenizer)) { |
284 |
+ $msgdata->{bayes_token_body} = $tokenizer->tokenize($msgdata->{bayes_token_body}); |
285 |
+ $msgdata->{bayes_token_inviz} = $tokenizer->tokenize($msgdata->{bayes_token_inviz}); |
286 |
+ } |
287 |
+ } |
288 |
return $msgdata; |
289 |
} |
290 |
|
291 |
+sub get_tokenizer { |
292 |
+ my ($self, $msg) = @_; |
293 |
+ |
294 |
+ my $tokenizer; |
295 |
+ my @languages = split(/\s+/, $msg->{msg}->get_language()); |
296 |
+ foreach my $lang (@languages) { |
297 |
+ $tokenizer = $self->{'conf'}->{'tokenizer'}->{$lang}; |
298 |
+ last if (ref($tokenizer)); |
299 |
+ } |
300 |
+ return $tokenizer; |
301 |
+} |
302 |
+ |
303 |
########################################################################### |
304 |
|
305 |
# The calling functions expect a uniq'ed array of tokens ... |
306 |
@@ -1039,7 +1067,7 @@ |
307 |
# include quotes, .'s and -'s for URIs, and [$,]'s for Nigerian-scam strings, |
308 |
# and ISO-8859-15 alphas. Do not split on @'s; better results keeping it. |
309 |
# Some useful tokens: "$31,000,000" "www.clock-speed.net" "f*ck" "Hits!" |
310 |
- tr/-A-Za-z0-9,\@\*\!_'"\$.\241-\377 / /cs; |
311 |
+ tr/-A-Za-z0-9,\@\*\!_'"\$.\200-\377 / /cs; |
312 |
|
313 |
# DO split on "..." or "--" or "---"; common formatting error resulting in |
314 |
# hapaxes. Keep the separator itself as a token, though, as long ones can |
315 |
@@ -1068,6 +1096,11 @@ |
316 |
# |
317 |
next if ( defined $magic_re && $token =~ /$magic_re/ ); |
318 |
|
319 |
+ # Skip short UTF-8 tokens. |
320 |
+ if ($self->{conf}->{normalize_charset}) { |
321 |
+ next if ($token =~ /^$SKIP_UTF8_SHORT_TOKENS_RE$/o); |
322 |
+ } |
323 |
+ |
324 |
# *do* keep 3-byte tokens; there's some solid signs in there |
325 |
my $len = length($token); |
326 |
|
327 |
@@ -1096,14 +1129,16 @@ |
328 |
# the domain ".net" appeared in the To header. |
329 |
# |
330 |
if ($len > MAX_TOKEN_LENGTH && $token !~ /\*/) { |
331 |
- if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) { |
332 |
- # Matt sez: "Could be asian? Autrijus suggested doing character ngrams, |
333 |
- # but I'm doing tuples to keep the dbs small(er)." Sounds like a plan |
334 |
- # to me! (jm) |
335 |
- while ($token =~ s/^(..?)//) { |
336 |
- push (@rettokens, "8:$1"); |
337 |
- } |
338 |
- next; |
339 |
+ unless ($self->{conf}->{normalize_charset}) { |
340 |
+ if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) { |
341 |
+ # Matt sez: "Could be asian? Autrijus suggested doing character ngrams, |
342 |
+ # but I'm doing tuples to keep the dbs small(er)." Sounds like a plan |
343 |
+ # to me! (jm) |
344 |
+ while ($token =~ s/^(..?)//) { |
345 |
+ push (@rettokens, "8:$1"); |
346 |
+ } |
347 |
+ next; |
348 |
+ } |
349 |
} |
350 |
|
351 |
if (($region == 0 && HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS) |
352 |
diff -uNr lib/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm lib/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm |
353 |
--- lib/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm 1970-01-01 09:00:00.000000000 +0900 |
354 |
+++ lib/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm 2011-07-14 22:29:19.000000000 +0900 |
355 |
@@ -0,0 +1,84 @@ |
356 |
+# <@LICENSE> |
357 |
+# Copyright 2004 Apache Software Foundation |
358 |
+# |
359 |
+# Licensed under the Apache License, Version 2.0 (the "License"); |
360 |
+# you may not use this file except in compliance with the License. |
361 |
+# You may obtain a copy of the License at |
362 |
+# |
363 |
+# http://www.apache.org/licenses/LICENSE-2.0 |
364 |
+# |
365 |
+# Unless required by applicable law or agreed to in writing, software |
366 |
+# distributed under the License is distributed on an "AS IS" BASIS, |
367 |
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
368 |
+# See the License for the specific language governing permissions and |
369 |
+# limitations under the License. |
370 |
+# </@LICENSE> |
371 |
+ |
372 |
+=head1 NAME |
373 |
+ |
374 |
+Tokenizer::MeCab - Japanese tokenizer with MeCab |
375 |
+ |
376 |
+=head1 SYNOPSIS |
377 |
+ |
378 |
+loadplugin Mail::SpamAssassin::Plugin::Tokenizer::MeCab |
379 |
+ |
380 |
+=head1 DESCRIPTION |
381 |
+ |
382 |
+This plugin tokenizes a Japanese string with MeCab that is |
383 |
+the morphological analysis engine. |
384 |
+ |
385 |
+Text::MeCab 0.12 or over is required. |
386 |
+ |
387 |
+=cut |
388 |
+ |
389 |
+package Mail::SpamAssassin::Plugin::Tokenizer::MeCab; |
390 |
+ |
391 |
+use strict; |
392 |
+use warnings; |
393 |
+use Mail::SpamAssassin::Plugin::Tokenizer; |
394 |
+ |
395 |
+use vars qw(@ISA); |
396 |
+@ISA = qw(Mail::SpamAssassin::Plugin::Tokenizer); |
397 |
+ |
398 |
+# Have to do this so that RPM doesn't find these as required perl modules |
399 |
+BEGIN { require MeCab; } |
400 |
+our $language = 'ja'; |
401 |
+our $mecab = new MeCab::Tagger(-Ochasen); |
402 |
+ |
403 |
+sub new { |
404 |
+ my $class = shift; |
405 |
+ my $mailsaobject = shift; |
406 |
+ |
407 |
+ $class = ref($class) || $class; |
408 |
+ my $self = $class->SUPER::new($mailsaobject, $language); |
409 |
+ bless ($self, $class); |
410 |
+ |
411 |
+ return $self; |
412 |
+} |
413 |
+ |
414 |
+sub tokenize { |
415 |
+ my $self = shift; |
416 |
+ my $text_array = shift; |
417 |
+ |
418 |
+ my @tokenized_array; |
419 |
+ foreach my $text (@$text_array) { |
420 |
+ next unless ($text); |
421 |
+ $text =~ s/([\x80-\xFF]{3,})/&_tokenize($1)/eg; |
422 |
+ push(@tokenized_array, $text); |
423 |
+ } |
424 |
+ return \@tokenized_array; |
425 |
+} |
426 |
+ |
427 |
+sub _tokenize { |
428 |
+ my $text = shift; |
429 |
+ |
430 |
+ my @buf; |
431 |
+ for (my $node = $mecab->parseToNode($text); $node->{next}; $node = $node->{next}) { |
432 |
+ push(@buf, $node->{surface}); |
433 |
+ } |
434 |
+ my $tokenized = join(' ', @buf) . ' '; |
435 |
+ return $tokenized; |
436 |
+} |
437 |
+ |
438 |
+1; |
439 |
+ |
440 |
diff -uNr lib/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm lib/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm |
441 |
--- lib/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm 1970-01-01 09:00:00.000000000 +0900 |
442 |
+++ lib/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm 2011-07-14 22:29:19.000000000 +0900 |
443 |
@@ -0,0 +1,111 @@ |
444 |
+# <@LICENSE> |
445 |
+# Copyright 2004 Apache Software Foundation |
446 |
+# |
447 |
+# Licensed under the Apache License, Version 2.0 (the "License"); |
448 |
+# you may not use this file except in compliance with the License. |
449 |
+# You may obtain a copy of the License at |
450 |
+# |
451 |
+# http://www.apache.org/licenses/LICENSE-2.0 |
452 |
+# |
453 |
+# Unless required by applicable law or agreed to in writing, software |
454 |
+# distributed under the License is distributed on an "AS IS" BASIS, |
455 |
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
456 |
+# See the License for the specific language governing permissions and |
457 |
+# limitations under the License. |
458 |
+# </@LICENSE> |
459 |
+ |
460 |
+=head1 NAME |
461 |
+ |
462 |
+Tokenizer::SimpleJA - simple Japanese tokenizer |
463 |
+ |
464 |
+=head1 SYNOPSIS |
465 |
+ |
466 |
+loadplugin Mail::SpamAssassin::Plugin::Tokenizer::SimpleJA |
467 |
+ |
468 |
+=head1 DESCRIPTION |
469 |
+ |
470 |
+This plugin simply tokenizes a Japanese string by characters other than |
471 |
+the alphabet, the Chinese character, and the katakana. |
472 |
+ |
473 |
+=cut |
474 |
+ |
475 |
+package Mail::SpamAssassin::Plugin::Tokenizer::SimpleJA; |
476 |
+ |
477 |
+use strict; |
478 |
+use warnings; |
479 |
+use Mail::SpamAssassin::Plugin::Tokenizer; |
480 |
+ |
481 |
+use vars qw(@ISA); |
482 |
+@ISA = qw(Mail::SpamAssassin::Plugin::Tokenizer); |
483 |
+ |
484 |
+our $language = 'ja'; |
485 |
+ |
486 |
+our $RE = qr{( |
487 |
+ # Hiragana |
488 |
+ (?: |
489 |
+ \xE3\x81[\x80-\xBF] |
490 |
+ | \xE3\x82[\x80-\x9F] |
491 |
+ )+ |
492 |
+ # Katakana |
493 |
+ | (?: |
494 |
+ \xE3\x82[\xA0-\xBF] |
495 |
+ | \xE3\x83[\x80-\xBF] |
496 |
+ )+ |
497 |
+ # Kanji |
498 |
+ | (?: |
499 |
+ \xE3[\x90-\xBF][\x80-\xBF] |
500 |
+ | [\xE4-\xE9][\x80-\xBF]{2} |
501 |
+ | \xEF[\xA4-\xAB][\x80-\xBF] |
502 |
+ )+ |
503 |
+ # Fullwidth |
504 |
+ | (?: |
505 |
+ \xEF\xBC[\x80-\xBF] |
506 |
+ | \xEF\xBD[\x80-\x9F] |
507 |
+ )+ |
508 |
+ # Others |
509 |
+ | [\xC0-\xDF][\x80-\xBF] |
510 |
+ | [\xE0-\xE2][\x80-\xBF]{2} |
511 |
+ | \xE3\x80[\x80-\xBF] |
512 |
+ | \xE3[\x84-\x8F][\x80-\xBF] |
513 |
+ | [\xEA-\xEE][\x80-\xBF]{2} |
514 |
+ | \xEF[\x80-\xA3][\x80-\xBF] |
515 |
+ | \xEF[\xAC-\xBB][\x80-\xBF] |
516 |
+ | \xEF\xBD[\xA0-\xBF] |
517 |
+ | \xEF[\xBE-\xBF][\x80-\xBF] |
518 |
+ | [\xF0-\xF7][\x80-\xBF]{3} |
519 |
+)}x; |
520 |
+ |
521 |
+sub new { |
522 |
+ my $class = shift; |
523 |
+ my $mailsaobject = shift; |
524 |
+ |
525 |
+ $class = ref($class) || $class; |
526 |
+ my $self = $class->SUPER::new($mailsaobject, $language); |
527 |
+ bless ($self, $class); |
528 |
+ |
529 |
+ return $self; |
530 |
+} |
531 |
+ |
532 |
+sub tokenize { |
533 |
+ my $self = shift; |
534 |
+ my $text_array = shift; |
535 |
+ |
536 |
+ my @tokenized_array; |
537 |
+ foreach my $text (@$text_array) { |
538 |
+ next unless ($text); |
539 |
+ $text =~ s/([\x80-\xFF]{3,})/&_tokenize($1)/eg; |
540 |
+ push(@tokenized_array, $text); |
541 |
+ } |
542 |
+ return \@tokenized_array; |
543 |
+} |
544 |
+ |
545 |
+sub _tokenize { |
546 |
+ my $text = shift; |
547 |
+ |
548 |
+ $text =~ s/$RE/$1 /og; |
549 |
+ $text = ' ' . $text; |
550 |
+ return $text; |
551 |
+} |
552 |
+ |
553 |
+1; |
554 |
+ |
555 |
diff -uNr lib/Mail/SpamAssassin/Plugin/Tokenizer.pm lib/Mail/SpamAssassin/Plugin/Tokenizer.pm |
556 |
--- lib/Mail/SpamAssassin/Plugin/Tokenizer.pm 1970-01-01 09:00:00.000000000 +0900 |
557 |
+++ lib/Mail/SpamAssassin/Plugin/Tokenizer.pm 2011-07-14 22:35:46.000000000 +0900 |
558 |
@@ -0,0 +1,115 @@ |
559 |
+# <@LICENSE> |
560 |
+# Copyright 2004 Apache Software Foundation |
561 |
+# |
562 |
+# Licensed under the Apache License, Version 2.0 (the "License"); |
563 |
+# you may not use this file except in compliance with the License. |
564 |
+# You may obtain a copy of the License at |
565 |
+# |
566 |
+# http://www.apache.org/licenses/LICENSE-2.0 |
567 |
+# |
568 |
+# Unless required by applicable law or agreed to in writing, software |
569 |
+# distributed under the License is distributed on an "AS IS" BASIS, |
570 |
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
571 |
+# See the License for the specific language governing permissions and |
572 |
+# limitations under the License. |
573 |
+# </@LICENSE> |
574 |
+ |
575 |
+=head1 NAME |
576 |
+ |
577 |
+Mail::SpamAssassin::Plugin::Tokenizer - Tokenizer plugin base class |
578 |
+ |
579 |
+=head1 SYNOPSIS |
580 |
+ |
581 |
+=head2 SpamAssassin configuration: |
582 |
+ |
583 |
+ loadplugin MyTokenizerPlugin /path/to/MyTokenizerPlugin.pm |
584 |
+ |
585 |
+=head2 Perl code: |
586 |
+ |
587 |
+ use Mail::SpamAssassin::Plugin::Tokenizer; |
588 |
+ use vars qw(@ISA); |
589 |
+ @ISA = qw(Mail::SpamAssassin::Plugin::Tokenizer); |
590 |
+ # language to use this plugin |
591 |
+ our $language = 'ja'; |
592 |
+ |
593 |
+ # constructor: register language |
594 |
+ sub new { |
595 |
+ my $class = shift; |
596 |
+ my $mailsaobject = shift; |
597 |
+ |
598 |
+ # some boilerplate... |
599 |
+ $class = ref($class) || $class; |
600 |
+ my $self = $class->SUPER::new($mailsaobject, $language); |
601 |
+ bless ($self, $class); |
602 |
+ |
603 |
+ return $self; |
604 |
+ } |
605 |
+ |
606 |
+ # tokenize function |
607 |
+ sub tokenize { |
608 |
+ my $self = shift; |
609 |
+ my $text_array_ref = shift; |
610 |
+ |
611 |
+ ...... |
612 |
+ |
613 |
+ return $tokenized_array_ref; |
614 |
+ } |
615 |
+ |
616 |
+ |
617 |
+=head1 DESCRIPTION |
618 |
+ |
619 |
+This plugin is the base class of tokenizer plugin. |
620 |
+You must define tokenize() and $language |
621 |
+ |
622 |
+=head1 INTERFACE |
623 |
+ |
624 |
+ sub tokenize { |
625 |
+ my $self = shift; |
626 |
+ my $text_array_ref = shift; |
627 |
+ |
628 |
+ ...... |
629 |
+ |
630 |
+ return $tokenized_array_ref; |
631 |
+ } |
632 |
+ |
633 |
+=cut |
634 |
+ |
635 |
+package Mail::SpamAssassin::Plugin::Tokenizer; |
636 |
+ |
637 |
+use Mail::SpamAssassin::Plugin; |
638 |
+use Mail::SpamAssassin::Logger; |
639 |
+use strict; |
640 |
+use warnings; |
641 |
+use bytes; |
642 |
+ |
643 |
+use vars qw(@ISA); |
644 |
+@ISA = qw(Mail::SpamAssassin::Plugin); |
645 |
+ |
646 |
+sub new { |
647 |
+ my $class = shift; |
648 |
+ my $mailsaobject = shift; |
649 |
+ my $language = shift; |
650 |
+ |
651 |
+ # some boilerplate... |
652 |
+ $class = ref($class) || $class; |
653 |
+ my $self = $class->SUPER::new($mailsaobject); |
654 |
+ bless ($self, $class); |
655 |
+ |
656 |
+ if ($language) { |
657 |
+ $self->{main}->{conf}->{tokenizer}->{$language} = $self; |
658 |
+ } |
659 |
+ else { |
660 |
+ dbg("plugin: $self: \$language is not defined"); |
661 |
+ } |
662 |
+ |
663 |
+ return $self; |
664 |
+} |
665 |
+ |
666 |
+sub tokenize { |
667 |
+ my ($self, $ref) = @_; |
668 |
+ |
669 |
+ return $ref; |
670 |
+} |
671 |
+ |
672 |
+1; |
673 |
+ |
674 |
diff -uNr lib/Mail/SpamAssassin/Util/Charset.pm lib/Mail/SpamAssassin/Util/Charset.pm |
675 |
--- lib/Mail/SpamAssassin/Util/Charset.pm 1970-01-01 09:00:00.000000000 +0900 |
676 |
+++ lib/Mail/SpamAssassin/Util/Charset.pm 2011-07-14 22:29:19.000000000 +0900 |
677 |
@@ -0,0 +1,471 @@ |
678 |
+# <@LICENSE> |
679 |
+# Copyright 2006 Apache Software Foundation |
680 |
+# |
681 |
+# Licensed under the Apache License, Version 2.0 (the "License"); |
682 |
+# you may not use this file except in compliance with the License. |
683 |
+# You may obtain a copy of the License at |
684 |
+# |
685 |
+# http://www.apache.org/licenses/LICENSE-2.0 |
686 |
+# |
687 |
+# Unless required by applicable law or agreed to in writing, software |
688 |
+# distributed under the License is distributed on an "AS IS" BASIS, |
689 |
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
690 |
+# See the License for the specific language governing permissions and |
691 |
+# limitations under the License. |
692 |
+# </@LICENSE> |
693 |
+ |
694 |
+ |
695 |
+=head1 NAME |
696 |
+ |
697 |
+ Mail::SpamAssassin::Util::Charset.pm - Utility for charset and language |
698 |
+ |
699 |
+=head1 SYNOPSIS |
700 |
+ |
701 |
+ my ($decoded, $detected) = Mail::SpamAssassin::Util::Charset::normalize_charset($str, $charset); |
702 |
+ my $language = Mail::SpamAssassin::Util::Charset::get_language($str, $charset); |
703 |
+ |
704 |
+=head1 DESCRIPTION |
705 |
+ |
706 |
+This module implements utility methods for charset and language. |
707 |
+ |
708 |
+=cut |
709 |
+ |
710 |
+package Mail::SpamAssassin::Util::Charset; |
711 |
+ |
712 |
+use strict; |
713 |
+use warnings; |
714 |
+use Encode; |
715 |
+use Encode::Guess; |
716 |
+use Encode::Alias; |
717 |
+ |
718 |
+use vars qw ( |
719 |
+ @ISA @EXPORT |
720 |
+); |
721 |
+ |
722 |
+require Exporter; |
723 |
+ |
724 |
+@ISA = qw(Exporter); |
725 |
+@EXPORT = qw(normalize_charset get_language); |
726 |
+ |
727 |
+########################################################################### |
728 |
+ |
729 |
+use constant HAS_ENCODE_DETECT => eval { require Encode::Detect::Detector; }; |
730 |
+use constant HAS_ENCODE_HANEXTRA => eval { require Encode::HanExtra; }; |
731 |
+use constant HAS_ENCODE_EUCJPMS => eval { require Encode::EUCJPMS; }; |
732 |
+ |
733 |
+########################################################################### |
734 |
+ |
735 |
+our $KANA_HAN_RE = qr{ |
736 |
+ # Hiragana and Katakana |
737 |
+ \xE3[\x81-\x83][\x80-\xBF] |
738 |
+ # Han |
739 |
+ | \xE3[\x90-\xBF][\x80-\xBF] |
740 |
+ | [\xE4-\xE9][\x80-\xBF]{2} |
741 |
+ | \xEF[\xA4-\xAB][\x80-\xBF] |
742 |
+}x; |
743 |
+ |
744 |
+our %enc2lang; |
745 |
+our %lang2enc; |
746 |
+our %scr2lang; |
747 |
+our %cjkscr2lang; |
748 |
+our @scrorder; |
749 |
+ |
750 |
+BEGIN { |
751 |
+ |
752 |
+ # See the following URL about this map: |
753 |
+ # http://czyborra.com/charsets/iso8859.html |
754 |
+ # http://czyborra.com/charsets/codepages.html |
755 |
+ # http://czyborra.com/charsets/cyrillic.html |
756 |
+ # http://en.wikipedia.org/wiki/ISO_8859 |
757 |
+ # http://www.w3.org/International/O-charset-lang.html |
758 |
+ %enc2lang = ( |
759 |
+ # buint-in Encodings and Encode::Byte |
760 |
+ # N. America |
761 |
+ 'ascii' => 'en', |
762 |
+ 'cp437' => 'en', |
763 |
+ 'cp863' => 'weurope', |
764 |
+ |
765 |
+ # W. Europe (Latin1, Latin9) |
766 |
+ # fr es ca eu pt it sq rm nl de da sv no fi fo is ga gd en af |
767 |
+ 'iso-8859-1' => 'weurope', |
768 |
+ 'iso-8859-15' => 'weurope', |
769 |
+ 'cp850' => 'weurope', |
770 |
+ 'cp860' => 'weurope', |
771 |
+ 'cp1252' => 'weurope', |
772 |
+ 'MacRoman' => 'weurope', |
773 |
+ |
774 |
+ # Cntrl. Europe / Latin2 / Latin10 |
775 |
+ # hr cs hu pl sr sk sl |
776 |
+ 'iso-8859-2' => 'ceurope', |
777 |
+ 'cp852' => 'ceurope', |
778 |
+ 'cp1250' => 'ceurope', |
779 |
+ 'MacCentralEurRoman' => 'ceurope', |
780 |
+ 'MacCroatian' => 'ceurope', |
781 |
+ 'iso-8859-16' => 'ceurope', |
782 |
+ 'MacRomanian' => 'ceurope', |
783 |
+ |
784 |
+ # Latin3 (Esperanto, Maltese, and Turkish. Turkish is now on 8859-9.) |
785 |
+ # eo mt |
786 |
+ 'iso-8859-3' => 'seurope', |
787 |
+ |
788 |
+ # Baltics (Latin4, Latin7) |
789 |
+ # lv lt |
790 |
+ 'iso-8859-4' => 'neurope', |
791 |
+ 'iso-8859-13' => 'baltic', |
792 |
+ 'cp1257' => 'baltic', |
793 |
+ |
794 |
+ # Nordics (Latin6) |
795 |
+ # et kl iu se |
796 |
+ 'iso-8859-10' => 'nordic', |
797 |
+ |
798 |
+ # Cyrillics |
799 |
+ # bg be uk sr mk ru |
800 |
+ 'iso-8859-5' => 'ru', |
801 |
+ 'cp855' => 'ru', |
802 |
+ 'cp1251' => 'ru', |
803 |
+ 'cp866' => 'ru', |
804 |
+ 'MacCyrillic' => 'ru', |
805 |
+ 'koi8-r' => 'ru', |
806 |
+ 'MacUkrainian' => 'uk', |
807 |
+ 'koi8-u' => 'uk', |
808 |
+ |
809 |
+ # Arabic |
810 |
+ 'iso-8859-6' => 'ar', |
811 |
+ 'cp864' => 'ar', |
812 |
+ 'cp1256' => 'ar', |
813 |
+ 'MacArabic' => 'ar', |
814 |
+ 'cp1006' => 'fa', |
815 |
+ 'MacFarsi' => 'fa', |
816 |
+ |
817 |
+ # Greek |
818 |
+ 'iso-8859-7' => 'el', |
819 |
+ 'cp1253' => 'el', |
820 |
+ 'MacGreek' => 'el', |
821 |
+ |
822 |
+ # Hebrew |
823 |
+ # he yi |
824 |
+ 'iso-8859-8' => 'he', |
825 |
+ 'cp862' => 'he', |
826 |
+ 'cp1255' => 'he', |
827 |
+ 'MacHebrew' => 'he', |
828 |
+ |
829 |
+ # Turkish |
830 |
+ 'iso-8859-9' => 'tr', |
831 |
+ 'cp857' => 'tr', |
832 |
+ 'cp1254' => 'tr', |
833 |
+ 'MacTurkish' => 'tr', |
834 |
+ |
835 |
+ # Thai |
836 |
+ 'iso-8859-11' => 'th', |
837 |
+ 'cp874' => 'th', |
838 |
+ |
839 |
+ # Celtics (Latin8) |
840 |
+ # gd cy br |
841 |
+ 'iso-8859-14' => 'celtic', |
842 |
+ |
843 |
+ # Vietnamese |
844 |
+ 'viscii' => 'vi', |
845 |
+ 'cp1258' => 'vi', |
846 |
+ |
847 |
+ # Encode::CN |
848 |
+ 'euc-cn' => 'zh', |
849 |
+ 'cp936' => 'zh', |
850 |
+ 'hz' => 'zh', |
851 |
+ |
852 |
+ # Encode::TW |
853 |
+ 'big5-eten' => 'zh', |
854 |
+ 'big5-hkscs' => 'zh', |
855 |
+ 'cp950' => 'zh', |
856 |
+ |
857 |
+ # Encode::JP |
858 |
+ 'euc-jp' => 'ja', |
859 |
+ 'shiftjis' => 'ja', |
860 |
+ '7bit-jis' => 'ja', |
861 |
+ 'iso-2022-jp' => 'ja', |
862 |
+ 'iso-2022-jp-1' => 'ja', |
863 |
+ 'cp932' => 'ja', |
864 |
+ |
865 |
+ # Encode::KR |
866 |
+ 'euc-kr' => 'ko', |
867 |
+ 'cp949' => 'ko', |
868 |
+ 'johab' => 'ko', |
869 |
+ 'iso-2022-kr' => 'ko', |
870 |
+ |
871 |
+ # Encode::HanExtra |
872 |
+ 'euc-tw' => 'zh', |
873 |
+ 'gb18030' => 'zh', |
874 |
+ |
875 |
+ # Encode::JIS2K |
876 |
+ 'euc-jisx0213' => 'ja', |
877 |
+ 'shiftjisx0123' => 'ja', |
878 |
+ 'iso-2022-jp-3' => 'ja', |
879 |
+ |
880 |
+ # Encode::EUCJPMS |
881 |
+ 'eucJP-ms' => 'ja', |
882 |
+ 'cp51932' => 'ja', |
883 |
+ 'cp50220' => 'ja', |
884 |
+ 'cp50221' => 'ja', |
885 |
+ |
886 |
+ ); |
887 |
+ |
888 |
+ %lang2enc = ( |
889 |
+ # Latin1 |
890 |
+ 'en' => ['ascii'], |
891 |
+ 'weurope' => ['cp1252'], |
892 |
+ |
893 |
+ # Latin2 |
894 |
+ 'ceurope' => ['cp1250'], |
895 |
+ |
896 |
+ # Latin3 |
897 |
+ 'seurope' => ['iso-8859-3'], |
898 |
+ |
899 |
+ # Latin4 |
900 |
+ 'neurope' => ['iso-8859-4'], |
901 |
+ |
902 |
+ # Latin5 |
903 |
+ 'tr' => ['cp1254'], |
904 |
+ |
905 |
+ # Latin6 |
906 |
+ 'nordic' => ['iso-8859-10'], |
907 |
+ |
908 |
+ # Latin7 |
909 |
+ 'baltic' => ['cp1257'], |
910 |
+ |
911 |
+ # Latin8 |
912 |
+ 'celtic' => ['iso-8859-14'], |
913 |
+ |
914 |
+ # Non Latin |
915 |
+ 'ru' => ['koi8-r', 'cp1251'], |
916 |
+ 'uk' => ['koi8-u'], |
917 |
+ |
918 |
+ 'ar' => ['cp1256'], |
919 |
+ 'el' => ['cp1253'], |
920 |
+ 'he' => ['cp1255'], |
921 |
+ 'th' => ['cp874'], |
922 |
+ 'vi' => ['viscii', 'cp1258'], |
923 |
+ 'zh' => ['euc-cn', 'cp950'], |
924 |
+ 'ja' => ['euc-jp', 'cp932'], |
925 |
+ 'ko' => ['euc-kr', 'cp949'], |
926 |
+ |
927 |
+ ); |
928 |
+ |
929 |
+ %scr2lang = ( |
930 |
+ 'InLatin1Supplement' => ['weurope'], |
931 |
+ 'InLatinExtendedA' => [ |
932 |
+ 'ceurope', |
933 |
+ 'seurope', |
934 |
+ 'tr', |
935 |
+ 'vi' |
936 |
+ ], |
937 |
+ 'InLatinExtendedB' => [ |
938 |
+ 'nordic', |
939 |
+ 'baltic', |
940 |
+ 'celtic' |
941 |
+ ], |
942 |
+ 'Thai' => ['th'], |
943 |
+ 'Cyrillic' => ['ru', 'uk'], |
944 |
+ 'Arabic' => ['ar'], |
945 |
+ 'Greek' => ['el'], |
946 |
+ 'Hebrew' => ['he'], |
947 |
+ ); |
948 |
+ |
949 |
+ # better detection for CJK |
950 |
+ @scrorder = ('Hiragana','Katakana','Hangul','Han',keys(%scr2lang)); |
951 |
+ %cjkscr2lang = ( |
952 |
+ 'Hiragana' => ['ja'], |
953 |
+ 'Katakana' => ['ja'], |
954 |
+ 'Hangul' => ['ko'], |
955 |
+ 'Han' => ['zh', 'ja', 'ko'], |
956 |
+ ); |
957 |
+ |
958 |
+ unless (HAS_ENCODE_HANEXTRA) { |
959 |
+ Encode::Alias::define_alias( qr/^gb18030$/i => ' "euc-cn"' ); |
960 |
+ } |
961 |
+ Encode::Alias::define_alias( qr/^unicode-1-1-(.+)$/i => ' "$1"' ); |
962 |
+ Encode::Alias::define_alias( qr/^TIS-620$/i => ' "iso-8859-11"' ); |
963 |
+ Encode::Alias::define_alias( qr/^x-mac-(.+)$/i => ' "Mac$1"' ); |
964 |
+ Encode::Alias::define_alias( qr/^Shift_JIS$/i => ' "cp932"' ); |
965 |
+ if (HAS_ENCODE_EUCJPMS) { |
966 |
+ Encode::Alias::define_alias( qr/^iso-2022-jp$/i => ' "cp50221"' ); |
967 |
+ } |
968 |
+} |
969 |
+ |
970 |
+sub get_language { |
971 |
+ my $str = shift; # $str must be UTF-8 encoding |
972 |
+ my $charset = shift; |
973 |
+ |
974 |
+ return 'en' unless $charset; |
975 |
+ if ($charset !~ /^utf/i) { |
976 |
+ return $enc2lang{$charset}; |
977 |
+ } elsif (defined($str)) { |
978 |
+ $str =~ s/[\x00-\x7F]//g; # remove ASCII characters |
979 |
+ return 'en' if ($str eq ''); |
980 |
+ |
981 |
+ my %handled; |
982 |
+ $str = Encode::decode_utf8($str) unless (Encode::is_utf8($str)); |
983 |
+ foreach my $scr (@scrorder) { |
984 |
+ next if ($str !~ /\p{$scr}/); |
985 |
+ my $scrlangs = exists($cjkscr2lang{$scr}) ? $cjkscr2lang{$scr} : $scr2lang{$scr}; |
986 |
+ foreach my $lang (@$scrlangs) { |
987 |
+ next if (exists($handled{$lang})); |
988 |
+ foreach my $enc (@{$lang2enc{$lang}}) { |
989 |
+ my $scratch = $str; |
990 |
+ Encode::encode($enc, $scratch, Encode::FB_QUIET); |
991 |
+ return $lang if ($scratch eq ''); |
992 |
+ } |
993 |
+ $handled{$lang} = 1; |
994 |
+ } |
995 |
+ } |
996 |
+ } |
997 |
+ return 'en'; |
998 |
+} |
999 |
+ |
1000 |
+# TEST 1: try conversion to use the specified charset. |
1001 |
+# TEST 2: try conversion to use Encode::Detect. |
1002 |
+# TEST 3: try conversion to use Encode::Guess. |
1003 |
+sub normalize_charset { |
1004 |
+ my $str = shift; |
1005 |
+ my $charset = shift; |
1006 |
+ |
1007 |
+ return wantarray ? ($str, 'ascii') : $str unless ($str); |
1008 |
+ |
1009 |
+ my $decoded; |
1010 |
+ my $detected; |
1011 |
+ |
1012 |
+ if ($charset) { |
1013 |
+ ($decoded, $detected) = _specified_encoding($str, $charset); |
1014 |
+ } |
1015 |
+ unless ($detected) { |
1016 |
+ ($decoded, $detected) = _encode_detect($str); |
1017 |
+ } |
1018 |
+ unless ($detected) { |
1019 |
+ ($decoded, $detected) = _encode_guess($str); |
1020 |
+ } |
1021 |
+ unless ($detected) { |
1022 |
+ return ($str, undef); |
1023 |
+ } |
1024 |
+ $decoded =~ s/^\x{feff}//g; |
1025 |
+ $decoded = Encode::encode_utf8($decoded); |
1026 |
+ |
1027 |
+ # unfold hiragana, katakana and han |
1028 |
+ if ($detected =~ /^(?:UTF|EUC|BIG5|GB|SHIFTJIS|ISO-2022|CP969$|CP932$|CP949|CP50221$)/i) { |
1029 |
+ $decoded =~ s/($KANA_HAN_RE)\012($KANA_HAN_RE)/$1$2/og; |
1030 |
+ } |
1031 |
+ return wantarray ? ($decoded, $detected) : $decoded; |
1032 |
+} |
1033 |
+ |
1034 |
+sub _specified_encoding { |
1035 |
+ my $str = shift; |
1036 |
+ my $encoding = shift; |
1037 |
+ |
1038 |
+ my $detected; |
1039 |
+ my $decoded; |
1040 |
+ |
1041 |
+ return (undef, undef) unless ($encoding); |
1042 |
+ |
1043 |
+ # note: ISO-2022-* is not deistinguish from US-ASCII |
1044 |
+ return (undef, undef) if ($str =~ /\e/ and $encoding !~ /^ISO-2022/i); |
1045 |
+ |
1046 |
+ # UTF-16|32 encoding without BOM cannot be trusted. |
1047 |
+ return (undef, undef) if ($encoding =~ /^UTF-32$/i and $str !~ /^(?:\xFF\xFE\x00\x00|\x00\x00\xFE\xFF)/); |
1048 |
+ return (undef, undef) if ($encoding =~ /^UTF-16$/i and $str !~ /^(?:\xFF\xFE|\xFE\xFF)/); |
1049 |
+ |
1050 |
+ #$encoding = _get_alias($encoding); |
1051 |
+ my $encoder = Encode::find_encoding($encoding); |
1052 |
+ if (ref($encoder)) { |
1053 |
+ $decoded = $encoder->decode($str,Encode::FB_QUIET); |
1054 |
+ $detected = $encoder->name if ($str eq ''); |
1055 |
+ } |
1056 |
+ return ($decoded, $detected); |
1057 |
+} |
1058 |
+ |
1059 |
+sub _encode_detect { |
1060 |
+ return undef unless HAS_ENCODE_DETECT; |
1061 |
+ my $str = shift; |
1062 |
+ |
1063 |
+ # UTF-16|32 encoding without BOM cannot be trusted. |
1064 |
+ return (undef, undef) if ($str =~ /\x00\x00/ and $str !~ /^(?:\xFF\xFE\x00\x00|\x00\x00\xFE\xFF)/); |
1065 |
+ return (undef, undef) if ($str =~ /\x00/ and $str !~ /^(?:\xFF\xFE|\xFE\xFF)/); |
1066 |
+ |
1067 |
+ my $decoded; |
1068 |
+ my $detected = Encode::Detect::Detector::detect($str); |
1069 |
+ if ($detected) { |
1070 |
+ $detected = _get_alias($detected); |
1071 |
+ my $encoder = Encode::find_encoding($detected); |
1072 |
+ if (ref($encoder)) { |
1073 |
+ $decoded = $encoder->decode($str); |
1074 |
+ $detected = $decoded ? $encoder->name : undef; |
1075 |
+ } |
1076 |
+ else { |
1077 |
+ $detected = undef; |
1078 |
+ } |
1079 |
+ } |
1080 |
+ return ($decoded, $detected); |
1081 |
+} |
1082 |
+ |
1083 |
+sub _encode_guess { |
1084 |
+ my $str = shift; |
1085 |
+ |
1086 |
+ my $detected; |
1087 |
+ my $decoded; |
1088 |
+ my $encoder; |
1089 |
+ |
1090 |
+ # Step 1: Examine ISO-2022-*. |
1091 |
+ if ($str =~ /\e/) { |
1092 |
+ $Encode::Guess::NoUTFAutoGuess = 1; |
1093 |
+ $encoder = Encode::Guess::guess_encoding($str, |
1094 |
+ qw/cp50221 7bit-jis iso-2022-kr/); |
1095 |
+ $Encode::Guess::NoUTFAutoGuess = 0; |
1096 |
+ } |
1097 |
+ |
1098 |
+ # Step 2: Examine US-ASCII/UTF-(8|16|32) |
1099 |
+ unless (ref($encoder)) { |
1100 |
+ $Encode::Guess::NoUTFAutoGuess = 0; |
1101 |
+ $encoder = Encode::Guess::guess_encoding($str); |
1102 |
+ } |
1103 |
+ |
1104 |
+ # Step 3: Examine other encodings |
1105 |
+ unless (ref($encoder)) { |
1106 |
+ $Encode::Guess::NoUTFAutoGuess = 1; |
1107 |
+ eval { |
1108 |
+ if ($str =~ /[\x80-\xFF]{4}/) { |
1109 |
+ $encoder = Encode::Guess::guess_encoding($str, |
1110 |
+ qw/euc-cn big5-eten euc-jp cp932 euc-kr cp949/); |
1111 |
+ } |
1112 |
+ else { |
1113 |
+ $encoder = Encode::Guess::guess_encoding($str, |
1114 |
+ qw/iso-8859-1 cp1252/); |
1115 |
+ } |
1116 |
+ }; |
1117 |
+ $Encode::Guess::NoUTFAutoGuess = 0; |
1118 |
+ } |
1119 |
+ if (ref($encoder)) { |
1120 |
+ $detected = $encoder->name; |
1121 |
+ if ($detected) { |
1122 |
+ $decoded = $encoder->decode($str); |
1123 |
+ } |
1124 |
+ } |
1125 |
+ return ($decoded, $detected); |
1126 |
+} |
1127 |
+ |
1128 |
+sub _get_alias { |
1129 |
+ my $encoding = shift; |
1130 |
+ |
1131 |
+ unless (HAS_ENCODE_HANEXTRA) { |
1132 |
+ $encoding =~ s/^gb18030$/euc-cn/i; |
1133 |
+ } |
1134 |
+ $encoding =~ s/^unicode-1-1-(.+)$/$1/i; |
1135 |
+ $encoding =~ s/^TIS-620$/iso-8859-11/i; |
1136 |
+ $encoding =~ s/x-mac-(.+)$/Mac$1/i; |
1137 |
+ $encoding =~ s/^Shift_JIS$/cp932/i; |
1138 |
+ if (HAS_ENCODE_EUCJPMS) { |
1139 |
+ $encoding =~ s/^iso-2022-jp$/cp50221/i; |
1140 |
+ $encoding =~ s/^euc-jp$/cp51932/i; |
1141 |
+ } |
1142 |
+ |
1143 |
+ return $encoding; |
1144 |
+} |
1145 |
+ |
1146 |
+ |
1147 |
+1; |
1148 |
+ |