|
Lines 1-1148
Link Here
|
| 1 |
diff -uNr lib/Mail/SpamAssassin/HTML.pm lib/Mail/SpamAssassin/HTML.pm |
|
|
| 2 |
--- lib/Mail/SpamAssassin/HTML.pm 2011-06-07 08:59:17.000000000 +0900 |
| 3 |
+++ lib/Mail/SpamAssassin/HTML.pm 2011-07-14 22:35:46.000000000 +0900 |
| 4 |
@@ -84,7 +84,7 @@ |
| 5 |
$ok_attributes{span}{$_} = 1 for qw( style ); |
| 6 |
|
| 7 |
sub new { |
| 8 |
- my ($class) = @_; |
| 9 |
+ my ($class, $opts) = @_; |
| 10 |
my $self = $class->SUPER::new( |
| 11 |
api_version => 3, |
| 12 |
handlers => [ |
| 13 |
@@ -97,6 +97,7 @@ |
| 14 |
declaration => ["html_declaration", "self,text"], |
| 15 |
], |
| 16 |
marked_sections => 1); |
| 17 |
+ $self->{normalize} = $opts->{'normalize'} || 0; |
| 18 |
|
| 19 |
$self; |
| 20 |
} |
| 21 |
@@ -672,7 +673,14 @@ |
| 22 |
} |
| 23 |
} |
| 24 |
else { |
| 25 |
- $text =~ s/[ \t\n\r\f\x0b\xa0]+/ /g; |
| 26 |
+ if ($self->{normalize}) { |
| 27 |
+ $text =~ s/\xc2\xa0/ /g; # no-break space |
| 28 |
+ $text =~ s/\xe3\x80\x80/ /g; # ideographicspace |
| 29 |
+ $text =~ s/[ \t\n\r\f\x0b]+/ /g; |
| 30 |
+ } |
| 31 |
+ else { |
| 32 |
+ $text =~ s/[ \t\n\r\f\x0b\xa0]+/ /g; |
| 33 |
+ } |
| 34 |
# trim leading whitespace if previous element was whitespace |
| 35 |
# and current element is not invisible |
| 36 |
if (@{ $self->{text} } && !$display{invisible} && |
| 37 |
diff -uNr lib/Mail/SpamAssassin/Message/Node.pm lib/Mail/SpamAssassin/Message/Node.pm |
| 38 |
--- lib/Mail/SpamAssassin/Message/Node.pm 2011-06-07 08:59:17.000000000 +0900 |
| 39 |
+++ lib/Mail/SpamAssassin/Message/Node.pm 2011-07-14 22:35:46.000000000 +0900 |
| 40 |
@@ -42,6 +42,7 @@ |
| 41 |
use Mail::SpamAssassin::Constants qw(:sa); |
| 42 |
use Mail::SpamAssassin::HTML; |
| 43 |
use Mail::SpamAssassin::Logger; |
| 44 |
+use Mail::SpamAssassin::Util::Charset; |
| 45 |
|
| 46 |
=item new() |
| 47 |
|
| 48 |
@@ -387,27 +388,10 @@ |
| 49 |
|
| 50 |
sub _normalize { |
| 51 |
my ($self, $data, $charset) = @_; |
| 52 |
- return $data unless $self->{normalize}; |
| 53 |
+ return wantarray ? ($data, $charset) : $data unless $self->{normalize}; |
| 54 |
|
| 55 |
- my $detected = Encode::Detect::Detector::detect($data); |
| 56 |
- |
| 57 |
- my $converter; |
| 58 |
- |
| 59 |
- if ($charset && $charset !~ /^us-ascii$/i && |
| 60 |
- ($detected || 'none') !~ /^(?:UTF|EUC|ISO-2022|Shift_JIS|Big5|GB)/i) { |
| 61 |
- dbg("message: Using labeled charset $charset"); |
| 62 |
- $converter = Encode::find_encoding($charset); |
| 63 |
- } |
| 64 |
- |
| 65 |
- $converter = Encode::find_encoding($detected) unless $converter || !defined($detected); |
| 66 |
- |
| 67 |
- return $data unless $converter; |
| 68 |
- |
| 69 |
- dbg("message: Converting..."); |
| 70 |
- |
| 71 |
- my $rv = $converter->decode($data, 0); |
| 72 |
- utf8::downgrade($rv, 1); |
| 73 |
- return $rv |
| 74 |
+ my ($decoded_data, $detected_charset) = normalize_charset($data, $charset); |
| 75 |
+ return wantarray ? ($decoded_data, $detected_charset) : $decoded_data; |
| 76 |
} |
| 77 |
|
| 78 |
=item rendered() |
| 79 |
@@ -430,8 +414,12 @@ |
| 80 |
# text/x-aol is ignored here, but looks like text/html ... |
| 81 |
return(undef,undef) unless ( $self->{'type'} =~ /^text\/(?:plain|html)$/i ); |
| 82 |
|
| 83 |
- my $text = $self->_normalize($self->decode(), $self->{charset}); |
| 84 |
+ my ($text, $charset) = $self->_normalize($self->decode(), $self->{charset}); |
| 85 |
my $raw = length($text); |
| 86 |
+ if ($self->{normalize}) { |
| 87 |
+ $self->{charset} = $charset; |
| 88 |
+ $self->{language} = get_language($text, $charset); |
| 89 |
+ } |
| 90 |
|
| 91 |
# render text/html always, or any other text|text/plain part as text/html |
| 92 |
# based on a heuristic which simulates a certain common mail client |
| 93 |
@@ -441,7 +429,7 @@ |
| 94 |
{ |
| 95 |
$self->{rendered_type} = 'text/html'; |
| 96 |
|
| 97 |
- my $html = Mail::SpamAssassin::HTML->new(); # object |
| 98 |
+ my $html = Mail::SpamAssassin::HTML->new({normalize=>$self->{normalize}}); # object |
| 99 |
$html->parse($text); # parse+render text |
| 100 |
$self->{rendered} = $html->get_rendered_text(); |
| 101 |
$self->{visible_rendered} = $html->get_rendered_text(invisible => 0); |
| 102 |
diff -uNr lib/Mail/SpamAssassin/Message.pm lib/Mail/SpamAssassin/Message.pm |
| 103 |
--- lib/Mail/SpamAssassin/Message.pm 2011-06-07 08:59:17.000000000 +0900 |
| 104 |
+++ lib/Mail/SpamAssassin/Message.pm 2011-07-14 22:35:46.000000000 +0900 |
| 105 |
@@ -559,6 +559,8 @@ |
| 106 |
delete $self->{'pristine_headers'}; |
| 107 |
delete $self->{'line_ending'}; |
| 108 |
delete $self->{'missing_head_body_separator'}; |
| 109 |
+ delete $self->{'charset'}; |
| 110 |
+ delete $self->{'language'}; |
| 111 |
|
| 112 |
my @toclean = ( $self ); |
| 113 |
|
| 114 |
@@ -585,6 +587,8 @@ |
| 115 |
delete $part->{'invisible_rendered'}; |
| 116 |
delete $part->{'type'}; |
| 117 |
delete $part->{'rendered_type'}; |
| 118 |
+ delete $self->{'charset'}; |
| 119 |
+ delete $self->{'language'}; |
| 120 |
|
| 121 |
# if there are children nodes, add them to the queue of nodes to clean up |
| 122 |
if (exists $part->{'body_parts'}) { |
| 123 |
@@ -1014,7 +1018,14 @@ |
| 124 |
|
| 125 |
# whitespace handling (warning: small changes have large effects!) |
| 126 |
$text =~ s/\n+\s*\n+/\f/gs; # double newlines => form feed |
| 127 |
- $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space |
| 128 |
+ if ($self->{normalize}) { |
| 129 |
+ $text =~ s/\xc2\xa0/ /g; # no-break space => space |
| 130 |
+ $text =~ s/\xe3\x80\x80/ /g; # ideographicspace => space |
| 131 |
+ $text =~ tr/ \t\n\r\x0b/ /s; # whitespace => space |
| 132 |
+ } |
| 133 |
+ else { |
| 134 |
+ $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space |
| 135 |
+ } |
| 136 |
$text =~ tr/\f/\n/; # form feeds => newline |
| 137 |
|
| 138 |
# warn "message: $text"; |
| 139 |
@@ -1071,7 +1082,14 @@ |
| 140 |
|
| 141 |
# whitespace handling (warning: small changes have large effects!) |
| 142 |
$text =~ s/\n+\s*\n+/\f/gs; # double newlines => form feed |
| 143 |
- $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space |
| 144 |
+ if ($self->{normalize}) { |
| 145 |
+ $text =~ s/\xc2\xa0/ /g; # no-break space => space |
| 146 |
+ $text =~ s/\xe3\x80\x80/ /g; # ideographicspace => space |
| 147 |
+ $text =~ tr/ \t\n\r\x0b/ /s; # whitespace => space |
| 148 |
+ } |
| 149 |
+ else { |
| 150 |
+ $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space |
| 151 |
+ } |
| 152 |
$text =~ tr/\f/\n/; # form feeds => newline |
| 153 |
|
| 154 |
my @textary = split_into_array_of_short_lines ($text); |
| 155 |
@@ -1122,7 +1140,14 @@ |
| 156 |
|
| 157 |
# whitespace handling (warning: small changes have large effects!) |
| 158 |
$text =~ s/\n+\s*\n+/\f/gs; # double newlines => form feed |
| 159 |
- $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space |
| 160 |
+ if ($self->{normalize}) { |
| 161 |
+ $text =~ s/\xc2\xa0/ /g; # no-break space => space |
| 162 |
+ $text =~ s/\xe3\x80\x80/ /g; # ideographicspace => space |
| 163 |
+ $text =~ tr/ \t\n\r\x0b/ /s; # whitespace => space |
| 164 |
+ } |
| 165 |
+ else { |
| 166 |
+ $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space |
| 167 |
+ } |
| 168 |
$text =~ tr/\f/\n/; # form feeds => newline |
| 169 |
|
| 170 |
my @textary = split_into_array_of_short_lines ($text); |
| 171 |
@@ -1198,6 +1223,28 @@ |
| 172 |
|
| 173 |
# --------------------------------------------------------------------------- |
| 174 |
|
| 175 |
+sub get_language { |
| 176 |
+ my ($self) = @_; |
| 177 |
+ |
| 178 |
+ if (defined $self->{language}) { return $self->{language}; } |
| 179 |
+ my @parts = $self->find_parts(qr/^(?:text|message)\b/i,1); |
| 180 |
+ return '' unless @parts; |
| 181 |
+ |
| 182 |
+ # Go through each part |
| 183 |
+ my @langs; |
| 184 |
+ for(my $pt = 0 ; $pt <= $#parts ; $pt++ ) { |
| 185 |
+ my $p = $parts[$pt]; |
| 186 |
+ my $lang = $p->{language}; |
| 187 |
+ next unless ($lang); |
| 188 |
+ push(@langs, $lang) unless (grep(/^$lang$/, @langs)) |
| 189 |
+ } |
| 190 |
+ $self->{language} = scalar(@langs) ? join(' ', @langs) : ''; |
| 191 |
+ return $self->{language}; |
| 192 |
+} |
| 193 |
+ |
| 194 |
+# --------------------------------------------------------------------------- |
| 195 |
+ |
| 196 |
+ |
| 197 |
1; |
| 198 |
|
| 199 |
=back |
| 200 |
diff -uNr lib/Mail/SpamAssassin/PerMsgStatus.pm lib/Mail/SpamAssassin/PerMsgStatus.pm |
| 201 |
--- lib/Mail/SpamAssassin/PerMsgStatus.pm 2011-06-07 08:59:17.000000000 +0900 |
| 202 |
+++ lib/Mail/SpamAssassin/PerMsgStatus.pm 2011-07-14 22:35:46.000000000 +0900 |
| 203 |
@@ -53,6 +53,7 @@ |
| 204 |
use warnings; |
| 205 |
use re 'taint'; |
| 206 |
|
| 207 |
+use Encode; |
| 208 |
use Time::HiRes qw(time); |
| 209 |
|
| 210 |
use Mail::SpamAssassin::Constants qw(:sa); |
| 211 |
@@ -733,19 +734,41 @@ |
| 212 |
|
| 213 |
# the report charset |
| 214 |
my $report_charset = "; charset=iso-8859-1"; |
| 215 |
- if ($self->{conf}->{report_charset}) { |
| 216 |
- $report_charset = "; charset=" . $self->{conf}->{report_charset}; |
| 217 |
- } |
| 218 |
|
| 219 |
# the SpamAssassin report |
| 220 |
my $report = $self->get_report(); |
| 221 |
+ if ($self->{conf}->{report_charset}) { |
| 222 |
+ $report_charset = "; charset=" . $self->{conf}->{report_charset}; |
| 223 |
+ } |
| 224 |
|
| 225 |
# If there are any wide characters, need to MIME-encode in UTF-8 |
| 226 |
# TODO: If $report_charset is something other than iso-8859-1/us-ascii, then |
| 227 |
# we could try converting to that charset if possible |
| 228 |
- unless ($] < 5.008 || utf8::downgrade($report, 1)) { |
| 229 |
+ my $is_utf8 = 0; |
| 230 |
+ if ($self->{conf}->{normalize_charset}) { |
| 231 |
+ $report = Encode::decode_utf8($report); |
| 232 |
+ $is_utf8 = 1; |
| 233 |
+ } |
| 234 |
+ else { |
| 235 |
+ if ($self->{msg}->{charset}) { |
| 236 |
+ eval { |
| 237 |
+ my $scratch = $report; |
| 238 |
+ $report = Encode::decode($self->{msg}->{charset},$scratch,Encode::FB_CROAK); |
| 239 |
+ $is_utf8 = 1; |
| 240 |
+ }; |
| 241 |
+ } |
| 242 |
+ } |
| 243 |
+ if ($is_utf8) { |
| 244 |
+ $is_utf8 = 1; |
| 245 |
+ eval { |
| 246 |
+ my $scratch = $report; |
| 247 |
+ $report = Encode::encode($self->{conf}->{report_charset},$scratch,Encode::FB_CROAK); |
| 248 |
+ $is_utf8 = 0; |
| 249 |
+ }; |
| 250 |
+ if ($is_utf8) { |
| 251 |
+ $report = Encode::encode_utf8($report); |
| 252 |
$report_charset = "; charset=utf-8"; |
| 253 |
- utf8::encode($report); |
| 254 |
+ } |
| 255 |
} |
| 256 |
|
| 257 |
# get original headers, "pristine" if we can do it |
| 258 |
diff -uNr lib/Mail/SpamAssassin/Plugin/Bayes.pm lib/Mail/SpamAssassin/Plugin/Bayes.pm |
| 259 |
--- lib/Mail/SpamAssassin/Plugin/Bayes.pm 2011-06-07 08:59:17.000000000 +0900 |
| 260 |
+++ lib/Mail/SpamAssassin/Plugin/Bayes.pm 2011-07-14 22:35:46.000000000 +0900 |
| 261 |
@@ -223,6 +223,15 @@ |
| 262 |
# will require a longer token than English ones.) |
| 263 |
use constant MAX_TOKEN_LENGTH => 15; |
| 264 |
|
| 265 |
+# Skip if a token is too short. |
| 266 |
+our $SKIP_UTF8_SHORT_TOKENS_RE = qr{(?: |
| 267 |
+ [\x00-\x7F] # 1 byte |
| 268 |
+ | [\xC0-\xDF][\x80-\xBF] # 2 bytes |
| 269 |
+ | [\xE0-\xEF][\x80-\xBF]{2} # 3 bytes |
| 270 |
+ | [\xF0-\xF7][\x80-\xBF]{3} # 4 bytes |
| 271 |
+ | (?:\xE3[\x81-\x83][\x80-\xBF]){2} # 2 characters of Hiragana and Katakana |
| 272 |
+)}x; |
| 273 |
+ |
| 274 |
########################################################################### |
| 275 |
|
| 276 |
sub new { |
| 277 |
@@ -983,9 +992,28 @@ |
| 278 |
$msgdata->{bayes_token_body} = $msg->{msg}->get_visible_rendered_body_text_array(); |
| 279 |
$msgdata->{bayes_token_inviz} = $msg->{msg}->get_invisible_rendered_body_text_array(); |
| 280 |
@{$msgdata->{bayes_token_uris}} = $msg->get_uri_list(); |
| 281 |
+ if ($self->{conf}->{normalize_charset}) { |
| 282 |
+ my $tokenizer = $self->get_tokenizer($msg); |
| 283 |
+ if (ref($tokenizer)) { |
| 284 |
+ $msgdata->{bayes_token_body} = $tokenizer->tokenize($msgdata->{bayes_token_body}); |
| 285 |
+ $msgdata->{bayes_token_inviz} = $tokenizer->tokenize($msgdata->{bayes_token_inviz}); |
| 286 |
+ } |
| 287 |
+ } |
| 288 |
return $msgdata; |
| 289 |
} |
| 290 |
|
| 291 |
+sub get_tokenizer { |
| 292 |
+ my ($self, $msg) = @_; |
| 293 |
+ |
| 294 |
+ my $tokenizer; |
| 295 |
+ my @languages = split(/\s+/, $msg->{msg}->get_language()); |
| 296 |
+ foreach my $lang (@languages) { |
| 297 |
+ $tokenizer = $self->{'conf'}->{'tokenizer'}->{$lang}; |
| 298 |
+ last if (ref($tokenizer)); |
| 299 |
+ } |
| 300 |
+ return $tokenizer; |
| 301 |
+} |
| 302 |
+ |
| 303 |
########################################################################### |
| 304 |
|
| 305 |
# The calling functions expect a uniq'ed array of tokens ... |
| 306 |
@@ -1039,7 +1067,7 @@ |
| 307 |
# include quotes, .'s and -'s for URIs, and [$,]'s for Nigerian-scam strings, |
| 308 |
# and ISO-8859-15 alphas. Do not split on @'s; better results keeping it. |
| 309 |
# Some useful tokens: "$31,000,000" "www.clock-speed.net" "f*ck" "Hits!" |
| 310 |
- tr/-A-Za-z0-9,\@\*\!_'"\$.\241-\377 / /cs; |
| 311 |
+ tr/-A-Za-z0-9,\@\*\!_'"\$.\200-\377 / /cs; |
| 312 |
|
| 313 |
# DO split on "..." or "--" or "---"; common formatting error resulting in |
| 314 |
# hapaxes. Keep the separator itself as a token, though, as long ones can |
| 315 |
@@ -1068,6 +1096,11 @@ |
| 316 |
# |
| 317 |
next if ( defined $magic_re && $token =~ /$magic_re/ ); |
| 318 |
|
| 319 |
+ # Skip short UTF-8 tokens. |
| 320 |
+ if ($self->{conf}->{normalize_charset}) { |
| 321 |
+ next if ($token =~ /^$SKIP_UTF8_SHORT_TOKENS_RE$/o); |
| 322 |
+ } |
| 323 |
+ |
| 324 |
# *do* keep 3-byte tokens; there's some solid signs in there |
| 325 |
my $len = length($token); |
| 326 |
|
| 327 |
@@ -1096,14 +1129,16 @@ |
| 328 |
# the domain ".net" appeared in the To header. |
| 329 |
# |
| 330 |
if ($len > MAX_TOKEN_LENGTH && $token !~ /\*/) { |
| 331 |
- if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) { |
| 332 |
- # Matt sez: "Could be asian? Autrijus suggested doing character ngrams, |
| 333 |
- # but I'm doing tuples to keep the dbs small(er)." Sounds like a plan |
| 334 |
- # to me! (jm) |
| 335 |
- while ($token =~ s/^(..?)//) { |
| 336 |
- push (@rettokens, "8:$1"); |
| 337 |
- } |
| 338 |
- next; |
| 339 |
+ unless ($self->{conf}->{normalize_charset}) { |
| 340 |
+ if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) { |
| 341 |
+ # Matt sez: "Could be asian? Autrijus suggested doing character ngrams, |
| 342 |
+ # but I'm doing tuples to keep the dbs small(er)." Sounds like a plan |
| 343 |
+ # to me! (jm) |
| 344 |
+ while ($token =~ s/^(..?)//) { |
| 345 |
+ push (@rettokens, "8:$1"); |
| 346 |
+ } |
| 347 |
+ next; |
| 348 |
+ } |
| 349 |
} |
| 350 |
|
| 351 |
if (($region == 0 && HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS) |
| 352 |
diff -uNr lib/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm lib/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm |
| 353 |
--- lib/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm 1970-01-01 09:00:00.000000000 +0900 |
| 354 |
+++ lib/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm 2011-07-14 22:29:19.000000000 +0900 |
| 355 |
@@ -0,0 +1,84 @@ |
| 356 |
+# <@LICENSE> |
| 357 |
+# Copyright 2004 Apache Software Foundation |
| 358 |
+# |
| 359 |
+# Licensed under the Apache License, Version 2.0 (the "License"); |
| 360 |
+# you may not use this file except in compliance with the License. |
| 361 |
+# You may obtain a copy of the License at |
| 362 |
+# |
| 363 |
+# http://www.apache.org/licenses/LICENSE-2.0 |
| 364 |
+# |
| 365 |
+# Unless required by applicable law or agreed to in writing, software |
| 366 |
+# distributed under the License is distributed on an "AS IS" BASIS, |
| 367 |
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 368 |
+# See the License for the specific language governing permissions and |
| 369 |
+# limitations under the License. |
| 370 |
+# </@LICENSE> |
| 371 |
+ |
| 372 |
+=head1 NAME |
| 373 |
+ |
| 374 |
+Tokenizer::MeCab - Japanese tokenizer with MeCab |
| 375 |
+ |
| 376 |
+=head1 SYNOPSIS |
| 377 |
+ |
| 378 |
+loadplugin Mail::SpamAssassin::Plugin::Tokenizer::MeCab |
| 379 |
+ |
| 380 |
+=head1 DESCRIPTION |
| 381 |
+ |
| 382 |
+This plugin tokenizes a Japanese string with MeCab that is |
| 383 |
+the morphological analysis engine. |
| 384 |
+ |
| 385 |
+Text::MeCab 0.12 or over is required. |
| 386 |
+ |
| 387 |
+=cut |
| 388 |
+ |
| 389 |
+package Mail::SpamAssassin::Plugin::Tokenizer::MeCab; |
| 390 |
+ |
| 391 |
+use strict; |
| 392 |
+use warnings; |
| 393 |
+use Mail::SpamAssassin::Plugin::Tokenizer; |
| 394 |
+ |
| 395 |
+use vars qw(@ISA); |
| 396 |
+@ISA = qw(Mail::SpamAssassin::Plugin::Tokenizer); |
| 397 |
+ |
| 398 |
+# Have to do this so that RPM doesn't find these as required perl modules |
| 399 |
+BEGIN { require MeCab; } |
| 400 |
+our $language = 'ja'; |
| 401 |
+our $mecab = new MeCab::Tagger(-Ochasen); |
| 402 |
+ |
| 403 |
+sub new { |
| 404 |
+ my $class = shift; |
| 405 |
+ my $mailsaobject = shift; |
| 406 |
+ |
| 407 |
+ $class = ref($class) || $class; |
| 408 |
+ my $self = $class->SUPER::new($mailsaobject, $language); |
| 409 |
+ bless ($self, $class); |
| 410 |
+ |
| 411 |
+ return $self; |
| 412 |
+} |
| 413 |
+ |
| 414 |
+sub tokenize { |
| 415 |
+ my $self = shift; |
| 416 |
+ my $text_array = shift; |
| 417 |
+ |
| 418 |
+ my @tokenized_array; |
| 419 |
+ foreach my $text (@$text_array) { |
| 420 |
+ next unless ($text); |
| 421 |
+ $text =~ s/([\x80-\xFF]{3,})/&_tokenize($1)/eg; |
| 422 |
+ push(@tokenized_array, $text); |
| 423 |
+ } |
| 424 |
+ return \@tokenized_array; |
| 425 |
+} |
| 426 |
+ |
| 427 |
+sub _tokenize { |
| 428 |
+ my $text = shift; |
| 429 |
+ |
| 430 |
+ my @buf; |
| 431 |
+ for (my $node = $mecab->parseToNode($text); $node->{next}; $node = $node->{next}) { |
| 432 |
+ push(@buf, $node->{surface}); |
| 433 |
+ } |
| 434 |
+ my $tokenized = join(' ', @buf) . ' '; |
| 435 |
+ return $tokenized; |
| 436 |
+} |
| 437 |
+ |
| 438 |
+1; |
| 439 |
+ |
| 440 |
diff -uNr lib/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm lib/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm |
| 441 |
--- lib/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm 1970-01-01 09:00:00.000000000 +0900 |
| 442 |
+++ lib/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm 2011-07-14 22:29:19.000000000 +0900 |
| 443 |
@@ -0,0 +1,111 @@ |
| 444 |
+# <@LICENSE> |
| 445 |
+# Copyright 2004 Apache Software Foundation |
| 446 |
+# |
| 447 |
+# Licensed under the Apache License, Version 2.0 (the "License"); |
| 448 |
+# you may not use this file except in compliance with the License. |
| 449 |
+# You may obtain a copy of the License at |
| 450 |
+# |
| 451 |
+# http://www.apache.org/licenses/LICENSE-2.0 |
| 452 |
+# |
| 453 |
+# Unless required by applicable law or agreed to in writing, software |
| 454 |
+# distributed under the License is distributed on an "AS IS" BASIS, |
| 455 |
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 456 |
+# See the License for the specific language governing permissions and |
| 457 |
+# limitations under the License. |
| 458 |
+# </@LICENSE> |
| 459 |
+ |
| 460 |
+=head1 NAME |
| 461 |
+ |
| 462 |
+Tokenizer::SimpleJA - simple Japanese tokenizer |
| 463 |
+ |
| 464 |
+=head1 SYNOPSIS |
| 465 |
+ |
| 466 |
+loadplugin Mail::SpamAssassin::Plugin::Tokenizer::SimpleJA |
| 467 |
+ |
| 468 |
+=head1 DESCRIPTION |
| 469 |
+ |
| 470 |
+This plugin simply tokenizes a Japanese string by characters other than |
| 471 |
+the alphabet, the Chinese character, and the katakana. |
| 472 |
+ |
| 473 |
+=cut |
| 474 |
+ |
| 475 |
+package Mail::SpamAssassin::Plugin::Tokenizer::SimpleJA; |
| 476 |
+ |
| 477 |
+use strict; |
| 478 |
+use warnings; |
| 479 |
+use Mail::SpamAssassin::Plugin::Tokenizer; |
| 480 |
+ |
| 481 |
+use vars qw(@ISA); |
| 482 |
+@ISA = qw(Mail::SpamAssassin::Plugin::Tokenizer); |
| 483 |
+ |
| 484 |
+our $language = 'ja'; |
| 485 |
+ |
| 486 |
+our $RE = qr{( |
| 487 |
+ # Hiragana |
| 488 |
+ (?: |
| 489 |
+ \xE3\x81[\x80-\xBF] |
| 490 |
+ | \xE3\x82[\x80-\x9F] |
| 491 |
+ )+ |
| 492 |
+ # Katakana |
| 493 |
+ | (?: |
| 494 |
+ \xE3\x82[\xA0-\xBF] |
| 495 |
+ | \xE3\x83[\x80-\xBF] |
| 496 |
+ )+ |
| 497 |
+ # Kanji |
| 498 |
+ | (?: |
| 499 |
+ \xE3[\x90-\xBF][\x80-\xBF] |
| 500 |
+ | [\xE4-\xE9][\x80-\xBF]{2} |
| 501 |
+ | \xEF[\xA4-\xAB][\x80-\xBF] |
| 502 |
+ )+ |
| 503 |
+ # Fullwidth |
| 504 |
+ | (?: |
| 505 |
+ \xEF\xBC[\x80-\xBF] |
| 506 |
+ | \xEF\xBD[\x80-\x9F] |
| 507 |
+ )+ |
| 508 |
+ # Others |
| 509 |
+ | [\xC0-\xDF][\x80-\xBF] |
| 510 |
+ | [\xE0-\xE2][\x80-\xBF]{2} |
| 511 |
+ | \xE3\x80[\x80-\xBF] |
| 512 |
+ | \xE3[\x84-\x8F][\x80-\xBF] |
| 513 |
+ | [\xEA-\xEE][\x80-\xBF]{2} |
| 514 |
+ | \xEF[\x80-\xA3][\x80-\xBF] |
| 515 |
+ | \xEF[\xAC-\xBB][\x80-\xBF] |
| 516 |
+ | \xEF\xBD[\xA0-\xBF] |
| 517 |
+ | \xEF[\xBE-\xBF][\x80-\xBF] |
| 518 |
+ | [\xF0-\xF7][\x80-\xBF]{3} |
| 519 |
+)}x; |
| 520 |
+ |
| 521 |
+sub new { |
| 522 |
+ my $class = shift; |
| 523 |
+ my $mailsaobject = shift; |
| 524 |
+ |
| 525 |
+ $class = ref($class) || $class; |
| 526 |
+ my $self = $class->SUPER::new($mailsaobject, $language); |
| 527 |
+ bless ($self, $class); |
| 528 |
+ |
| 529 |
+ return $self; |
| 530 |
+} |
| 531 |
+ |
| 532 |
+sub tokenize { |
| 533 |
+ my $self = shift; |
| 534 |
+ my $text_array = shift; |
| 535 |
+ |
| 536 |
+ my @tokenized_array; |
| 537 |
+ foreach my $text (@$text_array) { |
| 538 |
+ next unless ($text); |
| 539 |
+ $text =~ s/([\x80-\xFF]{3,})/&_tokenize($1)/eg; |
| 540 |
+ push(@tokenized_array, $text); |
| 541 |
+ } |
| 542 |
+ return \@tokenized_array; |
| 543 |
+} |
| 544 |
+ |
| 545 |
+sub _tokenize { |
| 546 |
+ my $text = shift; |
| 547 |
+ |
| 548 |
+ $text =~ s/$RE/$1 /og; |
| 549 |
+ $text = ' ' . $text; |
| 550 |
+ return $text; |
| 551 |
+} |
| 552 |
+ |
| 553 |
+1; |
| 554 |
+ |
| 555 |
diff -uNr lib/Mail/SpamAssassin/Plugin/Tokenizer.pm lib/Mail/SpamAssassin/Plugin/Tokenizer.pm |
| 556 |
--- lib/Mail/SpamAssassin/Plugin/Tokenizer.pm 1970-01-01 09:00:00.000000000 +0900 |
| 557 |
+++ lib/Mail/SpamAssassin/Plugin/Tokenizer.pm 2011-07-14 22:35:46.000000000 +0900 |
| 558 |
@@ -0,0 +1,115 @@ |
| 559 |
+# <@LICENSE> |
| 560 |
+# Copyright 2004 Apache Software Foundation |
| 561 |
+# |
| 562 |
+# Licensed under the Apache License, Version 2.0 (the "License"); |
| 563 |
+# you may not use this file except in compliance with the License. |
| 564 |
+# You may obtain a copy of the License at |
| 565 |
+# |
| 566 |
+# http://www.apache.org/licenses/LICENSE-2.0 |
| 567 |
+# |
| 568 |
+# Unless required by applicable law or agreed to in writing, software |
| 569 |
+# distributed under the License is distributed on an "AS IS" BASIS, |
| 570 |
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 571 |
+# See the License for the specific language governing permissions and |
| 572 |
+# limitations under the License. |
| 573 |
+# </@LICENSE> |
| 574 |
+ |
| 575 |
+=head1 NAME |
| 576 |
+ |
| 577 |
+Mail::SpamAssassin::Plugin::Tokenizer - Tokenizer plugin base class |
| 578 |
+ |
| 579 |
+=head1 SYNOPSIS |
| 580 |
+ |
| 581 |
+=head2 SpamAssassin configuration: |
| 582 |
+ |
| 583 |
+ loadplugin MyTokenizerPlugin /path/to/MyTokenizerPlugin.pm |
| 584 |
+ |
| 585 |
+=head2 Perl code: |
| 586 |
+ |
| 587 |
+ use Mail::SpamAssassin::Plugin::Tokenizer; |
| 588 |
+ use vars qw(@ISA); |
| 589 |
+ @ISA = qw(Mail::SpamAssassin::Plugin::Tokenizer); |
| 590 |
+ # language to use this plugin |
| 591 |
+ our $language = 'ja'; |
| 592 |
+ |
| 593 |
+ # constructor: register language |
| 594 |
+ sub new { |
| 595 |
+ my $class = shift; |
| 596 |
+ my $mailsaobject = shift; |
| 597 |
+ |
| 598 |
+ # some boilerplate... |
| 599 |
+ $class = ref($class) || $class; |
| 600 |
+ my $self = $class->SUPER::new($mailsaobject, $language); |
| 601 |
+ bless ($self, $class); |
| 602 |
+ |
| 603 |
+ return $self; |
| 604 |
+ } |
| 605 |
+ |
| 606 |
+ # tokenize function |
| 607 |
+ sub tokenize { |
| 608 |
+ my $self = shift; |
| 609 |
+ my $text_array_ref = shift; |
| 610 |
+ |
| 611 |
+ ...... |
| 612 |
+ |
| 613 |
+ return $tokenized_array_ref; |
| 614 |
+ } |
| 615 |
+ |
| 616 |
+ |
| 617 |
+=head1 DESCRIPTION |
| 618 |
+ |
| 619 |
+This plugin is the base class of tokenizer plugin. |
| 620 |
+You must define tokenize() and $language |
| 621 |
+ |
| 622 |
+=head1 INTERFACE |
| 623 |
+ |
| 624 |
+ sub tokenize { |
| 625 |
+ my $self = shift; |
| 626 |
+ my $text_array_ref = shift; |
| 627 |
+ |
| 628 |
+ ...... |
| 629 |
+ |
| 630 |
+ return $tokenized_array_ref; |
| 631 |
+ } |
| 632 |
+ |
| 633 |
+=cut |
| 634 |
+ |
| 635 |
+package Mail::SpamAssassin::Plugin::Tokenizer; |
| 636 |
+ |
| 637 |
+use Mail::SpamAssassin::Plugin; |
| 638 |
+use Mail::SpamAssassin::Logger; |
| 639 |
+use strict; |
| 640 |
+use warnings; |
| 641 |
+use bytes; |
| 642 |
+ |
| 643 |
+use vars qw(@ISA); |
| 644 |
+@ISA = qw(Mail::SpamAssassin::Plugin); |
| 645 |
+ |
| 646 |
+sub new { |
| 647 |
+ my $class = shift; |
| 648 |
+ my $mailsaobject = shift; |
| 649 |
+ my $language = shift; |
| 650 |
+ |
| 651 |
+ # some boilerplate... |
| 652 |
+ $class = ref($class) || $class; |
| 653 |
+ my $self = $class->SUPER::new($mailsaobject); |
| 654 |
+ bless ($self, $class); |
| 655 |
+ |
| 656 |
+ if ($language) { |
| 657 |
+ $self->{main}->{conf}->{tokenizer}->{$language} = $self; |
| 658 |
+ } |
| 659 |
+ else { |
| 660 |
+ dbg("plugin: $self: \$language is not defined"); |
| 661 |
+ } |
| 662 |
+ |
| 663 |
+ return $self; |
| 664 |
+} |
| 665 |
+ |
| 666 |
+sub tokenize { |
| 667 |
+ my ($self, $ref) = @_; |
| 668 |
+ |
| 669 |
+ return $ref; |
| 670 |
+} |
| 671 |
+ |
| 672 |
+1; |
| 673 |
+ |
| 674 |
diff -uNr lib/Mail/SpamAssassin/Util/Charset.pm lib/Mail/SpamAssassin/Util/Charset.pm |
| 675 |
--- lib/Mail/SpamAssassin/Util/Charset.pm 1970-01-01 09:00:00.000000000 +0900 |
| 676 |
+++ lib/Mail/SpamAssassin/Util/Charset.pm 2011-07-14 22:29:19.000000000 +0900 |
| 677 |
@@ -0,0 +1,471 @@ |
| 678 |
+# <@LICENSE> |
| 679 |
+# Copyright 2006 Apache Software Foundation |
| 680 |
+# |
| 681 |
+# Licensed under the Apache License, Version 2.0 (the "License"); |
| 682 |
+# you may not use this file except in compliance with the License. |
| 683 |
+# You may obtain a copy of the License at |
| 684 |
+# |
| 685 |
+# http://www.apache.org/licenses/LICENSE-2.0 |
| 686 |
+# |
| 687 |
+# Unless required by applicable law or agreed to in writing, software |
| 688 |
+# distributed under the License is distributed on an "AS IS" BASIS, |
| 689 |
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 690 |
+# See the License for the specific language governing permissions and |
| 691 |
+# limitations under the License. |
| 692 |
+# </@LICENSE> |
| 693 |
+ |
| 694 |
+ |
| 695 |
+=head1 NAME |
| 696 |
+ |
| 697 |
+ Mail::SpamAssassin::Util::Charset.pm - Utility for charset and language |
| 698 |
+ |
| 699 |
+=head1 SYNOPSIS |
| 700 |
+ |
| 701 |
+ my ($decoded, $detected) = Mail::SpamAssassin::Util::Charset::normalize_charset($str, $charset); |
| 702 |
+ my $language = Mail::SpamAssassin::Util::Charset::get_language($str, $charset); |
| 703 |
+ |
| 704 |
+=head1 DESCRIPTION |
| 705 |
+ |
| 706 |
+This module implements utility methods for charset and language. |
| 707 |
+ |
| 708 |
+=cut |
| 709 |
+ |
| 710 |
+package Mail::SpamAssassin::Util::Charset; |
| 711 |
+ |
| 712 |
+use strict; |
| 713 |
+use warnings; |
| 714 |
+use Encode; |
| 715 |
+use Encode::Guess; |
| 716 |
+use Encode::Alias; |
| 717 |
+ |
| 718 |
+use vars qw ( |
| 719 |
+ @ISA @EXPORT |
| 720 |
+); |
| 721 |
+ |
| 722 |
+require Exporter; |
| 723 |
+ |
| 724 |
+@ISA = qw(Exporter); |
| 725 |
+@EXPORT = qw(normalize_charset get_language); |
| 726 |
+ |
| 727 |
+########################################################################### |
| 728 |
+ |
| 729 |
+use constant HAS_ENCODE_DETECT => eval { require Encode::Detect::Detector; }; |
| 730 |
+use constant HAS_ENCODE_HANEXTRA => eval { require Encode::HanExtra; }; |
| 731 |
+use constant HAS_ENCODE_EUCJPMS => eval { require Encode::EUCJPMS; }; |
| 732 |
+ |
| 733 |
+########################################################################### |
| 734 |
+ |
| 735 |
+our $KANA_HAN_RE = qr{ |
| 736 |
+ # Hiragana and Katakana |
| 737 |
+ \xE3[\x81-\x83][\x80-\xBF] |
| 738 |
+ # Han |
| 739 |
+ | \xE3[\x90-\xBF][\x80-\xBF] |
| 740 |
+ | [\xE4-\xE9][\x80-\xBF]{2} |
| 741 |
+ | \xEF[\xA4-\xAB][\x80-\xBF] |
| 742 |
+}x; |
| 743 |
+ |
| 744 |
+our %enc2lang; |
| 745 |
+our %lang2enc; |
| 746 |
+our %scr2lang; |
| 747 |
+our %cjkscr2lang; |
| 748 |
+our @scrorder; |
| 749 |
+ |
| 750 |
+BEGIN { |
| 751 |
+ |
| 752 |
+ # See the following URL about this map: |
| 753 |
+ # http://czyborra.com/charsets/iso8859.html |
| 754 |
+ # http://czyborra.com/charsets/codepages.html |
| 755 |
+ # http://czyborra.com/charsets/cyrillic.html |
| 756 |
+ # http://en.wikipedia.org/wiki/ISO_8859 |
| 757 |
+ # http://www.w3.org/International/O-charset-lang.html |
| 758 |
+ %enc2lang = ( |
| 759 |
+ # buint-in Encodings and Encode::Byte |
| 760 |
+ # N. America |
| 761 |
+ 'ascii' => 'en', |
| 762 |
+ 'cp437' => 'en', |
| 763 |
+ 'cp863' => 'weurope', |
| 764 |
+ |
| 765 |
+ # W. Europe (Latin1, Latin9) |
| 766 |
+ # fr es ca eu pt it sq rm nl de da sv no fi fo is ga gd en af |
| 767 |
+ 'iso-8859-1' => 'weurope', |
| 768 |
+ 'iso-8859-15' => 'weurope', |
| 769 |
+ 'cp850' => 'weurope', |
| 770 |
+ 'cp860' => 'weurope', |
| 771 |
+ 'cp1252' => 'weurope', |
| 772 |
+ 'MacRoman' => 'weurope', |
| 773 |
+ |
| 774 |
+ # Cntrl. Europe / Latin2 / Latin10 |
| 775 |
+ # hr cs hu pl sr sk sl |
| 776 |
+ 'iso-8859-2' => 'ceurope', |
| 777 |
+ 'cp852' => 'ceurope', |
| 778 |
+ 'cp1250' => 'ceurope', |
| 779 |
+ 'MacCentralEurRoman' => 'ceurope', |
| 780 |
+ 'MacCroatian' => 'ceurope', |
| 781 |
+ 'iso-8859-16' => 'ceurope', |
| 782 |
+ 'MacRomanian' => 'ceurope', |
| 783 |
+ |
| 784 |
+ # Latin3 (Esperanto, Maltese, and Turkish. Turkish is now on 8859-9.) |
| 785 |
+ # eo mt |
| 786 |
+ 'iso-8859-3' => 'seurope', |
| 787 |
+ |
| 788 |
+ # Baltics (Latin4, Latin7) |
| 789 |
+ # lv lt |
| 790 |
+ 'iso-8859-4' => 'neurope', |
| 791 |
+ 'iso-8859-13' => 'baltic', |
| 792 |
+ 'cp1257' => 'baltic', |
| 793 |
+ |
| 794 |
+ # Nordics (Latin6) |
| 795 |
+ # et kl iu se |
| 796 |
+ 'iso-8859-10' => 'nordic', |
| 797 |
+ |
| 798 |
+ # Cyrillics |
| 799 |
+ # bg be uk sr mk ru |
| 800 |
+ 'iso-8859-5' => 'ru', |
| 801 |
+ 'cp855' => 'ru', |
| 802 |
+ 'cp1251' => 'ru', |
| 803 |
+ 'cp866' => 'ru', |
| 804 |
+ 'MacCyrillic' => 'ru', |
| 805 |
+ 'koi8-r' => 'ru', |
| 806 |
+ 'MacUkrainian' => 'uk', |
| 807 |
+ 'koi8-u' => 'uk', |
| 808 |
+ |
| 809 |
+ # Arabic |
| 810 |
+ 'iso-8859-6' => 'ar', |
| 811 |
+ 'cp864' => 'ar', |
| 812 |
+ 'cp1256' => 'ar', |
| 813 |
+ 'MacArabic' => 'ar', |
| 814 |
+ 'cp1006' => 'fa', |
| 815 |
+ 'MacFarsi' => 'fa', |
| 816 |
+ |
| 817 |
+ # Greek |
| 818 |
+ 'iso-8859-7' => 'el', |
| 819 |
+ 'cp1253' => 'el', |
| 820 |
+ 'MacGreek' => 'el', |
| 821 |
+ |
| 822 |
+ # Hebrew |
| 823 |
+ # he yi |
| 824 |
+ 'iso-8859-8' => 'he', |
| 825 |
+ 'cp862' => 'he', |
| 826 |
+ 'cp1255' => 'he', |
| 827 |
+ 'MacHebrew' => 'he', |
| 828 |
+ |
| 829 |
+ # Turkish |
| 830 |
+ 'iso-8859-9' => 'tr', |
| 831 |
+ 'cp857' => 'tr', |
| 832 |
+ 'cp1254' => 'tr', |
| 833 |
+ 'MacTurkish' => 'tr', |
| 834 |
+ |
| 835 |
+ # Thai |
| 836 |
+ 'iso-8859-11' => 'th', |
| 837 |
+ 'cp874' => 'th', |
| 838 |
+ |
| 839 |
+ # Celtics (Latin8) |
| 840 |
+ # gd cy br |
| 841 |
+ 'iso-8859-14' => 'celtic', |
| 842 |
+ |
| 843 |
+ # Vietnamese |
| 844 |
+ 'viscii' => 'vi', |
| 845 |
+ 'cp1258' => 'vi', |
| 846 |
+ |
| 847 |
+ # Encode::CN |
| 848 |
+ 'euc-cn' => 'zh', |
| 849 |
+ 'cp936' => 'zh', |
| 850 |
+ 'hz' => 'zh', |
| 851 |
+ |
| 852 |
+ # Encode::TW |
| 853 |
+ 'big5-eten' => 'zh', |
| 854 |
+ 'big5-hkscs' => 'zh', |
| 855 |
+ 'cp950' => 'zh', |
| 856 |
+ |
| 857 |
+ # Encode::JP |
| 858 |
+ 'euc-jp' => 'ja', |
| 859 |
+ 'shiftjis' => 'ja', |
| 860 |
+ '7bit-jis' => 'ja', |
| 861 |
+ 'iso-2022-jp' => 'ja', |
| 862 |
+ 'iso-2022-jp-1' => 'ja', |
| 863 |
+ 'cp932' => 'ja', |
| 864 |
+ |
| 865 |
+ # Encode::KR |
| 866 |
+ 'euc-kr' => 'ko', |
| 867 |
+ 'cp949' => 'ko', |
| 868 |
+ 'johab' => 'ko', |
| 869 |
+ 'iso-2022-kr' => 'ko', |
| 870 |
+ |
| 871 |
+ # Encode::HanExtra |
| 872 |
+ 'euc-tw' => 'zh', |
| 873 |
+ 'gb18030' => 'zh', |
| 874 |
+ |
| 875 |
+ # Encode::JIS2K |
| 876 |
+ 'euc-jisx0213' => 'ja', |
| 877 |
+ 'shiftjisx0123' => 'ja', |
| 878 |
+ 'iso-2022-jp-3' => 'ja', |
| 879 |
+ |
| 880 |
+ # Encode::EUCJPMS |
| 881 |
+ 'eucJP-ms' => 'ja', |
| 882 |
+ 'cp51932' => 'ja', |
| 883 |
+ 'cp50220' => 'ja', |
| 884 |
+ 'cp50221' => 'ja', |
| 885 |
+ |
| 886 |
+ ); |
| 887 |
+ |
| 888 |
+ %lang2enc = ( |
| 889 |
+ # Latin1 |
| 890 |
+ 'en' => ['ascii'], |
| 891 |
+ 'weurope' => ['cp1252'], |
| 892 |
+ |
| 893 |
+ # Latin2 |
| 894 |
+ 'ceurope' => ['cp1250'], |
| 895 |
+ |
| 896 |
+ # Latin3 |
| 897 |
+ 'seurope' => ['iso-8859-3'], |
| 898 |
+ |
| 899 |
+ # Latin4 |
| 900 |
+ 'neurope' => ['iso-8859-4'], |
| 901 |
+ |
| 902 |
+ # Latin5 |
| 903 |
+ 'tr' => ['cp1254'], |
| 904 |
+ |
| 905 |
+ # Latin6 |
| 906 |
+ 'nordic' => ['iso-8859-10'], |
| 907 |
+ |
| 908 |
+ # Latin7 |
| 909 |
+ 'baltic' => ['cp1257'], |
| 910 |
+ |
| 911 |
+ # Latin8 |
| 912 |
+ 'celtic' => ['iso-8859-14'], |
| 913 |
+ |
| 914 |
+ # Non Latin |
| 915 |
+ 'ru' => ['koi8-r', 'cp1251'], |
| 916 |
+ 'uk' => ['koi8-u'], |
| 917 |
+ |
| 918 |
+ 'ar' => ['cp1256'], |
| 919 |
+ 'el' => ['cp1253'], |
| 920 |
+ 'he' => ['cp1255'], |
| 921 |
+ 'th' => ['cp874'], |
| 922 |
+ 'vi' => ['viscii', 'cp1258'], |
| 923 |
+ 'zh' => ['euc-cn', 'cp950'], |
| 924 |
+ 'ja' => ['euc-jp', 'cp932'], |
| 925 |
+ 'ko' => ['euc-kr', 'cp949'], |
| 926 |
+ |
| 927 |
+ ); |
| 928 |
+ |
| 929 |
+ %scr2lang = ( |
| 930 |
+ 'InLatin1Supplement' => ['weurope'], |
| 931 |
+ 'InLatinExtendedA' => [ |
| 932 |
+ 'ceurope', |
| 933 |
+ 'seurope', |
| 934 |
+ 'tr', |
| 935 |
+ 'vi' |
| 936 |
+ ], |
| 937 |
+ 'InLatinExtendedB' => [ |
| 938 |
+ 'nordic', |
| 939 |
+ 'baltic', |
| 940 |
+ 'celtic' |
| 941 |
+ ], |
| 942 |
+ 'Thai' => ['th'], |
| 943 |
+ 'Cyrillic' => ['ru', 'uk'], |
| 944 |
+ 'Arabic' => ['ar'], |
| 945 |
+ 'Greek' => ['el'], |
| 946 |
+ 'Hebrew' => ['he'], |
| 947 |
+ ); |
| 948 |
+ |
| 949 |
+ # better detection for CJK |
| 950 |
+ @scrorder = ('Hiragana','Katakana','Hangul','Han',keys(%scr2lang)); |
| 951 |
+ %cjkscr2lang = ( |
| 952 |
+ 'Hiragana' => ['ja'], |
| 953 |
+ 'Katakana' => ['ja'], |
| 954 |
+ 'Hangul' => ['ko'], |
| 955 |
+ 'Han' => ['zh', 'ja', 'ko'], |
| 956 |
+ ); |
| 957 |
+ |
| 958 |
+ unless (HAS_ENCODE_HANEXTRA) { |
| 959 |
+ Encode::Alias::define_alias( qr/^gb18030$/i => ' "euc-cn"' ); |
| 960 |
+ } |
| 961 |
+ Encode::Alias::define_alias( qr/^unicode-1-1-(.+)$/i => ' "$1"' ); |
| 962 |
+ Encode::Alias::define_alias( qr/^TIS-620$/i => ' "iso-8859-11"' ); |
| 963 |
+ Encode::Alias::define_alias( qr/^x-mac-(.+)$/i => ' "Mac$1"' ); |
| 964 |
+ Encode::Alias::define_alias( qr/^Shift_JIS$/i => ' "cp932"' ); |
| 965 |
+ if (HAS_ENCODE_EUCJPMS) { |
| 966 |
+ Encode::Alias::define_alias( qr/^iso-2022-jp$/i => ' "cp50221"' ); |
| 967 |
+ } |
| 968 |
+} |
| 969 |
+ |
| 970 |
+sub get_language { |
| 971 |
+ my $str = shift; # $str must be UTF-8 encoding |
| 972 |
+ my $charset = shift; |
| 973 |
+ |
| 974 |
+ return 'en' unless $charset; |
| 975 |
+ if ($charset !~ /^utf/i) { |
| 976 |
+ return $enc2lang{$charset}; |
| 977 |
+ } elsif (defined($str)) { |
| 978 |
+ $str =~ s/[\x00-\x7F]//g; # remove ASCII characters |
| 979 |
+ return 'en' if ($str eq ''); |
| 980 |
+ |
| 981 |
+ my %handled; |
| 982 |
+ $str = Encode::decode_utf8($str) unless (Encode::is_utf8($str)); |
| 983 |
+ foreach my $scr (@scrorder) { |
| 984 |
+ next if ($str !~ /\p{$scr}/); |
| 985 |
+ my $scrlangs = exists($cjkscr2lang{$scr}) ? $cjkscr2lang{$scr} : $scr2lang{$scr}; |
| 986 |
+ foreach my $lang (@$scrlangs) { |
| 987 |
+ next if (exists($handled{$lang})); |
| 988 |
+ foreach my $enc (@{$lang2enc{$lang}}) { |
| 989 |
+ my $scratch = $str; |
| 990 |
+ Encode::encode($enc, $scratch, Encode::FB_QUIET); |
| 991 |
+ return $lang if ($scratch eq ''); |
| 992 |
+ } |
| 993 |
+ $handled{$lang} = 1; |
| 994 |
+ } |
| 995 |
+ } |
| 996 |
+ } |
| 997 |
+ return 'en'; |
| 998 |
+} |
| 999 |
+ |
| 1000 |
+# TEST 1: try conversion to use the specified charset. |
| 1001 |
+# TEST 2: try conversion to use Encode::Detect. |
| 1002 |
+# TEST 3: try conversion to use Encode::Guess. |
| 1003 |
+sub normalize_charset { |
| 1004 |
+ my $str = shift; |
| 1005 |
+ my $charset = shift; |
| 1006 |
+ |
| 1007 |
+ return wantarray ? ($str, 'ascii') : $str unless ($str); |
| 1008 |
+ |
| 1009 |
+ my $decoded; |
| 1010 |
+ my $detected; |
| 1011 |
+ |
| 1012 |
+ if ($charset) { |
| 1013 |
+ ($decoded, $detected) = _specified_encoding($str, $charset); |
| 1014 |
+ } |
| 1015 |
+ unless ($detected) { |
| 1016 |
+ ($decoded, $detected) = _encode_detect($str); |
| 1017 |
+ } |
| 1018 |
+ unless ($detected) { |
| 1019 |
+ ($decoded, $detected) = _encode_guess($str); |
| 1020 |
+ } |
| 1021 |
+ unless ($detected) { |
| 1022 |
+ return ($str, undef); |
| 1023 |
+ } |
| 1024 |
+ $decoded =~ s/^\x{feff}//g; |
| 1025 |
+ $decoded = Encode::encode_utf8($decoded); |
| 1026 |
+ |
| 1027 |
+ # unfold hiragana, katakana and han |
| 1028 |
+ if ($detected =~ /^(?:UTF|EUC|BIG5|GB|SHIFTJIS|ISO-2022|CP969$|CP932$|CP949|CP50221$)/i) { |
| 1029 |
+ $decoded =~ s/($KANA_HAN_RE)\012($KANA_HAN_RE)/$1$2/og; |
| 1030 |
+ } |
| 1031 |
+ return wantarray ? ($decoded, $detected) : $decoded; |
| 1032 |
+} |
| 1033 |
+ |
| 1034 |
+sub _specified_encoding { |
| 1035 |
+ my $str = shift; |
| 1036 |
+ my $encoding = shift; |
| 1037 |
+ |
| 1038 |
+ my $detected; |
| 1039 |
+ my $decoded; |
| 1040 |
+ |
| 1041 |
+ return (undef, undef) unless ($encoding); |
| 1042 |
+ |
| 1043 |
+ # note: ISO-2022-* is not deistinguish from US-ASCII |
| 1044 |
+ return (undef, undef) if ($str =~ /\e/ and $encoding !~ /^ISO-2022/i); |
| 1045 |
+ |
| 1046 |
+ # UTF-16|32 encoding without BOM cannot be trusted. |
| 1047 |
+ return (undef, undef) if ($encoding =~ /^UTF-32$/i and $str !~ /^(?:\xFF\xFE\x00\x00|\x00\x00\xFE\xFF)/); |
| 1048 |
+ return (undef, undef) if ($encoding =~ /^UTF-16$/i and $str !~ /^(?:\xFF\xFE|\xFE\xFF)/); |
| 1049 |
+ |
| 1050 |
+ #$encoding = _get_alias($encoding); |
| 1051 |
+ my $encoder = Encode::find_encoding($encoding); |
| 1052 |
+ if (ref($encoder)) { |
| 1053 |
+ $decoded = $encoder->decode($str,Encode::FB_QUIET); |
| 1054 |
+ $detected = $encoder->name if ($str eq ''); |
| 1055 |
+ } |
| 1056 |
+ return ($decoded, $detected); |
| 1057 |
+} |
| 1058 |
+ |
| 1059 |
+sub _encode_detect { |
| 1060 |
+ return undef unless HAS_ENCODE_DETECT; |
| 1061 |
+ my $str = shift; |
| 1062 |
+ |
| 1063 |
+ # UTF-16|32 encoding without BOM cannot be trusted. |
| 1064 |
+ return (undef, undef) if ($str =~ /\x00\x00/ and $str !~ /^(?:\xFF\xFE\x00\x00|\x00\x00\xFE\xFF)/); |
| 1065 |
+ return (undef, undef) if ($str =~ /\x00/ and $str !~ /^(?:\xFF\xFE|\xFE\xFF)/); |
| 1066 |
+ |
| 1067 |
+ my $decoded; |
| 1068 |
+ my $detected = Encode::Detect::Detector::detect($str); |
| 1069 |
+ if ($detected) { |
| 1070 |
+ $detected = _get_alias($detected); |
| 1071 |
+ my $encoder = Encode::find_encoding($detected); |
| 1072 |
+ if (ref($encoder)) { |
| 1073 |
+ $decoded = $encoder->decode($str); |
| 1074 |
+ $detected = $decoded ? $encoder->name : undef; |
| 1075 |
+ } |
| 1076 |
+ else { |
| 1077 |
+ $detected = undef; |
| 1078 |
+ } |
| 1079 |
+ } |
| 1080 |
+ return ($decoded, $detected); |
| 1081 |
+} |
| 1082 |
+ |
| 1083 |
+sub _encode_guess { |
| 1084 |
+ my $str = shift; |
| 1085 |
+ |
| 1086 |
+ my $detected; |
| 1087 |
+ my $decoded; |
| 1088 |
+ my $encoder; |
| 1089 |
+ |
| 1090 |
+ # Step 1: Examine ISO-2022-*. |
| 1091 |
+ if ($str =~ /\e/) { |
| 1092 |
+ $Encode::Guess::NoUTFAutoGuess = 1; |
| 1093 |
+ $encoder = Encode::Guess::guess_encoding($str, |
| 1094 |
+ qw/cp50221 7bit-jis iso-2022-kr/); |
| 1095 |
+ $Encode::Guess::NoUTFAutoGuess = 0; |
| 1096 |
+ } |
| 1097 |
+ |
| 1098 |
+ # Step 2: Examine US-ASCII/UTF-(8|16|32) |
| 1099 |
+ unless (ref($encoder)) { |
| 1100 |
+ $Encode::Guess::NoUTFAutoGuess = 0; |
| 1101 |
+ $encoder = Encode::Guess::guess_encoding($str); |
| 1102 |
+ } |
| 1103 |
+ |
| 1104 |
+ # Step 3: Examine other encodings |
| 1105 |
+ unless (ref($encoder)) { |
| 1106 |
+ $Encode::Guess::NoUTFAutoGuess = 1; |
| 1107 |
+ eval { |
| 1108 |
+ if ($str =~ /[\x80-\xFF]{4}/) { |
| 1109 |
+ $encoder = Encode::Guess::guess_encoding($str, |
| 1110 |
+ qw/euc-cn big5-eten euc-jp cp932 euc-kr cp949/); |
| 1111 |
+ } |
| 1112 |
+ else { |
| 1113 |
+ $encoder = Encode::Guess::guess_encoding($str, |
| 1114 |
+ qw/iso-8859-1 cp1252/); |
| 1115 |
+ } |
| 1116 |
+ }; |
| 1117 |
+ $Encode::Guess::NoUTFAutoGuess = 0; |
| 1118 |
+ } |
| 1119 |
+ if (ref($encoder)) { |
| 1120 |
+ $detected = $encoder->name; |
| 1121 |
+ if ($detected) { |
| 1122 |
+ $decoded = $encoder->decode($str); |
| 1123 |
+ } |
| 1124 |
+ } |
| 1125 |
+ return ($decoded, $detected); |
| 1126 |
+} |
| 1127 |
+ |
| 1128 |
+sub _get_alias { |
| 1129 |
+ my $encoding = shift; |
| 1130 |
+ |
| 1131 |
+ unless (HAS_ENCODE_HANEXTRA) { |
| 1132 |
+ $encoding =~ s/^gb18030$/euc-cn/i; |
| 1133 |
+ } |
| 1134 |
+ $encoding =~ s/^unicode-1-1-(.+)$/$1/i; |
| 1135 |
+ $encoding =~ s/^TIS-620$/iso-8859-11/i; |
| 1136 |
+ $encoding =~ s/x-mac-(.+)$/Mac$1/i; |
| 1137 |
+ $encoding =~ s/^Shift_JIS$/cp932/i; |
| 1138 |
+ if (HAS_ENCODE_EUCJPMS) { |
| 1139 |
+ $encoding =~ s/^iso-2022-jp$/cp50221/i; |
| 1140 |
+ $encoding =~ s/^euc-jp$/cp51932/i; |
| 1141 |
+ } |
| 1142 |
+ |
| 1143 |
+ return $encoding; |
| 1144 |
+} |
| 1145 |
+ |
| 1146 |
+ |
| 1147 |
+1; |
| 1148 |
+ |