View | Details | Raw Unified | Return to bug 186756 | Differences between
and this patch

Collapse All | Expand All

(-)./Makefile (-3 / +3 lines)
Lines 1-7 Link Here
1
# Created by: TAOKA Fumiyoshi
1
# Created by: TAOKA Fumiyoshi
2
# $FreeBSD: head/japanese/p5-Mail-SpamAssassin/Makefile 346964 2014-03-03 20:48:15Z antoine $
2
# $FreeBSD: head/japanese/p5-Mail-SpamAssassin/Makefile 346964 2014-03-03 20:48:15Z antoine $
3
3
4
PORTREVISION=	5
4
PORTREVISION=	0
5
CATEGORIES=	japanese mail perl5
5
CATEGORIES=	japanese mail perl5
6
PKGNAMEPREFIX=	ja-p5-
6
PKGNAMEPREFIX=	ja-p5-
7
7
Lines 12-22 Link Here
12
12
13
MASTERDIR=	${.CURDIR}/../../mail/p5-Mail-SpamAssassin
13
MASTERDIR=	${.CURDIR}/../../mail/p5-Mail-SpamAssassin
14
14
15
RUN_DEPENDS+=	ja-p5-MeCab>=0.98:${PORTSDIR}/japanese/p5-MeCab
15
BUILD_DEPENDS=	ja-p5-MeCab>=0.98:${PORTSDIR}/japanese/p5-MeCab
16
16
17
CONFLICTS=	p5-Mail-SpamAssassin-[0-9]*
17
CONFLICTS=	p5-Mail-SpamAssassin-[0-9]*
18
18
19
EXTRA_PATCHES=	${.CURDIR}/files/spamassassin-3.3.2-ja-1.patch
19
EXTRA_PATCHES=	${.CURDIR}/files/spamassassin-3.4.0-ja.patch
20
20
21
PKGMESSAGE=	${.CURDIR}/pkg-message
21
PKGMESSAGE=	${.CURDIR}/pkg-message
22
PLIST=		${WRKDIR}/pkg-plist
22
PLIST=		${WRKDIR}/pkg-plist
(-)./files/spamassassin-3.3.2-ja-1.patch (-1148 lines)
Lines 1-1148 Link Here
1
diff -uNr lib/Mail/SpamAssassin/HTML.pm lib/Mail/SpamAssassin/HTML.pm
2
--- lib/Mail/SpamAssassin/HTML.pm	2011-06-07 08:59:17.000000000 +0900
3
+++ lib/Mail/SpamAssassin/HTML.pm	2011-07-14 22:35:46.000000000 +0900
4
@@ -84,7 +84,7 @@
5
 $ok_attributes{span}{$_} = 1 for qw( style );
6
 
7
 sub new {
8
-  my ($class) = @_;
9
+  my ($class, $opts) = @_;
10
   my $self = $class->SUPER::new(
11
 		api_version => 3,
12
 		handlers => [
13
@@ -97,6 +97,7 @@
14
 			declaration => ["html_declaration", "self,text"],
15
 		],
16
 		marked_sections => 1);
17
+  $self->{normalize} = $opts->{'normalize'} || 0;
18
 
19
   $self;
20
 }
21
@@ -672,7 +673,14 @@
22
     }
23
   }
24
   else {
25
-    $text =~ s/[ \t\n\r\f\x0b\xa0]+/ /g;
26
+    if ($self->{normalize}) {
27
+      $text =~ s/\xc2\xa0/ /g;           # no-break space
28
+      $text =~ s/\xe3\x80\x80/ /g;       # ideographicspace
29
+      $text =~ s/[ \t\n\r\f\x0b]+/ /g;
30
+    }
31
+    else {
32
+      $text =~ s/[ \t\n\r\f\x0b\xa0]+/ /g;
33
+    }
34
     # trim leading whitespace if previous element was whitespace 
35
     # and current element is not invisible
36
     if (@{ $self->{text} } && !$display{invisible} &&
37
diff -uNr lib/Mail/SpamAssassin/Message/Node.pm lib/Mail/SpamAssassin/Message/Node.pm
38
--- lib/Mail/SpamAssassin/Message/Node.pm	2011-06-07 08:59:17.000000000 +0900
39
+++ lib/Mail/SpamAssassin/Message/Node.pm	2011-07-14 22:35:46.000000000 +0900
40
@@ -42,6 +42,7 @@
41
 use Mail::SpamAssassin::Constants qw(:sa);
42
 use Mail::SpamAssassin::HTML;
43
 use Mail::SpamAssassin::Logger;
44
+use Mail::SpamAssassin::Util::Charset;
45
 
46
 =item new()
47
 
48
@@ -387,27 +388,10 @@
49
 
50
 sub _normalize {
51
   my ($self, $data, $charset) = @_;
52
-  return $data unless $self->{normalize};
53
+  return wantarray ? ($data, $charset) : $data unless $self->{normalize};
54
 
55
-  my $detected = Encode::Detect::Detector::detect($data);
56
-
57
-  my $converter;
58
-
59
-  if ($charset && $charset !~ /^us-ascii$/i &&
60
-      ($detected || 'none') !~ /^(?:UTF|EUC|ISO-2022|Shift_JIS|Big5|GB)/i) {
61
-      dbg("message: Using labeled charset $charset");
62
-      $converter = Encode::find_encoding($charset);
63
-  }
64
-
65
-  $converter = Encode::find_encoding($detected) unless $converter || !defined($detected);
66
-
67
-  return $data unless $converter;
68
-
69
-  dbg("message: Converting...");
70
-
71
-  my $rv = $converter->decode($data, 0);
72
-  utf8::downgrade($rv, 1);
73
-  return $rv
74
+  my ($decoded_data, $detected_charset) = normalize_charset($data, $charset);
75
+  return wantarray ? ($decoded_data, $detected_charset) : $decoded_data;
76
 }
77
 
78
 =item rendered()
79
@@ -430,8 +414,12 @@
80
     # text/x-aol is ignored here, but looks like text/html ...
81
     return(undef,undef) unless ( $self->{'type'} =~ /^text\/(?:plain|html)$/i );
82
 
83
-    my $text = $self->_normalize($self->decode(), $self->{charset});
84
+    my ($text, $charset) = $self->_normalize($self->decode(), $self->{charset});
85
     my $raw = length($text);
86
+    if ($self->{normalize}) {
87
+      $self->{charset} = $charset;
88
+      $self->{language} = get_language($text, $charset);
89
+    }
90
 
91
     # render text/html always, or any other text|text/plain part as text/html
92
     # based on a heuristic which simulates a certain common mail client
93
@@ -441,7 +429,7 @@
94
     {
95
       $self->{rendered_type} = 'text/html';
96
 
97
-      my $html = Mail::SpamAssassin::HTML->new();	# object
98
+      my $html = Mail::SpamAssassin::HTML->new({normalize=>$self->{normalize}});       # object
99
       $html->parse($text);				# parse+render text
100
       $self->{rendered} = $html->get_rendered_text();
101
       $self->{visible_rendered} = $html->get_rendered_text(invisible => 0);
102
diff -uNr lib/Mail/SpamAssassin/Message.pm lib/Mail/SpamAssassin/Message.pm
103
--- lib/Mail/SpamAssassin/Message.pm	2011-06-07 08:59:17.000000000 +0900
104
+++ lib/Mail/SpamAssassin/Message.pm	2011-07-14 22:35:46.000000000 +0900
105
@@ -559,6 +559,8 @@
106
   delete $self->{'pristine_headers'};
107
   delete $self->{'line_ending'};
108
   delete $self->{'missing_head_body_separator'};
109
+  delete $self->{'charset'};
110
+  delete $self->{'language'};
111
 
112
   my @toclean = ( $self );
113
 
114
@@ -585,6 +587,8 @@
115
     delete $part->{'invisible_rendered'};
116
     delete $part->{'type'};
117
     delete $part->{'rendered_type'};
118
+    delete $self->{'charset'};
119
+    delete $self->{'language'};
120
 
121
     # if there are children nodes, add them to the queue of nodes to clean up
122
     if (exists $part->{'body_parts'}) {
123
@@ -1014,7 +1018,14 @@
124
 
125
   # whitespace handling (warning: small changes have large effects!)
126
   $text =~ s/\n+\s*\n+/\f/gs;		# double newlines => form feed
127
-  $text =~ tr/ \t\n\r\x0b\xa0/ /s;	# whitespace => space
128
+  if ($self->{normalize}) {
129
+    $text =~ s/\xc2\xa0/ /g;		# no-break space => space
130
+    $text =~ s/\xe3\x80\x80/ /g;	# ideographicspace => space
131
+    $text =~ tr/ \t\n\r\x0b/ /s;	# whitespace => space
132
+  }
133
+  else {
134
+    $text =~ tr/ \t\n\r\x0b\xa0/ /s;	# whitespace => space
135
+  }
136
   $text =~ tr/\f/\n/;			# form feeds => newline
137
   
138
   # warn "message: $text";
139
@@ -1071,7 +1082,14 @@
140
 
141
   # whitespace handling (warning: small changes have large effects!)
142
   $text =~ s/\n+\s*\n+/\f/gs;		# double newlines => form feed
143
-  $text =~ tr/ \t\n\r\x0b\xa0/ /s;	# whitespace => space
144
+  if ($self->{normalize}) {
145
+    $text =~ s/\xc2\xa0/ /g;		# no-break space => space
146
+    $text =~ s/\xe3\x80\x80/ /g;	# ideographicspace => space
147
+    $text =~ tr/ \t\n\r\x0b/ /s;	# whitespace => space
148
+  }
149
+  else {
150
+    $text =~ tr/ \t\n\r\x0b\xa0/ /s;	# whitespace => space
151
+  }
152
   $text =~ tr/\f/\n/;			# form feeds => newline
153
 
154
   my @textary = split_into_array_of_short_lines ($text);
155
@@ -1122,7 +1140,14 @@
156
 
157
   # whitespace handling (warning: small changes have large effects!)
158
   $text =~ s/\n+\s*\n+/\f/gs;		# double newlines => form feed
159
-  $text =~ tr/ \t\n\r\x0b\xa0/ /s;	# whitespace => space
160
+  if ($self->{normalize}) {
161
+    $text =~ s/\xc2\xa0/ /g;		# no-break space => space
162
+    $text =~ s/\xe3\x80\x80/ /g;	# ideographicspace => space
163
+    $text =~ tr/ \t\n\r\x0b/ /s;	# whitespace => space
164
+  }
165
+  else {
166
+    $text =~ tr/ \t\n\r\x0b\xa0/ /s;	# whitespace => space
167
+  }
168
   $text =~ tr/\f/\n/;			# form feeds => newline
169
 
170
   my @textary = split_into_array_of_short_lines ($text);
171
@@ -1198,6 +1223,28 @@
172
 
173
 # ---------------------------------------------------------------------------
174
 
175
+sub get_language {
176
+  my ($self) = @_;
177
+
178
+  if (defined $self->{language}) { return $self->{language}; }
179
+  my @parts = $self->find_parts(qr/^(?:text|message)\b/i,1);
180
+  return '' unless @parts;
181
+
182
+  # Go through each part
183
+  my @langs;
184
+  for(my $pt = 0 ; $pt <= $#parts ; $pt++ ) {
185
+    my $p = $parts[$pt];
186
+    my $lang = $p->{language};
187
+    next unless ($lang);
188
+    push(@langs, $lang) unless (grep(/^$lang$/, @langs))
189
+  }
190
+  $self->{language} = scalar(@langs) ? join(' ', @langs) : '';
191
+  return $self->{language};
192
+}
193
+
194
+# ---------------------------------------------------------------------------
195
+
196
+
197
 1;
198
 
199
 =back
200
diff -uNr lib/Mail/SpamAssassin/PerMsgStatus.pm lib/Mail/SpamAssassin/PerMsgStatus.pm
201
--- lib/Mail/SpamAssassin/PerMsgStatus.pm	2011-06-07 08:59:17.000000000 +0900
202
+++ lib/Mail/SpamAssassin/PerMsgStatus.pm	2011-07-14 22:35:46.000000000 +0900
203
@@ -53,6 +53,7 @@
204
 use warnings;
205
 use re 'taint';
206
 
207
+use Encode;
208
 use Time::HiRes qw(time);
209
 
210
 use Mail::SpamAssassin::Constants qw(:sa);
211
@@ -733,19 +734,41 @@
212
 
213
   # the report charset
214
   my $report_charset = "; charset=iso-8859-1";
215
-  if ($self->{conf}->{report_charset}) {
216
-    $report_charset = "; charset=" . $self->{conf}->{report_charset};
217
-  }
218
 
219
   # the SpamAssassin report
220
   my $report = $self->get_report();
221
+  if ($self->{conf}->{report_charset}) {
222
+    $report_charset = "; charset=" . $self->{conf}->{report_charset};
223
+  }
224
 
225
   # If there are any wide characters, need to MIME-encode in UTF-8
226
   # TODO: If $report_charset is something other than iso-8859-1/us-ascii, then
227
   # we could try converting to that charset if possible
228
-  unless ($] < 5.008 || utf8::downgrade($report, 1)) {
229
+  my $is_utf8 = 0;
230
+  if ($self->{conf}->{normalize_charset}) {
231
+    $report = Encode::decode_utf8($report);
232
+    $is_utf8 = 1;
233
+  }
234
+  else {
235
+    if ($self->{msg}->{charset}) {
236
+      eval {
237
+        my $scratch = $report;
238
+        $report = Encode::decode($self->{msg}->{charset},$scratch,Encode::FB_CROAK);
239
+        $is_utf8 = 1;
240
+      };
241
+    }
242
+  }
243
+  if ($is_utf8) {
244
+    $is_utf8 = 1;
245
+    eval {
246
+      my $scratch = $report;
247
+      $report = Encode::encode($self->{conf}->{report_charset},$scratch,Encode::FB_CROAK);
248
+      $is_utf8 = 0;
249
+    };
250
+    if ($is_utf8) {
251
+      $report = Encode::encode_utf8($report);
252
       $report_charset = "; charset=utf-8";
253
-      utf8::encode($report);
254
+    }
255
   }
256
 
257
   # get original headers, "pristine" if we can do it
258
diff -uNr lib/Mail/SpamAssassin/Plugin/Bayes.pm lib/Mail/SpamAssassin/Plugin/Bayes.pm
259
--- lib/Mail/SpamAssassin/Plugin/Bayes.pm	2011-06-07 08:59:17.000000000 +0900
260
+++ lib/Mail/SpamAssassin/Plugin/Bayes.pm	2011-07-14 22:35:46.000000000 +0900
261
@@ -223,6 +223,15 @@
262
 # will require a longer token than English ones.)
263
 use constant MAX_TOKEN_LENGTH => 15;
264
 
265
+# Skip if a token is too short.
266
+our $SKIP_UTF8_SHORT_TOKENS_RE = qr{(?:
267
+    [\x00-\x7F]                # 1 byte
268
+  | [\xC0-\xDF][\x80-\xBF]     # 2 bytes
269
+  | [\xE0-\xEF][\x80-\xBF]{2}  # 3 bytes
270
+  | [\xF0-\xF7][\x80-\xBF]{3}  # 4 bytes
271
+  | (?:\xE3[\x81-\x83][\x80-\xBF]){2} # 2 characters of Hiragana and Katakana
272
+)}x;
273
+
274
 ###########################################################################
275
 
276
 sub new {
277
@@ -983,9 +992,28 @@
278
   $msgdata->{bayes_token_body} = $msg->{msg}->get_visible_rendered_body_text_array();
279
   $msgdata->{bayes_token_inviz} = $msg->{msg}->get_invisible_rendered_body_text_array();
280
   @{$msgdata->{bayes_token_uris}} = $msg->get_uri_list();
281
+  if ($self->{conf}->{normalize_charset}) {
282
+    my $tokenizer = $self->get_tokenizer($msg);
283
+    if (ref($tokenizer)) {
284
+      $msgdata->{bayes_token_body} = $tokenizer->tokenize($msgdata->{bayes_token_body});
285
+      $msgdata->{bayes_token_inviz} = $tokenizer->tokenize($msgdata->{bayes_token_inviz});
286
+    }
287
+  }
288
   return $msgdata;
289
 }
290
 
291
+sub get_tokenizer {
292
+  my ($self, $msg) = @_;
293
+
294
+  my $tokenizer;
295
+  my @languages = split(/\s+/, $msg->{msg}->get_language());
296
+  foreach my $lang (@languages) {
297
+    $tokenizer = $self->{'conf'}->{'tokenizer'}->{$lang};
298
+    last if (ref($tokenizer));
299
+  }
300
+  return $tokenizer;
301
+}
302
+
303
 ###########################################################################
304
 
305
 # The calling functions expect a uniq'ed array of tokens ...
306
@@ -1039,7 +1067,7 @@
307
   # include quotes, .'s and -'s for URIs, and [$,]'s for Nigerian-scam strings,
308
   # and ISO-8859-15 alphas.  Do not split on @'s; better results keeping it.
309
   # Some useful tokens: "$31,000,000" "www.clock-speed.net" "f*ck" "Hits!"
310
-  tr/-A-Za-z0-9,\@\*\!_'"\$.\241-\377 / /cs;
311
+  tr/-A-Za-z0-9,\@\*\!_'"\$.\200-\377 / /cs;
312
 
313
   # DO split on "..." or "--" or "---"; common formatting error resulting in
314
   # hapaxes.  Keep the separator itself as a token, though, as long ones can
315
@@ -1068,6 +1096,11 @@
316
     #
317
     next if ( defined $magic_re && $token =~ /$magic_re/ );
318
 
319
+    # Skip short UTF-8 tokens.
320
+    if ($self->{conf}->{normalize_charset}) {
321
+      next if ($token =~ /^$SKIP_UTF8_SHORT_TOKENS_RE$/o);
322
+    }
323
+
324
     # *do* keep 3-byte tokens; there's some solid signs in there
325
     my $len = length($token);
326
 
327
@@ -1096,14 +1129,16 @@
328
     # the domain ".net" appeared in the To header.
329
     #
330
     if ($len > MAX_TOKEN_LENGTH && $token !~ /\*/) {
331
-      if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) {
332
-	# Matt sez: "Could be asian? Autrijus suggested doing character ngrams,
333
-	# but I'm doing tuples to keep the dbs small(er)."  Sounds like a plan
334
-	# to me! (jm)
335
-	while ($token =~ s/^(..?)//) {
336
-	  push (@rettokens, "8:$1");
337
-	}
338
-	next;
339
+      unless ($self->{conf}->{normalize_charset}) {
340
+        if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) {
341
+	  # Matt sez: "Could be asian? Autrijus suggested doing character ngrams,
342
+	  # but I'm doing tuples to keep the dbs small(er)."  Sounds like a plan
343
+	  # to me! (jm)
344
+	  while ($token =~ s/^(..?)//) {
345
+	    push (@rettokens, "8:$1");
346
+	  }
347
+	  next;
348
+        }
349
       }
350
 
351
       if (($region == 0 && HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS)
352
diff -uNr lib/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm lib/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm
353
--- lib/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm	1970-01-01 09:00:00.000000000 +0900
354
+++ lib/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm	2011-07-14 22:29:19.000000000 +0900
355
@@ -0,0 +1,84 @@
356
+# <@LICENSE>
357
+# Copyright 2004 Apache Software Foundation
358
+# 
359
+# Licensed under the Apache License, Version 2.0 (the "License");
360
+# you may not use this file except in compliance with the License.
361
+# You may obtain a copy of the License at
362
+# 
363
+#     http://www.apache.org/licenses/LICENSE-2.0
364
+# 
365
+# Unless required by applicable law or agreed to in writing, software
366
+# distributed under the License is distributed on an "AS IS" BASIS,
367
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
368
+# See the License for the specific language governing permissions and
369
+# limitations under the License.
370
+# </@LICENSE>
371
+
372
+=head1 NAME
373
+
374
+Tokenizer::MeCab - Japanese tokenizer with MeCab
375
+
376
+=head1 SYNOPSIS
377
+
378
+loadplugin     Mail::SpamAssassin::Plugin::Tokenizer::MeCab
379
+
380
+=head1 DESCRIPTION
381
+
382
+This plugin tokenizes a Japanese string with MeCab that is 
383
+the morphological analysis engine. 
384
+
385
+Text::MeCab 0.12 or over is required.
386
+
387
+=cut
388
+
389
+package Mail::SpamAssassin::Plugin::Tokenizer::MeCab;
390
+
391
+use strict;
392
+use warnings;
393
+use Mail::SpamAssassin::Plugin::Tokenizer;
394
+
395
+use vars qw(@ISA);
396
+@ISA = qw(Mail::SpamAssassin::Plugin::Tokenizer);
397
+
398
+# Have to do this so that RPM doesn't find these as required perl modules
399
+BEGIN { require MeCab; }
400
+our $language = 'ja';
401
+our $mecab = new MeCab::Tagger(-Ochasen);
402
+
403
+sub new {
404
+  my $class = shift;
405
+  my $mailsaobject = shift;
406
+
407
+  $class = ref($class) || $class;
408
+  my $self = $class->SUPER::new($mailsaobject, $language);
409
+  bless ($self, $class);
410
+
411
+  return $self;
412
+}
413
+
414
+sub tokenize {
415
+  my $self = shift;
416
+  my $text_array = shift;
417
+
418
+  my @tokenized_array;
419
+  foreach my $text (@$text_array) {
420
+    next unless ($text);
421
+    $text =~ s/([\x80-\xFF]{3,})/&_tokenize($1)/eg;
422
+    push(@tokenized_array, $text);
423
+  }
424
+  return \@tokenized_array;
425
+}
426
+
427
+sub _tokenize {
428
+  my $text = shift;
429
+
430
+  my @buf;
431
+  for (my $node = $mecab->parseToNode($text); $node->{next}; $node = $node->{next}) {
432
+    push(@buf, $node->{surface});
433
+  }
434
+  my $tokenized = join(' ', @buf) . ' ';
435
+  return $tokenized;
436
+}
437
+
438
+1;
439
+
440
diff -uNr lib/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm lib/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm
441
--- lib/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm	1970-01-01 09:00:00.000000000 +0900
442
+++ lib/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm	2011-07-14 22:29:19.000000000 +0900
443
@@ -0,0 +1,111 @@
444
+# <@LICENSE>
445
+# Copyright 2004 Apache Software Foundation
446
+# 
447
+# Licensed under the Apache License, Version 2.0 (the "License");
448
+# you may not use this file except in compliance with the License.
449
+# You may obtain a copy of the License at
450
+# 
451
+#     http://www.apache.org/licenses/LICENSE-2.0
452
+# 
453
+# Unless required by applicable law or agreed to in writing, software
454
+# distributed under the License is distributed on an "AS IS" BASIS,
455
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
456
+# See the License for the specific language governing permissions and
457
+# limitations under the License.
458
+# </@LICENSE>
459
+
460
+=head1 NAME
461
+
462
+Tokenizer::SimpleJA - simple Japanese tokenizer
463
+
464
+=head1 SYNOPSIS
465
+
466
+loadplugin     Mail::SpamAssassin::Plugin::Tokenizer::SimpleJA
467
+
468
+=head1 DESCRIPTION
469
+
470
+This plugin simply tokenizes a Japanese string by characters other than 
471
+the alphabet, the Chinese character, and the katakana. 
472
+
473
+=cut
474
+
475
+package Mail::SpamAssassin::Plugin::Tokenizer::SimpleJA;
476
+
477
+use strict;
478
+use warnings;
479
+use Mail::SpamAssassin::Plugin::Tokenizer;
480
+
481
+use vars qw(@ISA);
482
+@ISA = qw(Mail::SpamAssassin::Plugin::Tokenizer);
483
+
484
+our $language = 'ja';
485
+
486
+our $RE = qr{(
487
+  # Hiragana
488
+    (?:
489
+        \xE3\x81[\x80-\xBF]
490
+      | \xE3\x82[\x80-\x9F]
491
+    )+
492
+  # Katakana
493
+  | (?:
494
+        \xE3\x82[\xA0-\xBF]
495
+      | \xE3\x83[\x80-\xBF]
496
+    )+
497
+  # Kanji
498
+  | (?:
499
+        \xE3[\x90-\xBF][\x80-\xBF]
500
+      | [\xE4-\xE9][\x80-\xBF]{2}
501
+      | \xEF[\xA4-\xAB][\x80-\xBF]
502
+    )+
503
+  # Fullwidth
504
+  | (?:
505
+        \xEF\xBC[\x80-\xBF]
506
+      | \xEF\xBD[\x80-\x9F]
507
+    )+
508
+  # Others
509
+  | [\xC0-\xDF][\x80-\xBF]
510
+  | [\xE0-\xE2][\x80-\xBF]{2}
511
+  | \xE3\x80[\x80-\xBF]
512
+  | \xE3[\x84-\x8F][\x80-\xBF]
513
+  | [\xEA-\xEE][\x80-\xBF]{2}
514
+  | \xEF[\x80-\xA3][\x80-\xBF]
515
+  | \xEF[\xAC-\xBB][\x80-\xBF]
516
+  | \xEF\xBD[\xA0-\xBF]
517
+  | \xEF[\xBE-\xBF][\x80-\xBF]
518
+  | [\xF0-\xF7][\x80-\xBF]{3}
519
+)}x;
520
+
521
+sub new {
522
+  my $class = shift;
523
+  my $mailsaobject = shift;
524
+
525
+  $class = ref($class) || $class;
526
+  my $self = $class->SUPER::new($mailsaobject, $language);
527
+  bless ($self, $class);
528
+
529
+  return $self;
530
+}
531
+
532
+sub tokenize {
533
+  my $self = shift;
534
+  my $text_array = shift;
535
+
536
+  my @tokenized_array;
537
+  foreach my $text (@$text_array) {
538
+    next unless ($text);
539
+    $text =~ s/([\x80-\xFF]{3,})/&_tokenize($1)/eg;
540
+    push(@tokenized_array, $text);
541
+  }
542
+  return \@tokenized_array;
543
+}
544
+
545
+sub _tokenize {
546
+  my $text = shift;
547
+
548
+  $text =~ s/$RE/$1 /og;
549
+  $text = ' ' . $text;
550
+  return $text;
551
+}
552
+
553
+1;
554
+
555
diff -uNr lib/Mail/SpamAssassin/Plugin/Tokenizer.pm lib/Mail/SpamAssassin/Plugin/Tokenizer.pm
556
--- lib/Mail/SpamAssassin/Plugin/Tokenizer.pm	1970-01-01 09:00:00.000000000 +0900
557
+++ lib/Mail/SpamAssassin/Plugin/Tokenizer.pm	2011-07-14 22:35:46.000000000 +0900
558
@@ -0,0 +1,115 @@
559
+# <@LICENSE>
560
+# Copyright 2004 Apache Software Foundation
561
+# 
562
+# Licensed under the Apache License, Version 2.0 (the "License");
563
+# you may not use this file except in compliance with the License.
564
+# You may obtain a copy of the License at
565
+# 
566
+#     http://www.apache.org/licenses/LICENSE-2.0
567
+# 
568
+# Unless required by applicable law or agreed to in writing, software
569
+# distributed under the License is distributed on an "AS IS" BASIS,
570
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
571
+# See the License for the specific language governing permissions and
572
+# limitations under the License.
573
+# </@LICENSE>
574
+
575
+=head1 NAME
576
+
577
+Mail::SpamAssassin::Plugin::Tokenizer - Tokenizer plugin base class
578
+
579
+=head1 SYNOPSIS
580
+
581
+=head2 SpamAssassin configuration:
582
+
583
+  loadplugin MyTokenizerPlugin /path/to/MyTokenizerPlugin.pm
584
+
585
+=head2 Perl code:
586
+
587
+  use Mail::SpamAssassin::Plugin::Tokenizer;
588
+  use vars qw(@ISA);
589
+  @ISA = qw(Mail::SpamAssassin::Plugin::Tokenizer);
590
+  # language to use this plugin
591
+  our $language = 'ja';
592
+
593
+  # constructor: register language
594
+  sub new {
595
+    my $class = shift;
596
+    my $mailsaobject = shift;
597
+
598
+    # some boilerplate...
599
+    $class = ref($class) || $class;
600
+    my $self = $class->SUPER::new($mailsaobject, $language);
601
+    bless ($self, $class);
602
+
603
+    return $self;
604
+  }
605
+
606
+  # tokenize function
607
+  sub tokenize {
608
+    my $self = shift;
609
+    my $text_array_ref = shift;
610
+
611
+    ......
612
+
613
+    return $tokenized_array_ref;
614
+  }
615
+
616
+
617
+=head1 DESCRIPTION
618
+
619
+This plugin is the base class of tokenizer plugin.
620
+You must define tokenize() and $language
621
+
622
+=head1 INTERFACE
623
+
624
+  sub tokenize {
625
+    my $self = shift;
626
+    my $text_array_ref = shift;
627
+
628
+    ......
629
+ 
630
+    return $tokenized_array_ref;
631
+  }
632
+
633
+=cut
634
+
635
+package Mail::SpamAssassin::Plugin::Tokenizer;
636
+
637
+use Mail::SpamAssassin::Plugin;
638
+use Mail::SpamAssassin::Logger;
639
+use strict;
640
+use warnings;
641
+use bytes;
642
+
643
+use vars qw(@ISA);
644
+@ISA = qw(Mail::SpamAssassin::Plugin);
645
+
646
+sub new {
647
+  my $class = shift;
648
+  my $mailsaobject = shift;
649
+  my $language = shift;
650
+
651
+  # some boilerplate...
652
+  $class = ref($class) || $class;
653
+  my $self = $class->SUPER::new($mailsaobject);
654
+  bless ($self, $class);
655
+
656
+  if ($language) {
657
+    $self->{main}->{conf}->{tokenizer}->{$language} = $self;
658
+  }
659
+  else {
660
+    dbg("plugin: $self: \$language is not defined");
661
+  }
662
+
663
+  return $self;
664
+}
665
+
666
+sub tokenize {
667
+  my ($self, $ref) = @_;
668
+
669
+  return $ref;
670
+}
671
+
672
+1;
673
+
674
diff -uNr lib/Mail/SpamAssassin/Util/Charset.pm lib/Mail/SpamAssassin/Util/Charset.pm
675
--- lib/Mail/SpamAssassin/Util/Charset.pm	1970-01-01 09:00:00.000000000 +0900
676
+++ lib/Mail/SpamAssassin/Util/Charset.pm	2011-07-14 22:29:19.000000000 +0900
677
@@ -0,0 +1,471 @@
678
+# <@LICENSE>
679
+# Copyright 2006 Apache Software Foundation
680
+#
681
+# Licensed under the Apache License, Version 2.0 (the "License");
682
+# you may not use this file except in compliance with the License.
683
+# You may obtain a copy of the License at
684
+#
685
+#     http://www.apache.org/licenses/LICENSE-2.0
686
+#
687
+# Unless required by applicable law or agreed to in writing, software
688
+# distributed under the License is distributed on an "AS IS" BASIS,
689
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
690
+# See the License for the specific language governing permissions and
691
+# limitations under the License.
692
+# </@LICENSE>
693
+
694
+
695
+=head1 NAME
696
+
697
+  Mail::SpamAssassin::Util::Charset.pm - Utility for charset and language
698
+
699
+=head1 SYNOPSIS
700
+
701
+  my ($decoded, $detected) = Mail::SpamAssassin::Util::Charset::normalize_charset($str, $charset);
702
+  my $language = Mail::SpamAssassin::Util::Charset::get_language($str, $charset);
703
+
704
+=head1 DESCRIPTION
705
+
706
+This module implements utility methods for charset and language.
707
+
708
+=cut
709
+
710
+package Mail::SpamAssassin::Util::Charset;
711
+
712
+use strict;
713
+use warnings;
714
+use Encode;
715
+use Encode::Guess;
716
+use Encode::Alias;
717
+
718
+use vars qw (
719
+  @ISA @EXPORT
720
+);
721
+
722
+require Exporter;
723
+
724
+@ISA = qw(Exporter);
725
+@EXPORT = qw(normalize_charset get_language);
726
+
727
+###########################################################################
728
+
729
+use constant HAS_ENCODE_DETECT => eval { require Encode::Detect::Detector; };
730
+use constant HAS_ENCODE_HANEXTRA => eval { require Encode::HanExtra; };
731
+use constant HAS_ENCODE_EUCJPMS => eval { require Encode::EUCJPMS; };
732
+
733
+###########################################################################
734
+
735
+our $KANA_HAN_RE = qr{
736
+  # Hiragana and Katakana
737
+    \xE3[\x81-\x83][\x80-\xBF]
738
+  # Han
739
+  | \xE3[\x90-\xBF][\x80-\xBF]
740
+  | [\xE4-\xE9][\x80-\xBF]{2}
741
+  | \xEF[\xA4-\xAB][\x80-\xBF]
742
+}x;
743
+
744
+our %enc2lang;
745
+our %lang2enc;
746
+our %scr2lang;
747
+our %cjkscr2lang;
748
+our @scrorder;
749
+
750
+BEGIN {
751
+
752
+  # See the following URL about this map:
753
+  #   http://czyborra.com/charsets/iso8859.html
754
+  #   http://czyborra.com/charsets/codepages.html
755
+  #   http://czyborra.com/charsets/cyrillic.html
756
+  #   http://en.wikipedia.org/wiki/ISO_8859
757
+  #   http://www.w3.org/International/O-charset-lang.html
758
+  %enc2lang = (
759
+    # buint-in Encodings and Encode::Byte
760
+    #   N. America
761
+    'ascii'         => 'en',
762
+    'cp437'         => 'en',
763
+    'cp863'         => 'weurope',
764
+
765
+    #   W. Europe (Latin1, Latin9)
766
+    #       fr es ca eu pt it sq rm nl de da sv no fi fo is ga gd en af
767
+    'iso-8859-1'    => 'weurope',
768
+    'iso-8859-15'   => 'weurope',
769
+    'cp850'         => 'weurope',
770
+    'cp860'         => 'weurope',
771
+    'cp1252'        => 'weurope',
772
+    'MacRoman'      => 'weurope',
773
+
774
+    #   Cntrl. Europe / Latin2 / Latin10
775
+    #       hr cs hu pl sr sk sl
776
+    'iso-8859-2'    => 'ceurope',
777
+    'cp852'         => 'ceurope',
778
+    'cp1250'        => 'ceurope',
779
+    'MacCentralEurRoman' => 'ceurope',
780
+    'MacCroatian'   => 'ceurope',
781
+    'iso-8859-16'   => 'ceurope',
782
+    'MacRomanian'   => 'ceurope',
783
+
784
+    #   Latin3 (Esperanto, Maltese, and Turkish. Turkish is now on 8859-9.)
785
+    #       eo mt
786
+    'iso-8859-3'    => 'seurope',
787
+
788
+    #   Baltics (Latin4, Latin7)
789
+    #       lv lt
790
+    'iso-8859-4'    => 'neurope',
791
+    'iso-8859-13'   => 'baltic',
792
+    'cp1257'        => 'baltic',
793
+
794
+    #   Nordics (Latin6)
795
+    #       et kl iu se
796
+    'iso-8859-10'   => 'nordic',
797
+
798
+    #   Cyrillics
799
+    #       bg be uk sr mk ru
800
+    'iso-8859-5'    => 'ru',
801
+    'cp855'         => 'ru',
802
+    'cp1251'        => 'ru',
803
+    'cp866'         => 'ru',
804
+    'MacCyrillic'   => 'ru',
805
+    'koi8-r'        => 'ru',
806
+    'MacUkrainian'  => 'uk',
807
+    'koi8-u'        => 'uk',
808
+
809
+    #   Arabic
810
+    'iso-8859-6'    => 'ar',
811
+    'cp864'         => 'ar',
812
+    'cp1256'        => 'ar',
813
+    'MacArabic'     => 'ar',
814
+    'cp1006'        => 'fa',
815
+    'MacFarsi'      => 'fa',
816
+
817
+    #   Greek
818
+    'iso-8859-7'    => 'el',
819
+    'cp1253'        => 'el',
820
+    'MacGreek'      => 'el',
821
+
822
+    #   Hebrew
823
+    #       he yi
824
+    'iso-8859-8'    => 'he',
825
+    'cp862'         => 'he',
826
+    'cp1255'        => 'he',
827
+    'MacHebrew'     => 'he',
828
+
829
+    #   Turkish
830
+    'iso-8859-9'    => 'tr',
831
+    'cp857'         => 'tr',
832
+    'cp1254'        => 'tr',
833
+    'MacTurkish'    => 'tr',
834
+
835
+    #   Thai
836
+    'iso-8859-11'   => 'th',
837
+    'cp874'         => 'th',
838
+
839
+    #   Celtics (Latin8)
840
+    #       gd cy br
841
+    'iso-8859-14'   => 'celtic',
842
+
843
+    #   Vietnamese
844
+    'viscii'        => 'vi',
845
+    'cp1258'        => 'vi',
846
+
847
+    # Encode::CN
848
+    'euc-cn'        => 'zh',
849
+    'cp936'         => 'zh',
850
+    'hz'            => 'zh',
851
+
852
+    # Encode::TW
853
+    'big5-eten'     => 'zh',
854
+    'big5-hkscs'    => 'zh',
855
+    'cp950'         => 'zh',
856
+
857
+    # Encode::JP
858
+    'euc-jp'        => 'ja',
859
+    'shiftjis'      => 'ja',
860
+    '7bit-jis'      => 'ja',
861
+    'iso-2022-jp'   => 'ja',
862
+    'iso-2022-jp-1' => 'ja',
863
+    'cp932'         => 'ja',
864
+
865
+    # Encode::KR
866
+    'euc-kr'        => 'ko',
867
+    'cp949'         => 'ko',
868
+    'johab'         => 'ko',
869
+    'iso-2022-kr'   => 'ko',
870
+
871
+    # Encode::HanExtra
872
+    'euc-tw'        => 'zh',
873
+    'gb18030'       => 'zh',
874
+
875
+    # Encode::JIS2K
876
+    'euc-jisx0213'  => 'ja',
877
+    'shiftjisx0123' => 'ja',
878
+    'iso-2022-jp-3' => 'ja',
879
+
880
+    # Encode::EUCJPMS
881
+    'eucJP-ms'      => 'ja',
882
+    'cp51932'       => 'ja',
883
+    'cp50220'       => 'ja',
884
+    'cp50221'       => 'ja',
885
+
886
+  );
887
+
888
+  %lang2enc = (
889
+    # Latin1
890
+    'en' => ['ascii'],
891
+    'weurope' => ['cp1252'],
892
+
893
+    # Latin2
894
+    'ceurope' => ['cp1250'],
895
+
896
+    # Latin3
897
+    'seurope' => ['iso-8859-3'],
898
+
899
+    # Latin4
900
+    'neurope' => ['iso-8859-4'],
901
+
902
+    # Latin5
903
+    'tr' => ['cp1254'],
904
+
905
+    # Latin6
906
+    'nordic' => ['iso-8859-10'],
907
+
908
+    # Latin7
909
+    'baltic' => ['cp1257'],
910
+
911
+    # Latin8
912
+    'celtic' => ['iso-8859-14'],
913
+
914
+    # Non Latin
915
+    'ru' => ['koi8-r', 'cp1251'],
916
+    'uk' => ['koi8-u'],
917
+
918
+    'ar' => ['cp1256'],
919
+    'el' => ['cp1253'],
920
+    'he' => ['cp1255'],
921
+    'th' => ['cp874'],
922
+    'vi' => ['viscii', 'cp1258'],
923
+    'zh' => ['euc-cn', 'cp950'],
924
+    'ja' => ['euc-jp', 'cp932'],
925
+    'ko' => ['euc-kr', 'cp949'],
926
+
927
+  );
928
+
929
+  %scr2lang = (
930
+    'InLatin1Supplement' => ['weurope'],
931
+    'InLatinExtendedA' => [
932
+      'ceurope',
933
+      'seurope',
934
+      'tr',
935
+      'vi'
936
+    ],
937
+    'InLatinExtendedB' => [
938
+      'nordic',
939
+      'baltic',
940
+      'celtic'
941
+    ],
942
+    'Thai'   => ['th'],
943
+    'Cyrillic' => ['ru', 'uk'],
944
+    'Arabic' => ['ar'],
945
+    'Greek'  => ['el'],
946
+    'Hebrew' => ['he'],
947
+  );
948
+
949
+  # better detection for CJK
950
+  @scrorder = ('Hiragana','Katakana','Hangul','Han',keys(%scr2lang));
951
+  %cjkscr2lang = (
952
+    'Hiragana' => ['ja'],
953
+    'Katakana' => ['ja'],
954
+    'Hangul' => ['ko'],
955
+    'Han'    => ['zh', 'ja', 'ko'],
956
+  );
957
+
958
+  unless (HAS_ENCODE_HANEXTRA) {
959
+    Encode::Alias::define_alias( qr/^gb18030$/i => ' "euc-cn"' );
960
+  }
961
+  Encode::Alias::define_alias( qr/^unicode-1-1-(.+)$/i => ' "$1"' );
962
+  Encode::Alias::define_alias( qr/^TIS-620$/i => ' "iso-8859-11"' );
963
+  Encode::Alias::define_alias( qr/^x-mac-(.+)$/i => ' "Mac$1"' );
964
+  Encode::Alias::define_alias( qr/^Shift_JIS$/i => ' "cp932"' );
965
+  if (HAS_ENCODE_EUCJPMS) {
966
+    Encode::Alias::define_alias( qr/^iso-2022-jp$/i => ' "cp50221"' );
967
+  }
968
+}
969
+
970
+sub get_language {
971
+  my $str = shift; # $str must be UTF-8 encoding
972
+  my $charset = shift;
973
+
974
+  return 'en' unless $charset;
975
+  if ($charset !~ /^utf/i) {
976
+    return $enc2lang{$charset};
977
+  } elsif (defined($str)) {
978
+    $str =~ s/[\x00-\x7F]//g; # remove ASCII characters
979
+    return 'en' if ($str eq '');
980
+
981
+    my %handled;
982
+    $str = Encode::decode_utf8($str) unless (Encode::is_utf8($str));
983
+    foreach my $scr (@scrorder) {
984
+      next if ($str !~ /\p{$scr}/);
985
+      my $scrlangs = exists($cjkscr2lang{$scr}) ? $cjkscr2lang{$scr} : $scr2lang{$scr};
986
+      foreach my $lang (@$scrlangs) {
987
+        next if (exists($handled{$lang}));
988
+        foreach my $enc (@{$lang2enc{$lang}}) {
989
+          my $scratch = $str;
990
+          Encode::encode($enc, $scratch, Encode::FB_QUIET);
991
+          return $lang if ($scratch eq '');
992
+        }
993
+        $handled{$lang} = 1;
994
+      }
995
+    }
996
+  } 
997
+  return 'en';
998
+}
999
+
1000
+# TEST 1: try conversion to use the specified charset. 
1001
+# TEST 2: try conversion to use Encode::Detect.
1002
+# TEST 3: try conversion to use Encode::Guess.
1003
+sub normalize_charset {
1004
+  my $str = shift;
1005
+  my $charset = shift;
1006
+
1007
+  return wantarray ? ($str, 'ascii')  : $str unless ($str);
1008
+
1009
+  my $decoded;
1010
+  my $detected;
1011
+
1012
+  if ($charset) {
1013
+    ($decoded, $detected) = _specified_encoding($str, $charset);
1014
+  }
1015
+  unless ($detected) {
1016
+    ($decoded, $detected) = _encode_detect($str);
1017
+  }
1018
+  unless ($detected) {
1019
+    ($decoded, $detected) = _encode_guess($str);
1020
+  }
1021
+  unless ($detected) {
1022
+    return ($str, undef);
1023
+  }
1024
+  $decoded =~ s/^\x{feff}//g;
1025
+  $decoded = Encode::encode_utf8($decoded);
1026
+
1027
+  # unfold hiragana, katakana and han
1028
+  if ($detected =~ /^(?:UTF|EUC|BIG5|GB|SHIFTJIS|ISO-2022|CP969$|CP932$|CP949|CP50221$)/i) {
1029
+    $decoded =~ s/($KANA_HAN_RE)\012($KANA_HAN_RE)/$1$2/og;
1030
+  }
1031
+  return wantarray ? ($decoded, $detected) : $decoded;
1032
+}
1033
+
1034
+sub _specified_encoding {
1035
+  my $str = shift;
1036
+  my $encoding = shift;
1037
+
1038
+  my $detected;
1039
+  my $decoded;
1040
+
1041
+  return (undef, undef) unless ($encoding);
1042
+
1043
+  # note: ISO-2022-* is not deistinguish from US-ASCII
1044
+  return (undef, undef) if ($str =~ /\e/ and $encoding !~ /^ISO-2022/i);
1045
+
1046
+  # UTF-16|32 encoding without BOM cannot be trusted.
1047
+  return (undef, undef) if ($encoding =~ /^UTF-32$/i and $str !~ /^(?:\xFF\xFE\x00\x00|\x00\x00\xFE\xFF)/);
1048
+  return (undef, undef) if ($encoding =~ /^UTF-16$/i and $str !~ /^(?:\xFF\xFE|\xFE\xFF)/);
1049
+
1050
+  #$encoding = _get_alias($encoding);
1051
+  my $encoder = Encode::find_encoding($encoding);
1052
+  if (ref($encoder)) {
1053
+    $decoded = $encoder->decode($str,Encode::FB_QUIET);
1054
+    $detected = $encoder->name if ($str eq '');
1055
+  }
1056
+  return ($decoded, $detected);
1057
+}
1058
+
1059
+sub _encode_detect {
1060
+  return undef unless HAS_ENCODE_DETECT;
1061
+  my $str = shift;
1062
+
1063
+  # UTF-16|32 encoding without BOM cannot be trusted.
1064
+  return (undef, undef) if ($str =~ /\x00\x00/ and $str !~ /^(?:\xFF\xFE\x00\x00|\x00\x00\xFE\xFF)/);
1065
+  return (undef, undef) if ($str =~ /\x00/ and $str !~ /^(?:\xFF\xFE|\xFE\xFF)/);
1066
+
1067
+  my $decoded;
1068
+  my $detected = Encode::Detect::Detector::detect($str);
1069
+  if ($detected) {
1070
+    $detected = _get_alias($detected);
1071
+    my $encoder = Encode::find_encoding($detected);
1072
+    if (ref($encoder)) {
1073
+      $decoded = $encoder->decode($str);
1074
+      $detected = $decoded ? $encoder->name : undef;
1075
+    }
1076
+    else {
1077
+      $detected = undef;
1078
+    }
1079
+  }
1080
+  return ($decoded, $detected);
1081
+}
1082
+
1083
+sub _encode_guess {
1084
+  my $str = shift;
1085
+
1086
+  my $detected;
1087
+  my $decoded;
1088
+  my $encoder;
1089
+
1090
+  # Step 1: Examine ISO-2022-*.
1091
+  if ($str =~ /\e/) {
1092
+    $Encode::Guess::NoUTFAutoGuess = 1;
1093
+    $encoder = Encode::Guess::guess_encoding($str,
1094
+        qw/cp50221 7bit-jis iso-2022-kr/);
1095
+    $Encode::Guess::NoUTFAutoGuess = 0;
1096
+  }
1097
+
1098
+  # Step 2: Examine US-ASCII/UTF-(8|16|32)
1099
+  unless (ref($encoder)) {
1100
+    $Encode::Guess::NoUTFAutoGuess = 0;
1101
+    $encoder = Encode::Guess::guess_encoding($str);
1102
+  }
1103
+
1104
+  # Step 3: Examine other encodings
1105
+  unless (ref($encoder)) {
1106
+    $Encode::Guess::NoUTFAutoGuess = 1;
1107
+    eval {
1108
+      if ($str =~ /[\x80-\xFF]{4}/) {
1109
+        $encoder = Encode::Guess::guess_encoding($str,
1110
+          qw/euc-cn big5-eten euc-jp cp932 euc-kr cp949/);
1111
+      }
1112
+      else {
1113
+        $encoder = Encode::Guess::guess_encoding($str,
1114
+          qw/iso-8859-1 cp1252/);
1115
+      }
1116
+    };
1117
+    $Encode::Guess::NoUTFAutoGuess = 0;
1118
+  }
1119
+  if (ref($encoder)) {
1120
+    $detected = $encoder->name;
1121
+    if ($detected) {
1122
+      $decoded = $encoder->decode($str);
1123
+    }
1124
+  }
1125
+  return ($decoded, $detected);
1126
+}
1127
+
1128
+sub _get_alias {
1129
+  my $encoding = shift;
1130
+
1131
+  unless (HAS_ENCODE_HANEXTRA) {
1132
+    $encoding =~ s/^gb18030$/euc-cn/i;
1133
+  }
1134
+  $encoding =~ s/^unicode-1-1-(.+)$/$1/i;
1135
+  $encoding =~ s/^TIS-620$/iso-8859-11/i;
1136
+  $encoding =~ s/x-mac-(.+)$/Mac$1/i;
1137
+  $encoding =~ s/^Shift_JIS$/cp932/i;
1138
+  if (HAS_ENCODE_EUCJPMS) {
1139
+    $encoding =~ s/^iso-2022-jp$/cp50221/i;
1140
+    $encoding =~ s/^euc-jp$/cp51932/i;
1141
+  }
1142
+
1143
+  return $encoding;
1144
+}
1145
+
1146
+
1147
+1;
1148
+
(-)./files/spamassassin-3.3.2-ja-1.plist (-12 lines)
Lines 1-12 Link Here
1
%%SITE_PERL%%/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm
2
%%SITE_PERL%%/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm
3
@dirrm %%SITE_PERL%%/Mail/SpamAssassin/Plugin/Tokenizer
4
%%SITE_PERL%%/Mail/SpamAssassin/Plugin/Tokenizer.pm
5
%%SITE_PERL%%/Mail/SpamAssassin/Util/Charset.pm
6
%%PERL5_MAN3%%/Mail::SpamAssassin::Plugin::Tokenizer.3.gz
7
%%PERL5_MAN3%%/Mail::SpamAssassin::Plugin::Tokenizer::MeCab.3.gz
8
%%PERL5_MAN3%%/Mail::SpamAssassin::Plugin::Tokenizer::SimpleJA.3.gz
9
%%PERL5_MAN3%%/Mail::SpamAssassin::Util::Charset.3.gz
10
@unexec if cmp -s %D/%%ETCDIR%%/%%TOKENIZER_PRE%%.sample %D/%%ETCDIR%%/%%TOKENIZER_PRE%%; then rm -f %D/%%ETCDIR%%/%%TOKENIZER_PRE%%; fi
11
%%ETCDIR%%/%%TOKENIZER_PRE%%.sample
12
@exec if [ ! -f %B/%%TOKENIZER_PRE%% ]; then cp -p %B/%f %B/%%TOKENIZER_PRE%%; fi
(-)./files/spamassassin-3.4.0-ja.patch (+1143 lines)
Line 0 Link Here
1
--- lib/Mail/SpamAssassin/HTML.pm.orig	2014-02-07 17:36:28.000000000 +0900
2
+++ lib/Mail/SpamAssassin/HTML.pm	2014-03-04 11:18:44.000000000 +0900
3
@@ -86,7 +86,7 @@
4
 $ok_attributes{div}{$_} = 1 for qw( style );
5
 
6
 sub new {
7
-  my ($class) = @_;
8
+  my ($class, $opts) = @_;
9
   my $self = $class->SUPER::new(
10
 		api_version => 3,
11
 		handlers => [
12
@@ -99,6 +99,7 @@
13
 			declaration => ["html_declaration", "self,text"],
14
 		],
15
 		marked_sections => 1);
16
+  $self->{normalize} = $opts->{'normalize'} || 0;
17
 
18
   $self;
19
 }
20
@@ -681,7 +682,14 @@
21
     }
22
   }
23
   else {
24
-    $text =~ s/[ \t\n\r\f\x0b\xa0]+/ /g;
25
+    if ($self->{normalize}) {
26
+      $text =~ s/\xc2\xa0/ /g;           # no-break space
27
+      $text =~ s/\xe3\x80\x80/ /g;       # ideographicspace
28
+      $text =~ s/[ \t\n\r\f\x0b]+/ /g;
29
+    }
30
+    else {
31
+      $text =~ s/[ \t\n\r\f\x0b\xa0]+/ /g;
32
+    }
33
     # trim leading whitespace if previous element was whitespace 
34
     # and current element is not invisible
35
     if (@{ $self->{text} } && !$display{invisible} &&
36
--- lib/Mail/SpamAssassin/Message/Node.pm.orig	2014-02-07 17:36:23.000000000 +0900
37
+++ lib/Mail/SpamAssassin/Message/Node.pm	2014-03-04 11:22:38.000000000 +0900
38
@@ -42,6 +42,7 @@
39
 use Mail::SpamAssassin::Constants qw(:sa);
40
 use Mail::SpamAssassin::HTML;
41
 use Mail::SpamAssassin::Logger;
42
+use Mail::SpamAssassin::Util::Charset;
43
 
44
 =item new()
45
 
46
@@ -385,27 +386,10 @@
47
 
48
 sub _normalize {
49
   my ($self, $data, $charset) = @_;
50
-  return $data unless $self->{normalize};
51
+  return wantarray ? ($data, $charset) : $data unless $self->{normalize};
52
 
53
-  my $detected = Encode::Detect::Detector::detect($data);
54
-
55
-  my $converter;
56
-
57
-  if ($charset && $charset !~ /^us-ascii$/i &&
58
-      ($detected || 'none') !~ /^(?:UTF|EUC|ISO-2022|Shift_JIS|Big5|GB)/i) {
59
-      dbg("message: Using labeled charset $charset");
60
-      $converter = Encode::find_encoding($charset);
61
-  }
62
-
63
-  $converter = Encode::find_encoding($detected) unless $converter || !defined($detected);
64
-
65
-  return $data unless $converter;
66
-
67
-  dbg("message: Converting...");
68
-
69
-  my $rv = $converter->decode($data, 0);
70
-  utf8::downgrade($rv, 1);
71
-  return $rv
72
+  my ($decoded_data, $detected_charset) = normalize_charset($data, $charset);
73
+  return wantarray ? ($decoded_data, $detected_charset) : $decoded_data;
74
 }
75
 
76
 =item rendered()
77
@@ -428,8 +412,12 @@
78
     # text/x-aol is ignored here, but looks like text/html ...
79
     return(undef,undef) unless ( $self->{'type'} =~ /^text\/(?:plain|html)$/i );
80
 
81
-    my $text = $self->_normalize($self->decode(), $self->{charset});
82
+    my ($text, $charset) = $self->_normalize($self->decode(), $self->{charset});
83
     my $raw = length($text);
84
+    if ($self->{normalize}) {
85
+      $self->{charset} = $charset;
86
+      $self->{language} = get_language($text, $charset);
87
+    }
88
 
89
     # render text/html always, or any other text|text/plain part as text/html
90
     # based on a heuristic which simulates a certain common mail client
91
@@ -439,7 +427,7 @@
92
     {
93
       $self->{rendered_type} = 'text/html';
94
 
95
-      my $html = Mail::SpamAssassin::HTML->new();	# object
96
+      my $html = Mail::SpamAssassin::HTML->new({normalize=>$self->{normalize}});       # object
97
       $html->parse($text);				# parse+render text
98
       $self->{rendered} = $html->get_rendered_text();
99
       $self->{visible_rendered} = $html->get_rendered_text(invisible => 0);
100
--- lib/Mail/SpamAssassin/Message.pm.orig	2014-02-07 17:36:28.000000000 +0900
101
+++ lib/Mail/SpamAssassin/Message.pm	2014-03-04 11:27:31.000000000 +0900
102
@@ -604,6 +604,8 @@
103
   delete $self->{'pristine_headers'};
104
   delete $self->{'line_ending'};
105
   delete $self->{'missing_head_body_separator'};
106
+  delete $self->{'charset'};
107
+  delete $self->{'language'};
108
 
109
   my @toclean = ( $self );
110
 
111
@@ -630,6 +632,8 @@
112
     delete $part->{'invisible_rendered'};
113
     delete $part->{'type'};
114
     delete $part->{'rendered_type'};
115
+    delete $self->{'charset'};
116
+    delete $self->{'language'};
117
 
118
     # if there are children nodes, add them to the queue of nodes to clean up
119
     if (exists $part->{'body_parts'}) {
120
@@ -1085,7 +1089,14 @@
121
 
122
   # whitespace handling (warning: small changes have large effects!)
123
   $text =~ s/\n+\s*\n+/\f/gs;		# double newlines => form feed
124
-  $text =~ tr/ \t\n\r\x0b\xa0/ /s;	# whitespace => space
125
+  if ($self->{normalize}) {
126
+    $text =~ s/\xc2\xa0/ /g;		# no-break space => space
127
+    $text =~ s/\xe3\x80\x80/ /g;	# ideographicspace => space
128
+    $text =~ tr/ \t\n\r\x0b/ /s;	# whitespace => space
129
+  }
130
+  else {
131
+    $text =~ tr/ \t\n\r\x0b\xa0/ /s;	# whitespace => space
132
+  }
133
   $text =~ tr/\f/\n/;			# form feeds => newline
134
   
135
   # warn "message: $text";
136
@@ -1142,7 +1153,14 @@
137
 
138
   # whitespace handling (warning: small changes have large effects!)
139
   $text =~ s/\n+\s*\n+/\f/gs;		# double newlines => form feed
140
-  $text =~ tr/ \t\n\r\x0b\xa0/ /s;	# whitespace => space
141
+  if ($self->{normalize}) {
142
+    $text =~ s/\xc2\xa0/ /g;		# no-break space => space
143
+    $text =~ s/\xe3\x80\x80/ /g;	# ideographicspace => space
144
+    $text =~ tr/ \t\n\r\x0b/ /s;	# whitespace => space
145
+  }
146
+  else {
147
+    $text =~ tr/ \t\n\r\x0b\xa0/ /s;	# whitespace => space
148
+  }
149
   $text =~ tr/\f/\n/;			# form feeds => newline
150
 
151
   my @textary = split_into_array_of_short_lines ($text);
152
@@ -1193,7 +1211,14 @@
153
 
154
   # whitespace handling (warning: small changes have large effects!)
155
   $text =~ s/\n+\s*\n+/\f/gs;		# double newlines => form feed
156
-  $text =~ tr/ \t\n\r\x0b\xa0/ /s;	# whitespace => space
157
+  if ($self->{normalize}) {
158
+    $text =~ s/\xc2\xa0/ /g;		# no-break space => space
159
+    $text =~ s/\xe3\x80\x80/ /g;	# ideographicspace => space
160
+    $text =~ tr/ \t\n\r\x0b/ /s;	# whitespace => space
161
+  }
162
+  else {
163
+    $text =~ tr/ \t\n\r\x0b\xa0/ /s;	# whitespace => space
164
+  }
165
   $text =~ tr/\f/\n/;			# form feeds => newline
166
 
167
   my @textary = split_into_array_of_short_lines ($text);
168
@@ -1269,6 +1294,28 @@
169
 
170
 # ---------------------------------------------------------------------------
171
 
172
+sub get_language {
173
+  my ($self) = @_;
174
+
175
+  if (defined $self->{language}) { return $self->{language}; }
176
+  my @parts = $self->find_parts(qr/^(?:text|message)\b/i,1);
177
+  return '' unless @parts;
178
+
179
+  # Go through each part
180
+  my @langs;
181
+  for(my $pt = 0 ; $pt <= $#parts ; $pt++ ) {
182
+    my $p = $parts[$pt];
183
+    my $lang = $p->{language};
184
+    next unless ($lang);
185
+    push(@langs, $lang) unless (grep(/^$lang$/, @langs))
186
+  }
187
+  $self->{language} = scalar(@langs) ? join(' ', @langs) : '';
188
+  return $self->{language};
189
+}
190
+
191
+# ---------------------------------------------------------------------------
192
+
193
+
194
 1;
195
 
196
 =back
197
--- lib/Mail/SpamAssassin/PerMsgStatus.pm.orig	2014-02-07 17:36:28.000000000 +0900
198
+++ lib/Mail/SpamAssassin/PerMsgStatus.pm	2014-03-04 11:30:25.000000000 +0900
199
@@ -53,6 +53,7 @@
200
 use warnings;
201
 use re 'taint';
202
 
203
+use Encode;
204
 use Errno qw(ENOENT);
205
 use Time::HiRes qw(time);
206
 
207
@@ -996,19 +997,41 @@
208
 
209
   # the report charset
210
   my $report_charset = "; charset=iso-8859-1";
211
-  if ($self->{conf}->{report_charset}) {
212
-    $report_charset = "; charset=" . $self->{conf}->{report_charset};
213
-  }
214
 
215
   # the SpamAssassin report
216
   my $report = $self->get_report();
217
+  if ($self->{conf}->{report_charset}) {
218
+    $report_charset = "; charset=" . $self->{conf}->{report_charset};
219
+  }
220
 
221
   # If there are any wide characters, need to MIME-encode in UTF-8
222
   # TODO: If $report_charset is something other than iso-8859-1/us-ascii, then
223
   # we could try converting to that charset if possible
224
-  unless ($] < 5.008 || utf8::downgrade($report, 1)) {
225
+  my $is_utf8 = 0;
226
+  if ($self->{conf}->{normalize_charset}) {
227
+    $report = Encode::decode_utf8($report);
228
+    $is_utf8 = 1;
229
+  }
230
+  else {
231
+    if ($self->{msg}->{charset}) {
232
+      eval {
233
+        my $scratch = $report;
234
+        $report = Encode::decode($self->{msg}->{charset},$scratch,Encode::FB_CROAK);
235
+        $is_utf8 = 1;
236
+      };
237
+    }
238
+  }
239
+  if ($is_utf8) {
240
+    $is_utf8 = 1;
241
+    eval {
242
+      my $scratch = $report;
243
+      $report = Encode::encode($self->{conf}->{report_charset},$scratch,Encode::FB_CROAK);
244
+      $is_utf8 = 0;
245
+    };
246
+    if ($is_utf8) {
247
+      $report = Encode::encode_utf8($report);
248
       $report_charset = "; charset=utf-8";
249
-      utf8::encode($report);
250
+    }
251
   }
252
 
253
   # get original headers, "pristine" if we can do it
254
--- lib/Mail/SpamAssassin/Plugin/Bayes.pm.orig	2014-02-07 17:36:27.000000000 +0900
255
+++ lib/Mail/SpamAssassin/Plugin/Bayes.pm	2014-03-04 11:34:46.000000000 +0900
256
@@ -223,6 +223,15 @@
257
 # will require a longer token than English ones.)
258
 use constant MAX_TOKEN_LENGTH => 15;
259
 
260
+# Skip if a token is too short.
261
+our $SKIP_UTF8_SHORT_TOKENS_RE = qr{(?:
262
+    [\x00-\x7F]                # 1 byte
263
+  | [\xC0-\xDF][\x80-\xBF]     # 2 bytes
264
+  | [\xE0-\xEF][\x80-\xBF]{2}  # 3 bytes
265
+  | [\xF0-\xF7][\x80-\xBF]{3}  # 4 bytes
266
+  | (?:\xE3[\x81-\x83][\x80-\xBF]){2} # 2 characters of Hiragana and Katakana
267
+)}x;
268
+
269
 ###########################################################################
270
 
271
 sub new {
272
@@ -1039,9 +1048,28 @@
273
   $msgdata->{bayes_token_body} = $msg->{msg}->get_visible_rendered_body_text_array();
274
   $msgdata->{bayes_token_inviz} = $msg->{msg}->get_invisible_rendered_body_text_array();
275
   @{$msgdata->{bayes_token_uris}} = $msg->get_uri_list();
276
+  if ($self->{conf}->{normalize_charset}) {
277
+    my $tokenizer = $self->get_tokenizer($msg);
278
+    if (ref($tokenizer)) {
279
+      $msgdata->{bayes_token_body} = $tokenizer->tokenize($msgdata->{bayes_token_body});
280
+      $msgdata->{bayes_token_inviz} = $tokenizer->tokenize($msgdata->{bayes_token_inviz});
281
+    }
282
+  }
283
   return $msgdata;
284
 }
285
 
286
+sub get_tokenizer {
287
+  my ($self, $msg) = @_;
288
+
289
+  my $tokenizer;
290
+  my @languages = split(/\s+/, $msg->{msg}->get_language());
291
+  foreach my $lang (@languages) {
292
+    $tokenizer = $self->{'conf'}->{'tokenizer'}->{$lang};
293
+    last if (ref($tokenizer));
294
+  }
295
+  return $tokenizer;
296
+}
297
+
298
 ###########################################################################
299
 
300
 # The calling functions expect a uniq'ed array of tokens ...
301
@@ -1095,7 +1123,7 @@
302
   # include quotes, .'s and -'s for URIs, and [$,]'s for Nigerian-scam strings,
303
   # and ISO-8859-15 alphas.  Do not split on @'s; better results keeping it.
304
   # Some useful tokens: "$31,000,000" "www.clock-speed.net" "f*ck" "Hits!"
305
-  tr/-A-Za-z0-9,\@\*\!_'"\$.\241-\377 / /cs;
306
+  tr/-A-Za-z0-9,\@\*\!_'"\$.\200-\377 / /cs;
307
 
308
   # DO split on "..." or "--" or "---"; common formatting error resulting in
309
   # hapaxes.  Keep the separator itself as a token, though, as long ones can
310
@@ -1124,6 +1152,11 @@
311
     #
312
     next if ( defined $magic_re && $token =~ /$magic_re/ );
313
 
314
+    # Skip short UTF-8 tokens.
315
+    if ($self->{conf}->{normalize_charset}) {
316
+      next if ($token =~ /^$SKIP_UTF8_SHORT_TOKENS_RE$/o);
317
+    }
318
+
319
     # *do* keep 3-byte tokens; there's some solid signs in there
320
     my $len = length($token);
321
 
322
@@ -1152,14 +1185,16 @@
323
     # the domain ".net" appeared in the To header.
324
     #
325
     if ($len > MAX_TOKEN_LENGTH && $token !~ /\*/) {
326
-      if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) {
327
-	# Matt sez: "Could be asian? Autrijus suggested doing character ngrams,
328
-	# but I'm doing tuples to keep the dbs small(er)."  Sounds like a plan
329
-	# to me! (jm)
330
-	while ($token =~ s/^(..?)//) {
331
-	  push (@rettokens, "8:$1");
332
-	}
333
-	next;
334
+      unless ($self->{conf}->{normalize_charset}) {
335
+        if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) {
336
+	  # Matt sez: "Could be asian? Autrijus suggested doing character ngrams,
337
+	  # but I'm doing tuples to keep the dbs small(er)."  Sounds like a plan
338
+	  # to me! (jm)
339
+	  while ($token =~ s/^(..?)//) {
340
+	    push (@rettokens, "8:$1");
341
+	  }
342
+	  next;
343
+        }
344
       }
345
 
346
       if (($region == 0 && HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS)
347
diff -uNr /dev/null lib/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm
348
--- /dev/null	1970-01-01 09:00:00.000000000 +0900
349
+++ lib/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm	2011-07-14 22:29:19.000000000 +0900
350
@@ -0,0 +1,84 @@
351
+# <@LICENSE>
352
+# Copyright 2004 Apache Software Foundation
353
+# 
354
+# Licensed under the Apache License, Version 2.0 (the "License");
355
+# you may not use this file except in compliance with the License.
356
+# You may obtain a copy of the License at
357
+# 
358
+#     http://www.apache.org/licenses/LICENSE-2.0
359
+# 
360
+# Unless required by applicable law or agreed to in writing, software
361
+# distributed under the License is distributed on an "AS IS" BASIS,
362
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
363
+# See the License for the specific language governing permissions and
364
+# limitations under the License.
365
+# </@LICENSE>
366
+
367
+=head1 NAME
368
+
369
+Tokenizer::MeCab - Japanese tokenizer with MeCab
370
+
371
+=head1 SYNOPSIS
372
+
373
+loadplugin     Mail::SpamAssassin::Plugin::Tokenizer::MeCab
374
+
375
+=head1 DESCRIPTION
376
+
377
+This plugin tokenizes a Japanese string with MeCab that is 
378
+the morphological analysis engine. 
379
+
380
+Text::MeCab 0.12 or over is required.
381
+
382
+=cut
383
+
384
+package Mail::SpamAssassin::Plugin::Tokenizer::MeCab;
385
+
386
+use strict;
387
+use warnings;
388
+use Mail::SpamAssassin::Plugin::Tokenizer;
389
+
390
+use vars qw(@ISA);
391
+@ISA = qw(Mail::SpamAssassin::Plugin::Tokenizer);
392
+
393
+# Have to do this so that RPM doesn't find these as required perl modules
394
+BEGIN { require MeCab; }
395
+our $language = 'ja';
396
+our $mecab = new MeCab::Tagger(-Ochasen);
397
+
398
+sub new {
399
+  my $class = shift;
400
+  my $mailsaobject = shift;
401
+
402
+  $class = ref($class) || $class;
403
+  my $self = $class->SUPER::new($mailsaobject, $language);
404
+  bless ($self, $class);
405
+
406
+  return $self;
407
+}
408
+
409
+sub tokenize {
410
+  my $self = shift;
411
+  my $text_array = shift;
412
+
413
+  my @tokenized_array;
414
+  foreach my $text (@$text_array) {
415
+    next unless ($text);
416
+    $text =~ s/([\x80-\xFF]{3,})/&_tokenize($1)/eg;
417
+    push(@tokenized_array, $text);
418
+  }
419
+  return \@tokenized_array;
420
+}
421
+
422
+sub _tokenize {
423
+  my $text = shift;
424
+
425
+  my @buf;
426
+  for (my $node = $mecab->parseToNode($text); $node->{next}; $node = $node->{next}) {
427
+    push(@buf, $node->{surface});
428
+  }
429
+  my $tokenized = join(' ', @buf) . ' ';
430
+  return $tokenized;
431
+}
432
+
433
+1;
434
+
435
diff -uNr /dev/null lib/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm
436
--- /dev/null	1970-01-01 09:00:00.000000000 +0900
437
+++ lib/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm	2011-07-14 22:29:19.000000000 +0900
438
@@ -0,0 +1,111 @@
439
+# <@LICENSE>
440
+# Copyright 2004 Apache Software Foundation
441
+# 
442
+# Licensed under the Apache License, Version 2.0 (the "License");
443
+# you may not use this file except in compliance with the License.
444
+# You may obtain a copy of the License at
445
+# 
446
+#     http://www.apache.org/licenses/LICENSE-2.0
447
+# 
448
+# Unless required by applicable law or agreed to in writing, software
449
+# distributed under the License is distributed on an "AS IS" BASIS,
450
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
451
+# See the License for the specific language governing permissions and
452
+# limitations under the License.
453
+# </@LICENSE>
454
+
455
+=head1 NAME
456
+
457
+Tokenizer::SimpleJA - simple Japanese tokenizer
458
+
459
+=head1 SYNOPSIS
460
+
461
+loadplugin     Mail::SpamAssassin::Plugin::Tokenizer::SimpleJA
462
+
463
+=head1 DESCRIPTION
464
+
465
+This plugin simply tokenizes a Japanese string by characters other than 
466
+the alphabet, the Chinese character, and the katakana. 
467
+
468
+=cut
469
+
470
+package Mail::SpamAssassin::Plugin::Tokenizer::SimpleJA;
471
+
472
+use strict;
473
+use warnings;
474
+use Mail::SpamAssassin::Plugin::Tokenizer;
475
+
476
+use vars qw(@ISA);
477
+@ISA = qw(Mail::SpamAssassin::Plugin::Tokenizer);
478
+
479
+our $language = 'ja';
480
+
481
+our $RE = qr{(
482
+  # Hiragana
483
+    (?:
484
+        \xE3\x81[\x80-\xBF]
485
+      | \xE3\x82[\x80-\x9F]
486
+    )+
487
+  # Katakana
488
+  | (?:
489
+        \xE3\x82[\xA0-\xBF]
490
+      | \xE3\x83[\x80-\xBF]
491
+    )+
492
+  # Kanji
493
+  | (?:
494
+        \xE3[\x90-\xBF][\x80-\xBF]
495
+      | [\xE4-\xE9][\x80-\xBF]{2}
496
+      | \xEF[\xA4-\xAB][\x80-\xBF]
497
+    )+
498
+  # Fullwidth
499
+  | (?:
500
+        \xEF\xBC[\x80-\xBF]
501
+      | \xEF\xBD[\x80-\x9F]
502
+    )+
503
+  # Others
504
+  | [\xC0-\xDF][\x80-\xBF]
505
+  | [\xE0-\xE2][\x80-\xBF]{2}
506
+  | \xE3\x80[\x80-\xBF]
507
+  | \xE3[\x84-\x8F][\x80-\xBF]
508
+  | [\xEA-\xEE][\x80-\xBF]{2}
509
+  | \xEF[\x80-\xA3][\x80-\xBF]
510
+  | \xEF[\xAC-\xBB][\x80-\xBF]
511
+  | \xEF\xBD[\xA0-\xBF]
512
+  | \xEF[\xBE-\xBF][\x80-\xBF]
513
+  | [\xF0-\xF7][\x80-\xBF]{3}
514
+)}x;
515
+
516
+sub new {
517
+  my $class = shift;
518
+  my $mailsaobject = shift;
519
+
520
+  $class = ref($class) || $class;
521
+  my $self = $class->SUPER::new($mailsaobject, $language);
522
+  bless ($self, $class);
523
+
524
+  return $self;
525
+}
526
+
527
+sub tokenize {
528
+  my $self = shift;
529
+  my $text_array = shift;
530
+
531
+  my @tokenized_array;
532
+  foreach my $text (@$text_array) {
533
+    next unless ($text);
534
+    $text =~ s/([\x80-\xFF]{3,})/&_tokenize($1)/eg;
535
+    push(@tokenized_array, $text);
536
+  }
537
+  return \@tokenized_array;
538
+}
539
+
540
+sub _tokenize {
541
+  my $text = shift;
542
+
543
+  $text =~ s/$RE/$1 /og;
544
+  $text = ' ' . $text;
545
+  return $text;
546
+}
547
+
548
+1;
549
+
550
diff -uNr /dev/null lib/Mail/SpamAssassin/Plugin/Tokenizer.pm
551
--- /dev/null	1970-01-01 09:00:00.000000000 +0900
552
+++ lib/Mail/SpamAssassin/Plugin/Tokenizer.pm	2011-07-14 22:35:46.000000000 +0900
553
@@ -0,0 +1,115 @@
554
+# <@LICENSE>
555
+# Copyright 2004 Apache Software Foundation
556
+# 
557
+# Licensed under the Apache License, Version 2.0 (the "License");
558
+# you may not use this file except in compliance with the License.
559
+# You may obtain a copy of the License at
560
+# 
561
+#     http://www.apache.org/licenses/LICENSE-2.0
562
+# 
563
+# Unless required by applicable law or agreed to in writing, software
564
+# distributed under the License is distributed on an "AS IS" BASIS,
565
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
566
+# See the License for the specific language governing permissions and
567
+# limitations under the License.
568
+# </@LICENSE>
569
+
570
+=head1 NAME
571
+
572
+Mail::SpamAssassin::Plugin::Tokenizer - Tokenizer plugin base class
573
+
574
+=head1 SYNOPSIS
575
+
576
+=head2 SpamAssassin configuration:
577
+
578
+  loadplugin MyTokenizerPlugin /path/to/MyTokenizerPlugin.pm
579
+
580
+=head2 Perl code:
581
+
582
+  use Mail::SpamAssassin::Plugin::Tokenizer;
583
+  use vars qw(@ISA);
584
+  @ISA = qw(Mail::SpamAssassin::Plugin::Tokenizer);
585
+  # language to use this plugin
586
+  our $language = 'ja';
587
+
588
+  # constructor: register language
589
+  sub new {
590
+    my $class = shift;
591
+    my $mailsaobject = shift;
592
+
593
+    # some boilerplate...
594
+    $class = ref($class) || $class;
595
+    my $self = $class->SUPER::new($mailsaobject, $language);
596
+    bless ($self, $class);
597
+
598
+    return $self;
599
+  }
600
+
601
+  # tokenize function
602
+  sub tokenize {
603
+    my $self = shift;
604
+    my $text_array_ref = shift;
605
+
606
+    ......
607
+
608
+    return $tokenized_array_ref;
609
+  }
610
+
611
+
612
+=head1 DESCRIPTION
613
+
614
+This plugin is the base class of tokenizer plugin.
615
+You must define tokenize() and $language
616
+
617
+=head1 INTERFACE
618
+
619
+  sub tokenize {
620
+    my $self = shift;
621
+    my $text_array_ref = shift;
622
+
623
+    ......
624
+ 
625
+    return $tokenized_array_ref;
626
+  }
627
+
628
+=cut
629
+
630
+package Mail::SpamAssassin::Plugin::Tokenizer;
631
+
632
+use Mail::SpamAssassin::Plugin;
633
+use Mail::SpamAssassin::Logger;
634
+use strict;
635
+use warnings;
636
+use bytes;
637
+
638
+use vars qw(@ISA);
639
+@ISA = qw(Mail::SpamAssassin::Plugin);
640
+
641
+sub new {
642
+  my $class = shift;
643
+  my $mailsaobject = shift;
644
+  my $language = shift;
645
+
646
+  # some boilerplate...
647
+  $class = ref($class) || $class;
648
+  my $self = $class->SUPER::new($mailsaobject);
649
+  bless ($self, $class);
650
+
651
+  if ($language) {
652
+    $self->{main}->{conf}->{tokenizer}->{$language} = $self;
653
+  }
654
+  else {
655
+    dbg("plugin: $self: \$language is not defined");
656
+  }
657
+
658
+  return $self;
659
+}
660
+
661
+sub tokenize {
662
+  my ($self, $ref) = @_;
663
+
664
+  return $ref;
665
+}
666
+
667
+1;
668
+
669
diff -uNr /dev/null lib/Mail/SpamAssassin/Util/Charset.pm
670
--- /dev/null	1970-01-01 09:00:00.000000000 +0900
671
+++ lib/Mail/SpamAssassin/Util/Charset.pm	2011-07-14 22:29:19.000000000 +0900
672
@@ -0,0 +1,471 @@
673
+# <@LICENSE>
674
+# Copyright 2006 Apache Software Foundation
675
+#
676
+# Licensed under the Apache License, Version 2.0 (the "License");
677
+# you may not use this file except in compliance with the License.
678
+# You may obtain a copy of the License at
679
+#
680
+#     http://www.apache.org/licenses/LICENSE-2.0
681
+#
682
+# Unless required by applicable law or agreed to in writing, software
683
+# distributed under the License is distributed on an "AS IS" BASIS,
684
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
685
+# See the License for the specific language governing permissions and
686
+# limitations under the License.
687
+# </@LICENSE>
688
+
689
+
690
+=head1 NAME
691
+
692
+  Mail::SpamAssassin::Util::Charset.pm - Utility for charset and language
693
+
694
+=head1 SYNOPSIS
695
+
696
+  my ($decoded, $detected) = Mail::SpamAssassin::Util::Charset::normalize_charset($str, $charset);
697
+  my $language = Mail::SpamAssassin::Util::Charset::get_language($str, $charset);
698
+
699
+=head1 DESCRIPTION
700
+
701
+This module implements utility methods for charset and language.
702
+
703
+=cut
704
+
705
+package Mail::SpamAssassin::Util::Charset;
706
+
707
+use strict;
708
+use warnings;
709
+use Encode;
710
+use Encode::Guess;
711
+use Encode::Alias;
712
+
713
+use vars qw (
714
+  @ISA @EXPORT
715
+);
716
+
717
+require Exporter;
718
+
719
+@ISA = qw(Exporter);
720
+@EXPORT = qw(normalize_charset get_language);
721
+
722
+###########################################################################
723
+
724
+use constant HAS_ENCODE_DETECT => eval { require Encode::Detect::Detector; };
725
+use constant HAS_ENCODE_HANEXTRA => eval { require Encode::HanExtra; };
726
+use constant HAS_ENCODE_EUCJPMS => eval { require Encode::EUCJPMS; };
727
+
728
+###########################################################################
729
+
730
+our $KANA_HAN_RE = qr{
731
+  # Hiragana and Katakana
732
+    \xE3[\x81-\x83][\x80-\xBF]
733
+  # Han
734
+  | \xE3[\x90-\xBF][\x80-\xBF]
735
+  | [\xE4-\xE9][\x80-\xBF]{2}
736
+  | \xEF[\xA4-\xAB][\x80-\xBF]
737
+}x;
738
+
739
+our %enc2lang;
740
+our %lang2enc;
741
+our %scr2lang;
742
+our %cjkscr2lang;
743
+our @scrorder;
744
+
745
+BEGIN {
746
+
747
+  # See the following URL about this map:
748
+  #   http://czyborra.com/charsets/iso8859.html
749
+  #   http://czyborra.com/charsets/codepages.html
750
+  #   http://czyborra.com/charsets/cyrillic.html
751
+  #   http://en.wikipedia.org/wiki/ISO_8859
752
+  #   http://www.w3.org/International/O-charset-lang.html
753
+  %enc2lang = (
754
+    # buint-in Encodings and Encode::Byte
755
+    #   N. America
756
+    'ascii'         => 'en',
757
+    'cp437'         => 'en',
758
+    'cp863'         => 'weurope',
759
+
760
+    #   W. Europe (Latin1, Latin9)
761
+    #       fr es ca eu pt it sq rm nl de da sv no fi fo is ga gd en af
762
+    'iso-8859-1'    => 'weurope',
763
+    'iso-8859-15'   => 'weurope',
764
+    'cp850'         => 'weurope',
765
+    'cp860'         => 'weurope',
766
+    'cp1252'        => 'weurope',
767
+    'MacRoman'      => 'weurope',
768
+
769
+    #   Cntrl. Europe / Latin2 / Latin10
770
+    #       hr cs hu pl sr sk sl
771
+    'iso-8859-2'    => 'ceurope',
772
+    'cp852'         => 'ceurope',
773
+    'cp1250'        => 'ceurope',
774
+    'MacCentralEurRoman' => 'ceurope',
775
+    'MacCroatian'   => 'ceurope',
776
+    'iso-8859-16'   => 'ceurope',
777
+    'MacRomanian'   => 'ceurope',
778
+
779
+    #   Latin3 (Esperanto, Maltese, and Turkish. Turkish is now on 8859-9.)
780
+    #       eo mt
781
+    'iso-8859-3'    => 'seurope',
782
+
783
+    #   Baltics (Latin4, Latin7)
784
+    #       lv lt
785
+    'iso-8859-4'    => 'neurope',
786
+    'iso-8859-13'   => 'baltic',
787
+    'cp1257'        => 'baltic',
788
+
789
+    #   Nordics (Latin6)
790
+    #       et kl iu se
791
+    'iso-8859-10'   => 'nordic',
792
+
793
+    #   Cyrillics
794
+    #       bg be uk sr mk ru
795
+    'iso-8859-5'    => 'ru',
796
+    'cp855'         => 'ru',
797
+    'cp1251'        => 'ru',
798
+    'cp866'         => 'ru',
799
+    'MacCyrillic'   => 'ru',
800
+    'koi8-r'        => 'ru',
801
+    'MacUkrainian'  => 'uk',
802
+    'koi8-u'        => 'uk',
803
+
804
+    #   Arabic
805
+    'iso-8859-6'    => 'ar',
806
+    'cp864'         => 'ar',
807
+    'cp1256'        => 'ar',
808
+    'MacArabic'     => 'ar',
809
+    'cp1006'        => 'fa',
810
+    'MacFarsi'      => 'fa',
811
+
812
+    #   Greek
813
+    'iso-8859-7'    => 'el',
814
+    'cp1253'        => 'el',
815
+    'MacGreek'      => 'el',
816
+
817
+    #   Hebrew
818
+    #       he yi
819
+    'iso-8859-8'    => 'he',
820
+    'cp862'         => 'he',
821
+    'cp1255'        => 'he',
822
+    'MacHebrew'     => 'he',
823
+
824
+    #   Turkish
825
+    'iso-8859-9'    => 'tr',
826
+    'cp857'         => 'tr',
827
+    'cp1254'        => 'tr',
828
+    'MacTurkish'    => 'tr',
829
+
830
+    #   Thai
831
+    'iso-8859-11'   => 'th',
832
+    'cp874'         => 'th',
833
+
834
+    #   Celtics (Latin8)
835
+    #       gd cy br
836
+    'iso-8859-14'   => 'celtic',
837
+
838
+    #   Vietnamese
839
+    'viscii'        => 'vi',
840
+    'cp1258'        => 'vi',
841
+
842
+    # Encode::CN
843
+    'euc-cn'        => 'zh',
844
+    'cp936'         => 'zh',
845
+    'hz'            => 'zh',
846
+
847
+    # Encode::TW
848
+    'big5-eten'     => 'zh',
849
+    'big5-hkscs'    => 'zh',
850
+    'cp950'         => 'zh',
851
+
852
+    # Encode::JP
853
+    'euc-jp'        => 'ja',
854
+    'shiftjis'      => 'ja',
855
+    '7bit-jis'      => 'ja',
856
+    'iso-2022-jp'   => 'ja',
857
+    'iso-2022-jp-1' => 'ja',
858
+    'cp932'         => 'ja',
859
+
860
+    # Encode::KR
861
+    'euc-kr'        => 'ko',
862
+    'cp949'         => 'ko',
863
+    'johab'         => 'ko',
864
+    'iso-2022-kr'   => 'ko',
865
+
866
+    # Encode::HanExtra
867
+    'euc-tw'        => 'zh',
868
+    'gb18030'       => 'zh',
869
+
870
+    # Encode::JIS2K
871
+    'euc-jisx0213'  => 'ja',
872
+    'shiftjisx0123' => 'ja',
873
+    'iso-2022-jp-3' => 'ja',
874
+
875
+    # Encode::EUCJPMS
876
+    'eucJP-ms'      => 'ja',
877
+    'cp51932'       => 'ja',
878
+    'cp50220'       => 'ja',
879
+    'cp50221'       => 'ja',
880
+
881
+  );
882
+
883
+  %lang2enc = (
884
+    # Latin1
885
+    'en' => ['ascii'],
886
+    'weurope' => ['cp1252'],
887
+
888
+    # Latin2
889
+    'ceurope' => ['cp1250'],
890
+
891
+    # Latin3
892
+    'seurope' => ['iso-8859-3'],
893
+
894
+    # Latin4
895
+    'neurope' => ['iso-8859-4'],
896
+
897
+    # Latin5
898
+    'tr' => ['cp1254'],
899
+
900
+    # Latin6
901
+    'nordic' => ['iso-8859-10'],
902
+
903
+    # Latin7
904
+    'baltic' => ['cp1257'],
905
+
906
+    # Latin8
907
+    'celtic' => ['iso-8859-14'],
908
+
909
+    # Non Latin
910
+    'ru' => ['koi8-r', 'cp1251'],
911
+    'uk' => ['koi8-u'],
912
+
913
+    'ar' => ['cp1256'],
914
+    'el' => ['cp1253'],
915
+    'he' => ['cp1255'],
916
+    'th' => ['cp874'],
917
+    'vi' => ['viscii', 'cp1258'],
918
+    'zh' => ['euc-cn', 'cp950'],
919
+    'ja' => ['euc-jp', 'cp932'],
920
+    'ko' => ['euc-kr', 'cp949'],
921
+
922
+  );
923
+
924
+  %scr2lang = (
925
+    'InLatin1Supplement' => ['weurope'],
926
+    'InLatinExtendedA' => [
927
+      'ceurope',
928
+      'seurope',
929
+      'tr',
930
+      'vi'
931
+    ],
932
+    'InLatinExtendedB' => [
933
+      'nordic',
934
+      'baltic',
935
+      'celtic'
936
+    ],
937
+    'Thai'   => ['th'],
938
+    'Cyrillic' => ['ru', 'uk'],
939
+    'Arabic' => ['ar'],
940
+    'Greek'  => ['el'],
941
+    'Hebrew' => ['he'],
942
+  );
943
+
944
+  # better detection for CJK
945
+  @scrorder = ('Hiragana','Katakana','Hangul','Han',keys(%scr2lang));
946
+  %cjkscr2lang = (
947
+    'Hiragana' => ['ja'],
948
+    'Katakana' => ['ja'],
949
+    'Hangul' => ['ko'],
950
+    'Han'    => ['zh', 'ja', 'ko'],
951
+  );
952
+
953
+  unless (HAS_ENCODE_HANEXTRA) {
954
+    Encode::Alias::define_alias( qr/^gb18030$/i => ' "euc-cn"' );
955
+  }
956
+  Encode::Alias::define_alias( qr/^unicode-1-1-(.+)$/i => ' "$1"' );
957
+  Encode::Alias::define_alias( qr/^TIS-620$/i => ' "iso-8859-11"' );
958
+  Encode::Alias::define_alias( qr/^x-mac-(.+)$/i => ' "Mac$1"' );
959
+  Encode::Alias::define_alias( qr/^Shift_JIS$/i => ' "cp932"' );
960
+  if (HAS_ENCODE_EUCJPMS) {
961
+    Encode::Alias::define_alias( qr/^iso-2022-jp$/i => ' "cp50221"' );
962
+  }
963
+}
964
+
965
+sub get_language {
966
+  my $str = shift; # $str must be UTF-8 encoding
967
+  my $charset = shift;
968
+
969
+  return 'en' unless $charset;
970
+  if ($charset !~ /^utf/i) {
971
+    return $enc2lang{$charset};
972
+  } elsif (defined($str)) {
973
+    $str =~ s/[\x00-\x7F]//g; # remove ASCII characters
974
+    return 'en' if ($str eq '');
975
+
976
+    my %handled;
977
+    $str = Encode::decode_utf8($str) unless (Encode::is_utf8($str));
978
+    foreach my $scr (@scrorder) {
979
+      next if ($str !~ /\p{$scr}/);
980
+      my $scrlangs = exists($cjkscr2lang{$scr}) ? $cjkscr2lang{$scr} : $scr2lang{$scr};
981
+      foreach my $lang (@$scrlangs) {
982
+        next if (exists($handled{$lang}));
983
+        foreach my $enc (@{$lang2enc{$lang}}) {
984
+          my $scratch = $str;
985
+          Encode::encode($enc, $scratch, Encode::FB_QUIET);
986
+          return $lang if ($scratch eq '');
987
+        }
988
+        $handled{$lang} = 1;
989
+      }
990
+    }
991
+  } 
992
+  return 'en';
993
+}
994
+
995
+# TEST 1: try conversion to use the specified charset. 
996
+# TEST 2: try conversion to use Encode::Detect.
997
+# TEST 3: try conversion to use Encode::Guess.
998
+sub normalize_charset {
999
+  my $str = shift;
1000
+  my $charset = shift;
1001
+
1002
+  return wantarray ? ($str, 'ascii')  : $str unless ($str);
1003
+
1004
+  my $decoded;
1005
+  my $detected;
1006
+
1007
+  if ($charset) {
1008
+    ($decoded, $detected) = _specified_encoding($str, $charset);
1009
+  }
1010
+  unless ($detected) {
1011
+    ($decoded, $detected) = _encode_detect($str);
1012
+  }
1013
+  unless ($detected) {
1014
+    ($decoded, $detected) = _encode_guess($str);
1015
+  }
1016
+  unless ($detected) {
1017
+    return ($str, undef);
1018
+  }
1019
+  $decoded =~ s/^\x{feff}//g;
1020
+  $decoded = Encode::encode_utf8($decoded);
1021
+
1022
+  # unfold hiragana, katakana and han
1023
+  if ($detected =~ /^(?:UTF|EUC|BIG5|GB|SHIFTJIS|ISO-2022|CP969$|CP932$|CP949|CP50221$)/i) {
1024
+    $decoded =~ s/($KANA_HAN_RE)\012($KANA_HAN_RE)/$1$2/og;
1025
+  }
1026
+  return wantarray ? ($decoded, $detected) : $decoded;
1027
+}
1028
+
1029
+sub _specified_encoding {
1030
+  my $str = shift;
1031
+  my $encoding = shift;
1032
+
1033
+  my $detected;
1034
+  my $decoded;
1035
+
1036
+  return (undef, undef) unless ($encoding);
1037
+
1038
+  # note: ISO-2022-* is not deistinguish from US-ASCII
1039
+  return (undef, undef) if ($str =~ /\e/ and $encoding !~ /^ISO-2022/i);
1040
+
1041
+  # UTF-16|32 encoding without BOM cannot be trusted.
1042
+  return (undef, undef) if ($encoding =~ /^UTF-32$/i and $str !~ /^(?:\xFF\xFE\x00\x00|\x00\x00\xFE\xFF)/);
1043
+  return (undef, undef) if ($encoding =~ /^UTF-16$/i and $str !~ /^(?:\xFF\xFE|\xFE\xFF)/);
1044
+
1045
+  #$encoding = _get_alias($encoding);
1046
+  my $encoder = Encode::find_encoding($encoding);
1047
+  if (ref($encoder)) {
1048
+    $decoded = $encoder->decode($str,Encode::FB_QUIET);
1049
+    $detected = $encoder->name if ($str eq '');
1050
+  }
1051
+  return ($decoded, $detected);
1052
+}
1053
+
1054
+sub _encode_detect {
1055
+  return undef unless HAS_ENCODE_DETECT;
1056
+  my $str = shift;
1057
+
1058
+  # UTF-16|32 encoding without BOM cannot be trusted.
1059
+  return (undef, undef) if ($str =~ /\x00\x00/ and $str !~ /^(?:\xFF\xFE\x00\x00|\x00\x00\xFE\xFF)/);
1060
+  return (undef, undef) if ($str =~ /\x00/ and $str !~ /^(?:\xFF\xFE|\xFE\xFF)/);
1061
+
1062
+  my $decoded;
1063
+  my $detected = Encode::Detect::Detector::detect($str);
1064
+  if ($detected) {
1065
+    $detected = _get_alias($detected);
1066
+    my $encoder = Encode::find_encoding($detected);
1067
+    if (ref($encoder)) {
1068
+      $decoded = $encoder->decode($str);
1069
+      $detected = $decoded ? $encoder->name : undef;
1070
+    }
1071
+    else {
1072
+      $detected = undef;
1073
+    }
1074
+  }
1075
+  return ($decoded, $detected);
1076
+}
1077
+
1078
+sub _encode_guess {
1079
+  my $str = shift;
1080
+
1081
+  my $detected;
1082
+  my $decoded;
1083
+  my $encoder;
1084
+
1085
+  # Step 1: Examine ISO-2022-*.
1086
+  if ($str =~ /\e/) {
1087
+    $Encode::Guess::NoUTFAutoGuess = 1;
1088
+    $encoder = Encode::Guess::guess_encoding($str,
1089
+        qw/cp50221 7bit-jis iso-2022-kr/);
1090
+    $Encode::Guess::NoUTFAutoGuess = 0;
1091
+  }
1092
+
1093
+  # Step 2: Examine US-ASCII/UTF-(8|16|32)
1094
+  unless (ref($encoder)) {
1095
+    $Encode::Guess::NoUTFAutoGuess = 0;
1096
+    $encoder = Encode::Guess::guess_encoding($str);
1097
+  }
1098
+
1099
+  # Step 3: Examine other encodings
1100
+  unless (ref($encoder)) {
1101
+    $Encode::Guess::NoUTFAutoGuess = 1;
1102
+    eval {
1103
+      if ($str =~ /[\x80-\xFF]{4}/) {
1104
+        $encoder = Encode::Guess::guess_encoding($str,
1105
+          qw/euc-cn big5-eten euc-jp cp932 euc-kr cp949/);
1106
+      }
1107
+      else {
1108
+        $encoder = Encode::Guess::guess_encoding($str,
1109
+          qw/iso-8859-1 cp1252/);
1110
+      }
1111
+    };
1112
+    $Encode::Guess::NoUTFAutoGuess = 0;
1113
+  }
1114
+  if (ref($encoder)) {
1115
+    $detected = $encoder->name;
1116
+    if ($detected) {
1117
+      $decoded = $encoder->decode($str);
1118
+    }
1119
+  }
1120
+  return ($decoded, $detected);
1121
+}
1122
+
1123
+sub _get_alias {
1124
+  my $encoding = shift;
1125
+
1126
+  unless (HAS_ENCODE_HANEXTRA) {
1127
+    $encoding =~ s/^gb18030$/euc-cn/i;
1128
+  }
1129
+  $encoding =~ s/^unicode-1-1-(.+)$/$1/i;
1130
+  $encoding =~ s/^TIS-620$/iso-8859-11/i;
1131
+  $encoding =~ s/x-mac-(.+)$/Mac$1/i;
1132
+  $encoding =~ s/^Shift_JIS$/cp932/i;
1133
+  if (HAS_ENCODE_EUCJPMS) {
1134
+    $encoding =~ s/^iso-2022-jp$/cp50221/i;
1135
+    $encoding =~ s/^euc-jp$/cp51932/i;
1136
+  }
1137
+
1138
+  return $encoding;
1139
+}
1140
+
1141
+
1142
+1;
1143
+
(-)./files/spamassassin-3.4.0-ja.plist (+12 lines)
Line 0 Link Here
1
%%SITE_PERL%%/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm
2
%%SITE_PERL%%/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm
3
@dirrm %%SITE_PERL%%/Mail/SpamAssassin/Plugin/Tokenizer
4
%%SITE_PERL%%/Mail/SpamAssassin/Plugin/Tokenizer.pm
5
%%SITE_PERL%%/Mail/SpamAssassin/Util/Charset.pm
6
%%PERL5_MAN3%%/Mail::SpamAssassin::Plugin::Tokenizer.3.gz
7
%%PERL5_MAN3%%/Mail::SpamAssassin::Plugin::Tokenizer::MeCab.3.gz
8
%%PERL5_MAN3%%/Mail::SpamAssassin::Plugin::Tokenizer::SimpleJA.3.gz
9
%%PERL5_MAN3%%/Mail::SpamAssassin::Util::Charset.3.gz
10
@unexec if cmp -s %D/%%ETCDIR%%/%%TOKENIZER_PRE%%.sample %D/%%ETCDIR%%/%%TOKENIZER_PRE%%; then rm -f %D/%%ETCDIR%%/%%TOKENIZER_PRE%%; fi
11
%%ETCDIR%%/%%TOKENIZER_PRE%%.sample
12
@exec if [ ! -f %B/%%TOKENIZER_PRE%% ]; then cp -p %B/%f %B/%%TOKENIZER_PRE%%; fi
(-)./pkg-message (-39 lines)
Lines 1-42 Link Here
1
*************************************************************************
2
*	    _  _____ _____ _____ _   _ _____ ___ ___  _   _ 		*
3
*	   / \|_   _|_   _| ____| \ | |_   _|_ _/ _ \| \ | |		*
4
*	  / _ \ | |   | | |  _| |  \| | | |  | | | | |  \| |		*
5
*	 / ___ \| |   | | | |___| |\  | | |  | | |_| | |\  |		*
6
*	/_/   \_\_|   |_| |_____|_| \_| |_| |___\___/|_| \_|		*
7
*									*
8
*	See PREFIX/share/doc/p5-Mail-SpamAssassin/INSTALL,		*
9
*	and PREFIX/share/doc/p5-Mail-SpamAssassin/UPGRADE,		*
10
*	or http://spamassassin.org/dist/INSTALL and			*
11
*	http://spamassassin.org/dist/UPGRADE BEFORE enabling		*
12
*	this version of SpamAssassin for important information		*
13
*	regarding changes in this version.				*
14
*									*
15
*	SpamAssassin may require additional configuration in		*
16
*	PREFIX/etc/mail/spamassassin/init.pre depending on		*
17
*	the options you have installed.	 Otherwise, annoying		*
18
*	(but harmless) error messages may result.  Read the		*
19
*	files listed above.						*
20
*									*
21
*************************************************************************
22
You may wish to run sa-update now to obtain the latest rules.
23
24
NOTE:  FREEBSD users: If you are updating from a version prior to 3.20.
25
sa-update now places state files in /var/db/spamassassin and not 
26
/var/lib/spamassassin.  This is to be consistant with Freebsd file
27
directory conventions.
28
29
If you run sa-compile, you will notice that files are in 
30
/var/db/spamassassin/compiled/<perlversion>/<version> instead of
31
/var/db/spamassassin/compiled/<version>.
32
No attempts have been made to move old versions over. You must recompile.
33
34
If you are running with spamd, you must add the following to rc.conf:
35
spamd_enable="YES"
36
37
Security Note:  If you did NOT deselected AS_ROOT, spamd will be running
38
as root. To change this, also add this to rc.conf:
39
spamd_flags="-u spamd -H /var/spool/spamd"
40
1
41
************************************************************************
2
************************************************************************
42
For Japanese users, see documents in
3
For Japanese users, see documents in

Return to bug 186756