--- Bayes.pm_org	2024-09-23 22:43:53.455018053 +0900
+++ Bayes.pm_patch	2024-12-14 20:26:28.976110365 +0900
@@ -81,6 +81,9 @@ package Mail::SpamAssassin::Plugin::Baye
 
 use strict;
 use warnings;
+use Text::MeCab;
+use Unicode::Japanese;
+
 # use bytes;
 use re 'taint';
 
@@ -1259,7 +1262,9 @@ sub _tokenize_line {
   # cleared, even if the source string has perl characters semantics !!!
   # Is this really still desirable?
 
-TOKEN: foreach my $token (split) {
+TOKEN: foreach my $token (_bunsetsu_wakachi($_)) {
+    next if !defined($token);
+
     $token =~ s/^[-'"\.,]+//;        # trim non-alphanum chars at start or end
     $token =~ s/[-'"\.,]+$//;        # so we don't get loads of '"foo' tokens
 
@@ -1848,4 +1853,102 @@ sub bayes_report_make_list {
   } @{$info}[0..$amt-1];
 }
 
+###########################################################################
+
+sub _bunsetsu_wakachi {
+  my ($textbase) = @_;
+  my $mecab = Text::MeCab->new();
+  my @result = ();
+
+  $textbase =  Unicode::Japanese->new($textbase)->z2hSym->get;
+  $textbase =  Unicode::Japanese->new($textbase)->z2hNum->get;
+  $textbase =  Unicode::Japanese->new($textbase)->z2hAlpha->get;
+
+  # Combine patterns for Mail Address, URL, Post Code, Toll-free Number, Mobile Number, Phone Number
+  # Year, Month, Day and Percentage
+  while ($textbase =~ /([a-zA-Z0-9_+-]+(?:\.[a-zA-Z0-9_+-]+)*@([a-zA-Z0-9-]+)(\.[a-zA-Z]{2,})+)/ ||
+         $textbase =~ /(https?:\/\/[\w!\?\/\+\-_~=;\.,\*&@#$%\(\)'\[\]]+)/ ||
+         $textbase =~ /(〒[\d]{3}-[\d]{4})/ ||
+         $textbase =~ /(0120-\d{3}-\d{3})/ ||
+         $textbase =~ /(0120-\d{2}-\d{4})/ ||
+         $textbase =~ /(0120-\d{4}-\d{2})/ ||
+         $textbase =~ /(0120-\d{6})/ ||
+         $textbase =~ /(0[5789]0-\d{4}-\d{4})/ ||
+         $textbase =~ /(0\d{1,4}-\d{1,4}-\d{4})/ ||
+         $textbase =~ /([\+-]?(\d+)(年|月|日)(度)?)/ ||
+         $textbase =~ /([\+-]?\d{1,3}(,\d{3})*(千|万|億)?円)/ ||
+         $textbase =~ /([\+-]?(\d+)(\.\d+)?\%)/){
+    push @result, $1;
+    $textbase =~ s/\Q$1\E/ /g;
+  }
+
+  # Replace specific characters
+  my $replacechar = "　 ( ) [ ] { } . , 、 。 「 」 （ ） 『 』 「 」 ［ ］ 【 】 《 》 ｛ ｝ 〈 〉";
+  foreach my $replaceparts (split(/ /, $replacechar)) {
+    next if !defined($replaceparts);
+    my $count = ($textbase =~ s/\Q$replaceparts\E/ /g);
+    if ($count != 0){
+      push(@result, $replaceparts);
+    }
+  }
+
+  foreach my $text (split(/[\s]+/, $textbase)) {
+    next if !defined($text);
+
+    my $string = '';
+    my $maeFeature = '';
+    my $append = 0;
+
+    if ($text =~ /^[\x21-\x7E]+$/){
+      push @result, $text;
+      next
+    }
+
+    for (my $n = $mecab->parse($text); $n; $n = $n->next) {
+      my $token = $n->surface;
+      my @feature = split(',', $n->feature);
+      next if !defined($token);
+
+#####################################
+#     push @result, $token;
+#####################################
+
+      if ($feature[0] =~ /記号/){
+        $feature[0] = '記号';
+      }
+
+      if ($append != 0 || $maeFeature eq $feature[0]){
+        $string = $string.$token;
+        $maeFeature = $feature[0];
+        $append = 0;
+      } else {
+        if ($feature[0] =~ '(助詞|助動詞|接尾辞)'){
+          $string = $string.$token;
+          $maeFeature = $feature[0];
+          next;
+        }
+
+        if($string ne ""){
+          push @result, $string;
+        }
+
+        if ($feature[0] =~ '(接頭辞|連体詞)'){
+          $append = 1;
+        }
+
+        $string = $token;
+        $maeFeature = $feature[0];
+      }
+    }
+    if ($string ne ""){
+      push @result, $string;
+    }
+  }
+
+  my %seen;
+  @result = (grep { !$seen{$_}++ } @result);
+  @result = sort(@result);
+  return(@result);
+}
+
 1;
