? japanese_tokenize.5.diff
? japanese_tokenize.6.diff
? japanese_tokenize.7.diff
? japanese_tokenize.8.diff
Index: nsBayesianFilter.cpp
===================================================================
RCS file: /cvsroot/mozilla/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.cpp,v
retrieving revision 1.35.6.7
diff -u -r1.35.6.7 nsBayesianFilter.cpp
--- nsBayesianFilter.cpp	19 Oct 2004 21:25:42 -0000	1.35.6.7
+++ nsBayesianFilter.cpp	26 Jan 2005 16:36:05 -0000
@@ -79,6 +79,9 @@
 #include "nsIHTMLToTextSink.h"
 #include "nsIDocumentEncoder.h" 
 
+// needed to decode mime encoded subject
+#include "nsIMimeConverter.h"
+
 #include "nsIncompleteGamma.h"
 #include <math.h>
 
@@ -303,6 +306,108 @@
     return str;
 }
 
+// one subtract and one conditional jump should be faster than two conditional jump on most recent system.
+#define IN_RANGE(x, low, high)  ((PRUint16)((x)-(low)) <= (high)-(low))
+
+#define IS_JA_HIRAGANA(x)   IN_RANGE(x, 0x3040, 0x309F)
+// swapping the range using xor operation to reduce conditional jump.
+#define IS_JA_KATAKANA(x)	(IN_RANGE(x^0x0004, 0x30A0, 0x30FE)||(IN_RANGE(x, 0xFF66, 0xFF9F)))
+#define IS_JA_KANJI(x)      (IN_RANGE(x, 0x2E80, 0x2FDF)||IN_RANGE(x, 0x4E00, 0x9FAF))
+#define IS_JA_KUTEN(x)      (((x)==0x3001)||((x)==0xFF64)||((x)==0xFF0E))
+#define IS_JA_TOUTEN(x)     (((x)==0x3002)||((x)==0xFF61)||((x)==0xFF0C))
+#define IS_JA_SPACE(x)      ((x)==0x3000)
+#define IS_JA_FWLATAIN(x)   IN_RANGE(x, 0xFF01, 0xFF5E)
+#define IS_JA_FWNUMERAL(x)  IN_RANGE(x, 0xFF10, 0xFF19)
+
+enum char_class{
+    others = 0,
+    space,
+    hiragana,
+    katakana,
+    kanji,
+    kuten,
+    touten,
+    kigou,
+    fwlatain,
+    ascii
+};
+
+char_class getCharClass(PRUnichar c)
+{
+  char_class charClass = others;
+
+  if(IS_JA_HIRAGANA(c))
+    charClass = hiragana;
+  else if(IS_JA_KATAKANA(c))
+    charClass = katakana;
+  else if(IS_JA_KANJI(c))
+    charClass = kanji;
+  else if(IS_JA_KUTEN(c))
+    charClass = kuten;
+  else if(IS_JA_TOUTEN(c))
+    charClass = touten;
+  else if(IS_JA_FWLATAIN(c))
+    charClass = fwlatain;
+
+  return charClass;
+}
+
+PRBool isFWNumeral(const PRUnichar* p1, const PRUnichar* p2)
+{
+  for(;p1<p2;p1++)
+    if(!IS_JA_FWNUMERAL(*p1)) 
+      return PR_FALSE;
+
+  return PR_TRUE;
+}
+
+#define IS_JAPANESE_SPECIFIC(x) (IN_RANGE(x, 0x3040, 0x30FF)||IN_RANGE(x, 0xFF01, 0xFF9F))
+
+static PRBool isJapanese(const char* word)
+{
+  nsString text = NS_ConvertUTF8toUCS2(word);
+  PRUnichar* p = (PRUnichar*)text.get();
+  PRUnichar c;
+    
+  // it is japanese chunk if it contains any hiragana or katakana.
+  while((c = *p++))
+    if( IS_JAPANESE_SPECIFIC(c)) 
+      return PR_TRUE;
+
+  return PR_FALSE;
+}
+
+// The japanese tokenizer was added as part of Bug #277354
+void Tokenizer::tokenize_japanese_word(char* chunk, const char* aTokenPrefix)
+{
+  PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("entering tokenize_japanese_word(%s)", chunk));
+  
+  nsCAutoString prefix;
+  if(aTokenPrefix){
+    prefix = PromiseFlatCString(nsDependentCString(aTokenPrefix) + NS_LITERAL_CSTRING(":JA:"));
+  }else{
+    prefix = PromiseFlatCString(NS_LITERAL_CSTRING("JA:"));
+  }
+  nsString srcStr = NS_ConvertUTF8toUCS2(chunk);
+  const PRUnichar* p1 = srcStr.get();
+  const PRUnichar* p2 = p1;
+  if(!*p2) return;
+  
+  char_class cc = getCharClass(*p2);
+  while(*(++p2))
+  {
+    if(cc == getCharClass(*p2)) 
+      continue;
+   
+    nsCString token = NS_ConvertUCS2toUTF8(p1, p2-p1);
+    if( (!isDecimalNumber(token.get())) && (!isFWNumeral(p1, p2)))      
+      add(PromiseFlatCString(prefix + token).get());
+        
+    cc = getCharClass(*p2);
+    p1 = p2;
+  }
+}
+
 void Tokenizer::addTokenForHeader(const char * aTokenPrefix, nsACString& aValue, PRBool aTokenizeValue)
 {
   if (aValue.Length())
@@ -321,6 +426,10 @@
           if (isDecimalNumber(word)) continue;
           if (isASCII(word))
               add(PromiseFlatCString(nsDependentCString(aTokenPrefix) + NS_LITERAL_CSTRING(":") + nsDependentCString(word)).get());
+          else if (isJapanese(word)){
+              tokenize_japanese_word(word, aTokenPrefix);
+              mLanguage = NS_LITERAL_STRING("JA");
+          }
       }
     }
   }
@@ -344,7 +453,8 @@
 void Tokenizer::tokenizeHeaders(nsIUTF8StringEnumerator * aHeaderNames, nsIUTF8StringEnumerator * aHeaderValues)
 {
   nsCOMPtr<nsIMIMEHeaderParam> mimehdrpar = do_GetService(NS_MIMEHEADERPARAM_CONTRACTID);
-
+  nsCOMPtr<nsIMimeConverter> mimeConverter = do_GetService(NS_MIME_CONVERTER_CONTRACTID);
+  
   nsCString headerValue;
   nsCAutoString headerName; // we'll be normalizing all header names to lower case
   PRBool hasMore = PR_TRUE;
@@ -390,16 +500,22 @@
         if (headerName.Equals("subject"))
         { 
           // we want to tokenize the subject
-          addTokenForHeader(headerName.get(), headerValue, PR_TRUE);
+          nsString str;
+          mimeConverter->DecodeMimeHeader(headerValue.get(), str);
+          nsCAutoString cstr = NS_ConvertUTF16toUTF8(str);
+          addTokenForHeader(headerName.get(), cstr, PR_TRUE);
         }
 
         // important: leave out sender field. To strong of an indicator
         break;
-    case 'x': // (2) X-Mailer / user-agent works best if it is untokenized, just fold the case and any leading/trailing white space
-    case 'u': 
-        addTokenForHeader(headerName.get(), headerValue); 
-        break;
     default:
+        if (headerName.Equals("x-mozilla-status")) break;
+        if (headerName.Equals("x-mozilla-status2")) break;
+        if (headerName.Equals("x-uidl")) break;
+        if (headerName.Equals("message-id")) break;
+        if (headerName.Equals("in-reply-to")) break;
+        if (headerName.Equals("date")) break;
+        if (headerName.Equals("reference")) break;
         addTokenForHeader(headerName.get(), headerValue); 
         break;
     } // end switch
@@ -484,7 +600,17 @@
     nsString text = NS_ConvertUTF8toUCS2(aText);
     nsString strippedUCS2;
     stripHTML(text, strippedUCS2);
-
+    
+    // convert 0x3000(full width space) into 0x0020
+    nsString::iterator substr_start, substr_end;
+    strippedUCS2.BeginWriting(substr_start);
+    strippedUCS2.EndWriting(substr_end);
+    while (substr_start != substr_end) {
+        if (*substr_start == 0x3000)
+            *substr_start = 0x0020;
+        ++substr_start;
+    }
+    
     nsCString strippedStr = NS_ConvertUCS2toUTF8(strippedUCS2);
     char * strippedText = (char *) strippedStr.get(); // bleh
     PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("tokenize stripped html: %s", strippedText));
@@ -496,6 +622,10 @@
         if (isDecimalNumber(word)) continue;
         if (isASCII(word))
             tokenize_ascii_word(word);
+        else if (isJapanese(word)){
+            tokenize_japanese_word(word);
+            mLanguage = NS_LITERAL_STRING("JA");
+        }
         else {
             nsresult rv;
             // use I18N  scanner to break this word into meaningful semantic units.
@@ -809,7 +939,7 @@
 NS_IMPL_ISUPPORTS2(nsBayesianFilter, nsIMsgFilterPlugin, nsIJunkMailPlugin)
 
 nsBayesianFilter::nsBayesianFilter()
-    :   mGoodCount(0), mBadCount(0),
+    :   mGoodCount(0), mBadCount(0), mGoodCountJ(0), mBadCountJ(0),
         mBatchLevel(0), mTrainingDataDirty(PR_FALSE)
 {
     if (!BayesianFilterLogModule)
@@ -827,8 +957,13 @@
 
     PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("junk probabilty threshold: %f", mJunkProbabilityThreshold));
 
-    PRBool ok = (mGoodTokens && mBadTokens);
+    PRBool ok = (mGoodTokens && mBadTokens && mGoodTokensJ && mBadTokensJ);
     NS_ASSERTION(ok, "error allocating tokenizers");
+    mGoodTokens.mLanguage = NS_LITERAL_STRING("GENERAL");
+    mBadTokens.mLanguage = NS_LITERAL_STRING("GENERAL");
+    mGoodTokensJ.mLanguage = NS_LITERAL_STRING("JA");
+    mBadTokensJ.mLanguage = NS_LITERAL_STRING("JA");
+    
     if (ok)
         readTrainingData();
     else {
@@ -954,6 +1089,11 @@
     Token* tokens = tokenizer.copyTokens();
     if (!tokens) return;
   
+	PRUint32& goodCount = (tokenizer.mLanguage == NS_LITERAL_STRING("JA")) ? mGoodCountJ : mGoodCount;
+	PRUint32& badCount = (tokenizer.mLanguage == NS_LITERAL_STRING("JA")) ? mBadCountJ : mBadCount;
+	Tokenizer& goodTokens = (tokenizer.mLanguage == NS_LITERAL_STRING("JA")) ? mGoodTokensJ : mGoodTokens;
+	Tokenizer& badTokens = (tokenizer.mLanguage == NS_LITERAL_STRING("JA")) ? mBadTokensJ : mBadTokens;
+	
     // the algorithm in "A Plan For Spam" assumes that you have a large good
     // corpus and a large junk corpus.
     // that won't be the case with users who first use the junk mail feature
@@ -964,12 +1104,12 @@
     // and if there are no bad tokens, assume the message is not junk
     // this will also "encourage" the user to train
     // see bug #194238
-    if (listener && !mGoodCount && !mGoodTokens.countTokens()) {
+    if (listener && !goodCount && !goodTokens.countTokens()) {
       PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("no good tokens, assume junk"));
       listener->OnMessageClassified(messageURI, nsMsgJunkStatus(nsIJunkMailPlugin::JUNK));
       return;
     }
-    if (listener && !mBadCount && !mBadTokens.countTokens()) {
+    if (listener && !badCount && !badTokens.countTokens()) {
       PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("no bad tokens, assume good"));
       listener->OnMessageClassified(messageURI, nsMsgJunkStatus(nsIJunkMailPlugin::GOOD));
       return;
@@ -977,15 +1117,15 @@
 
     /* this part is similar to the Graham algorithm with some adjustments. */
     PRUint32 i, goodclues=0, count = tokenizer.countTokens();
-    double ngood = mGoodCount, nbad = mBadCount, prob;
+    double ngood = goodCount, nbad = badCount, prob;
 
     for (i = 0; i < count; ++i) 
     {
       Token& token = tokens[i];
       const char* word = token.mWord;
-      Token* t = mGoodTokens.get(word);
+      Token* t = goodTokens.get(word);
       double hamcount = ((t != NULL) ? t->mCount : 0);
-      t = mBadTokens.get(word);
+      t = badTokens.get(word);
       double spamcount = ((t != NULL) ? t->mCount : 0);
 
       // if hamcount and spam count are both 0, we could end up with a divide by 0 error, 
@@ -1182,6 +1322,11 @@
     PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("observeMessage(%s) old=%d new=%d", messageURL, oldClassification, newClassification));
     TokenEnumeration tokens = tokenizer.getTokens();
 
+	PRUint32& goodCount = (tokenizer.mLanguage == NS_LITERAL_STRING("JA")) ? mGoodCountJ : mGoodCount;
+	PRUint32& badCount = (tokenizer.mLanguage == NS_LITERAL_STRING("JA")) ? mBadCountJ : mBadCount;
+	Tokenizer& goodTokens = (tokenizer.mLanguage == NS_LITERAL_STRING("JA")) ? mGoodTokensJ : mGoodTokens;
+	Tokenizer& badTokens = (tokenizer.mLanguage == NS_LITERAL_STRING("JA")) ? mBadTokensJ : mBadTokens;
+	
     // Uhoh...if the user is re-training then the message may already be classified and we are classifying it again with the same classification.
     // the old code would have removed the tokens for this message then added them back. But this really hurts the message occurrence
     // count for tokens if you just removed training.dat and are re-training. See Bug #237095 for more details.
@@ -1194,17 +1339,17 @@
     switch (oldClassification) {
     case nsIJunkMailPlugin::JUNK:
         // remove tokens from junk corpus.
-        if (mBadCount > 0) {
-            --mBadCount;
-            forgetTokens(mBadTokens, tokens);
+        if (badCount > 0) {
+            --badCount;
+            forgetTokens(badTokens, tokens);
             mTrainingDataDirty = PR_TRUE;
         }
         break;
     case nsIJunkMailPlugin::GOOD:
         // remove tokens from good corpus.
-        if (mGoodCount > 0) {
-            --mGoodCount;
-            forgetTokens(mGoodTokens, tokens);
+        if (goodCount > 0) {
+            --goodCount;
+            forgetTokens(goodTokens, tokens);
             mTrainingDataDirty = PR_TRUE;
         }
         break;
@@ -1215,14 +1360,14 @@
     switch (newClassification) {
     case nsIJunkMailPlugin::JUNK:
         // put tokens into junk corpus.
-        ++mBadCount;
-        rememberTokens(mBadTokens, tokens);
+        ++badCount;
+        rememberTokens(badTokens, tokens);
         mTrainingDataDirty = PR_TRUE;
         break;
     case nsIJunkMailPlugin::GOOD:
         // put tokens into good corpus.
-        ++mGoodCount;
-        rememberTokens(mGoodTokens, tokens);
+        ++goodCount;
+        rememberTokens(goodTokens, tokens);
         mTrainingDataDirty = PR_TRUE;
         break;
     }
@@ -1241,7 +1386,7 @@
 
     nsresult rv = NS_GetSpecialDirectory(NS_APP_USER_PROFILE_50_DIR, getter_AddRefs(profileDir));
     NS_ENSURE_SUCCESS(rv, rv);
-    rv = profileDir->Append(NS_LITERAL_STRING("training.dat"));
+    rv = profileDir->Append(NS_LITERAL_STRING("training.2.dat"));
     if (NS_FAILED(rv)) return rv;
     
     file = do_QueryInterface(profileDir, &rv);
@@ -1335,7 +1480,7 @@
     return PR_TRUE;
 }
 
-static const char kMagicCookie[] = { '\xFE', '\xED', '\xFA', '\xCE' };
+static const char kMagicCookie[] = { '\xFA', '\xCE', '\x00', '\x00' };
 
 void nsBayesianFilter::writeTrainingData()
 {
@@ -1353,7 +1498,12 @@
           (writeUInt32(stream, mGoodCount) == 1) &&
           (writeUInt32(stream, mBadCount) == 1) &&
            writeTokens(stream, mGoodTokens) &&
-           writeTokens(stream, mBadTokens))) {
+           writeTokens(stream, mBadTokens) &&
+          (writeUInt32(stream, mGoodCountJ) == 1) &&
+          (writeUInt32(stream, mBadCountJ) == 1) &&
+           writeTokens(stream, mGoodTokensJ) &&
+           writeTokens(stream, mBadTokensJ)
+           )) {
         NS_WARNING("failed to write training data.");
         fclose(stream);
         // delete the training data file, since it is potentially corrupt.
@@ -1386,7 +1536,12 @@
           (readUInt32(stream, &mGoodCount) == 1) &&
           (readUInt32(stream, &mBadCount) == 1) &&
            readTokens(stream, mGoodTokens) &&
-           readTokens(stream, mBadTokens))) {
+           readTokens(stream, mBadTokens) &&
+          (readUInt32(stream, &mGoodCountJ) == 1) &&
+          (readUInt32(stream, &mBadCountJ) == 1) &&
+           readTokens(stream, mGoodTokensJ) &&
+           readTokens(stream, mBadTokensJ)
+           )) {
         NS_WARNING("failed to read training data.");
         PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("failed to read training data."));
     }
@@ -1397,7 +1552,9 @@
 NS_IMETHODIMP nsBayesianFilter::GetUserHasClassified(PRBool *aResult)
 {
   *aResult = (mGoodCount && mGoodTokens.countTokens() ||
-              mBadCount && mBadTokens.countTokens());
+              mBadCount && mBadTokens.countTokens() ||
+              mGoodCountJ && mGoodTokensJ.countTokens() ||
+              mBadCountJ && mBadTokensJ.countTokens() );
   return NS_OK;
 }
 
@@ -1430,6 +1587,18 @@
     mBadCount = 0;
   }
 
+  if (mGoodCountJ && mGoodTokensJ.countTokens())
+  {
+    mGoodTokensJ.clearTokens();
+    mGoodCountJ = 0;
+  }
+
+  if (mBadCountJ && mBadTokensJ.countTokens())
+  {
+    mBadTokensJ.clearTokens();
+    mBadCountJ = 0;
+  }
+  
   // now remove training.dat
   nsCOMPtr<nsILocalFile> file;
   nsresult rv = getTrainingFile(file);
Index: nsBayesianFilter.h
===================================================================
RCS file: /cvsroot/mozilla/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.h,v
retrieving revision 1.12.54.1
diff -u -r1.12.54.1 nsBayesianFilter.h
--- nsBayesianFilter.h	17 May 2004 00:57:48 -0000	1.12.54.1
+++ nsBayesianFilter.h	26 Jan 2005 16:36:05 -0000
@@ -43,6 +43,7 @@
 #include "nsIMsgFilterPlugin.h"
 #include "nsISemanticUnitScanner.h"
 #include "pldhash.h"
+#include "nsString.h"
 
 // XXX can't simply byte align arenas, must at least 2-byte align.
 #define PL_ARENA_CONST_ALIGN_MASK 1
@@ -120,9 +121,11 @@
      */
     void visit(PRBool (*f) (Token*, void*), void* data);
 
+    nsString mLanguage;
 private:
     char* copyWord(const char* word, PRUint32 len);
     void tokenize_ascii_word(char * word);
+    void tokenize_japanese_word(char* chunk, const char* aTokenPrefix = nsnull);
     inline void addTokenForHeader(const char * aTokenPrefix, nsACString& aValue, PRBool aTokenizeValue = false);
     nsresult stripHTML(const nsAString& inString, nsAString& outString);
 
@@ -150,9 +153,9 @@
     void readTrainingData();
     
 protected:
-    Tokenizer mGoodTokens, mBadTokens;
+    Tokenizer mGoodTokens, mBadTokens, mGoodTokensJ, mBadTokensJ;
     double   mJunkProbabilityThreshold;
-    PRUint32 mGoodCount, mBadCount;
+    PRUint32 mGoodCount, mBadCount, mGoodCountJ, mBadCountJ;
     PRUint32 mBatchLevel;  // allow for nested batches to happen
     PRPackedBool mTrainingDataDirty;
 };