? japanese_tokenize.diff
Index: Makefile.in
===================================================================
RCS file: /cvsroot/mozilla/mailnews/extensions/bayesian-spam-filter/src/Makefile.in,v
retrieving revision 1.11.6.1
diff -u -r1.11.6.1 Makefile.in
--- Makefile.in	17 May 2004 00:57:48 -0000	1.11.6.1
+++ Makefile.in	4 Jan 2005 13:06:34 -0000
@@ -44,6 +44,7 @@
 		  content \
 		  htmlparser \
 		  layout \
+	uconv \
 		  $(NULL)
 
 CPPSRCS		= \
Index: nsBayesianFilter.cpp
===================================================================
RCS file: /cvsroot/mozilla/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.cpp,v
retrieving revision 1.35.6.7
diff -u -r1.35.6.7 nsBayesianFilter.cpp
--- nsBayesianFilter.cpp	19 Oct 2004 21:25:42 -0000	1.35.6.7
+++ nsBayesianFilter.cpp	4 Jan 2005 13:06:35 -0000
@@ -443,6 +443,179 @@
   } 
 }
 
+/*
+    Character unicode memo
+    
+    Hiragana:
+        Hiragana: 3040-309F
+    
+    Katakana:
+        Katakana: 30A0-30FF
+        Halfwidth and Fullwidth Forms (FF00-FFEF): FF66-FF9F
+        * Katakana Phonetic Extensions: 31F0-31FF
+    
+    Kanji:
+        CJK Radicals Supplement: 2E80-2EFF
+        Kangxi Radicals: 2F00-2FDF
+        CJK Unified Ideographs: 4E00-9FAF
+        * Ideographic Description Characters: 2FF0-2FFF
+        * CJK Unified Ideographs Extension A: 3400-4DBF
+        * CJK Compatibility Ideographs: F900-FAFF
+    
+    Kigou:
+        * Arrows: 2190-21FF
+        * Mathematical Operators: 2200-22FF
+        * Box Drawing: 2500-257F
+        * Geometric Shapes: 25A0-25FF
+        * Miscellaneous Symbols: 2600-26FF
+        * CJK Symbols and Punctuation (3000-303F): 3003-303F
+        * Enclosed CJK Letters and Months: 3200-32FF
+        * CJK Compatibility: 3300-33FF
+        * CJK Compatibility Forms: FE30-FE4F
+        * Halfwidth and Fullwidth Forms (FF00-FFEF): FF5F-FF60, FF62-FF63, FFE0-FFF6, FFE8-FFEE
+        * and others?
+    
+    Space:
+        C0 Controls and Basic Latin (0000-007F): 0020
+        CJK Symbols and Punctuation (3000-303F): 3000
+        
+    Kuten:
+        CJK Symbols and Punctuation (3000-303F): 3001
+        Halfwidth and Fullwidth Forms (FF00-FFEF): FF64
+        
+    Touten:
+        CJK Symbols and Punctuation (3000-303F): 3002
+        Halfwidth and Fullwidth Forms (FF00-FFEF): FF61
+    
+    Fullwidth characters of latain:
+        Halfwidth and Fullwidth Forms (FF00-FFEF): FF01-FF5E (correspond to 0021-007E)
+    
+    see http://www.unicode.org/charts/
+*/
+
+#define IN_RANGE(x, low, high)  ((low<=(x))&&((x)<=high))
+
+#define IS_JA_HIRAGANA(x)   IN_RANGE(x, 0x3040, 0x309F)
+#define IS_JA_KATAKANA(x)   (IN_RANGE(x, 0x30A0, 0x30FF)||IN_RANGE(x, 0xFF66, 0xFF9F))
+#define IS_JA_KANJI(x)      (IN_RANGE(x, 0x2E80, 0x2EFF)||IN_RANGE(x, 0x2F00, 0x2FDF)||IN_RANGE(x, 0x4E00, 0x9FAF))
+#define IS_JA_KUTEN(x)      ((x==0x3001)||(x==0xFF64))
+#define IS_JA_TOUTEN(x)     ((x==0x3002)||(x==0xFF61))
+#define IS_JA_SPACE(x)      ((x==0x0020)||(x==0x3000))
+#define IS_JA_FWLATAIN(x)   IN_RANGE(x, 0xFF01, 0xFF5E)
+
+enum char_class{
+    space = 0,
+    others,
+    hiragana,
+    katakana,
+    kanji,
+    kuten,
+    touten,
+    fwlatain
+};
+
+
+char_class getCharClass(PRUnichar c)
+{
+    if(IS_JA_HIRAGANA(c)){
+        return hiragana;
+    }else if(IS_JA_KATAKANA(c)){
+        return katakana;
+    }else if(IS_JA_KANJI(c)){
+        return kanji;
+    }else if(IS_JA_KUTEN(c)){
+        return kuten;
+    }else if(IS_JA_TOUTEN(c)){
+        return touten;
+    }else if(IS_JA_SPACE(c)){
+        return space;
+    }else if(IS_JA_FWLATAIN(c)){
+        return fwlatain;
+    }
+    return others;
+}
+
+static PRBool isJapanese(const char* word)
+{
+    nsString text = NS_ConvertUTF8toUCS2(word);
+    PRUnichar* p = (PRUnichar*)text.get();
+    PRUnichar c;
+    
+    // it is japanese chunk if it contains any hiragana or katakana.
+    while((c = *p++)){
+        if( IS_JA_HIRAGANA(c)||IS_JA_KATAKANA(c) )
+            return PR_TRUE;
+    }
+    return PR_FALSE;
+}
+
+#ifdef DEBUG
+#include "nsIUnicodeEncoder.h"
+#include "nsICharsetConverterManager.h"
+
+static NS_DEFINE_CID(kCharsetConverterManagerCID, NS_ICHARSETCONVERTERMANAGER_CID);
+
+void debugoutShiftJIS(nsString srcStr)
+{
+    // in reference of nsGlobalWindow::ConvertCharset(const nsAString& aStr, char** aDest)
+    nsresult result;
+    
+    nsCOMPtr<nsIUnicodeEncoder> encoder;
+    nsCOMPtr<nsICharsetConverterManager> ccm(do_GetService(kCharsetConverterManagerCID, &result));
+    if (NS_FAILED(result)) return;
+    
+    // Get an encoder and decoder for the character set
+    result = ccm->GetUnicodeEncoder("Shift_JIS", getter_AddRefs(encoder));
+    if (NS_FAILED(result)) return;
+    
+    // encode from unicode string into shift_jis string
+    result = encoder->Reset();
+    if (NS_FAILED(result)) return;
+    PRInt32 srcLen = srcStr.Length();
+    const PRUnichar* src = srcStr.get();
+    PRInt32 maxByteLen;
+    result = encoder->GetMaxLength(src, srcLen, &maxByteLen);
+    if (NS_FAILED(result)) return;
+    char* dst = (char*)nsMemory::Alloc(maxByteLen+1);
+    if (!dst) return;
+    memset(dst, 0, maxByteLen+1);
+    
+    PRInt32 dstLen = maxByteLen;
+    result = encoder->Convert(src, &srcLen, dst, &dstLen);
+    if (NS_FAILED(result)) return;
+    
+    printf("word: %s\n", dst);
+}
+#endif //DEBUG
+
+void Tokenizer::tokenize_japanese_words(char* chunk)
+{
+    printf("tokenize_japanese_words!\n");
+    
+    nsString srcStr = NS_ConvertUTF8toUCS2(chunk);
+    const PRUnichar* p1 = srcStr.get();
+    const PRUnichar* p2 = p1;
+    if(!*p2) return;
+    
+    
+    char_class cc = getCharClass(*p2);
+    while(*(++p2)){
+        if(cc == getCharClass(*p2)) continue;
+        
+        //if( (cc != space) && (cc != kuten) && (cc != touten)){
+        if( (cc==hiragana)||(cc==katakana)||(cc==kanji) ){
+#ifdef DEBUG
+            debugoutShiftJIS(nsString(p1, p2-p1));
+#endif //DEBUG
+            nsCString token = NS_ConvertUCS2toUTF8(p1, p2-p1);
+            nsCString addToken = nsCString("japanese_token:") + token;
+            add( addToken.get());
+        }
+        cc = getCharClass(*p2);
+        p1 = p2;
+    }
+}
+
 nsresult Tokenizer::stripHTML(const nsAString& inString, nsAString& outString)
 {
   nsresult rv = NS_OK;
@@ -496,6 +669,8 @@
         if (isDecimalNumber(word)) continue;
         if (isASCII(word))
             tokenize_ascii_word(word);
+        if (isJapanese(word))
+            tokenize_japanese_words(word);
         else {
             nsresult rv;
             // use I18N  scanner to break this word into meaningful semantic units.
Index: nsBayesianFilter.h
===================================================================
RCS file: /cvsroot/mozilla/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.h,v
retrieving revision 1.12.54.1
diff -u -r1.12.54.1 nsBayesianFilter.h
--- nsBayesianFilter.h	17 May 2004 00:57:48 -0000	1.12.54.1
+++ nsBayesianFilter.h	4 Jan 2005 13:06:35 -0000
@@ -123,6 +123,7 @@
 private:
     char* copyWord(const char* word, PRUint32 len);
     void tokenize_ascii_word(char * word);
+    void tokenize_japanese_words(char* chunk);
     inline void addTokenForHeader(const char * aTokenPrefix, nsACString& aValue, PRBool aTokenizeValue = false);
     nsresult stripHTML(const nsAString& inString, nsAString& outString);