? japanese_tokenize.diff Index: Makefile.in =================================================================== RCS file: /cvsroot/mozilla/mailnews/extensions/bayesian-spam-filter/src/Makefile.in,v retrieving revision 1.11.6.1 diff -u -r1.11.6.1 Makefile.in --- Makefile.in 17 May 2004 00:57:48 -0000 1.11.6.1 +++ Makefile.in 4 Jan 2005 13:06:34 -0000 @@ -44,6 +44,7 @@ content \ htmlparser \ layout \ + uconv \ $(NULL) CPPSRCS = \ Index: nsBayesianFilter.cpp =================================================================== RCS file: /cvsroot/mozilla/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.cpp,v retrieving revision 1.35.6.7 diff -u -r1.35.6.7 nsBayesianFilter.cpp --- nsBayesianFilter.cpp 19 Oct 2004 21:25:42 -0000 1.35.6.7 +++ nsBayesianFilter.cpp 4 Jan 2005 13:06:35 -0000 @@ -443,6 +443,179 @@ } } +/* + Character unicode memo + + Hiragana: + Hiragana: 3040-309F + + Katakana: + Katakana: 30A0-30FF + Halfwidth and Fullwidth Forms (FF00-FFEF): FF66-FF9F + * Katakana Phonetic Extensions: 31F0-31FF + + Kanji: + CJK Radicals Supplement: 2E80-2EFF + Kangxi Radicals: 2F00-2FDF + CJK Unified Ideographs: 4E00-9FAF + * Ideographic Description Characters: 2FF0-2FFF + * CJK Unified Ideographs Extension A: 3400-4DBF + * CJK Compatibility Ideographs: F900-FAFF + + Kigou: + * Arrows: 2190-21FF + * Mathematical Operators: 2200-22FF + * Box Drawing: 2500-257F + * Geometric Shapes: 25A0-25FF + * Miscellaneous Symbols: 2600-26FF + * CJK Symbols and Punctuation (3000-303F): 3003-303F + * Enclosed CJK Letters and Months: 3200-32FF + * CJK Compatibility: 3300-33FF + * CJK Compatibility Forms: FE30-FE4F + * Halfwidth and Fullwidth Forms (FF00-FFEF): FF5F-FF60, FF62-FF63, FFE0-FFF6, FFE8-FFEE + * and others? + + Space: + C0 Controls and Basic Latin (0000-007F): 0020 + CJK Symbols and Punctuation (3000-303F): 3000 + + Kuten: + CJK Symbols and Punctuation (3000-303F): 3001 + Halfwidth and Fullwidth Forms (FF00-FFEF): FF64 + + Touten: + CJK Symbols and Punctuation (3000-303F): 3002 + Halfwidth and Fullwidth Forms (FF00-FFEF): FF61 + + Fullwidth characters of latain: + Halfwidth and Fullwidth Forms (FF00-FFEF): FF01-FF5E (correspond to 0021-007E) + + see http://www.unicode.org/charts/ +*/ + +#define IN_RANGE(x, low, high) ((low<=(x))&&((x)<=high)) + +#define IS_JA_HIRAGANA(x) IN_RANGE(x, 0x3040, 0x309F) +#define IS_JA_KATAKANA(x) (IN_RANGE(x, 0x30A0, 0x30FF)||IN_RANGE(x, 0xFF66, 0xFF9F)) +#define IS_JA_KANJI(x) (IN_RANGE(x, 0x2E80, 0x2EFF)||IN_RANGE(x, 0x2F00, 0x2FDF)||IN_RANGE(x, 0x4E00, 0x9FAF)) +#define IS_JA_KUTEN(x) ((x==0x3001)||(x==0xFF64)) +#define IS_JA_TOUTEN(x) ((x==0x3002)||(x==0xFF61)) +#define IS_JA_SPACE(x) ((x==0x0020)||(x==0x3000)) +#define IS_JA_FWLATAIN(x) IN_RANGE(x, 0xFF01, 0xFF5E) + +enum char_class{ + space = 0, + others, + hiragana, + katakana, + kanji, + kuten, + touten, + fwlatain +}; + + +char_class getCharClass(PRUnichar c) +{ + if(IS_JA_HIRAGANA(c)){ + return hiragana; + }else if(IS_JA_KATAKANA(c)){ + return katakana; + }else if(IS_JA_KANJI(c)){ + return kanji; + }else if(IS_JA_KUTEN(c)){ + return kuten; + }else if(IS_JA_TOUTEN(c)){ + return touten; + }else if(IS_JA_SPACE(c)){ + return space; + }else if(IS_JA_FWLATAIN(c)){ + return fwlatain; + } + return others; +} + +static PRBool isJapanese(const char* word) +{ + nsString text = NS_ConvertUTF8toUCS2(word); + PRUnichar* p = (PRUnichar*)text.get(); + PRUnichar c; + + // it is japanese chunk if it contains any hiragana or katakana. + while((c = *p++)){ + if( IS_JA_HIRAGANA(c)||IS_JA_KATAKANA(c) ) + return PR_TRUE; + } + return PR_FALSE; +} + +#ifdef DEBUG +#include "nsIUnicodeEncoder.h" +#include "nsICharsetConverterManager.h" + +static NS_DEFINE_CID(kCharsetConverterManagerCID, NS_ICHARSETCONVERTERMANAGER_CID); + +void debugoutShiftJIS(nsString srcStr) +{ + // in reference of nsGlobalWindow::ConvertCharset(const nsAString& aStr, char** aDest) + nsresult result; + + nsCOMPtr encoder; + nsCOMPtr ccm(do_GetService(kCharsetConverterManagerCID, &result)); + if (NS_FAILED(result)) return; + + // Get an encoder and decoder for the character set + result = ccm->GetUnicodeEncoder("Shift_JIS", getter_AddRefs(encoder)); + if (NS_FAILED(result)) return; + + // encode from unicode string into shift_jis string + result = encoder->Reset(); + if (NS_FAILED(result)) return; + PRInt32 srcLen = srcStr.Length(); + const PRUnichar* src = srcStr.get(); + PRInt32 maxByteLen; + result = encoder->GetMaxLength(src, srcLen, &maxByteLen); + if (NS_FAILED(result)) return; + char* dst = (char*)nsMemory::Alloc(maxByteLen+1); + if (!dst) return; + memset(dst, 0, maxByteLen+1); + + PRInt32 dstLen = maxByteLen; + result = encoder->Convert(src, &srcLen, dst, &dstLen); + if (NS_FAILED(result)) return; + + printf("word: %s\n", dst); +} +#endif //DEBUG + +void Tokenizer::tokenize_japanese_words(char* chunk) +{ + printf("tokenize_japanese_words!\n"); + + nsString srcStr = NS_ConvertUTF8toUCS2(chunk); + const PRUnichar* p1 = srcStr.get(); + const PRUnichar* p2 = p1; + if(!*p2) return; + + + char_class cc = getCharClass(*p2); + while(*(++p2)){ + if(cc == getCharClass(*p2)) continue; + + //if( (cc != space) && (cc != kuten) && (cc != touten)){ + if( (cc==hiragana)||(cc==katakana)||(cc==kanji) ){ +#ifdef DEBUG + debugoutShiftJIS(nsString(p1, p2-p1)); +#endif //DEBUG + nsCString token = NS_ConvertUCS2toUTF8(p1, p2-p1); + nsCString addToken = nsCString("japanese_token:") + token; + add( addToken.get()); + } + cc = getCharClass(*p2); + p1 = p2; + } +} + nsresult Tokenizer::stripHTML(const nsAString& inString, nsAString& outString) { nsresult rv = NS_OK; @@ -496,6 +669,8 @@ if (isDecimalNumber(word)) continue; if (isASCII(word)) tokenize_ascii_word(word); + if (isJapanese(word)) + tokenize_japanese_words(word); else { nsresult rv; // use I18N scanner to break this word into meaningful semantic units. Index: nsBayesianFilter.h =================================================================== RCS file: /cvsroot/mozilla/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.h,v retrieving revision 1.12.54.1 diff -u -r1.12.54.1 nsBayesianFilter.h --- nsBayesianFilter.h 17 May 2004 00:57:48 -0000 1.12.54.1 +++ nsBayesianFilter.h 4 Jan 2005 13:06:35 -0000 @@ -123,6 +123,7 @@ private: char* copyWord(const char* word, PRUint32 len); void tokenize_ascii_word(char * word); + void tokenize_japanese_words(char* chunk); inline void addTokenForHeader(const char * aTokenPrefix, nsACString& aValue, PRBool aTokenizeValue = false); nsresult stripHTML(const nsAString& inString, nsAString& outString);