? kakasi.diff Index: Makefile.in =================================================================== RCS file: /cvsroot/mozilla/mailnews/extensions/bayesian-spam-filter/src/Makefile.in,v retrieving revision 1.11.6.1 diff -u -r1.11.6.1 Makefile.in --- Makefile.in 17 May 2004 00:57:48 -0000 1.11.6.1 +++ Makefile.in 4 Jan 2005 00:03:54 -0000 @@ -44,6 +44,7 @@ content \ htmlparser \ layout \ + uconv \ $(NULL) CPPSRCS = \ Index: nsBayesianFilter.cpp =================================================================== RCS file: /cvsroot/mozilla/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.cpp,v retrieving revision 1.35.6.7 diff -u -r1.35.6.7 nsBayesianFilter.cpp --- nsBayesianFilter.cpp 19 Oct 2004 21:25:42 -0000 1.35.6.7 +++ nsBayesianFilter.cpp 4 Jan 2005 00:03:55 -0000 @@ -82,6 +82,12 @@ #include "nsIncompleteGamma.h" #include +// for kakasi +#include +#include "nsIUnicodeEncoder.h" +#include "nsIUnicodeDecoder.h" +#include "nsICharsetConverterManager.h" + static PRLogModuleInfo *BayesianFilterLogModule = nsnull; static NS_DEFINE_CID(kParserCID, NS_PARSER_CID); @@ -291,6 +297,20 @@ return PR_TRUE; } +static PRBool isJapanese(const char* word) +{ + nsString text = NS_ConvertUTF8toUCS2(word); + PRUnichar* p = (PRUnichar*)text.get(); + PRUnichar c; + + // it is japanese chunk if it contains any hiragana or katakana. + while((c = *p++)){ + if( ((c>=0x3040)&&(c<=0x309F)) || ((c>=0x30A0)&&(c<=0x30FF)) ) + return PR_TRUE; + } + return PR_FALSE; +} + inline PRBool isUpperCase(char c) { return ('A' <= c) && (c <= 'Z'); } static char* toLowerCase(char* str) @@ -443,6 +463,128 @@ } } +typedef int (WINAPI *FP_KAKASI_GETOPT_ARGV) (int, char**); +typedef char* (WINAPI *FP_KAKASI_DO) (char*); +typedef int (WINAPI *FP_KAKASI_CLOSE_KANWADICT) (); +static NS_DEFINE_CID(kCharsetConverterManagerCID, NS_ICHARSETCONVERTERMANAGER_CID); + +void Tokenizer::tokenize_japanese_words(char* chunk) +{ + HINSTANCE hLib = NULL; + char* dst = NULL; + char* sepStrSJ = NULL; + PRUnichar* sepStr = NULL; + FP_KAKASI_GETOPT_ARGV kakasi_getopt_argv = NULL; + FP_KAKASI_DO kakasi_do = NULL; + FP_KAKASI_CLOSE_KANWADICT kakasi_close_kanwadict = NULL; + + do{ + + hLib = LoadLibrary("kakasi.dll"); + if(!hLib){ + printf("loading kakasi.dll failed."); + break; + } + + kakasi_getopt_argv = + (FP_KAKASI_GETOPT_ARGV)GetProcAddress(hLib, "kakasi_getopt_argv"); + if(!kakasi_getopt_argv){ + printf("getting address of kakasi_getopt_argv failed."); + break; + } + char* v[] = {"kakasi", "-w"}; + if (kakasi_getopt_argv(2, v) != 0) { + printf("initializing kakasi failed."); + break; + } + + kakasi_do = (FP_KAKASI_DO)GetProcAddress(hLib, "kakasi_do"); + if(!kakasi_do){ + printf("getting address of kakasi_do failed."); + break; + } + + kakasi_close_kanwadict = + (FP_KAKASI_CLOSE_KANWADICT)GetProcAddress(hLib, "kakasi_close_kanwadict"); + if(!kakasi_close_kanwadict){ + printf("getting address of kakasi_close_kanwadict failed."); + break; + } + + // in reference of nsGlobalWindow::ConvertCharset(const nsAString& aStr, char** aDest) + + nsString srcStr = NS_ConvertUTF8toUCS2(chunk); + + nsresult result; + nsCOMPtr encoder; + nsCOMPtr decoder; + nsCOMPtr ccm(do_GetService(kCharsetConverterManagerCID, &result)); + if (NS_FAILED(result)) break; + + // Get an encoder and decoder for the character set + result = ccm->GetUnicodeEncoder("Shift_JIS", getter_AddRefs(encoder)); + if (NS_FAILED(result)) break; + result = ccm->GetUnicodeDecoder("Shift_JIS", getter_AddRefs(decoder)); + if (NS_FAILED(result)) break; + + // encode from unicode string into shift_jis string + result = encoder->Reset(); + if (NS_FAILED(result)) break; + PRInt32 srcLen = srcStr.Length(); + const PRUnichar* src = srcStr.get(); + PRInt32 maxByteLen; + result = encoder->GetMaxLength(src, srcLen, &maxByteLen); + if (NS_FAILED(result)) break; + dst = (char*)nsMemory::Alloc(maxByteLen+1); + if (!dst) break; + memset(dst, 0, maxByteLen+1); + + PRInt32 dstLen = maxByteLen; + result = encoder->Convert(src, &srcLen, dst, &dstLen); + if (NS_FAILED(result)) break; + + // execute kakasi + printf("before kakasi: %s", dst); + sepStrSJ = kakasi_do(dst); + printf("after kakasi: %s", sepStrSJ); + + // decode from shift_jis string into unicode. + result = decoder->Reset(); + if (NS_FAILED(result)) break; + srcLen = nsCRT::strlen(sepStrSJ); + result = decoder->GetMaxLength(sepStrSJ, srcLen, &maxByteLen); + if (NS_FAILED(result)) break; + sepStr = (PRUnichar*) nsMemory::Alloc(maxByteLen+2); + if(!sepStr) break; + memset(sepStr, 0, maxByteLen+2); + + dstLen = maxByteLen/2; // is dstLen # of unicode char? + decoder->Convert(sepStrSJ, &srcLen, sepStr, &dstLen); + if(NS_FAILED(result)) break; + + nsCString sepStrS = NS_ConvertUCS2toUTF8(sepStr); + char* word; + char* next = (char*)sepStrS.get(); // see tokenize ! + while ((word = nsCRT::strtok(next, kBayesianFilterTokenDelimiters, &next)) != NULL) { + add(word); + } + + }while(0); +//cleanup: + if(sepStr) nsMemory::Free(sepStr); + if(sepStrSJ) free(sepStrSJ); + if(dst) nsMemory::Free(dst); + + if(hLib){ + if(kakasi_close_kanwadict){ + kakasi_close_kanwadict(); + } + FreeLibrary(hLib); + } + + return; +} + nsresult Tokenizer::stripHTML(const nsAString& inString, nsAString& outString) { nsresult rv = NS_OK; @@ -496,6 +638,8 @@ if (isDecimalNumber(word)) continue; if (isASCII(word)) tokenize_ascii_word(word); + if (isJapanese(word)) + tokenize_japanese_words(word); else { nsresult rv; // use I18N scanner to break this word into meaningful semantic units. Index: nsBayesianFilter.h =================================================================== RCS file: /cvsroot/mozilla/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.h,v retrieving revision 1.12.54.1 diff -u -r1.12.54.1 nsBayesianFilter.h --- nsBayesianFilter.h 17 May 2004 00:57:48 -0000 1.12.54.1 +++ nsBayesianFilter.h 4 Jan 2005 00:03:55 -0000 @@ -123,6 +123,7 @@ private: char* copyWord(const char* word, PRUint32 len); void tokenize_ascii_word(char * word); + void tokenize_japanese_words(char* chunk); inline void addTokenForHeader(const char * aTokenPrefix, nsACString& aValue, PRBool aTokenizeValue = false); nsresult stripHTML(const nsAString& inString, nsAString& outString);