? kakasi.2.diff Index: Makefile.in =================================================================== RCS file: /cvsroot/mozilla/mailnews/extensions/bayesian-spam-filter/src/Makefile.in,v retrieving revision 1.11.6.1 diff -u -r1.11.6.1 Makefile.in --- Makefile.in 17 May 2004 00:57:48 -0000 1.11.6.1 +++ Makefile.in 17 Jan 2005 14:50:01 -0000 @@ -44,6 +44,7 @@ content \ htmlparser \ layout \ + uconv \ $(NULL) CPPSRCS = \ Index: nsBayesianFilter.cpp =================================================================== RCS file: /cvsroot/mozilla/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.cpp,v retrieving revision 1.35.6.7 diff -u -r1.35.6.7 nsBayesianFilter.cpp --- nsBayesianFilter.cpp 19 Oct 2004 21:25:42 -0000 1.35.6.7 +++ nsBayesianFilter.cpp 17 Jan 2005 14:50:02 -0000 @@ -82,6 +82,12 @@ #include "nsIncompleteGamma.h" #include +// for kakasi +#include +#include "nsIUnicodeEncoder.h" +#include "nsIUnicodeDecoder.h" +#include "nsICharsetConverterManager.h" + static PRLogModuleInfo *BayesianFilterLogModule = nsnull; static NS_DEFINE_CID(kParserCID, NS_PARSER_CID); @@ -291,6 +297,23 @@ return PR_TRUE; } +#define IN_RANGE(x, low, high) ((PRUint16)((x)-(low)) <= (high)-(low)) +#define IS_JAPANESE_SPECIFIC(x) (IN_RANGE(x, 0x3040, 0x30FF)||IN_RANGE(x, 0xFF01, 0xFF9F)) + +static PRBool isJapanese(const char* word) +{ + nsString text = NS_ConvertUTF8toUCS2(word); + PRUnichar* p = (PRUnichar*)text.get(); + PRUnichar c; + + // it is japanese chunk if it contains any hiragana or katakana. + while((c = *p++)){ + if( IS_JAPANESE_SPECIFIC(c) ) + return PR_TRUE; + } + return PR_FALSE; +} + inline PRBool isUpperCase(char c) { return ('A' <= c) && (c <= 'Z'); } static char* toLowerCase(char* str) @@ -443,6 +466,129 @@ } } +typedef int (WINAPI *FP_KAKASI_GETOPT_ARGV) (int, char**); +typedef char* (WINAPI *FP_KAKASI_DO) (char*); +typedef int (WINAPI *FP_KAKASI_CLOSE_KANWADICT) (); +static NS_DEFINE_CID(kCharsetConverterManagerCID, NS_ICHARSETCONVERTERMANAGER_CID); + +void Tokenizer::tokenize_japanese_words(char* chunk) +{ + HINSTANCE hLib = NULL; + char* dst = NULL; + char* sepStrSJ = NULL; + PRUnichar* sepStr = NULL; + FP_KAKASI_GETOPT_ARGV kakasi_getopt_argv = NULL; + FP_KAKASI_DO kakasi_do = NULL; + FP_KAKASI_CLOSE_KANWADICT kakasi_close_kanwadict = NULL; + + do{ + + hLib = LoadLibrary("kakasi.dll"); + if(!hLib){ + printf("loading kakasi.dll failed."); + break; + } + + kakasi_getopt_argv = + (FP_KAKASI_GETOPT_ARGV)GetProcAddress(hLib, "kakasi_getopt_argv"); + if(!kakasi_getopt_argv){ + printf("getting address of kakasi_getopt_argv failed."); + break; + } + char* v[] = {"kakasi", "-w"}; + if (kakasi_getopt_argv(2, v) != 0) { + printf("initializing kakasi failed."); + break; + } + + kakasi_do = (FP_KAKASI_DO)GetProcAddress(hLib, "kakasi_do"); + if(!kakasi_do){ + printf("getting address of kakasi_do failed."); + break; + } + + kakasi_close_kanwadict = + (FP_KAKASI_CLOSE_KANWADICT)GetProcAddress(hLib, "kakasi_close_kanwadict"); + if(!kakasi_close_kanwadict){ + printf("getting address of kakasi_close_kanwadict failed."); + break; + } + + // in reference of nsGlobalWindow::ConvertCharset(const nsAString& aStr, char** aDest) + + nsString srcStr = NS_ConvertUTF8toUCS2(chunk); + + nsresult result; + nsCOMPtr encoder; + nsCOMPtr decoder; + nsCOMPtr ccm(do_GetService(kCharsetConverterManagerCID, &result)); + if (NS_FAILED(result)) break; + + // Get an encoder and decoder for the character set + result = ccm->GetUnicodeEncoder("Shift_JIS", getter_AddRefs(encoder)); + if (NS_FAILED(result)) break; + result = ccm->GetUnicodeDecoder("Shift_JIS", getter_AddRefs(decoder)); + if (NS_FAILED(result)) break; + + // encode from unicode string into shift_jis string + result = encoder->Reset(); + if (NS_FAILED(result)) break; + PRInt32 srcLen = srcStr.Length(); + const PRUnichar* src = srcStr.get(); + PRInt32 maxByteLen; + result = encoder->GetMaxLength(src, srcLen, &maxByteLen); + if (NS_FAILED(result)) break; + dst = (char*)nsMemory::Alloc(maxByteLen+1); + if (!dst) break; + memset(dst, 0, maxByteLen+1); + + PRInt32 dstLen = maxByteLen; + result = encoder->Convert(src, &srcLen, dst, &dstLen); + if (NS_FAILED(result)) break; + + // execute kakasi + printf("before kakasi: %s", dst); + sepStrSJ = kakasi_do(dst); + printf("after kakasi: %s", sepStrSJ); + + // decode from shift_jis string into unicode. + result = decoder->Reset(); + if (NS_FAILED(result)) break; + srcLen = nsCRT::strlen(sepStrSJ); + result = decoder->GetMaxLength(sepStrSJ, srcLen, &maxByteLen); + if (NS_FAILED(result)) break; + sepStr = (PRUnichar*) nsMemory::Alloc(maxByteLen+2); + if(!sepStr) break; + memset(sepStr, 0, maxByteLen+2); + + dstLen = maxByteLen/2; // is dstLen # of unicode char? + decoder->Convert(sepStrSJ, &srcLen, sepStr, &dstLen); + if(NS_FAILED(result)) break; + + nsCString sepStrS = NS_ConvertUCS2toUTF8(sepStr); + char* word; + char* next = (char*)sepStrS.get(); // see tokenize ! + while ((word = nsCRT::strtok(next, kBayesianFilterTokenDelimiters, &next)) != NULL) { + nsCString addToken = nsCString("KAKASI:") + nsCString(word); + add(addToken.get()); + } + + }while(0); +//cleanup: + if(sepStr) nsMemory::Free(sepStr); + if(sepStrSJ) free(sepStrSJ); + if(dst) nsMemory::Free(dst); + + if(hLib){ + if(kakasi_close_kanwadict){ + kakasi_close_kanwadict(); + } + FreeLibrary(hLib); + } + + return; +} + nsresult Tokenizer::stripHTML(const nsAString& inString, nsAString& outString) { nsresult rv = NS_OK; @@ -485,6 +631,16 @@ nsString strippedUCS2; stripHTML(text, strippedUCS2); + // convert 0x3000(full width space) into 0x0020 + nsString::iterator substr_start, substr_end; + strippedUCS2.BeginWriting(substr_start); + strippedUCS2.EndWriting(substr_end); + while (substr_start != substr_end) { + if (*substr_start == 0x3000) + *substr_start = 0x0020; + ++substr_start; + } + nsCString strippedStr = NS_ConvertUCS2toUTF8(strippedUCS2); char * strippedText = (char *) strippedStr.get(); // bleh PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("tokenize stripped html: %s", strippedText)); @@ -496,6 +652,8 @@ if (isDecimalNumber(word)) continue; if (isASCII(word)) tokenize_ascii_word(word); + else if (isJapanese(word)) + tokenize_japanese_words(word); else { nsresult rv; // use I18N scanner to break this word into meaningful semantic units. Index: nsBayesianFilter.h =================================================================== RCS file: /cvsroot/mozilla/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.h,v retrieving revision 1.12.54.1 diff -u -r1.12.54.1 nsBayesianFilter.h --- nsBayesianFilter.h 17 May 2004 00:57:48 -0000 1.12.54.1 +++ nsBayesianFilter.h 17 Jan 2005 14:50:02 -0000 @@ -123,6 +123,7 @@ private: char* copyWord(const char* word, PRUint32 len); void tokenize_ascii_word(char * word); + void tokenize_japanese_words(char* chunk); inline void addTokenForHeader(const char * aTokenPrefix, nsACString& aValue, PRBool aTokenizeValue = false); nsresult stripHTML(const nsAString& inString, nsAString& outString);