? japanese_tokenize.5.diff ? japanese_tokenize.6.diff ? japanese_tokenize.7.diff ? japanese_tokenize.8.diff Index: nsBayesianFilter.cpp =================================================================== RCS file: /cvsroot/mozilla/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.cpp,v retrieving revision 1.35.6.7 diff -u -r1.35.6.7 nsBayesianFilter.cpp --- nsBayesianFilter.cpp 19 Oct 2004 21:25:42 -0000 1.35.6.7 +++ nsBayesianFilter.cpp 26 Jan 2005 16:36:05 -0000 @@ -79,6 +79,9 @@ #include "nsIHTMLToTextSink.h" #include "nsIDocumentEncoder.h" +// needed to decode mime encoded subject +#include "nsIMimeConverter.h" + #include "nsIncompleteGamma.h" #include @@ -303,6 +306,108 @@ return str; } +// one subtract and one conditional jump should be faster than two conditional jump on most recent system. +#define IN_RANGE(x, low, high) ((PRUint16)((x)-(low)) <= (high)-(low)) + +#define IS_JA_HIRAGANA(x) IN_RANGE(x, 0x3040, 0x309F) +// swapping the range using xor operation to reduce conditional jump. +#define IS_JA_KATAKANA(x) (IN_RANGE(x^0x0004, 0x30A0, 0x30FE)||(IN_RANGE(x, 0xFF66, 0xFF9F))) +#define IS_JA_KANJI(x) (IN_RANGE(x, 0x2E80, 0x2FDF)||IN_RANGE(x, 0x4E00, 0x9FAF)) +#define IS_JA_KUTEN(x) (((x)==0x3001)||((x)==0xFF64)||((x)==0xFF0E)) +#define IS_JA_TOUTEN(x) (((x)==0x3002)||((x)==0xFF61)||((x)==0xFF0C)) +#define IS_JA_SPACE(x) ((x)==0x3000) +#define IS_JA_FWLATAIN(x) IN_RANGE(x, 0xFF01, 0xFF5E) +#define IS_JA_FWNUMERAL(x) IN_RANGE(x, 0xFF10, 0xFF19) + +enum char_class{ + others = 0, + space, + hiragana, + katakana, + kanji, + kuten, + touten, + kigou, + fwlatain, + ascii +}; + +char_class getCharClass(PRUnichar c) +{ + char_class charClass = others; + + if(IS_JA_HIRAGANA(c)) + charClass = hiragana; + else if(IS_JA_KATAKANA(c)) + charClass = katakana; + else if(IS_JA_KANJI(c)) + charClass = kanji; + else if(IS_JA_KUTEN(c)) + charClass = kuten; + else if(IS_JA_TOUTEN(c)) + charClass = touten; + else if(IS_JA_FWLATAIN(c)) + charClass = fwlatain; + + return charClass; +} + +PRBool isFWNumeral(const PRUnichar* p1, const PRUnichar* p2) +{ + for(;p1 mimehdrpar = do_GetService(NS_MIMEHEADERPARAM_CONTRACTID); - + nsCOMPtr mimeConverter = do_GetService(NS_MIME_CONVERTER_CONTRACTID); + nsCString headerValue; nsCAutoString headerName; // we'll be normalizing all header names to lower case PRBool hasMore = PR_TRUE; @@ -390,16 +500,22 @@ if (headerName.Equals("subject")) { // we want to tokenize the subject - addTokenForHeader(headerName.get(), headerValue, PR_TRUE); + nsString str; + mimeConverter->DecodeMimeHeader(headerValue.get(), str); + nsCAutoString cstr = NS_ConvertUTF16toUTF8(str); + addTokenForHeader(headerName.get(), cstr, PR_TRUE); } // important: leave out sender field. To strong of an indicator break; - case 'x': // (2) X-Mailer / user-agent works best if it is untokenized, just fold the case and any leading/trailing white space - case 'u': - addTokenForHeader(headerName.get(), headerValue); - break; default: + if (headerName.Equals("x-mozilla-status")) break; + if (headerName.Equals("x-mozilla-status2")) break; + if (headerName.Equals("x-uidl")) break; + if (headerName.Equals("message-id")) break; + if (headerName.Equals("in-reply-to")) break; + if (headerName.Equals("date")) break; + if (headerName.Equals("reference")) break; addTokenForHeader(headerName.get(), headerValue); break; } // end switch @@ -484,7 +600,17 @@ nsString text = NS_ConvertUTF8toUCS2(aText); nsString strippedUCS2; stripHTML(text, strippedUCS2); - + + // convert 0x3000(full width space) into 0x0020 + nsString::iterator substr_start, substr_end; + strippedUCS2.BeginWriting(substr_start); + strippedUCS2.EndWriting(substr_end); + while (substr_start != substr_end) { + if (*substr_start == 0x3000) + *substr_start = 0x0020; + ++substr_start; + } + nsCString strippedStr = NS_ConvertUCS2toUTF8(strippedUCS2); char * strippedText = (char *) strippedStr.get(); // bleh PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("tokenize stripped html: %s", strippedText)); @@ -496,6 +622,10 @@ if (isDecimalNumber(word)) continue; if (isASCII(word)) tokenize_ascii_word(word); + else if (isJapanese(word)){ + tokenize_japanese_word(word); + mLanguage = NS_LITERAL_STRING("JA"); + } else { nsresult rv; // use I18N scanner to break this word into meaningful semantic units. @@ -809,7 +939,7 @@ NS_IMPL_ISUPPORTS2(nsBayesianFilter, nsIMsgFilterPlugin, nsIJunkMailPlugin) nsBayesianFilter::nsBayesianFilter() - : mGoodCount(0), mBadCount(0), + : mGoodCount(0), mBadCount(0), mGoodCountJ(0), mBadCountJ(0), mBatchLevel(0), mTrainingDataDirty(PR_FALSE) { if (!BayesianFilterLogModule) @@ -827,8 +957,13 @@ PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("junk probabilty threshold: %f", mJunkProbabilityThreshold)); - PRBool ok = (mGoodTokens && mBadTokens); + PRBool ok = (mGoodTokens && mBadTokens && mGoodTokensJ && mBadTokensJ); NS_ASSERTION(ok, "error allocating tokenizers"); + mGoodTokens.mLanguage = NS_LITERAL_STRING("GENERAL"); + mBadTokens.mLanguage = NS_LITERAL_STRING("GENERAL"); + mGoodTokensJ.mLanguage = NS_LITERAL_STRING("JA"); + mBadTokensJ.mLanguage = NS_LITERAL_STRING("JA"); + if (ok) readTrainingData(); else { @@ -954,6 +1089,11 @@ Token* tokens = tokenizer.copyTokens(); if (!tokens) return; + PRUint32& goodCount = (tokenizer.mLanguage == NS_LITERAL_STRING("JA")) ? mGoodCountJ : mGoodCount; + PRUint32& badCount = (tokenizer.mLanguage == NS_LITERAL_STRING("JA")) ? mBadCountJ : mBadCount; + Tokenizer& goodTokens = (tokenizer.mLanguage == NS_LITERAL_STRING("JA")) ? mGoodTokensJ : mGoodTokens; + Tokenizer& badTokens = (tokenizer.mLanguage == NS_LITERAL_STRING("JA")) ? mBadTokensJ : mBadTokens; + // the algorithm in "A Plan For Spam" assumes that you have a large good // corpus and a large junk corpus. // that won't be the case with users who first use the junk mail feature @@ -964,12 +1104,12 @@ // and if there are no bad tokens, assume the message is not junk // this will also "encourage" the user to train // see bug #194238 - if (listener && !mGoodCount && !mGoodTokens.countTokens()) { + if (listener && !goodCount && !goodTokens.countTokens()) { PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("no good tokens, assume junk")); listener->OnMessageClassified(messageURI, nsMsgJunkStatus(nsIJunkMailPlugin::JUNK)); return; } - if (listener && !mBadCount && !mBadTokens.countTokens()) { + if (listener && !badCount && !badTokens.countTokens()) { PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("no bad tokens, assume good")); listener->OnMessageClassified(messageURI, nsMsgJunkStatus(nsIJunkMailPlugin::GOOD)); return; @@ -977,15 +1117,15 @@ /* this part is similar to the Graham algorithm with some adjustments. */ PRUint32 i, goodclues=0, count = tokenizer.countTokens(); - double ngood = mGoodCount, nbad = mBadCount, prob; + double ngood = goodCount, nbad = badCount, prob; for (i = 0; i < count; ++i) { Token& token = tokens[i]; const char* word = token.mWord; - Token* t = mGoodTokens.get(word); + Token* t = goodTokens.get(word); double hamcount = ((t != NULL) ? t->mCount : 0); - t = mBadTokens.get(word); + t = badTokens.get(word); double spamcount = ((t != NULL) ? t->mCount : 0); // if hamcount and spam count are both 0, we could end up with a divide by 0 error, @@ -1182,6 +1322,11 @@ PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("observeMessage(%s) old=%d new=%d", messageURL, oldClassification, newClassification)); TokenEnumeration tokens = tokenizer.getTokens(); + PRUint32& goodCount = (tokenizer.mLanguage == NS_LITERAL_STRING("JA")) ? mGoodCountJ : mGoodCount; + PRUint32& badCount = (tokenizer.mLanguage == NS_LITERAL_STRING("JA")) ? mBadCountJ : mBadCount; + Tokenizer& goodTokens = (tokenizer.mLanguage == NS_LITERAL_STRING("JA")) ? mGoodTokensJ : mGoodTokens; + Tokenizer& badTokens = (tokenizer.mLanguage == NS_LITERAL_STRING("JA")) ? mBadTokensJ : mBadTokens; + // Uhoh...if the user is re-training then the message may already be classified and we are classifying it again with the same classification. // the old code would have removed the tokens for this message then added them back. But this really hurts the message occurrence // count for tokens if you just removed training.dat and are re-training. See Bug #237095 for more details. @@ -1194,17 +1339,17 @@ switch (oldClassification) { case nsIJunkMailPlugin::JUNK: // remove tokens from junk corpus. - if (mBadCount > 0) { - --mBadCount; - forgetTokens(mBadTokens, tokens); + if (badCount > 0) { + --badCount; + forgetTokens(badTokens, tokens); mTrainingDataDirty = PR_TRUE; } break; case nsIJunkMailPlugin::GOOD: // remove tokens from good corpus. - if (mGoodCount > 0) { - --mGoodCount; - forgetTokens(mGoodTokens, tokens); + if (goodCount > 0) { + --goodCount; + forgetTokens(goodTokens, tokens); mTrainingDataDirty = PR_TRUE; } break; @@ -1215,14 +1360,14 @@ switch (newClassification) { case nsIJunkMailPlugin::JUNK: // put tokens into junk corpus. - ++mBadCount; - rememberTokens(mBadTokens, tokens); + ++badCount; + rememberTokens(badTokens, tokens); mTrainingDataDirty = PR_TRUE; break; case nsIJunkMailPlugin::GOOD: // put tokens into good corpus. - ++mGoodCount; - rememberTokens(mGoodTokens, tokens); + ++goodCount; + rememberTokens(goodTokens, tokens); mTrainingDataDirty = PR_TRUE; break; } @@ -1241,7 +1386,7 @@ nsresult rv = NS_GetSpecialDirectory(NS_APP_USER_PROFILE_50_DIR, getter_AddRefs(profileDir)); NS_ENSURE_SUCCESS(rv, rv); - rv = profileDir->Append(NS_LITERAL_STRING("training.dat")); + rv = profileDir->Append(NS_LITERAL_STRING("training.2.dat")); if (NS_FAILED(rv)) return rv; file = do_QueryInterface(profileDir, &rv); @@ -1335,7 +1480,7 @@ return PR_TRUE; } -static const char kMagicCookie[] = { '\xFE', '\xED', '\xFA', '\xCE' }; +static const char kMagicCookie[] = { '\xFA', '\xCE', '\x00', '\x00' }; void nsBayesianFilter::writeTrainingData() { @@ -1353,7 +1498,12 @@ (writeUInt32(stream, mGoodCount) == 1) && (writeUInt32(stream, mBadCount) == 1) && writeTokens(stream, mGoodTokens) && - writeTokens(stream, mBadTokens))) { + writeTokens(stream, mBadTokens) && + (writeUInt32(stream, mGoodCountJ) == 1) && + (writeUInt32(stream, mBadCountJ) == 1) && + writeTokens(stream, mGoodTokensJ) && + writeTokens(stream, mBadTokensJ) + )) { NS_WARNING("failed to write training data."); fclose(stream); // delete the training data file, since it is potentially corrupt. @@ -1386,7 +1536,12 @@ (readUInt32(stream, &mGoodCount) == 1) && (readUInt32(stream, &mBadCount) == 1) && readTokens(stream, mGoodTokens) && - readTokens(stream, mBadTokens))) { + readTokens(stream, mBadTokens) && + (readUInt32(stream, &mGoodCountJ) == 1) && + (readUInt32(stream, &mBadCountJ) == 1) && + readTokens(stream, mGoodTokensJ) && + readTokens(stream, mBadTokensJ) + )) { NS_WARNING("failed to read training data."); PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("failed to read training data.")); } @@ -1397,7 +1552,9 @@ NS_IMETHODIMP nsBayesianFilter::GetUserHasClassified(PRBool *aResult) { *aResult = (mGoodCount && mGoodTokens.countTokens() || - mBadCount && mBadTokens.countTokens()); + mBadCount && mBadTokens.countTokens() || + mGoodCountJ && mGoodTokensJ.countTokens() || + mBadCountJ && mBadTokensJ.countTokens() ); return NS_OK; } @@ -1430,6 +1587,18 @@ mBadCount = 0; } + if (mGoodCountJ && mGoodTokensJ.countTokens()) + { + mGoodTokensJ.clearTokens(); + mGoodCountJ = 0; + } + + if (mBadCountJ && mBadTokensJ.countTokens()) + { + mBadTokensJ.clearTokens(); + mBadCountJ = 0; + } + // now remove training.dat nsCOMPtr file; nsresult rv = getTrainingFile(file); Index: nsBayesianFilter.h =================================================================== RCS file: /cvsroot/mozilla/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.h,v retrieving revision 1.12.54.1 diff -u -r1.12.54.1 nsBayesianFilter.h --- nsBayesianFilter.h 17 May 2004 00:57:48 -0000 1.12.54.1 +++ nsBayesianFilter.h 26 Jan 2005 16:36:05 -0000 @@ -43,6 +43,7 @@ #include "nsIMsgFilterPlugin.h" #include "nsISemanticUnitScanner.h" #include "pldhash.h" +#include "nsString.h" // XXX can't simply byte align arenas, must at least 2-byte align. #define PL_ARENA_CONST_ALIGN_MASK 1 @@ -120,9 +121,11 @@ */ void visit(PRBool (*f) (Token*, void*), void* data); + nsString mLanguage; private: char* copyWord(const char* word, PRUint32 len); void tokenize_ascii_word(char * word); + void tokenize_japanese_word(char* chunk, const char* aTokenPrefix = nsnull); inline void addTokenForHeader(const char * aTokenPrefix, nsACString& aValue, PRBool aTokenizeValue = false); nsresult stripHTML(const nsAString& inString, nsAString& outString); @@ -150,9 +153,9 @@ void readTrainingData(); protected: - Tokenizer mGoodTokens, mBadTokens; + Tokenizer mGoodTokens, mBadTokens, mGoodTokensJ, mBadTokensJ; double mJunkProbabilityThreshold; - PRUint32 mGoodCount, mBadCount; + PRUint32 mGoodCount, mBadCount, mGoodCountJ, mBadCountJ; PRUint32 mBatchLevel; // allow for nested batches to happen PRPackedBool mTrainingDataDirty; };