null+****@clear*****
null+****@clear*****
2010年 6月 10日 (木) 16:33:59 JST
Kouhei Sutou 2010-06-10 07:33:59 +0000 (Thu, 10 Jun 2010) New Revision: 8873b3d556a921f507b213f928aba9307608c4d1 Log: report encoding difference between groonga and MeCab. #85 Modified files: modules/tokenizers/mecab.c Modified: modules/tokenizers/mecab.c (+41 -0) =================================================================== --- modules/tokenizers/mecab.c 2010-06-10 04:19:42 +0000 (1d97708) +++ modules/tokenizers/mecab.c 2010-06-10 07:33:59 +0000 (acd5b65) @@ -158,12 +158,53 @@ mecab_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) return NULL; } +static void +check_mecab_dictionary_encoding(grn_ctx *ctx) +{ + mecab_t *mecab; + + mecab = mecab_new(0, NULL); + if (mecab) { + grn_encoding encoding; + const mecab_dictionary_info_t *dictionary; + int have_same_encoding_dictionary = 0; + + encoding = GRN_CTX_GET_ENCODING(ctx); + dictionary = mecab_dictionary_info(mecab); + for (; dictionary; dictionary = dictionary->next) { + switch (encoding) { + case GRN_ENC_EUC_JP: + if (strcmp(dictionary->charset, "EUC-JP") == 0) { + have_same_encoding_dictionary = 1; + } + break; + case GRN_ENC_UTF8: + if (strcmp(dictionary->charset, "UTF-8") == 0) { + have_same_encoding_dictionary = 1; + } + break; + default: + break; + } + } + mecab_destroy(mecab); + + if (!have_same_encoding_dictionary) { + ERR(GRN_TOKENIZER_ERROR, + "MeCab has no dictionary that uses the context encoding: <%s>", + grn_enctostr(encoding)); + } + } +} + grn_rc grn_module_init_mecab(grn_ctx *ctx) { sole_mecab = NULL; CRITICAL_SECTION_INIT(sole_mecab_lock); + check_mecab_dictionary_encoding(ctx); + return GRN_SUCCESS; }