[Groonga-commit] groonga/groonga [master] report encoding difference between groonga and MeCab. #85

Zurück zum Archiv-Index

null+****@clear***** null+****@clear*****
2010年 6月 10日 (木) 16:33:59 JST


Kouhei Sutou	2010-06-10 07:33:59 +0000 (Thu, 10 Jun 2010)

  New Revision: 8873b3d556a921f507b213f928aba9307608c4d1

  Log:
    report encoding difference between groonga and MeCab. #85

  Modified files:
    modules/tokenizers/mecab.c

  Modified: modules/tokenizers/mecab.c (+41 -0)
===================================================================
--- modules/tokenizers/mecab.c    2010-06-10 04:19:42 +0000 (1d97708)
+++ modules/tokenizers/mecab.c    2010-06-10 07:33:59 +0000 (acd5b65)
@@ -158,12 +158,53 @@ mecab_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
   return NULL;
 }
 
+static void
+check_mecab_dictionary_encoding(grn_ctx *ctx)
+{
+  mecab_t *mecab;
+
+  mecab = mecab_new(0, NULL);
+  if (mecab) {
+    grn_encoding encoding;
+    const mecab_dictionary_info_t *dictionary;
+    int have_same_encoding_dictionary = 0;
+
+    encoding = GRN_CTX_GET_ENCODING(ctx);
+    dictionary = mecab_dictionary_info(mecab);
+    for (; dictionary; dictionary = dictionary->next) {
+      switch (encoding) {
+      case GRN_ENC_EUC_JP:
+        if (strcmp(dictionary->charset, "EUC-JP") == 0) {
+          have_same_encoding_dictionary = 1;
+        }
+        break;
+      case GRN_ENC_UTF8:
+        if (strcmp(dictionary->charset, "UTF-8") == 0) {
+          have_same_encoding_dictionary = 1;
+        }
+        break;
+      default:
+        break;
+      }
+    }
+    mecab_destroy(mecab);
+
+    if (!have_same_encoding_dictionary) {
+      ERR(GRN_TOKENIZER_ERROR,
+          "MeCab has no dictionary that uses the context encoding: <%s>",
+          grn_enctostr(encoding));
+    }
+  }
+}
+
 grn_rc
 grn_module_init_mecab(grn_ctx *ctx)
 {
   sole_mecab = NULL;
   CRITICAL_SECTION_INIT(sole_mecab_lock);
 
+  check_mecab_dictionary_encoding(ctx);
+
   return GRN_SUCCESS;
 }
 




Groonga-commit メーリングリストの案内
Zurück zum Archiv-Index