Kouhei Sutou 2018-10-19 19:02:46 +0900 (Fri, 19 Oct 2018) Revision: d6ee2d85f6de6afa2a38e846998b58b68e93bb57 https://github.com/groonga/groonga/commit/d6ee2d85f6de6afa2a38e846998b58b68e93bb57 Message: TokenMecab: support subclasses and negative conditions "+CLASS/SUBCLASS0" matches class and subclass0. "-CLASS/SUBCLASS0" matches class and subclass0 but reject the token. "+' matches all classes. It uses with "-..." pattern. Added files: test/command/suite/tokenizers/mecab/options/target_class/negative.test test/command/suite/tokenizers/mecab/options/target_class/positive.expected test/command/suite/tokenizers/mecab/options/target_class/positive.test test/command/suite/tokenizers/mecab/options/target_class/subclass0.expected test/command/suite/tokenizers/mecab/options/target_class/subclass0.test test/command/suite/tokenizers/mecab/options/target_class/subclass1.expected test/command/suite/tokenizers/mecab/options/target_class/subclass1.test test/command/suite/tokenizers/mecab/options/target_class/subclass2.expected test/command/suite/tokenizers/mecab/options/target_class/subclass2.test Copied files: test/command/suite/tokenizers/mecab/options/target_class/negative.expected (from test/command/suite/tokenizers/mecab/options/target_class/one.expected) Modified files: plugins/tokenizers/mecab.c test/command/suite/tokenizers/mecab/options/target_class/one.expected Modified: plugins/tokenizers/mecab.c (+87 -9) =================================================================== --- plugins/tokenizers/mecab.c 2018-10-18 17:30:23 +0900 (536d6da0c) +++ plugins/tokenizers/mecab.c 2018-10-19 19:02:46 +0900 (91a080fbf) @@ -755,6 +755,24 @@ mecab_next_default_format_consume_token(grn_ctx *ctx, return surface_length; } +static grn_bool +mecab_next_default_format_match_class(grn_ctx *ctx, + const char *target_class, + size_t target_class_length, + const char *class, + size_t class_length) +{ + if (class_length == 0) { + return GRN_FALSE; + } + + if (target_class_length < class_length) { + return GRN_FALSE; + } + + return memcmp(target_class, class, class_length) == 0; +} + static void mecab_next_default_format_consume_needless_tokens(grn_ctx *ctx, grn_mecab_tokenizer *tokenizer) @@ -775,8 +793,9 @@ mecab_next_default_format_consume_needless_tokens(grn_ctx *ctx, size_t surface_length = 0; unsigned int i; grn_obj *feature_locations; - const char *class = NULL; - size_t class_length; + const size_t n_classes = 4; + const char *classes[4]; + size_t class_lengths[4]; last_next = tokenizer->next; surface_length = mecab_next_default_format_consume_token(ctx, @@ -788,22 +807,81 @@ mecab_next_default_format_consume_needless_tokens(grn_ctx *ctx, } feature_locations = &(tokenizer->feature_locations); - class_length = mecab_get_feature(ctx, - feature_locations, - GRN_MECAB_FEATURE_LOCATION_CLASS, - &class); + class_lengths[0] = mecab_get_feature(ctx, + feature_locations, + GRN_MECAB_FEATURE_LOCATION_CLASS, + &(classes[0])); + class_lengths[1] = mecab_get_feature(ctx, + feature_locations, + GRN_MECAB_FEATURE_LOCATION_SUBCLASS0, + &(classes[1])); + class_lengths[2] = mecab_get_feature(ctx, + feature_locations, + GRN_MECAB_FEATURE_LOCATION_SUBCLASS1, + &(classes[2])); + class_lengths[3] = mecab_get_feature(ctx, + feature_locations, + GRN_MECAB_FEATURE_LOCATION_SUBCLASS2, + &(classes[3])); for (i = 0; i < n_target_classes; i++) { const char *target_class; unsigned int target_class_length; + grn_bool positive = GRN_TRUE; + size_t j; + grn_bool matched = GRN_FALSE; + target_class_length = grn_vector_get_element(ctx, target_classes, i, &target_class, NULL, NULL); - if (target_class_length == class_length && - memcmp(target_class, class, target_class_length) == 0) { - is_target = GRN_TRUE; + if (target_class_length > 0) { + switch (target_class[0]) { + case '+' : + target_class++; + target_class_length--; + break; + case '-' : + positive = GRN_FALSE; + target_class++; + target_class_length--; + break; + default : + break; + } + } + + for (j = 0; j < n_classes; j++) { + const size_t class_length = class_lengths[j]; + + if (target_class_length == 0) { + is_target = positive; + matched = GRN_TRUE; + break; + } + + if (!mecab_next_default_format_match_class(ctx, + target_class, + target_class_length, + classes[j], + class_length)) { + break; + } + target_class += class_length; + target_class_length -= class_length; + if (target_class_length == 0) { + is_target = positive; + matched = GRN_TRUE; + break; + } + if (target_class[0] != '/') { + break; + } + target_class++; + target_class_length--; + } + if (matched) { break; } } Copied: test/command/suite/tokenizers/mecab/options/target_class/negative.expected (+5 -23) 69% =================================================================== --- test/command/suite/tokenizers/mecab/options/target_class/one.expected 2018-10-18 17:30:23 +0900 (4b5bd07e2) +++ test/command/suite/tokenizers/mecab/options/target_class/negative.expected 2018-10-19 19:02:46 +0900 (97c86be79) @@ -1,4 +1,4 @@ -tokenize 'TokenMecab("include_class", true)' '私の名前は中野です。' +tokenize 'TokenMecab("include_class", true, "target_class", "-助詞", "target_class", "+")' '私の名前は中野です。' [ [ 0, @@ -17,17 +17,8 @@ tokenize 'TokenMecab("include_class", true)' '私の名前は中野です。 } }, { - "value": "の", - "position": 1, - "force_prefix": false, - "metadata": { - "class": "助詞", - "subclass0": "連体化" - } - }, - { "value": "名前", - "position": 2, + "position": 1, "force_prefix": false, "metadata": { "class": "名詞", @@ -35,17 +26,8 @@ tokenize 'TokenMecab("include_class", true)' '私の名前は中野です。 } }, { - "value": "は", - "position": 3, - "force_prefix": false, - "metadata": { - "class": "助詞", - "subclass0": "係助詞" - } - }, - { "value": "中野", - "position": 4, + "position": 2, "force_prefix": false, "metadata": { "class": "名詞", @@ -56,7 +38,7 @@ tokenize 'TokenMecab("include_class", true)' '私の名前は中野です。 }, { "value": "です", - "position": 5, + "position": 3, "force_prefix": false, "metadata": { "class": "助動詞" @@ -64,7 +46,7 @@ tokenize 'TokenMecab("include_class", true)' '私の名前は中野です。 }, { "value": "。", - "position": 6, + "position": 4, "force_prefix": false, "metadata": { "class": "記号", Added: test/command/suite/tokenizers/mecab/options/target_class/negative.test (+7 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/mecab/options/target_class/negative.test 2018-10-19 19:02:46 +0900 (71e2ae75a) @@ -0,0 +1,7 @@ +#@on-error omit +tokenize \ + 'TokenMecab("include_class", true, \ + "target_class", "-助詞", \ + "target_class", "+")' \ + '私の名前は中野です。' +#@on-error default Modified: test/command/suite/tokenizers/mecab/options/target_class/one.expected (+3 -38) =================================================================== --- test/command/suite/tokenizers/mecab/options/target_class/one.expected 2018-10-18 17:30:23 +0900 (4b5bd07e2) +++ test/command/suite/tokenizers/mecab/options/target_class/one.expected 2018-10-19 19:02:46 +0900 (b0d35cba0) @@ -1,4 +1,4 @@ -tokenize 'TokenMecab("include_class", true)' '私の名前は中野です。' +tokenize 'TokenMecab("include_class", true, "target_class", "名詞")' '私の名前は中野です。' [ [ 0, @@ -17,17 +17,8 @@ tokenize 'TokenMecab("include_class", true)' '私の名前は中野です。 } }, { - "value": "の", - "position": 1, - "force_prefix": false, - "metadata": { - "class": "助詞", - "subclass0": "連体化" - } - }, - { "value": "名前", - "position": 2, + "position": 1, "force_prefix": false, "metadata": { "class": "名詞", @@ -35,17 +26,8 @@ tokenize 'TokenMecab("include_class", true)' '私の名前は中野です。 } }, { - "value": "は", - "position": 3, - "force_prefix": false, - "metadata": { - "class": "助詞", - "subclass0": "係助詞" - } - }, - { "value": "中野", - "position": 4, + "position": 2, "force_prefix": false, "metadata": { "class": "名詞", @@ -53,23 +35,6 @@ tokenize 'TokenMecab("include_class", true)' '私の名前は中野です。 "subclass1": "人名", "subclass2": "姓" } - }, - { - "value": "です", - "position": 5, - "force_prefix": false, - "metadata": { - "class": "助動詞" - } - }, - { - "value": "。", - "position": 6, - "force_prefix": false, - "metadata": { - "class": "記号", - "subclass0": "句点" - } } ] ] Added: test/command/suite/tokenizers/mecab/options/target_class/positive.expected (+40 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/mecab/options/target_class/positive.expected 2018-10-19 19:02:46 +0900 (522fb872c) @@ -0,0 +1,40 @@ +tokenize 'TokenMecab("include_class", true, "target_class", "+名詞")' '私の名前は中野です。' +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "私", + "position": 0, + "force_prefix": false, + "metadata": { + "class": "名詞", + "subclass0": "代名詞", + "subclass1": "一般" + } + }, + { + "value": "名前", + "position": 1, + "force_prefix": false, + "metadata": { + "class": "名詞", + "subclass0": "一般" + } + }, + { + "value": "中野", + "position": 2, + "force_prefix": false, + "metadata": { + "class": "名詞", + "subclass0": "固有名詞", + "subclass1": "人名", + "subclass2": "姓" + } + } + ] +] Added: test/command/suite/tokenizers/mecab/options/target_class/positive.test (+6 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/mecab/options/target_class/positive.test 2018-10-19 19:02:46 +0900 (ddc37197a) @@ -0,0 +1,6 @@ +#@on-error omit +tokenize \ + 'TokenMecab("include_class", true, \ + "target_class", "+名詞")' \ + '私の名前は中野です。' +#@on-error default Added: test/command/suite/tokenizers/mecab/options/target_class/subclass0.expected (+20 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/mecab/options/target_class/subclass0.expected 2018-10-19 19:02:46 +0900 (8cce33d12) @@ -0,0 +1,20 @@ +tokenize 'TokenMecab("include_class", true, "target_class", "名詞/代名詞")' '私の名前は中野です。' +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "私", + "position": 0, + "force_prefix": false, + "metadata": { + "class": "名詞", + "subclass0": "代名詞", + "subclass1": "一般" + } + } + ] +] Added: test/command/suite/tokenizers/mecab/options/target_class/subclass0.test (+6 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/mecab/options/target_class/subclass0.test 2018-10-19 19:02:46 +0900 (b8a5bcd6e) @@ -0,0 +1,6 @@ +#@on-error omit +tokenize \ + 'TokenMecab("include_class", true, \ + "target_class", "名詞/代名詞")' \ + '私の名前は中野です。' +#@on-error default Added: test/command/suite/tokenizers/mecab/options/target_class/subclass1.expected (+21 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/mecab/options/target_class/subclass1.expected 2018-10-19 19:02:46 +0900 (91267ef17) @@ -0,0 +1,21 @@ +tokenize 'TokenMecab("include_class", true, "target_class", "名詞/固有名詞/人名")' '私の名前は中野です。東京生まれです。' +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "中野", + "position": 0, + "force_prefix": false, + "metadata": { + "class": "名詞", + "subclass0": "固有名詞", + "subclass1": "人名", + "subclass2": "姓" + } + } + ] +] Added: test/command/suite/tokenizers/mecab/options/target_class/subclass1.test (+6 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/mecab/options/target_class/subclass1.test 2018-10-19 19:02:46 +0900 (ebee2c4b7) @@ -0,0 +1,6 @@ +#@on-error omit +tokenize \ + 'TokenMecab("include_class", true, \ + "target_class", "名詞/固有名詞/人名")' \ + '私の名前は中野です。東京生まれです。' +#@on-error default Added: test/command/suite/tokenizers/mecab/options/target_class/subclass2.expected (+21 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/mecab/options/target_class/subclass2.expected 2018-10-19 19:02:46 +0900 (5ebd03263) @@ -0,0 +1,21 @@ +tokenize 'TokenMecab("include_class", true, "target_class", "名詞/固有名詞/人名/姓")' '私の名前は中野太郎です。' +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "中野", + "position": 0, + "force_prefix": false, + "metadata": { + "class": "名詞", + "subclass0": "固有名詞", + "subclass1": "人名", + "subclass2": "姓" + } + } + ] +] Added: test/command/suite/tokenizers/mecab/options/target_class/subclass2.test (+6 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/mecab/options/target_class/subclass2.test 2018-10-19 19:02:46 +0900 (c967bcbdc) @@ -0,0 +1,6 @@ +#@on-error omit +tokenize \ + 'TokenMecab("include_class", true, \ + "target_class", "名詞/固有名詞/人名/姓")' \ + '私の名前は中野太郎です。' +#@on-error default -------------- next part -------------- An HTML attachment was scrubbed... URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20181019/9e4e925f/attachment-0001.html>