Kouhei Sutou
null+****@clear*****
Thu Sep 20 17:11:09 JST 2018
Kouhei Sutou 2018-09-20 17:11:09 +0900 (Thu, 20 Sep 2018) Revision: 32b2e82cd29e89c0e26e6226d9baa8b2d8725a54 https://github.com/groonga/groonga/commit/32b2e82cd29e89c0e26e6226d9baa8b2d8725a54 Message: TokenNgram: add unify_alphabet option TokenNgram("unify_alphabet", false) == TokenBigramSplitAlpha (not exist ;p) Added files: test/command/suite/tokenizers/ngram/unify_alphabet.expected test/command/suite/tokenizers/ngram/unify_alphabet.test Modified files: lib/tokenizers.c Modified: lib/tokenizers.c (+30 -24) =================================================================== --- lib/tokenizers.c 2018-09-20 14:11:38 +0900 (657b8fa71) +++ lib/tokenizers.c 2018-09-20 17:11:09 +0900 (ca3037e5c) @@ -260,9 +260,9 @@ static grn_bool grn_ngram_tokenizer_remove_blank_enable = GRN_TRUE; typedef struct { uint8_t unit; - grn_bool uni_alpha; - grn_bool uni_digit; - grn_bool uni_symbol; + grn_bool unify_alphabet; + grn_bool unify_digit; + grn_bool unify_symbol; grn_bool ignore_blank; grn_bool remove_blank; grn_bool loose_symbol; @@ -302,9 +302,9 @@ static void ngram_options_init(grn_ngram_options *options, uint8_t unit) { options->unit = unit; - options->uni_alpha = GRN_TRUE; - options->uni_digit = GRN_TRUE; - options->uni_symbol = GRN_TRUE; + options->unify_alphabet = GRN_TRUE; + options->unify_digit = GRN_TRUE; + options->unify_symbol = GRN_TRUE; options->ignore_blank = GRN_FALSE; options->remove_blank = grn_ngram_tokenizer_remove_blank_enable; options->loose_symbol = GRN_FALSE; @@ -607,7 +607,7 @@ bigrams_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { grn_ngram_options options; ngram_options_init(&options, 2); - options.uni_symbol = GRN_FALSE; + options.unify_symbol = GRN_FALSE; return ngram_init_deprecated(ctx, nargs, args, user_data, &options); } @@ -616,8 +616,8 @@ bigramsa_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { grn_ngram_options options; ngram_options_init(&options, 2); - options.uni_symbol = GRN_FALSE; - options.uni_alpha = GRN_FALSE; + options.unify_symbol = GRN_FALSE; + options.unify_alphabet = GRN_FALSE; return ngram_init_deprecated(ctx, nargs, args, user_data, &options); } @@ -626,9 +626,9 @@ bigramsad_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data { grn_ngram_options options; ngram_options_init(&options, 2); - options.uni_symbol = GRN_FALSE; - options.uni_alpha = GRN_FALSE; - options.uni_digit = GRN_FALSE; + options.unify_symbol = GRN_FALSE; + options.unify_alphabet = GRN_FALSE; + options.unify_digit = GRN_FALSE; return ngram_init_deprecated(ctx, nargs, args, user_data, &options); } @@ -647,7 +647,7 @@ bigramis_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) grn_ngram_options options; ngram_options_init(&options, 2); options.ignore_blank = GRN_TRUE; - options.uni_symbol = GRN_FALSE; + options.unify_symbol = GRN_FALSE; return ngram_init_deprecated(ctx, nargs, args, user_data, &options); } @@ -657,8 +657,8 @@ bigramisa_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data grn_ngram_options options; ngram_options_init(&options, 2); options.ignore_blank = GRN_TRUE; - options.uni_symbol = GRN_FALSE; - options.uni_alpha = GRN_FALSE; + options.unify_symbol = GRN_FALSE; + options.unify_alphabet = GRN_FALSE; return ngram_init_deprecated(ctx, nargs, args, user_data, &options); } @@ -668,9 +668,9 @@ bigramisad_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_dat grn_ngram_options options; ngram_options_init(&options, 2); options.ignore_blank = GRN_TRUE; - options.uni_symbol = GRN_FALSE; - options.uni_alpha = GRN_FALSE; - options.uni_digit = GRN_FALSE; + options.unify_symbol = GRN_FALSE; + options.unify_alphabet = GRN_FALSE; + options.unify_digit = GRN_FALSE; return ngram_init_deprecated(ctx, nargs, args, user_data, &options); } @@ -730,6 +730,12 @@ ngram_open_options(grn_ctx *ctx, raw_options, i, options->include_removed_source_location); + } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "unify_alphabet")) { + options->unify_alphabet = + grn_vector_get_element_bool(ctx, + raw_options, + i, + options->unify_alphabet); } } GRN_OPTION_VALUES_EACH_END(); @@ -815,7 +821,7 @@ ngram_next(grn_ctx *ctx, LOOSE_NEED_CHECK(cp, tokenizer); - if (cp && tokenizer->options.uni_alpha && + if (cp && tokenizer->options.unify_alphabet && GRN_STR_CTYPE(*cp) == GRN_CHAR_ALPHA) { while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, encoding))) { n_characters++; @@ -827,7 +833,7 @@ ngram_next(grn_ctx *ctx, tokenizer->next = r; tokenizer->overlap = GRN_FALSE; } else if (cp && - tokenizer->options.uni_digit && + tokenizer->options.unify_digit && GRN_STR_CTYPE(*cp) == GRN_CHAR_DIGIT) { while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, encoding))) { n_characters++; @@ -839,7 +845,7 @@ ngram_next(grn_ctx *ctx, tokenizer->next = r; tokenizer->overlap = GRN_FALSE; } else if (cp && - tokenizer->options.uni_symbol && + tokenizer->options.unify_symbol && GRN_STR_CTYPE(*cp) == GRN_CHAR_SYMBOL) { while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, encoding))) { n_characters++; @@ -880,11 +886,11 @@ ngram_next(grn_ctx *ctx, LOOSE_NEED_CHECK(cp, tokenizer); if (!tokenizer->options.ignore_blank && GRN_STR_ISBLANK(*cp)) { break; } cp++; - if ((tokenizer->options.uni_alpha && + if ((tokenizer->options.unify_alphabet && GRN_STR_CTYPE(*cp) == GRN_CHAR_ALPHA) || - (tokenizer->options.uni_digit && + (tokenizer->options.unify_digit && GRN_STR_CTYPE(*cp) == GRN_CHAR_DIGIT) || - (tokenizer->options.uni_symbol && + (tokenizer->options.unify_symbol && GRN_STR_CTYPE(*cp) == GRN_CHAR_SYMBOL)) { break; } Added: test/command/suite/tokenizers/ngram/unify_alphabet.expected (+35 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/ngram/unify_alphabet.expected 2018-09-20 17:11:09 +0900 (1e6d748eb) @@ -0,0 +1,35 @@ +tokenize 'TokenNgram("unify_alphabet", false)' "abcde" NormalizerAuto +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "ab", + "position": 0, + "force_prefix": false + }, + { + "value": "bc", + "position": 1, + "force_prefix": false + }, + { + "value": "cd", + "position": 2, + "force_prefix": false + }, + { + "value": "de", + "position": 3, + "force_prefix": false + }, + { + "value": "e", + "position": 4, + "force_prefix": false + } + ] +] Added: test/command/suite/tokenizers/ngram/unify_alphabet.test (+4 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/ngram/unify_alphabet.test 2018-09-20 17:11:09 +0900 (b37dfdca6) @@ -0,0 +1,4 @@ +tokenize \ + 'TokenNgram("unify_alphabet", false)' \ + "abcde" \ + NormalizerAuto -------------- next part -------------- HTML����������������������������... URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180920/48aa8d43/attachment-0001.htm