Kouhei Sutou
null+****@clear*****
Fri Apr 6 15:30:48 JST 2018
Kouhei Sutou 2018-04-06 15:30:48 +0900 (Fri, 06 Apr 2018) New Revision: e30e135a10fc5b2d0169a5738929ed1194e77e16 https://github.com/groonga/groonga/commit/e30e135a10fc5b2d0169a5738929ed1194e77e16 Message: TokenNgram: add "loose_symbol" option Added files: test/command/suite/tokenizers/ngram/loose_symbol/add.expected test/command/suite/tokenizers/ngram/loose_symbol/add.test test/command/suite/tokenizers/ngram/loose_symbol/get.expected test/command/suite/tokenizers/ngram/loose_symbol/get.test test/command/suite/tokenizers/ngram/n.expected test/command/suite/tokenizers/ngram/n.test Copied files: test/command/suite/table_create/default_tokenizer/ngram/options/multiple.expected (from test/command/suite/table_create/default_tokenizer/ngram/options/n.expected) test/command/suite/table_create/default_tokenizer/ngram/options/multiple.test (from test/command/suite/table_create/default_tokenizer/ngram/options/n.test) Modified files: lib/tokenizers.c Renamed files: test/command/suite/table_create/default_tokenizer/ngram/options/one.expected (from test/command/suite/table_create/default_tokenizer/ngram/options/n.expected) test/command/suite/table_create/default_tokenizer/ngram/options/one.test (from test/command/suite/table_create/default_tokenizer/ngram/options/n.test) Modified: lib/tokenizers.c (+126 -11) =================================================================== --- lib/tokenizers.c 2018-04-06 15:29:18 +0900 (f96640b63) +++ lib/tokenizers.c 2018-04-06 15:30:48 +0900 (998b3ffca) @@ -246,6 +246,7 @@ typedef struct { grn_bool uni_digit; grn_bool uni_symbol; grn_bool ignore_blank; + grn_bool loose_symbol; } grn_ngram_options; typedef struct { @@ -253,6 +254,13 @@ typedef struct { grn_tokenizer_query *query; grn_ngram_options options; grn_bool overlap; + struct { + grn_bool ing; + grn_bool need; + grn_bool need_end_mark; + grn_obj text; + uint_least8_t *ctypes; + } loose; int32_t pos; uint32_t skip; const unsigned char *next; @@ -270,6 +278,69 @@ ngram_options_init(grn_ngram_options *options, uint8_t unit) options->uni_digit = GRN_TRUE; options->uni_symbol = GRN_TRUE; options->ignore_blank = GRN_FALSE; + options->loose_symbol = GRN_FALSE; +} + +static void +ngram_switch_to_loose_mode(grn_ctx *ctx, + grn_ngram_tokenizer *tokenizer) +{ + const char *normalized; + unsigned int normalized_length_in_bytes; + unsigned int normalized_length_in_chars; + const char *normalized_end; + const uint_least8_t *types = tokenizer->ctypes; + + grn_string_get_normalized(ctx, + tokenizer->query->normalized_query, + &normalized, + &normalized_length_in_bytes, + &normalized_length_in_chars); + normalized_end = normalized + normalized_length_in_bytes; + + if (types) { + uint_least8_t *loose_types; + + tokenizer->loose.ctypes = + GRN_MALLOC(sizeof(uint_least8_t) * normalized_length_in_chars); + if (!tokenizer->loose.ctypes) { + ERR(GRN_NO_MEMORY_AVAILABLE, + "[tokenizer][ngram][loose] " + "failed to allocate memory for character types"); + return; + } + loose_types = tokenizer->loose.ctypes; + while (normalized < normalized_end) { + size_t length; + length = grn_charlen_(ctx, + (char *)normalized, + (char *)normalized_end, + tokenizer->query->encoding); + if (length == 0) { + break; + } + if (!(tokenizer->options.loose_symbol && + GRN_STR_CTYPE(*types) == GRN_CHAR_SYMBOL)) { + GRN_TEXT_PUT(ctx, &(tokenizer->loose.text), normalized, length); + *loose_types = *types; + loose_types++; + } + normalized += length; + types++; + } + tokenizer->next = + (const unsigned char *)GRN_TEXT_VALUE(&(tokenizer->loose.text)); + tokenizer->end = tokenizer->next + GRN_TEXT_LEN(&(tokenizer->loose.text)); + tokenizer->ctypes = tokenizer->loose.ctypes; + } else { + tokenizer->next = normalized; + tokenizer->end = normalized_end; + } + + tokenizer->pos = 0; + tokenizer->skip = 0; + tokenizer->overlap = GRN_FALSE; + tokenizer->loose.ing = GRN_TRUE; } static grn_obj * @@ -310,6 +381,11 @@ ngram_init_raw(grn_ctx *ctx, tokenizer->options = *options; tokenizer->overlap = GRN_FALSE; + tokenizer->loose.ing = GRN_FALSE; + tokenizer->loose.need = GRN_FALSE; + tokenizer->loose.need_end_mark = GRN_FALSE; + GRN_TEXT_INIT(&(tokenizer->loose.text), 0); + tokenizer->loose.ctypes = NULL; tokenizer->pos = 0; tokenizer->skip = 0; @@ -320,6 +396,11 @@ ngram_init_raw(grn_ctx *ctx, tokenizer->end = tokenizer->next + normalized_length_in_bytes; tokenizer->ctypes = grn_string_get_types(ctx, tokenizer->query->normalized_query); + + if (tokenizer->query->tokenize_mode == GRN_TOKEN_GET) { + ngram_switch_to_loose_mode(ctx, tokenizer); + } + return NULL; } @@ -431,7 +512,7 @@ ngram_open_options(grn_ctx *ctx, if (!options) { ERR(GRN_NO_MEMORY_AVAILABLE, "[tokenizer][ngram] " - "failed to allocate Ngram options"); + "failed to allocate memory for options"); return NULL; } @@ -447,11 +528,11 @@ ngram_open_options(grn_ctx *ctx, raw_options, i, options->unit); - /* } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "loose_symbol")) { */ - /* options->loose_symbol = grn_vector_get_element_bool(ctx, */ - /* raw_options, */ - /* i, */ - /* options->loose_symbol); */ + } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "loose_symbol")) { + options->loose_symbol = grn_vector_get_element_bool(ctx, + raw_options, + i, + options->loose_symbol); } } GRN_OPTION_VALUES_EACH_END(); @@ -492,6 +573,26 @@ ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) int32_t len = 0, pos = tokenizer->pos + tokenizer->skip; grn_token_status status = 0; const uint_least8_t *cp = tokenizer->ctypes ? tokenizer->ctypes + pos : NULL; + + if (tokenizer->loose.ing && tokenizer->loose.need_end_mark) { + grn_tokenizer_token_push(ctx, + &(tokenizer->token), + GRN_TOKENIZER_END_MARK_UTF8, + GRN_TOKENIZER_END_MARK_UTF8_LEN, + status); + ngram_switch_to_loose_mode(ctx, tokenizer); + tokenizer->loose.need_end_mark = GRN_FALSE; + return NULL; + } + + if (cp && + !tokenizer->loose.ing && + !tokenizer->loose.need && + tokenizer->options.loose_symbol && + GRN_STR_CTYPE(*cp) == GRN_CHAR_SYMBOL) { + tokenizer->loose.need = GRN_TRUE; + } + if (cp && tokenizer->options.uni_alpha && GRN_STR_CTYPE(*cp) == GRN_CHAR_ALPHA) { while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, @@ -589,11 +690,21 @@ ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) tokenizer->skip = tokenizer->overlap ? 1 : len; } if (r == e) { status |= GRN_TOKEN_REACH_END; } - grn_tokenizer_token_push(ctx, - &(tokenizer->token), - (const char *)p, - r - p, - status); + + { + if ((status & (GRN_TOKEN_LAST | GRN_TOKEN_REACH_END)) && + !tokenizer->loose.ing && tokenizer->loose.need) { + status &= ~(GRN_TOKEN_LAST | GRN_TOKEN_REACH_END); + tokenizer->loose.ing = GRN_TRUE; + tokenizer->loose.need_end_mark = GRN_TRUE; + } + grn_tokenizer_token_push(ctx, + &(tokenizer->token), + (const char *)p, + r - p, + status); + } + return NULL; } @@ -604,6 +715,10 @@ ngram_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) if (!tokenizer) { return NULL; } + if (tokenizer->loose.ctypes) { + GRN_FREE(tokenizer->loose.ctypes); + } + GRN_OBJ_FIN(ctx, &(tokenizer->loose.text)); grn_tokenizer_token_fin(ctx, &(tokenizer->token)); grn_tokenizer_query_close(ctx, tokenizer->query); GRN_FREE(tokenizer); Copied: test/command/suite/table_create/default_tokenizer/ngram/options/multiple.expected (+13 -34) 69% =================================================================== --- test/command/suite/table_create/default_tokenizer/ngram/options/n.expected 2018-04-06 15:29:18 +0900 (0e1c9d971) +++ test/command/suite/table_create/default_tokenizer/ngram/options/multiple.expected 2018-04-06 15:30:48 +0900 (476e32a4a) @@ -2,13 +2,13 @@ table_create Memos TABLE_NO_KEY [[0,0.0,0.0],true] column_create Memos content COLUMN_SCALAR Text [[0,0.0,0.0],true] -table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer 'TokenNgram("n", 3)' +table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer 'TokenNgram("n", 3, "loose_symbol", true)' --normalizer NormalizerAuto [[0,0.0,0.0],true] column_create Terms memos_content COLUMN_INDEX Memos content [[0,0.0,0.0],true] load --table Memos [ -{"content": "This is a pen."}, +{"content": "090-1234-5678"}, {"content": "これはペンです。"} ] [[0,0.0,0.0],2] @@ -22,7 +22,7 @@ select Terms --output_columns _key --limit -1 [ [ [ - 21 + 14 ], [ [ @@ -31,43 +31,19 @@ select Terms --output_columns _key --limit -1 ] ], [ - " a " + "-" ], [ - " is" + "090" ], [ - " pe" + "09012345678" ], [ - "." + "1234" ], [ - "Thi" - ], - [ - "a p" - ], - [ - "en." - ], - [ - "his" - ], - [ - "is " - ], - [ - "n." - ], - [ - "pen" - ], - [ - "s a" - ], - [ - "s i" + "5678" ], [ "。" @@ -76,10 +52,10 @@ select Terms --output_columns _key --limit -1 "これは" ], [ - "す。" + "す" ], [ - "です。" + "です" ], [ "はペン" @@ -92,6 +68,9 @@ select Terms --output_columns _key --limit -1 ], [ "ンです" + ], + [ + "" ] ] ] Copied: test/command/suite/table_create/default_tokenizer/ngram/options/multiple.test (+3 -2) 70% =================================================================== --- test/command/suite/table_create/default_tokenizer/ngram/options/n.test 2018-04-06 15:29:18 +0900 (199aa2878) +++ test/command/suite/table_create/default_tokenizer/ngram/options/multiple.test 2018-04-06 15:30:48 +0900 (a1e9eb08f) @@ -2,12 +2,13 @@ table_create Memos TABLE_NO_KEY column_create Memos content COLUMN_SCALAR Text table_create Terms TABLE_PAT_KEY ShortText \ - --default_tokenizer 'TokenNgram("n", 3)' + --default_tokenizer 'TokenNgram("n", 3, "loose_symbol", true)' \ + --normalizer NormalizerAuto column_create Terms memos_content COLUMN_INDEX Memos content load --table Memos [ -{"content": "This is a pen."}, +{"content": "090-1234-5678"}, {"content": "これはペンです。"} ] Renamed: test/command/suite/table_create/default_tokenizer/ngram/options/one.expected (+0 -0) 100% =================================================================== Renamed: test/command/suite/table_create/default_tokenizer/ngram/options/one.test (+0 -0) 100% =================================================================== Added: test/command/suite/tokenizers/ngram/loose_symbol/add.expected (+45 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/ngram/loose_symbol/add.expected 2018-04-06 15:30:48 +0900 (a8b574720) @@ -0,0 +1,45 @@ +tokenize 'TokenNgram("loose_symbol", true)' "090-1234-5678" NormalizerAuto --mode ADD +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "090", + "position": 0, + "force_prefix": false + }, + { + "value": "-", + "position": 1, + "force_prefix": false + }, + { + "value": "1234", + "position": 2, + "force_prefix": false + }, + { + "value": "-", + "position": 3, + "force_prefix": false + }, + { + "value": "5678", + "position": 4, + "force_prefix": false + }, + { + "value": "", + "position": 5, + "force_prefix": false + }, + { + "value": "09012345678", + "position": 6, + "force_prefix": false + } + ] +] Added: test/command/suite/tokenizers/ngram/loose_symbol/add.test (+5 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/ngram/loose_symbol/add.test 2018-04-06 15:30:48 +0900 (8fb1d61a2) @@ -0,0 +1,5 @@ +tokenize \ + 'TokenNgram("loose_symbol", true)' \ + "090-1234-5678" \ + NormalizerAuto \ + --mode ADD Added: test/command/suite/tokenizers/ngram/loose_symbol/get.expected (+2 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/ngram/loose_symbol/get.expected 2018-04-06 15:30:48 +0900 (601e8b3cb) @@ -0,0 +1,2 @@ +tokenize 'TokenNgram("loose_symbol", true)' "090-1234-5678" NormalizerAuto --mode GET +[[0,0.0,0.0],[{"value":"09012345678","position":0,"force_prefix":false}]] Added: test/command/suite/tokenizers/ngram/loose_symbol/get.test (+5 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/ngram/loose_symbol/get.test 2018-04-06 15:30:48 +0900 (8af91ffac) @@ -0,0 +1,5 @@ +tokenize \ + 'TokenNgram("loose_symbol", true)' \ + "090-1234-5678" \ + NormalizerAuto \ + --mode GET Added: test/command/suite/tokenizers/ngram/n.expected (+45 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/ngram/n.expected 2018-04-06 15:30:48 +0900 (b4af32bdb) @@ -0,0 +1,45 @@ +tokenize 'TokenNgram("n", 3)' "abcdefg" +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "abc", + "position": 0, + "force_prefix": false + }, + { + "value": "bcd", + "position": 1, + "force_prefix": false + }, + { + "value": "cde", + "position": 2, + "force_prefix": false + }, + { + "value": "def", + "position": 3, + "force_prefix": false + }, + { + "value": "efg", + "position": 4, + "force_prefix": false + }, + { + "value": "fg", + "position": 5, + "force_prefix": false + }, + { + "value": "g", + "position": 6, + "force_prefix": false + } + ] +] Added: test/command/suite/tokenizers/ngram/n.test (+1 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/ngram/n.test 2018-04-06 15:30:48 +0900 (17d83ba64) @@ -0,0 +1 @@ +tokenize 'TokenNgram("n", 3)' "abcdefg" -------------- next part -------------- HTML����������������������������... URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180406/34695edc/attachment-0001.htm