Kouhei Sutou
null+****@clear*****
Tue Apr 7 23:47:15 JST 2015
Kouhei Sutou 2015-04-07 23:47:15 +0900 (Tue, 07 Apr 2015) New Revision: 6d994a6f3413bef6800d3f3e5b8a40aa326b473c https://github.com/groonga/groonga/commit/6d994a6f3413bef6800d3f3e5b8a40aa326b473c Message: TokenRegexp: don't search overlapped tokens They are needless. Added files: test/command/suite/select/filter/index/regexp/long.expected test/command/suite/select/filter/index/regexp/long.test test/command/suite/tokenizers/regexp/get/long.expected test/command/suite/tokenizers/regexp/get/long.test Modified files: lib/tokenizers.c test/command/suite/tokenizers/regexp/get/end/four.expected Modified: lib/tokenizers.c (+9 -0) =================================================================== --- lib/tokenizers.c 2015-04-07 22:55:41 +0900 (ea85cc6) +++ lib/tokenizers.c 2015-04-07 23:47:15 +0900 (8ed0b8c) @@ -475,6 +475,7 @@ typedef struct { struct { grn_bool have_begin; grn_bool have_end; + int32_t n_skip_tokens; } get; grn_bool is_begin; grn_bool is_end; @@ -513,6 +514,7 @@ regexp_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) tokenizer->get.have_begin = GRN_FALSE; tokenizer->get.have_end = GRN_FALSE; + tokenizer->get.n_skip_tokens = 0; tokenizer->is_begin = GRN_TRUE; tokenizer->is_end = GRN_FALSE; @@ -681,6 +683,13 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) status |= GRN_TOKEN_FORCE_PREFIX; } } + } else { + if (tokenizer->get.n_skip_tokens > 0) { + tokenizer->get.n_skip_tokens--; + status |= GRN_TOKEN_SKIP; + } else { + tokenizer->get.n_skip_tokens = ngram_unit - 1; + } } } else { if (tokenizer->next == end) { Added: test/command/suite/select/filter/index/regexp/long.expected (+52 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/filter/index/regexp/long.expected 2015-04-07 23:47:15 +0900 (e3a3a5e) @@ -0,0 +1,52 @@ +table_create Memos TABLE_NO_KEY +[[0,0.0,0.0],true] +column_create Memos content COLUMN_SCALAR Text +[[0,0.0,0.0],true] +table_create RegexpTokens TABLE_PAT_KEY ShortText --default_tokenizer TokenRegexp +[[0,0.0,0.0],true] +column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION Memos content +[[0,0.0,0.0],true] +load --table Memos +[ +{"content": "Groonga"}, +{"content": "Mroonga"}, +{"content": "Rroonga and Ruby"} +] +[[0,0.0,0.0],3] +select Memos --filter 'content @~ "roonga"' +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 3 + ], + [ + [ + "_id", + "UInt32" + ], + [ + "content", + "Text" + ] + ], + [ + 1, + "Groonga" + ], + [ + 2, + "Mroonga" + ], + [ + 3, + "Rroonga and Ruby" + ] + ] + ] +] Added: test/command/suite/select/filter/index/regexp/long.test (+16 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/filter/index/regexp/long.test 2015-04-07 23:47:15 +0900 (758139d) @@ -0,0 +1,16 @@ +table_create Memos TABLE_NO_KEY +column_create Memos content COLUMN_SCALAR Text + +table_create RegexpTokens TABLE_PAT_KEY ShortText \ + --default_tokenizer TokenRegexp +column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION \ + Memos content + +load --table Memos +[ +{"content": "Groonga"}, +{"content": "Mroonga"}, +{"content": "Rroonga and Ruby"} +] + +select Memos --filter 'content @~ "roonga"' Modified: test/command/suite/tokenizers/regexp/get/end/four.expected (+0 -4) =================================================================== --- test/command/suite/tokenizers/regexp/get/end/four.expected 2015-04-07 22:55:41 +0900 (b621183) +++ test/command/suite/tokenizers/regexp/get/end/four.expected 2015-04-07 23:47:15 +0900 (ad58a34) @@ -47,10 +47,6 @@ table_tokenize Lexicon "abcd\\z" --mode GET "position": 0 }, { - "value": "bc", - "position": 1 - }, - { "value": "cd", "position": 2 }, Added: test/command/suite/tokenizers/regexp/get/long.expected (+98 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/regexp/get/long.expected 2015-04-07 23:47:15 +0900 (d9023b0) @@ -0,0 +1,98 @@ +table_create Lexicon TABLE_PAT_KEY ShortText --default_tokenizer TokenRegexp +[[0,0.0,0.0],true] +table_tokenize Lexicon "abcdefghijk" --mode ADD +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "", + "position": 0 + }, + { + "value": "ab", + "position": 1 + }, + { + "value": "bc", + "position": 2 + }, + { + "value": "cd", + "position": 3 + }, + { + "value": "de", + "position": 4 + }, + { + "value": "ef", + "position": 5 + }, + { + "value": "fg", + "position": 6 + }, + { + "value": "gh", + "position": 7 + }, + { + "value": "hi", + "position": 8 + }, + { + "value": "ij", + "position": 9 + }, + { + "value": "jk", + "position": 10 + }, + { + "value": "k", + "position": 11 + }, + { + "value": "", + "position": 12 + } + ] +] +table_tokenize Lexicon "abcdefghijk" --mode GET +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "ab", + "position": 0 + }, + { + "value": "cd", + "position": 2 + }, + { + "value": "ef", + "position": 4 + }, + { + "value": "gh", + "position": 6 + }, + { + "value": "ij", + "position": 8 + }, + { + "value": "jk", + "position": 9 + } + ] +] Added: test/command/suite/tokenizers/regexp/get/long.test (+5 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/regexp/get/long.test 2015-04-07 23:47:15 +0900 (b3b2f0d) @@ -0,0 +1,5 @@ +table_create Lexicon TABLE_PAT_KEY ShortText \ + --default_tokenizer TokenRegexp +table_tokenize Lexicon "abcdefghijk" --mode ADD + +table_tokenize Lexicon "abcdefghijk" --mode GET -------------- next part -------------- HTML����������������������������... Download