Kouhei Sutou 2018-11-26 10:37:01 +0900 (Mon, 26 Nov 2018) Revision: d658ad001c67b43e4565c9731fba6c3557ced933 https://github.com/groonga/groonga/commit/d658ad001c67b43e4565c9731fba6c3557ced933 Message: TokenDelimit: invert what pattern means Now, pattern specifies delimiter pattern not token pattern. Modified files: lib/tokenizers.c test/command/suite/tokenizers/delimit/options/pattern/no_match.expected test/command/suite/tokenizers/delimit/options/pattern/sentences.expected test/command/suite/tokenizers/delimit/options/pattern/sentences.test Modified: lib/tokenizers.c (+4 -4) =================================================================== --- lib/tokenizers.c 2018-11-26 10:36:49 +0900 (cb4478d44) +++ lib/tokenizers.c 2018-11-26 10:37:01 +0900 (5c5420853) @@ -337,14 +337,14 @@ delimit_next(grn_ctx *ctx, if (position == ONIG_MISMATCH) { grn_token_set_data(ctx, token, - NULL, - 0); + tokenizer->next, + tokenizer->end - tokenizer->next); grn_token_set_status(ctx, token, GRN_TOKEN_LAST); } else { grn_token_set_data(ctx, token, - tokenizer->start + region.beg[0], - region.end[0] - region.beg[0]); + tokenizer->next, + (tokenizer->start + region.beg[0]) - tokenizer->next); grn_token_set_status(ctx, token, GRN_TOKEN_CONTINUE); tokenizer->next = tokenizer->start + region.end[0]; onig_region_free(®ion, 0); Modified: test/command/suite/tokenizers/delimit/options/pattern/no_match.expected (+15 -1) =================================================================== --- test/command/suite/tokenizers/delimit/options/pattern/no_match.expected 2018-11-26 10:36:49 +0900 (70e6dbf63) +++ test/command/suite/tokenizers/delimit/options/pattern/no_match.expected 2018-11-26 10:37:01 +0900 (7b2d3a084) @@ -1,2 +1,16 @@ tokenize 'TokenDelimit("pattern", "nonexistent")' "Hello" -[[0,0.0,0.0],[]] +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "Hello", + "position": 0, + "force_prefix": false, + "force_prefix_search": false + } + ] +] Modified: test/command/suite/tokenizers/delimit/options/pattern/sentences.expected (+5 -11) =================================================================== --- test/command/suite/tokenizers/delimit/options/pattern/sentences.expected 2018-11-26 10:36:49 +0900 (08ac7c822) +++ test/command/suite/tokenizers/delimit/options/pattern/sentences.expected 2018-11-26 10:37:01 +0900 (f8447b9a4) @@ -1,4 +1,4 @@ -tokenize 'TokenDelimit("pattern", "[^\\\\s].*?[.。]")' "りんごです。ペンです。This is an apple. Mr. X." +tokenize 'TokenDelimit("pattern", "(?:(?<!(?:Mr|bldg))[.]|。)\\\\s*")' "りんごです。ペンです。This is an apple. Mr. X." [ [ 0, @@ -7,34 +7,28 @@ tokenize 'TokenDelimit("pattern", "[^\\\\s].*?[.。]")' "りんごです。 ], [ { - "value": "りんごです。", + "value": "りんごです", "position": 0, "force_prefix": false, "force_prefix_search": false }, { - "value": "ペンです。", + "value": "ペンです", "position": 1, "force_prefix": false, "force_prefix_search": false }, { - "value": "This is an apple.", + "value": "This is an apple", "position": 2, "force_prefix": false, "force_prefix_search": false }, { - "value": "Mr.", + "value": "Mr. X", "position": 3, "force_prefix": false, "force_prefix_search": false - }, - { - "value": "X.", - "position": 4, - "force_prefix": false, - "force_prefix_search": false } ] ] Modified: test/command/suite/tokenizers/delimit/options/pattern/sentences.test (+1 -1) =================================================================== --- test/command/suite/tokenizers/delimit/options/pattern/sentences.test 2018-11-26 10:36:49 +0900 (8396e3284) +++ test/command/suite/tokenizers/delimit/options/pattern/sentences.test 2018-11-26 10:37:01 +0900 (22b497abb) @@ -1,3 +1,3 @@ tokenize \ - 'TokenDelimit("pattern", "[^\\\\s].*?[.。]")' \ + 'TokenDelimit("pattern", "(?:(?<!(?:Mr|bldg))[.]|。)\\\\s*")' \ "りんごです。ペンです。This is an apple. Mr. X." -------------- next part -------------- An HTML attachment was scrubbed... URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20181126/890ee4d9/attachment-0001.html>