Kouhei Sutou
null+****@clear*****
Tue May 22 18:01:35 JST 2018
Kouhei Sutou 2018-05-22 18:01:35 +0900 (Tue, 22 May 2018) New Revision: d5c6d1f38438c34272f91f046baaf86150f6177c https://github.com/groonga/groonga/commit/d5c6d1f38438c34272f91f046baaf86150f6177c Message: TokenNgram: fix a wrong source offset bug for loose case Added files: test/command/suite/select/function/highlight_html/lexicon/loose.expected test/command/suite/select/function/highlight_html/lexicon/loose.test test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_non_number.expected test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_non_number.test Modified files: lib/tokenizers.c Modified: lib/tokenizers.c (+19 -12) =================================================================== --- lib/tokenizers.c 2018-05-22 16:45:01 +0900 (cecf8af9a) +++ lib/tokenizers.c 2018-05-22 18:01:35 +0900 (e59fcd6ca) @@ -845,7 +845,6 @@ ngram_next(grn_ctx *ctx, uint32_t source_length = 0; uint32_t source_first_character_length = 0; uint64_t next_offset = tokenizer->source_offset; - grn_bool first_character = GRN_TRUE; grn_token_set_source_offset(ctx, token, tokenizer->source_offset); if (checks[0] == -1) { size_t n_leading_bytes = p - tokenizer->start; @@ -855,22 +854,30 @@ ngram_next(grn_ctx *ctx, if (!tokenizer->overlap) { next_offset += checks[-i]; } - first_character = GRN_FALSE; break; } } } - for (i = 0; i < data_size; i++) { - if (checks[i] > 0) { - if ((tokenizer->overlap && !first_character) || - !tokenizer->overlap) { - next_offset += checks[i]; - } - if (first_character) { - source_first_character_length = checks[i]; + { + uint64_t first_offset = 0; + for (i = 0; i < data_size; i++) { + if (checks[i] > 0) { + if ((tokenizer->overlap && first_offset == 0) || + !tokenizer->overlap) { + if (first_offset == 0) { + first_offset = checks[i]; + } + next_offset += checks[i]; + } + if (source_first_character_length == 0) { + source_first_character_length = checks[i]; + } + source_length += checks[i]; + } else if (checks[i] < 0) { + if (tokenizer->overlap) { + next_offset -= first_offset; + } } - source_length += checks[i]; - first_character = GRN_FALSE; } } grn_token_set_source_length(ctx, token, source_length); Added: test/command/suite/select/function/highlight_html/lexicon/loose.expected (+37 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/function/highlight_html/lexicon/loose.expected 2018-05-22 18:01:35 +0900 (070da0b24) @@ -0,0 +1,37 @@ +table_create Entries TABLE_NO_KEY +[[0,0.0,0.0],true] +column_create Entries body COLUMN_SCALAR ShortText +[[0,0.0,0.0],true] +table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer 'TokenNgram("loose_symbol", true, "report_source_location", true)' --normalizer 'NormalizerNFKC100' +[[0,0.0,0.0],true] +column_create Terms document_index COLUMN_INDEX|WITH_POSITION Entries body +[[0,0.0,0.0],true] +load --table Entries +[ +{"body": "(あいうえお)"} +] +[[0,0.0,0.0],1] +select Entries --match_columns body --query 'いうえお' --output_columns 'highlight_html(body, Terms)' +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 1 + ], + [ + [ + "highlight_html", + null + ] + ], + [ + "(あ<span class=\"keyword\">いうえお</span>)" + ] + ] + ] +] Added: test/command/suite/select/function/highlight_html/lexicon/loose.test (+20 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/function/highlight_html/lexicon/loose.test 2018-05-22 18:01:35 +0900 (704d557dc) @@ -0,0 +1,20 @@ + +table_create Entries TABLE_NO_KEY +column_create Entries body COLUMN_SCALAR ShortText + +table_create Terms TABLE_PAT_KEY ShortText \ + --default_tokenizer 'TokenNgram("loose_symbol", true, \ + "report_source_location", true)' \ + --normalizer 'NormalizerNFKC100' +column_create Terms document_index COLUMN_INDEX|WITH_POSITION Entries body + +load --table Entries +[ +{"body": "(あいうえお)"} +] + +select Entries \ + --match_columns body \ + --query 'いうえお' \ + --output_columns 'highlight_html(body, Terms)' + Added: test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_non_number.expected (+114 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_non_number.expected 2018-05-22 18:01:35 +0900 (e0ccd2903) @@ -0,0 +1,114 @@ +tokenize 'TokenNgram("loose_symbol", true, "report_source_location", true)' "(あいうえお)" 'NormalizerNFKC100' +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "(", + "position": 0, + "force_prefix": false, + "source_offset": 0, + "source_length": 1, + "source_first_character_length": 1 + }, + { + "value": "あい", + "position": 1, + "force_prefix": false, + "source_offset": 1, + "source_length": 6, + "source_first_character_length": 3 + }, + { + "value": "いう", + "position": 2, + "force_prefix": false, + "source_offset": 4, + "source_length": 6, + "source_first_character_length": 3 + }, + { + "value": "うえ", + "position": 3, + "force_prefix": false, + "source_offset": 7, + "source_length": 6, + "source_first_character_length": 3 + }, + { + "value": "えお", + "position": 4, + "force_prefix": false, + "source_offset": 10, + "source_length": 6, + "source_first_character_length": 3 + }, + { + "value": "お", + "position": 5, + "force_prefix": false, + "source_offset": 13, + "source_length": 3, + "source_first_character_length": 3 + }, + { + "value": ")", + "position": 6, + "force_prefix": false, + "source_offset": 16, + "source_length": 1, + "source_first_character_length": 1 + }, + { + "value": "", + "position": 7, + "force_prefix": false, + "source_offset": 17, + "source_length": 0, + "source_first_character_length": 0 + }, + { + "value": "あい", + "position": 8, + "force_prefix": false, + "source_offset": 0, + "source_length": 7, + "source_first_character_length": 4 + }, + { + "value": "いう", + "position": 9, + "force_prefix": false, + "source_offset": 4, + "source_length": 6, + "source_first_character_length": 3 + }, + { + "value": "うえ", + "position": 10, + "force_prefix": false, + "source_offset": 7, + "source_length": 6, + "source_first_character_length": 3 + }, + { + "value": "えお", + "position": 11, + "force_prefix": false, + "source_offset": 10, + "source_length": 6, + "source_first_character_length": 3 + }, + { + "value": "お", + "position": 12, + "force_prefix": false, + "source_offset": 13, + "source_length": 3, + "source_first_character_length": 3 + } + ] +] Added: test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_non_number.test (+5 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_non_number.test 2018-05-22 18:01:35 +0900 (8b9a1545a) @@ -0,0 +1,5 @@ +tokenize \ + 'TokenNgram("loose_symbol", true, \ + "report_source_location", true)' \ + "(あいうえお)" \ + 'NormalizerNFKC100' -------------- next part -------------- HTML����������������������������... URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180522/741f3833/attachment-0001.htm