Kouhei Sutou
null+****@clear*****
Wed May 9 15:07:13 JST 2018
Kouhei Sutou 2018-05-09 15:07:13 +0900 (Wed, 09 May 2018) New Revision: 80c83fa473f396e9d78494cab973bf1ba6e92e45 https://github.com/groonga/groonga/commit/80c83fa473f396e9d78494cab973bf1ba6e92e45 Message: TokenNgram: add report_source_location option Added files: test/command/suite/tokenizers/ngram/report_source_location.expected test/command/suite/tokenizers/ngram/report_source_location.test Modified files: lib/tokenizers.c Modified: lib/tokenizers.c (+120 -14) =================================================================== --- lib/tokenizers.c 2018-05-09 14:19:02 +0900 (52d21a71e) +++ lib/tokenizers.c 2018-05-09 15:07:13 +0900 (b9c578a8c) @@ -267,6 +267,7 @@ typedef struct { grn_bool remove_blank; grn_bool loose_symbol; grn_bool loose_blank; + grn_bool report_source_location; } grn_ngram_options; typedef struct { @@ -280,13 +281,17 @@ typedef struct { grn_bool need_end_mark; grn_obj text; uint_least8_t *ctypes; + int16_t *checks; } loose; int32_t pos; uint32_t skip; + const unsigned char *start; const unsigned char *next; const unsigned char *end; const uint_least8_t *ctypes; + const int16_t *checks; uint32_t tail; + uint64_t source_offset; } grn_ngram_tokenizer; static void @@ -300,6 +305,7 @@ ngram_options_init(grn_ngram_options *options, uint8_t unit) options->remove_blank = grn_ngram_tokenizer_remove_blank_enable; options->loose_symbol = GRN_FALSE; options->loose_blank = GRN_FALSE; + options->report_source_location = GRN_FALSE; } static void @@ -312,6 +318,7 @@ ngram_switch_to_loose_mode(grn_ctx *ctx, unsigned int normalized_length_in_chars; const char *normalized_end; const uint_least8_t *types = tokenizer->ctypes; + const int16_t *checks = tokenizer->checks; string = grn_tokenizer_query_get_normalized_string(ctx, tokenizer->query); grn_string_get_normalized(ctx, @@ -325,6 +332,8 @@ ngram_switch_to_loose_mode(grn_ctx *ctx, grn_encoding encoding = grn_tokenizer_query_get_encoding(ctx, tokenizer->query); uint_least8_t *loose_types; + int16_t *loose_checks = NULL; + const int16_t *removed_checks = NULL; tokenizer->loose.ctypes = GRN_MALLOC(sizeof(uint_least8_t) * normalized_length_in_chars); @@ -335,6 +344,18 @@ ngram_switch_to_loose_mode(grn_ctx *ctx, return; } loose_types = tokenizer->loose.ctypes; + if (checks) { + tokenizer->loose.checks = + GRN_CALLOC(sizeof(int16_t) * normalized_length_in_bytes); + if (!tokenizer->loose.checks) { + ERR(GRN_NO_MEMORY_AVAILABLE, + "[tokenizer][ngram][loose] " + "failed to allocate memory for character offsets"); + return; + } + } + loose_types = tokenizer->loose.ctypes; + loose_checks = tokenizer->loose.checks; while (normalized < normalized_end) { size_t length; length = grn_charlen_(ctx, @@ -344,27 +365,50 @@ ngram_switch_to_loose_mode(grn_ctx *ctx, if (length == 0) { break; } - if (!((tokenizer->options.loose_symbol && - GRN_STR_CTYPE(*types) == GRN_CHAR_SYMBOL) || - (!tokenizer->options.remove_blank && - tokenizer->options.loose_blank && - GRN_STR_ISBLANK(*types)))) { + if ((tokenizer->options.loose_symbol && + GRN_STR_CTYPE(*types) == GRN_CHAR_SYMBOL) || + (!tokenizer->options.remove_blank && + tokenizer->options.loose_blank && + GRN_STR_ISBLANK(*types))) { + if (!removed_checks) { + removed_checks = checks; + } + } else { GRN_TEXT_PUT(ctx, &(tokenizer->loose.text), normalized, length); *loose_types = *types; if (tokenizer->options.loose_blank && GRN_STR_ISBLANK(*types)) { *loose_types &= ~GRN_STR_BLANK; } loose_types++; + if (loose_checks) { + size_t i; + for (; removed_checks && removed_checks < checks; removed_checks++) { + if (*removed_checks > 0) { + *loose_checks += *removed_checks; + } + } + removed_checks = NULL; + for (i = 0; i < length; i++) { + loose_checks[i] += checks[i]; + } + loose_checks += length; + } } normalized += length; types++; + if (checks) { + checks += length; + } } - tokenizer->next = + tokenizer->start = (const unsigned char *)GRN_TEXT_VALUE(&(tokenizer->loose.text)); - tokenizer->end = tokenizer->next + GRN_TEXT_LEN(&(tokenizer->loose.text)); + tokenizer->next = tokenizer->start; + tokenizer->end = tokenizer->start + GRN_TEXT_LEN(&(tokenizer->loose.text)); tokenizer->ctypes = tokenizer->loose.ctypes; + tokenizer->checks = tokenizer->loose.checks; } else { - tokenizer->next = normalized; + tokenizer->start = normalized; + tokenizer->next = tokenizer->start; tokenizer->end = normalized_end; } @@ -372,6 +416,7 @@ ngram_switch_to_loose_mode(grn_ctx *ctx, tokenizer->skip = 0; tokenizer->overlap = GRN_FALSE; tokenizer->loose.ing = GRN_TRUE; + tokenizer->source_offset = 0; } static void * @@ -388,6 +433,9 @@ ngram_init_raw(grn_ctx *ctx, if (!options->remove_blank) { normalize_flags &= ~GRN_STRING_REMOVE_BLANK; } + if (options->report_source_location) { + normalize_flags |= GRN_STRING_WITH_CHECKS; + } grn_tokenizer_query_set_normalize_flags(ctx, query, normalize_flags); if (!(tokenizer = GRN_MALLOC(sizeof(grn_ngram_tokenizer)))) { @@ -408,8 +456,10 @@ ngram_init_raw(grn_ctx *ctx, tokenizer->loose.need_end_mark = GRN_FALSE; GRN_TEXT_INIT(&(tokenizer->loose.text), 0); tokenizer->loose.ctypes = NULL; + tokenizer->loose.checks = NULL; tokenizer->pos = 0; tokenizer->skip = 0; + tokenizer->source_offset = 0; { grn_obj *string; @@ -421,9 +471,11 @@ ngram_init_raw(grn_ctx *ctx, string, &normalized_raw, &normalized_length_in_bytes, NULL); - tokenizer->next = (const unsigned char *)normalized_raw; - tokenizer->end = tokenizer->next + normalized_length_in_bytes; + tokenizer->start = (const unsigned char *)normalized_raw; + tokenizer->next = tokenizer->start; + tokenizer->end = tokenizer->start + normalized_length_in_bytes; tokenizer->ctypes = grn_string_get_types(ctx, string); + tokenizer->checks = grn_string_get_checks(ctx, string); } if (grn_tokenizer_query_get_mode(ctx, tokenizer->query) == GRN_TOKEN_GET) { @@ -594,9 +646,15 @@ ngram_open_options(grn_ctx *ctx, options->loose_symbol); } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "loose_blank")) { options->loose_blank = grn_vector_get_element_bool(ctx, - raw_options, - i, - options->loose_blank); + raw_options, + i, + options->loose_blank); + } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "report_source_location")) { + options->report_source_location = + grn_vector_get_element_bool(ctx, + raw_options, + i, + options->report_source_location); } } GRN_OPTION_VALUES_EACH_END(); @@ -641,14 +699,22 @@ ngram_next(grn_ctx *ctx, int32_t pos = tokenizer->pos + tokenizer->skip; grn_token_status status = 0; const uint_least8_t *cp = tokenizer->ctypes ? tokenizer->ctypes + pos : NULL; + const int16_t *checks = NULL; grn_encoding encoding = grn_tokenizer_query_get_encoding(ctx, query); + if (tokenizer->checks) { + checks = tokenizer->checks + (p - tokenizer->start); + } + if (tokenizer->loose.ing && tokenizer->loose.need_end_mark) { grn_token_set_data(ctx, token, GRN_TOKENIZER_END_MARK_UTF8, GRN_TOKENIZER_END_MARK_UTF8_LEN); grn_token_set_status(ctx, token, status); + if (checks) { + grn_token_set_source_offset(ctx, token, tokenizer->source_offset); + } ngram_switch_to_loose_mode(ctx, tokenizer); tokenizer->loose.need_end_mark = GRN_FALSE; return; @@ -764,14 +830,51 @@ ngram_next(grn_ctx *ctx, if (r == e) { status |= GRN_TOKEN_REACH_END; } { + size_t data_size = r - p; if ((status & (GRN_TOKEN_LAST | GRN_TOKEN_REACH_END)) && !tokenizer->loose.ing && tokenizer->loose.need) { status &= ~(GRN_TOKEN_LAST | GRN_TOKEN_REACH_END); tokenizer->loose.ing = GRN_TRUE; tokenizer->loose.need_end_mark = GRN_TRUE; } - grn_token_set_data(ctx, token, p, r - p); + grn_token_set_data(ctx, token, p, data_size); grn_token_set_status(ctx, token, status); + if (checks) { + size_t i; + uint32_t uncount_offset = 0; + uint32_t source_length = 0; + grn_token_set_source_offset(ctx, token, tokenizer->source_offset); + if (checks[0] == -1) { + size_t n_leading_bytes = p - tokenizer->start; + for (i = 1; i <= n_leading_bytes; i++) { + if (checks[-i] > 0) { + uncount_offset = source_length = checks[-i]; + break; + } + } + } + for (i = 0; i < data_size; i++) { + if (checks[i] > 0) { + source_length += checks[i]; + } + } + if (r < e) { + if (checks[i] > 0) { + if (!tokenizer->overlap) { + uncount_offset = 0; + } + } else if (checks[i] == -1) { + for (; i > 0; i--) { + if (checks[i - 1] > 0) { + uncount_offset += checks[i - 1]; + break; + } + } + } + } + grn_token_set_source_length(ctx, token, source_length); + tokenizer->source_offset += source_length - uncount_offset; + } } } @@ -808,6 +911,9 @@ ngram_fin(grn_ctx *ctx, void *user_data) if (tokenizer->loose.ctypes) { GRN_FREE(tokenizer->loose.ctypes); } + if (tokenizer->loose.checks) { + GRN_FREE(tokenizer->loose.checks); + } GRN_OBJ_FIN(ctx, &(tokenizer->loose.text)); grn_tokenizer_token_fin(ctx, &(tokenizer->token)); GRN_FREE(tokenizer); Added: test/command/suite/tokenizers/ngram/report_source_location.expected (+157 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/ngram/report_source_location.expected 2018-05-09 15:07:13 +0900 (085aa799d) @@ -0,0 +1,157 @@ +tokenize 'TokenNgram("report_source_location", true, "loose_symbol", true)' "ア㌕090(1234)56−78" NormalizerAuto +[ + [ + 0, + 0.0, + 0.0 + ], + [ + { + "value": "アキ", + "position": 0, + "force_prefix": false, + "source_offset": 0, + "source_length": 6 + }, + { + "value": "キロ", + "position": 1, + "force_prefix": false, + "source_offset": 3, + "source_length": 3 + }, + { + "value": "ログ", + "position": 2, + "force_prefix": false, + "source_offset": 3, + "source_length": 3 + }, + { + "value": "グラ", + "position": 3, + "force_prefix": false, + "source_offset": 3, + "source_length": 3 + }, + { + "value": "ラム", + "position": 4, + "force_prefix": false, + "source_offset": 3, + "source_length": 3 + }, + { + "value": "ム", + "position": 5, + "force_prefix": false, + "source_offset": 3, + "source_length": 3 + }, + { + "value": "090", + "position": 6, + "force_prefix": false, + "source_offset": 6, + "source_length": 9 + }, + { + "value": "(", + "position": 7, + "force_prefix": false, + "source_offset": 15, + "source_length": 3 + }, + { + "value": "1234", + "position": 8, + "force_prefix": false, + "source_offset": 18, + "source_length": 8 + }, + { + "value": ")", + "position": 9, + "force_prefix": false, + "source_offset": 26, + "source_length": 3 + }, + { + "value": "56", + "position": 10, + "force_prefix": false, + "source_offset": 29, + "source_length": 4 + }, + { + "value": "−", + "position": 11, + "force_prefix": false, + "source_offset": 33, + "source_length": 3 + }, + { + "value": "78", + "position": 12, + "force_prefix": false, + "source_offset": 36, + "source_length": 6 + }, + { + "value": "", + "position": 13, + "force_prefix": false, + "source_offset": 42, + "source_length": 0 + }, + { + "value": "アキ", + "position": 14, + "force_prefix": false, + "source_offset": 0, + "source_length": 6 + }, + { + "value": "キロ", + "position": 15, + "force_prefix": false, + "source_offset": 3, + "source_length": 3 + }, + { + "value": "ログ", + "position": 16, + "force_prefix": false, + "source_offset": 3, + "source_length": 3 + }, + { + "value": "グラ", + "position": 17, + "force_prefix": false, + "source_offset": 3, + "source_length": 3 + }, + { + "value": "ラム", + "position": 18, + "force_prefix": false, + "source_offset": 3, + "source_length": 3 + }, + { + "value": "ム", + "position": 19, + "force_prefix": false, + "source_offset": 3, + "source_length": 3 + }, + { + "value": "09012345678", + "position": 20, + "force_prefix": false, + "source_offset": 6, + "source_length": 36 + } + ] +] Added: test/command/suite/tokenizers/ngram/report_source_location.test (+4 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/tokenizers/ngram/report_source_location.test 2018-05-09 15:07:13 +0900 (e41363c28) @@ -0,0 +1,4 @@ +tokenize \ + 'TokenNgram("report_source_location", true, "loose_symbol", true)' \ + "ア㌕090(1234)56−78" \ + NormalizerAuto -------------- next part -------------- HTML����������������������������... URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180509/7cb663e0/attachment-0001.htm