Kouhei Sutou
null+****@clear*****
Wed Apr 11 16:41:39 JST 2018
Kouhei Sutou 2018-04-11 16:41:39 +0900 (Wed, 11 Apr 2018) New Revision: 87805b9bc59dd11341fd25993e2385e8195438b4 https://github.com/groonga/groonga/commit/87805b9bc59dd11341fd25993e2385e8195438b4 Message: normalizer: support normalizer options NormalizerNFKC100 supports "unify_kana" option. Added files: test/command/suite/normalizers/nfkc100/unify_kana.expected test/command/suite/normalizers/nfkc100/unify_kana.test Modified files: lib/normalizer.c lib/proc/proc_lexicon.c lib/proc/proc_normalize.c Modified: lib/normalizer.c (+133 -23) =================================================================== --- lib/normalizer.c 2018-04-11 16:41:12 +0900 (8ca5aa18e) +++ lib/normalizer.c 2018-04-11 16:41:39 +0900 (cf2db157f) @@ -20,6 +20,7 @@ #include "grn_normalizer.h" #include "grn_string.h" +#include "grn_raw_string.h" #include "grn_nfkc.h" #include <groonga/normalizer.h> #include <groonga/tokenizer.h> @@ -615,12 +616,29 @@ typedef const char *(*grn_nfkc_decompose_func)(const unsigned char *utf8); typedef const char *(*grn_nfkc_compose_func)(const unsigned char *prefix_utf8, const unsigned char *suffix_utf8); +typedef struct { + grn_nfkc_char_type_func char_type_func; + grn_nfkc_decompose_func decompose_func; + grn_nfkc_compose_func compose_func; + grn_bool unify_kana; +} grn_utf8_normalize_options; + +static void +utf8_normalize_options_init(grn_utf8_normalize_options *options, + grn_nfkc_char_type_func char_type_func, + grn_nfkc_decompose_func decompose_func, + grn_nfkc_compose_func compose_func) +{ + options->char_type_func = char_type_func; + options->decompose_func = decompose_func; + options->compose_func = compose_func; + options->unify_kana = GRN_FALSE; +} + grn_inline static grn_obj * utf8_normalize(grn_ctx *ctx, grn_string *nstr, - grn_nfkc_char_type_func char_type_func, - grn_nfkc_decompose_func decompose_func, - grn_nfkc_compose_func compose_func) + grn_utf8_normalize_options *options) { int16_t *ch; const unsigned char *s, *s_, *s__ = NULL, *p, *p2, *pe, *e; @@ -668,13 +686,13 @@ utf8_normalize(grn_ctx *ctx, GRN_ENC_UTF8)) { continue; } - if ((p = (unsigned char *)decompose_func(s))) { + if ((p = (unsigned char *)options->decompose_func(s))) { pe = p + strlen((char *)p); } else { p = s; pe = p + ls; } - if (d_ && (p2 = (unsigned char *)compose_func(d_, p))) { + if (d_ && (p2 = (unsigned char *)options->compose_func(d_, p))) { p = p2; pe = p + strlen((char *)p); if (cp) { cp--; } @@ -694,6 +712,8 @@ utf8_normalize(grn_ctx *ctx, if ((*p == ' ' && removeblankp) || *p < 0x20 /* skip unprintable ascii */ ) { if (cp > nstr->ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; } } else { + grn_char_type char_type; + if (de <= d + lp) { unsigned char *normalized; ds += (ds >> 1) + lp; @@ -735,11 +755,32 @@ utf8_normalize(grn_ctx *ctx, nstr->ctypes = ctypes; } } - grn_memcpy(d, p, lp); + char_type = options->char_type_func(p); + if (options->unify_kana && char_type == GRN_CHAR_KATAKANA) { + if (lp == 3 && + p[0] == 0xe3 && + /* U+30A1 KATAKANA LETTER SMALL A .. + * U+30F6 KATAKANA LETTER SMALL KE + * + * U+30FD KATAKANA ITERATION MARK .. + * U+30F6 KATAKANA LETTER SMALL KE */ + ((p[1] == 0x82 && 0xa1 <= p[2]) || + (p[1] == 0x83 && p[2] <= 0xb6) || + (p[1] == 0x83 && (0xbd <= p[2] && p[2] <= 0xbe)))) { + d[0] = p[0]; + d[1] = p[1] - 1; + d[2] = p[2] ^ 0x20; + char_type = GRN_CHAR_HIRAGANA; + } else { + grn_memcpy(d, p, lp); + } + } else { + grn_memcpy(d, p, lp); + } d_ = d; d += lp; length++; - if (cp) { *cp++ = char_type_func(p); } + if (cp) { *cp++ = char_type; } if (ch) { size_t i; if (s_ == s + ls) { @@ -1133,11 +1174,14 @@ auto_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) break; case GRN_ENC_UTF8 : #ifdef GRN_WITH_NFKC - utf8_normalize(ctx, - string, - grn_nfkc_char_type, - grn_nfkc_decompose, - grn_nfkc_compose); + { + grn_utf8_normalize_options options; + utf8_normalize_options_init(&options, + grn_nfkc_char_type, + grn_nfkc_decompose, + grn_nfkc_compose); + utf8_normalize(ctx, string, &options); + } #else /* GRN_WITH_NFKC */ ascii_normalize(ctx, string); #endif /* GRN_WITH_NFKC */ @@ -1163,23 +1207,89 @@ static grn_obj * nfkc51_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { grn_string *string = (grn_string *)(args[0]); - utf8_normalize(ctx, - string, - grn_nfkc50_char_type, - grn_nfkc50_decompose, - grn_nfkc50_compose); + grn_utf8_normalize_options options; + + utf8_normalize_options_init(&options, + grn_nfkc50_char_type, + grn_nfkc50_decompose, + grn_nfkc50_compose); + utf8_normalize(ctx, string, &options); return NULL; } +static void * +nfkc100_open_options(grn_ctx *ctx, + grn_obj *string, + grn_obj *raw_options, + void *user_data) +{ + grn_utf8_normalize_options *options; + + options = GRN_MALLOC(sizeof(grn_utf8_normalize_options)); + if (!options) { + ERR(GRN_NO_MEMORY_AVAILABLE, + "[normalizer][nfkc100] " + "failed to allocate memory for options"); + return NULL; + } + + utf8_normalize_options_init(options, + grn_nfkc100_char_type, + grn_nfkc100_decompose, + grn_nfkc100_compose); + + GRN_OPTION_VALUES_EACH_BEGIN(ctx, raw_options, i, name, name_length) { + grn_raw_string name_raw; + name_raw.value = name; + name_raw.length = name_length; + + if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "unify_kana")) { + options->unify_kana = grn_vector_get_element_bool(ctx, + raw_options, + i, + options->unify_kana); + } + } GRN_OPTION_VALUES_EACH_END(); + + return options; +} + +static void +nfkc100_close_options(grn_ctx *ctx, void *data) +{ + grn_utf8_normalize_options *options = data; + GRN_FREE(options); +} + static grn_obj * nfkc100_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { - grn_string *string = (grn_string *)(args[0]); - utf8_normalize(ctx, - string, - grn_nfkc100_char_type, - grn_nfkc100_decompose, - grn_nfkc100_compose); + grn_obj *string = args[0]; + grn_string *string_ = (grn_string *)string; + grn_obj *table; + grn_utf8_normalize_options *options; + grn_utf8_normalize_options options_raw; + + table = grn_string_get_table(ctx, string); + if (table) { + options = grn_table_cache_normalizer_options(ctx, + table, + string, + nfkc100_open_options, + nfkc100_close_options, + NULL); + if (ctx->rc != GRN_SUCCESS) { + return NULL; + } + } else { + utf8_normalize_options_init(&options_raw, + grn_nfkc100_char_type, + grn_nfkc100_decompose, + grn_nfkc100_compose); + options = &options_raw; + } + + utf8_normalize(ctx, string_, options); return NULL; } #endif /* GRN_WITH_NFKC */ Modified: lib/proc/proc_lexicon.c (+27 -35) =================================================================== --- lib/proc/proc_lexicon.c 2018-04-11 16:41:12 +0900 (ca5bbcba1) +++ lib/proc/proc_lexicon.c 2018-04-11 16:41:39 +0900 (a74c2aec6) @@ -29,35 +29,6 @@ grn_proc_lexicon_open(grn_ctx *ctx, const char *context_tag) { grn_obj *lexicon; - grn_obj *normalizer = NULL; - - if (normalizer_raw->length > 0) { - normalizer = grn_ctx_get(ctx, - normalizer_raw->value, - normalizer_raw->length); - if (!normalizer) { - GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, - "%s nonexistent normalizer: <%.*s>", - context_tag, - (int)normalizer_raw->length, - normalizer_raw->value); - return NULL; - } - - if (!grn_obj_is_normalizer_proc(ctx, normalizer)) { - grn_obj inspected; - GRN_TEXT_INIT(&inspected, 0); - grn_inspect(ctx, &inspected, normalizer); - GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, - "%s not normalizer: %.*s", - context_tag, - (int)GRN_TEXT_LEN(&inspected), - GRN_TEXT_VALUE(&inspected)); - GRN_OBJ_FIN(ctx, &inspected); - grn_obj_unlink(ctx, normalizer); - return NULL; - } - } lexicon = grn_table_create(ctx, NULL, 0, NULL, @@ -67,7 +38,9 @@ grn_proc_lexicon_open(grn_ctx *ctx, { grn_obj tokenizer; GRN_TEXT_INIT(&tokenizer, GRN_OBJ_DO_SHALLOW_COPY); - GRN_TEXT_SET(ctx, &tokenizer, tokenizer_raw->value, tokenizer_raw->length); + if (tokenizer_raw) { + GRN_TEXT_SET(ctx, &tokenizer, tokenizer_raw->value, tokenizer_raw->length); + } grn_obj_set_info(ctx, lexicon, GRN_INFO_DEFAULT_TOKENIZER, &tokenizer); GRN_OBJ_FIN(ctx, &tokenizer); } @@ -81,12 +54,31 @@ grn_proc_lexicon_open(grn_ctx *ctx, ctx->errbuf); return NULL; } - if (normalizer) { - grn_obj_set_info(ctx, lexicon, - GRN_INFO_NORMALIZER, normalizer); - grn_obj_unlink(ctx, normalizer); + { + grn_obj normalizer; + GRN_TEXT_INIT(&normalizer, GRN_OBJ_DO_SHALLOW_COPY); + if (normalizer_raw) { + GRN_TEXT_SET(ctx, + &normalizer, + normalizer_raw->value, + normalizer_raw->length); + } + grn_obj_set_info(ctx, lexicon, GRN_INFO_NORMALIZER, &normalizer); + GRN_OBJ_FIN(ctx, &normalizer); + } + if (ctx->rc != GRN_SUCCESS) { + grn_obj_close(ctx, lexicon); + GRN_PLUGIN_ERROR(ctx, ctx->rc, + "%s failed to set normalizer: <%.*s>: %s", + context_tag, + (int)(normalizer_raw->length), + normalizer_raw->value, + ctx->errbuf); + return NULL; + } + if (token_filters_raw) { + grn_proc_table_set_token_filters(ctx, lexicon, token_filters_raw); } - grn_proc_table_set_token_filters(ctx, lexicon, token_filters_raw); return lexicon; } Modified: lib/proc/proc_normalize.c (+45 -54) =================================================================== --- lib/proc/proc_normalize.c 2018-04-11 16:41:12 +0900 (998a8030c) +++ lib/proc/proc_normalize.c 2018-04-11 16:41:39 +0900 (63f061e49) @@ -18,20 +18,17 @@ #include "../grn_proc.h" #include "../grn_ctx.h" -#include "../grn_token_cursor.h" #include <groonga/plugin.h> static int -parse_normalize_flags(grn_ctx *ctx, grn_obj *flag_names) +parse_normalize_flags(grn_ctx *ctx, grn_raw_string *flags_raw) { int flags = 0; const char *names, *names_end; - int length; - names = GRN_TEXT_VALUE(flag_names); - length = GRN_TEXT_LEN(flag_names); - names_end = names + length; + names = flags_raw->value; + names_end = names + flags_raw->length; while (names < names_end) { if (*names == '|' || *names == ' ') { names += 1; @@ -64,77 +61,69 @@ parse_normalize_flags(grn_ctx *ctx, grn_obj *flag_names) return flags; } -static grn_bool -is_normalizer(grn_ctx *ctx, grn_obj *object) -{ - if (object->header.type != GRN_PROC) { - return GRN_FALSE; - } - - if (grn_proc_get_type(ctx, object) != GRN_PROC_NORMALIZER) { - return GRN_FALSE; - } - - return GRN_TRUE; -} - static grn_obj * command_normalize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { - grn_obj *normalizer_name; - grn_obj *string; - grn_obj *flag_names; - - normalizer_name = grn_plugin_proc_get_var(ctx, user_data, "normalizer", -1); - string = grn_plugin_proc_get_var(ctx, user_data, "string", -1); - flag_names = grn_plugin_proc_get_var(ctx, user_data, "flags", -1); - if (GRN_TEXT_LEN(normalizer_name) == 0) { - ERR(GRN_INVALID_ARGUMENT, "normalizer name is missing"); + const char *context_tag = "[normalize]"; + grn_raw_string normalizer_raw; + grn_raw_string string_raw; + grn_raw_string flags_raw; + +#define GET_VALUE(name) \ + name ## _raw.value = \ + grn_plugin_proc_get_var_string(ctx, \ + user_data, \ + #name, \ + strlen(#name), \ + &(name ## _raw.length)) + + GET_VALUE(normalizer); + GET_VALUE(string); + GET_VALUE(flags); + +#undef GET_VALUE + + if (normalizer_raw.length == 0) { + GRN_PLUGIN_ERROR(ctx, + GRN_INVALID_ARGUMENT, + "%s normalizer name is missing", + context_tag); return NULL; } { - grn_obj *normalizer; - grn_obj *grn_string; int flags; + grn_obj *lexicon; + grn_obj *grn_string; unsigned int normalized_length_in_bytes; unsigned int normalized_n_characters; - flags = parse_normalize_flags(ctx, flag_names); - normalizer = grn_ctx_get(ctx, - GRN_TEXT_VALUE(normalizer_name), - GRN_TEXT_LEN(normalizer_name)); - if (!normalizer) { - ERR(GRN_INVALID_ARGUMENT, - "[normalize] nonexistent normalizer: <%.*s>", - (int)GRN_TEXT_LEN(normalizer_name), - GRN_TEXT_VALUE(normalizer_name)); + flags = parse_normalize_flags(ctx, &flags_raw); + if (ctx->rc != GRN_SUCCESS) { return NULL; } - if (!is_normalizer(ctx, normalizer)) { - grn_obj inspected; - GRN_TEXT_INIT(&inspected, 0); - grn_inspect(ctx, &inspected, normalizer); - ERR(GRN_INVALID_ARGUMENT, - "[normalize] not normalizer: %.*s", - (int)GRN_TEXT_LEN(&inspected), - GRN_TEXT_VALUE(&inspected)); - GRN_OBJ_FIN(ctx, &inspected); - grn_obj_unlink(ctx, normalizer); + lexicon = grn_proc_lexicon_open(ctx, + NULL, + &normalizer_raw, + NULL, + context_tag); + if (!lexicon) { return NULL; } grn_string = grn_string_open(ctx, - GRN_TEXT_VALUE(string), GRN_TEXT_LEN(string), - normalizer, flags); - grn_obj_unlink(ctx, normalizer); + string_raw.value, + string_raw.length, + lexicon, + flags); grn_ctx_output_map_open(ctx, "RESULT", 3); { const char *normalized; - grn_string_get_normalized(ctx, grn_string, + grn_string_get_normalized(ctx, + grn_string, &normalized, &normalized_length_in_bytes, &normalized_n_characters); @@ -178,6 +167,8 @@ command_normalize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_d grn_ctx_output_map_close(ctx); grn_obj_unlink(ctx, grn_string); + + grn_obj_unlink(ctx, lexicon); } return NULL; Added: test/command/suite/normalizers/nfkc100/unify_kana.expected (+23 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/normalizers/nfkc100/unify_kana.expected 2018-04-11 16:41:39 +0900 (d1f315517) @@ -0,0 +1,23 @@ +normalize 'NormalizerNFKC100("unify_kana", true)' "あイウェおヽヾ" WITH_TYPES +[ + [ + 0, + 0.0, + 0.0 + ], + { + "normalized": "あいうぇおゝゞ", + "types": [ + "hiragana", + "hiragana", + "hiragana", + "hiragana", + "hiragana", + "hiragana", + "hiragana" + ], + "checks": [ + + ] + } +] Added: test/command/suite/normalizers/nfkc100/unify_kana.test (+4 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/normalizers/nfkc100/unify_kana.test 2018-04-11 16:41:39 +0900 (818bc56d3) @@ -0,0 +1,4 @@ +normalize \ + 'NormalizerNFKC100("unify_kana", true)' \ + "あイウェおヽヾ" \ + WITH_TYPES -------------- next part -------------- HTML����������������������������... URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180411/47b26cd0/attachment-0001.htm