Kouhei Sutou
null+****@clear*****
Tue Apr 17 16:58:04 JST 2018
Kouhei Sutou 2018-04-17 16:58:04 +0900 (Tue, 17 Apr 2018) New Revision: 32465a65fe9d7d5f82b59cf9c79f26eb1df2bdc8 https://github.com/groonga/groonga/commit/32465a65fe9d7d5f82b59cf9c79f26eb1df2bdc8 Message: NormalizerNFKC100: add unify_hyphen option Added files: test/command/suite/normalizers/nfkc100/unify_hyphen.expected test/command/suite/normalizers/nfkc100/unify_hyphen.test Modified files: lib/normalizer.c Modified: lib/normalizer.c (+65 -0) =================================================================== --- lib/normalizer.c 2018-04-17 15:56:06 +0900 (12ab8d6fc) +++ lib/normalizer.c 2018-04-17 16:58:04 +0900 (ec541cc2a) @@ -623,6 +623,7 @@ typedef struct { grn_bool unify_kana; grn_bool unify_kana_case; grn_bool unify_kana_voiced_sound_mark; + grn_bool unify_hyphen; } grn_utf8_normalize_options; static void @@ -637,6 +638,7 @@ utf8_normalize_options_init(grn_utf8_normalize_options *options, options->unify_kana = GRN_FALSE; options->unify_kana_case = GRN_FALSE; options->unify_kana_voiced_sound_mark = GRN_FALSE; + options->unify_hyphen = GRN_FALSE; } grn_inline static const unsigned char * @@ -846,6 +848,53 @@ utf8_normalize_unify_katakana_voiced_sound_mark(const unsigned char *utf8_char, return utf8_char; } +grn_inline static const grn_bool +utf8_normalize_is_hyphen_famity(const unsigned char *utf8_char, + size_t length) +{ + if (length == 2) { + switch (utf8_char[0]) { + case 0xcb : + if (utf8_char[1] == 0x97) { + /* U+02D7 MODIFIER LETTER MINUS SIGN */ + return GRN_TRUE; + } + break; + case 0xd6 : + if (utf8_char[1] == 0x8a) { + /* U+058A ARMENIAN HYPHEN */ + return GRN_TRUE; + } + break; + default : + break; + } + } else if (length == 3) { + if (utf8_char[0] == 0xe2) { + if (utf8_char[1] == 0x80 && + (0x90 <= utf8_char[2] && utf8_char[2] <= 0x93)) { + /* U+2010 HYPHEN .. + * U+2013 EN DASH */ + return GRN_TRUE; + } else if (utf8_char[1] == 0x81 && + (utf8_char[2] == 0x83 || + utf8_char[2] == 0xbb)) { + /* U+2043 HYPHEN BULLET */ + /* U+207B SUPERSCRIPT MINUS */ + return GRN_TRUE; + } else if (utf8_char[1] == 0x82 && utf8_char[2] == 0x8b) { + /* U+208B SUBSCRIPT MINUS */ + return GRN_TRUE; + } else if (utf8_char[1] == 0x88 && utf8_char[2] == 0x92) { + /* U+2212 MINUS SIGN */ + return GRN_TRUE; + } + } + } + + return GRN_FALSE; +} + grn_inline static grn_obj * utf8_normalize(grn_ctx *ctx, grn_string *nstr, @@ -923,6 +972,7 @@ utf8_normalize(grn_ctx *ctx, if ((*p == ' ' && removeblankp) || *p < 0x20 /* skip unprintable ascii */ ) { if (cp > nstr->ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; } } else { + size_t lp_original = lp; grn_char_type char_type; char_type = options->char_type_func(p); @@ -973,6 +1023,7 @@ utf8_normalize(grn_ctx *ctx, unsigned char unified_kana[3]; unsigned char unified_kana_case[3]; unsigned char unified_kana_voiced_sound_mark[3]; + unsigned char unified_hyphen[] = "-"; if (options->unify_kana && char_type == GRN_CHAR_KATAKANA && @@ -1019,6 +1070,14 @@ utf8_normalize(grn_ctx *ctx, } } + if (options->unify_hyphen) { + if (utf8_normalize_is_hyphen_famity(p, lp)) { + p = unified_hyphen; + lp = 1; + char_type = GRN_CHAR_SYMBOL; + } + } + grn_memcpy(d, p, lp); p = p_original; } @@ -1037,6 +1096,7 @@ utf8_normalize(grn_ctx *ctx, } for (i = lp; i > 1; i--) { *ch++ = 0; } } + lp = lp_original; } } } @@ -1506,6 +1566,11 @@ nfkc100_open_options(grn_ctx *ctx, raw_options, i, options->unify_kana_voiced_sound_mark); + } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "unify_hyphen")) { + options->unify_hyphen = grn_vector_get_element_bool(ctx, + raw_options, + i, + options->unify_hyphen); } } GRN_OPTION_VALUES_EACH_END(); Added: test/command/suite/normalizers/nfkc100/unify_hyphen.expected (+27 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/normalizers/nfkc100/unify_hyphen.expected 2018-04-17 16:58:04 +0900 (0644ff88c) @@ -0,0 +1,27 @@ +normalize 'NormalizerNFKC100("unify_hyphen", true)' "-˗֊‐‑‒–⁃⁻₋−" WITH_TYPES +[ + [ + 0, + 0.0, + 0.0 + ], + { + "normalized": "-----------", + "types": [ + "symbol", + "symbol", + "symbol", + "symbol", + "symbol", + "symbol", + "symbol", + "symbol", + "symbol", + "symbol", + "symbol" + ], + "checks": [ + + ] + } +] Added: test/command/suite/normalizers/nfkc100/unify_hyphen.test (+4 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/normalizers/nfkc100/unify_hyphen.test 2018-04-17 16:58:04 +0900 (77e31ada3) @@ -0,0 +1,4 @@ +normalize \ + 'NormalizerNFKC100("unify_hyphen", true)' \ + "-˗֊‐‑‒–⁃⁻₋−" \ + WITH_TYPES -------------- next part -------------- HTML����������������������������... URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180417/a5772cb5/attachment-0001.htm