null+****@clear*****
null+****@clear*****
2012年 2月 14日 (火) 14:00:52 JST
Kouhei Sutou 2012-02-14 14:00:52 +0900 (Tue, 14 Feb 2012) New Revision: f1d46f851ec86981ff53d2a127cb9d8ccd8b5b7a Log: Revert "[normalizer] implemented normalizer as grn_proc. refs #1164" This reverts commit 67def859ac4bdcfef7345b5d654e0741d34a8710. Removed files: lib/normalizer.c lib/normalizer.h Modified files: include/groonga.h lib/Makefile.am lib/dat.cpp lib/dat.h lib/db.c lib/db.h lib/expr.c lib/hash.c lib/hash.h lib/pat.c lib/pat.h lib/snip.c lib/str.c lib/str.h lib/util.c test/unit/core/dat/test-dat.cpp test/unit/util/test-snip.c test/unit/util/test-string.c Modified: include/groonga.h (+2 -29) =================================================================== --- include/groonga.h 2012-02-14 14:00:31 +0900 (728df55) +++ include/groonga.h 2012-02-14 14:00:52 +0900 (8a078c2) @@ -414,7 +414,6 @@ typedef unsigned short int grn_obj_flags; #define GRN_ACCESSOR_VIEW (0x0a) #define GRN_SNIP (0x0b) #define GRN_PATSNIP (0x0c) -#define GRN_NORMALIZED_TEXT (0x0d) #define GRN_CURSOR_TABLE_HASH_KEY (0x10) #define GRN_CURSOR_TABLE_PAT_KEY (0x11) #define GRN_CURSOR_TABLE_DAT_KEY (0x12) @@ -591,16 +590,6 @@ typedef enum { GRN_DB_TRIGRAM, } grn_builtin_tokenizer; -typedef enum { - GRN_DB_NORMALIZER_ASCII = 96, - GRN_DB_NORMALIZER_UTF8_NFKC, /* Normalization Form KC */ - GRN_DB_NORMALIZER_EUC_JP, - GRN_DB_NORMALIZER_SJIS, - GRN_DB_NORMALIZER_LATIN1, - GRN_DB_NORMALIZER_KOI8R, - GRN_DB_NORMALIZER_UTF8_UCA /* Unicode Collation Algorithm */ -} grn_builtin_normalizer; - GRN_API grn_obj *grn_ctx_at(grn_ctx *ctx, grn_id id); /** @@ -647,8 +636,7 @@ typedef enum { GRN_PROC_TOKENIZER = 1, GRN_PROC_COMMAND, GRN_PROC_FUNCTION, - GRN_PROC_HOOK, - GRN_PROC_NORMALIZER + GRN_PROC_HOOK } grn_proc_type; GRN_API grn_obj *grn_proc_create(grn_ctx *ctx, @@ -2434,7 +2422,7 @@ GRN_API void grn_time_now(grn_ctx *ctx, grn_obj *obj); grn_bulk_write((ctx), (obj), (char *)&_val, sizeof(grn_obj *));\ } while (0) -/* grn_str: deprecated */ +/* grn_str */ typedef struct { const char *orig; @@ -2457,21 +2445,6 @@ GRN_API grn_str *grn_str_open(grn_ctx *ctx, const char *str, unsigned int str_le int flags); GRN_API grn_rc grn_str_close(grn_ctx *ctx, grn_str *nstr); -/* grn_normalized_text */ - -#define GRN_NORMALIZED_TEXT_REMOVE_BLANK (0x01<<0) -#define GRN_NORMALIZED_TEXT_WITH_CTYPES (0x01<<1) -#define GRN_NORMALIZED_TEXT_WITH_CHECKS (0x01<<2) - -GRN_API grn_obj *grn_normalized_text_open(grn_ctx *ctx, grn_obj *normalizer, - const char *str, unsigned int str_len, - grn_encoding encoding, int flags); -GRN_API grn_rc grn_normalized_text_get_value(grn_ctx *ctx, - grn_obj *normalized_text, - const char **value, - unsigned int *length, - unsigned int *binary_length); - GRN_API int grn_charlen(grn_ctx *ctx, const char *str, const char *end); /* expr */ Modified: lib/Makefile.am (+0 -2) =================================================================== --- lib/Makefile.am 2012-02-14 14:00:31 +0900 (c9bea94) +++ lib/Makefile.am 2012-02-14 14:00:52 +0900 (7f8cfcd) @@ -15,7 +15,6 @@ libgroonga_la_SOURCES = \ io.c \ str.c \ nfkc.c \ - normalizer.c \ snip.c \ store.c \ com.c \ @@ -55,7 +54,6 @@ noinst_HEADERS = \ ii.h \ io.h \ nfkc.h \ - normalizer.c \ output.h \ pat.h \ plugin_in.h \ Modified: lib/dat.cpp (+0 -15) =================================================================== --- lib/dat.cpp 2012-02-14 14:00:31 +0900 (eddb5be) +++ lib/dat.cpp 2012-02-14 14:00:52 +0900 (897186a) @@ -22,7 +22,6 @@ #include "str.h" #include "io.h" #include "dat.h" -#include "normalizer.h" #include "util.h" /* @@ -313,14 +312,6 @@ grn_dat_create(grn_ctx *ctx, const char *path, uint32_t, dat->header->encoding = encoding; dat->header->tokenizer = GRN_ID_NIL; dat->header->file_id = 0; - if (dat->header->flags & GRN_OBJ_KEY_NORMALIZE) { - dat->header->flags &= ~GRN_OBJ_KEY_NORMALIZE; - dat->header->normalizer = grn_normalizer_find(ctx, ctx->encoding); - dat->normalizer = grn_ctx_at(ctx, dat->header->normalizer); - } else { - dat->header->normalizer = GRN_ID_NIL; - dat->normalizer = NULL; - } dat->encoding = encoding; dat->tokenizer = NULL; return dat; @@ -356,12 +347,6 @@ grn_dat_open(grn_ctx *ctx, const char *path) dat->encoding = dat->header->encoding; dat->obj.header.flags = dat->header->flags; dat->tokenizer = grn_ctx_at(ctx, dat->header->tokenizer); - if (dat->header->flags & GRN_OBJ_KEY_NORMALIZE) { - dat->header->flags &= ~GRN_OBJ_KEY_NORMALIZE; - dat->header->normalizer = grn_normalizer_find(ctx, ctx->encoding); - - } - dat->normalizer = grn_ctx_at(ctx, dat->header->normalizer); return dat; } Modified: lib/dat.h (+0 -3) =================================================================== --- lib/dat.h 2012-02-14 14:00:31 +0900 (4409467) +++ lib/dat.h 2012-02-14 14:00:52 +0900 (a92b0f7) @@ -36,7 +36,6 @@ struct _grn_dat { void *trie; void *old_trie; grn_obj *tokenizer; - grn_obj *normalizer; grn_critical_section lock; }; @@ -45,8 +44,6 @@ struct grn_dat_header { grn_encoding encoding; grn_id tokenizer; uint32_t file_id; - grn_id normalizer; - uint32_t reserved[235]; }; struct _grn_dat_cursor { Modified: lib/db.c (+6 -21) =================================================================== --- lib/db.c 2012-02-14 14:00:31 +0900 (99099a7) +++ lib/db.c 2012-02-14 14:00:52 +0900 (9b64a10) @@ -22,7 +22,6 @@ #include "ii.h" #include "ctx_impl.h" #include "token.h" -#include "normalizer.h" #include "proc.h" #include "plugin_in.h" #include "geo.h" @@ -33,16 +32,13 @@ #define NEXT_ADDR(p) (((byte *)(p)) + sizeof *(p)) #define WITH_NORMALIZE(table,key,key_size,block) {\ - if ((table)->normalizer) {\ - grn_obj *nstr;\ - if ((nstr = grn_normalized_text_open(ctx, (table)->normalizer,\ - key, key_size,\ - (table)->encoding, 0))) {\ - const char *key;\ - unsigned int key_size;\ - grn_normalized_text_get_value(ctx, nstr, &key, NULL, &key_size);\ + if ((table)->obj.header.flags & GRN_OBJ_KEY_NORMALIZE) {\ + grn_str *nstr;\ + if ((nstr = grn_str_open(ctx, key, key_size, GRN_STR_NORMALIZE))) { \ + char *key = nstr->norm;\ + unsigned int key_size = nstr->norm_blen;\ block\ - grn_obj_close(ctx, nstr);\ + grn_str_close(ctx, nstr);\ }\ } else {\ block\ @@ -143,7 +139,6 @@ grn_db_create(grn_ctx *ctx, const char *path, grn_db_create_optarg *optarg) if ((s->specs = grn_ja_create(ctx, buffer, 65536, 0))) { grn_ctx_use(ctx, (grn_obj *)s); grn_db_init_builtin_types(ctx); - grn_db_init_builtin_normalizers(ctx); GRN_API_RETURN((grn_obj *)s); } else { ERR(GRN_NO_MEMORY_AVAILABLE, "ja create failed"); @@ -152,7 +147,6 @@ grn_db_create(grn_ctx *ctx, const char *path, grn_db_create_optarg *optarg) s->specs = NULL; grn_ctx_use(ctx, (grn_obj *)s); grn_db_init_builtin_types(ctx); - grn_db_init_builtin_normalizers(ctx); GRN_API_RETURN((grn_obj *)s); } if (use_pat_as_db_keys) { @@ -214,7 +208,6 @@ grn_db_open(grn_ctx *ctx, const char *path) } #endif grn_db_init_builtin_tokenizers(ctx); - grn_db_init_builtin_normalizers(ctx); grn_db_init_builtin_query(ctx); GRN_API_RETURN((grn_obj *)s); } @@ -6887,9 +6880,6 @@ grn_obj_close(grn_ctx *ctx, grn_obj *obj) case GRN_ACCESSOR_VIEW : rc = grn_accessor_view_close(ctx, obj); break; - case GRN_NORMALIZED_TEXT : - rc = grn_normalized_text_close(ctx, obj); - break; case GRN_CURSOR_TABLE_PAT_KEY : grn_pat_cursor_close(ctx, (grn_pat_cursor *)obj); break; @@ -8002,11 +7992,6 @@ grn_db_init_builtin_types(grn_ctx *ctx) } #endif grn_db_init_builtin_tokenizers(ctx); - for (id = grn_db_curr_id(ctx, db) + 1; id < GRN_DB_NORMALIZER_ASCII; id++) { - grn_itoh(id, buf + 3, 2); - grn_obj_register(ctx, db, buf, 5); - } - grn_db_init_builtin_normalizers(ctx); for (id = grn_db_curr_id(ctx, db) + 1; id < 128; id++) { grn_itoh(id, buf + 3, 2); grn_obj_register(ctx, db, buf, 5); Modified: lib/db.h (+1 -1) =================================================================== --- lib/db.h 2012-02-14 14:00:31 +0900 (4f76d43) +++ lib/db.h 2012-02-14 14:00:52 +0900 (2f7271b) @@ -92,7 +92,7 @@ grn_id grn_table_get_v(grn_ctx *ctx, grn_obj *table, const void *key, int key_si grn_id grn_table_add_v(grn_ctx *ctx, grn_obj *table, const void *key, int key_size, void **value, int *added); GRN_API grn_rc grn_table_get_info(grn_ctx *ctx, grn_obj *table, grn_obj_flags *flags, - grn_encoding *encoding, grn_obj **tokenizer); + grn_encoding *encoding, grn_obj **tokenizer); const char *_grn_table_key(grn_ctx *ctx, grn_obj *table, grn_id id, uint32_t *key_size); grn_rc grn_table_search(grn_ctx *ctx, grn_obj *table, Modified: lib/expr.c (+7 -24) =================================================================== --- lib/expr.c 2012-02-14 14:00:31 +0900 (795192d) +++ lib/expr.c 2012-02-14 14:00:52 +0900 (3c79202) @@ -22,7 +22,6 @@ #include <float.h> #include "ii.h" #include "geo.h" -#include "normalizer.h" #include "util.h" static inline int @@ -2242,19 +2241,13 @@ grn_proc_call(grn_ctx *ctx, grn_obj *proc, int nargs, grn_obj *caller) void pseudo_query_scan(grn_ctx *ctx, grn_obj *x, grn_obj *y, grn_obj *res) { - grn_id normalizer_id; - grn_obj *normalizer; - grn_obj *a = NULL, *b = NULL; + grn_str *a = NULL, *b = NULL; - normalizer_id = grn_normalizer_find(ctx, ctx->encoding); - normalizer = grn_ctx_at(ctx, normalizer_id); switch (x->header.domain) { case GRN_DB_SHORT_TEXT: case GRN_DB_TEXT: case GRN_DB_LONG_TEXT: - a = grn_normalized_text_open(ctx, normalizer, - GRN_TEXT_VALUE(x), GRN_TEXT_LEN(x), - ctx->encoding, 0); + a = grn_str_open(ctx, GRN_TEXT_VALUE(x), GRN_TEXT_LEN(x), GRN_STR_NORMALIZE); break; default: break; @@ -2264,33 +2257,23 @@ pseudo_query_scan(grn_ctx *ctx, grn_obj *x, grn_obj *y, grn_obj *res) case GRN_DB_SHORT_TEXT: case GRN_DB_TEXT: case GRN_DB_LONG_TEXT: - b = grn_normalized_text_open(ctx, normalizer, - GRN_TEXT_VALUE(y), GRN_TEXT_LEN(y), - ctx->encoding, 0); + b = grn_str_open(ctx, GRN_TEXT_VALUE(y), GRN_TEXT_LEN(y), GRN_STR_NORMALIZE); break; default: break; } /* normalized str doesn't contain '\0'. */ - if (a && b) { - const char *normalized_a, *normalized_b; - grn_normalized_text_get_value(ctx, a, &normalized_a, NULL, NULL); - grn_normalized_text_get_value(ctx, b, &normalized_b, NULL, NULL); - if (strstr(normalized_a, normalized_b)) { - GRN_INT32_SET(ctx, res, 1); - } else { - GRN_INT32_SET(ctx, res, 0); - } + if (a && b && strstr(a->norm, b->norm)) { + GRN_INT32_SET(ctx, res, 1); } else { GRN_INT32_SET(ctx, res, 0); } res->header.type = GRN_BULK; res->header.domain = GRN_DB_INT32; - if (a) { grn_obj_close(ctx, a); } - if (b) { grn_obj_close(ctx, b); } - if (normalizer) { grn_obj_unlink(ctx, normalizer); } + if (a) { grn_str_close(ctx, a); } + if (b) { grn_str_close(ctx, b); } } grn_obj * Modified: lib/hash.c (+1 -18) =================================================================== --- lib/hash.c 2012-02-14 14:00:31 +0900 (9a5455a) +++ lib/hash.c 2012-02-14 14:00:52 +0900 (e30f1f0) @@ -18,7 +18,6 @@ #include "hash.h" #include "pat.h" #include "output.h" -#include "normalizer.h" #include <string.h> #include <limits.h> @@ -869,14 +868,6 @@ io_hash_init(grn_hash *ih, grn_ctx *ctx, const char *path, uint32_t key_size, header->n_entries = 0; header->n_garbages = 0; header->tokenizer = GRN_ID_NIL; - if (header->flags & GRN_OBJ_KEY_NORMALIZE) { - header->flags &= ~GRN_OBJ_KEY_NORMALIZE; - header->normalizer = grn_normalizer_find(ctx, ctx->encoding); - ih->normalizer = grn_ctx_at(ctx, header->normalizer); - } else { - header->normalizer = GRN_ID_NIL; - ih->normalizer = NULL; - } ih->obj.header.flags = flags; ih->ctx = ctx; ih->key_size = key_size; @@ -931,7 +922,6 @@ tiny_hash_init(grn_hash *ah, grn_ctx *ctx, const char *path, uint32_t key_size, ah->n_entries_ = 0; ah->garbages = GRN_ID_NIL; ah->tokenizer = NULL; - ah->normalizer = NULL; grn_tiny_array_init(ctx, &ah->a, entry_size, GRN_TINY_ARRAY_CLEAR); grn_tiny_array_init(ctx, &ah->bitmap, 1, GRN_TINY_ARRAY_CLEAR); return GRN_SUCCESS; @@ -991,11 +981,6 @@ grn_hash_open(grn_ctx *ctx, const char *path) hash->header = header; hash->lock = &header->lock; hash->tokenizer = grn_ctx_at(ctx, header->tokenizer); - if (header->flags & GRN_OBJ_KEY_NORMALIZE) { - header->flags &= ~GRN_OBJ_KEY_NORMALIZE; - header->normalizer = grn_normalizer_find(ctx, ctx->encoding); - } - hash->normalizer = grn_ctx_at(ctx, header->normalizer); return (grn_hash *)hash; } else { GRN_LOG(ctx, GRN_LOG_NOTICE, "invalid hash flag. (%x)", header->flags); @@ -2159,7 +2144,7 @@ grn_hash_check(grn_ctx *ctx, grn_hash *hash) char buf[8]; struct grn_hash_header *h = hash->header; GRN_OUTPUT_ARRAY_OPEN("RESULT", 1); - GRN_OUTPUT_MAP_OPEN("SUMMARY", 25); + GRN_OUTPUT_MAP_OPEN("SUMMARY", 24); GRN_OUTPUT_CSTR("flags"); grn_itoh(h->flags, buf, 8); GRN_OUTPUT_STR(buf, 8); @@ -2169,8 +2154,6 @@ grn_hash_check(grn_ctx *ctx, grn_hash *hash) GRN_OUTPUT_INT64(hash->value_size); GRN_OUTPUT_CSTR("tokenizer"); GRN_OUTPUT_INT64(h->tokenizer); - GRN_OUTPUT_CSTR("normalizer"); - GRN_OUTPUT_INT64(h->normalizer); GRN_OUTPUT_CSTR("curr_rec"); GRN_OUTPUT_INT64(h->curr_rec); GRN_OUTPUT_CSTR("curr_key"); Modified: lib/hash.h (+1 -3) =================================================================== --- lib/hash.h 2012-02-14 14:00:31 +0900 (541835e) +++ lib/hash.h 2012-02-14 14:00:52 +0900 (efe364f) @@ -185,7 +185,6 @@ struct _grn_hash { uint32_t *n_entries; uint32_t *max_offset; grn_obj *tokenizer; - grn_obj *normalizer; /* portions for io_hash */ grn_io *io; struct grn_hash_header *header; @@ -226,8 +225,7 @@ struct grn_hash_header { uint32_t n_entries; uint32_t n_garbages; uint32_t lock; - grn_id normalizer; - uint32_t reserved[15]; + uint32_t reserved[16]; grn_id garbages[GRN_HASH_MAX_KEY_SIZE]; }; Deleted: lib/normalizer.c (+0 -1183) 100644 =================================================================== --- lib/normalizer.c 2012-02-14 14:00:31 +0900 (a21de48) +++ /dev/null @@ -1,1183 +0,0 @@ -/* -*- c-basic-offset: 2 -*- */ -/* - Copyright(C) 2012 Brazil - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License version 2.1 as published by the Free Software Foundation. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -*/ - -#include "groonga_in.h" -#include <string.h> -#include "normalizer.h" -#include "str.h" - -grn_id -grn_normalizer_find(grn_ctx *ctx, grn_encoding encoding) -{ - grn_id normalizer_id = GRN_ID_NIL; - - switch (encoding) { - case GRN_ENC_EUC_JP : - normalizer_id = GRN_DB_NORMALIZER_EUC_JP; - break; - case GRN_ENC_UTF8 : -#ifdef NO_NFKC - normalizer_id = GRN_DB_NORMALIZER_ASCII; -#else /* NO_NFKC */ - normalizer_id = GRN_DB_NORMALIZER_UTF8_NFKC; -#endif /* NO_NFKC */ - break; - case GRN_ENC_SJIS : - normalizer_id = GRN_DB_NORMALIZER_SJIS; - break; - case GRN_ENC_LATIN1 : - normalizer_id = GRN_DB_NORMALIZER_LATIN1; - break; - case GRN_ENC_KOI8R : - normalizer_id = GRN_DB_NORMALIZER_KOI8R; - break; - default : - normalizer_id = GRN_DB_NORMALIZER_ASCII; - break; - } - - return normalizer_id; -} - -grn_rc -grn_normalizer_init(void) -{ - return GRN_SUCCESS; -} - -grn_rc -grn_normalizer_fin(void) -{ - return GRN_SUCCESS; -} - -grn_obj * -grn_normalized_text_open(grn_ctx *ctx, grn_obj *normalizer, - const char *str, unsigned int str_len, - grn_encoding encoding, int flags) -{ - grn_normalized_text *normalized_text; - grn_obj *obj; - - if (!normalizer) { - return NULL; - } - - normalized_text = GRN_MALLOCN(grn_normalized_text, 1); - if (!normalized_text) { - return NULL; - } - - GRN_API_ENTER; - obj = (grn_obj *)normalized_text; - GRN_OBJ_INIT(obj, GRN_NORMALIZED_TEXT, GRN_OBJ_ALLOCATED, GRN_ID_NIL); - normalized_text->orig = str; - normalized_text->orig_blen = str_len; - normalized_text->norm = NULL; - normalized_text->norm_blen = 0; - normalized_text->length = 0; - normalized_text->checks = NULL; - normalized_text->ctypes = NULL; - normalized_text->encoding = encoding; - normalized_text->flags = flags; - - ((grn_proc *)normalizer)->funcs[PROC_NEXT](ctx, 1, &obj, NULL); - - GRN_API_RETURN(obj); -} - -grn_rc -grn_normalized_text_get_value(grn_ctx *ctx, grn_obj *normalized_text, - const char **value, unsigned int *length, - unsigned int *binary_length) -{ - grn_rc rc; - grn_normalized_text *text = (grn_normalized_text *)normalized_text; - GRN_API_ENTER; - if (text) { - if (value) { *value = text->norm; } - if (length) { *length = text->length; } - if (binary_length) { *binary_length = text->norm_blen; } - rc = GRN_SUCCESS; - } else { - rc = GRN_INVALID_ARGUMENT; - } - GRN_API_RETURN(rc); -} - -grn_rc -grn_normalized_text_close(grn_ctx *ctx, grn_obj *normalized_text) -{ - grn_rc rc; - grn_normalized_text *text = (grn_normalized_text *)normalized_text; - if (text) { - if (text->norm) { GRN_FREE(text->norm); } - if (text->ctypes) { GRN_FREE(text->ctypes); } - if (text->checks) { GRN_FREE(text->checks); } - GRN_FREE(text); - rc = GRN_SUCCESS; - } else { - rc = GRN_INVALID_ARGUMENT; - } - return rc; -} - -static unsigned char symbol[] = { - ',', '.', 0, ':', ';', '?', '!', 0, 0, 0, '`', 0, '^', '~', '_', 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, '-', '-', '/', '\\', 0, 0, '|', 0, 0, 0, '\'', 0, - '"', '(', ')', 0, 0, '[', ']', '{', '}', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - '+', '-', 0, 0, 0, '=', 0, '<', '>', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - '$', 0, 0, '%', '#', '&', '*', '@', 0, 0, 0, 0, 0, 0, 0, 0 -}; - -inline static grn_obj * -eucjp_normalize(grn_ctx *ctx, int nargs, grn_obj **args, - grn_user_data *user_data) -{ - static uint16_t hankana[] = { - 0xa1a1, 0xa1a3, 0xa1d6, 0xa1d7, 0xa1a2, 0xa1a6, 0xa5f2, 0xa5a1, 0xa5a3, - 0xa5a5, 0xa5a7, 0xa5a9, 0xa5e3, 0xa5e5, 0xa5e7, 0xa5c3, 0xa1bc, 0xa5a2, - 0xa5a4, 0xa5a6, 0xa5a8, 0xa5aa, 0xa5ab, 0xa5ad, 0xa5af, 0xa5b1, 0xa5b3, - 0xa5b5, 0xa5b7, 0xa5b9, 0xa5bb, 0xa5bd, 0xa5bf, 0xa5c1, 0xa5c4, 0xa5c6, - 0xa5c8, 0xa5ca, 0xa5cb, 0xa5cc, 0xa5cd, 0xa5ce, 0xa5cf, 0xa5d2, 0xa5d5, - 0xa5d8, 0xa5db, 0xa5de, 0xa5df, 0xa5e0, 0xa5e1, 0xa5e2, 0xa5e4, 0xa5e6, - 0xa5e8, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5ef, 0xa5f3, 0xa1ab, - 0xa1eb - }; - static unsigned char dakuten[] = { - 0xf4, 0, 0, 0, 0, 0xac, 0, 0xae, 0, 0xb0, 0, 0xb2, 0, 0xb4, 0, 0xb6, 0, - 0xb8, 0, 0xba, 0, 0xbc, 0, 0xbe, 0, 0xc0, 0, 0xc2, 0, 0, 0xc5, 0, 0xc7, - 0, 0xc9, 0, 0, 0, 0, 0, 0, 0xd0, 0, 0, 0xd3, 0, 0, 0xd6, 0, 0, 0xd9, 0, - 0, 0xdc - }; - static unsigned char handaku[] = { - 0xd1, 0, 0, 0xd4, 0, 0, 0xd7, 0, 0, 0xda, 0, 0, 0xdd - }; - grn_normalized_text *nstr = (grn_normalized_text *)args[0]; - int16_t *ch; - const unsigned char *s, *s_, *e; - unsigned char *d, *d0, *d_, b; - uint_least8_t *cp, *ctypes, ctype; - size_t size = nstr->orig_blen, length = 0; - int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK; - if (!(nstr->norm = GRN_MALLOC(size * 2 + 1))) { - ERR(GRN_NO_MEMORY_AVAILABLE, - "[normalizer][eucjp] failed to allocate normalized text space"); - return NULL; - } - d0 = (unsigned char *) nstr->norm; - if (nstr->flags & GRN_STR_WITH_CHECKS) { - if (!(nstr->checks = GRN_MALLOC(size * 2 * sizeof(int16_t) + 1))) { - GRN_FREE(nstr->norm); - nstr->norm = NULL; - ERR(GRN_NO_MEMORY_AVAILABLE, - "[normalizer][eucjp] failed to allocate checks space"); - return NULL; - } - } - ch = nstr->checks; - if (nstr->flags & GRN_STR_WITH_CTYPES) { - if (!(nstr->ctypes = GRN_MALLOC(size + 1))) { - GRN_FREE(nstr->checks); - GRN_FREE(nstr->norm); - nstr->checks = NULL; - nstr->norm = NULL; - ERR(GRN_NO_MEMORY_AVAILABLE, - "[normalizer][eucjp] failed to allocate character types space"); - return NULL; - } - } - cp = ctypes = nstr->ctypes; - e = (unsigned char *)nstr->orig + size; - for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) { - if ((*s & 0x80)) { - if (((s + 1) < e) && (*(s + 1) & 0x80)) { - unsigned char c1 = *s++, c2 = *s, c3 = 0; - switch (c1 >> 4) { - case 0x08 : - if (c1 == 0x8e && 0xa0 <= c2 && c2 <= 0xdf) { - uint16_t c = hankana[c2 - 0xa0]; - switch (c) { - case 0xa1ab : - if (d > d0 + 1 && d[-2] == 0xa5 - && 0xa6 <= d[-1] && d[-1] <= 0xdb && (b = dakuten[d[-1] - 0xa6])) { - *(d - 1) = b; - if (ch) { ch[-1] += 2; s_ += 2; } - continue; - } else { - *d++ = c >> 8; *d = c & 0xff; - } - break; - case 0xa1eb : - if (d > d0 + 1 && d[-2] == 0xa5 - && 0xcf <= d[-1] && d[-1] <= 0xdb && (b = handaku[d[-1] - 0xcf])) { - *(d - 1) = b; - if (ch) { ch[-1] += 2; s_ += 2; } - continue; - } else { - *d++ = c >> 8; *d = c & 0xff; - } - break; - default : - *d++ = c >> 8; *d = c & 0xff; - break; - } - ctype = grn_str_katakana; - } else { - *d++ = c1; *d = c2; - ctype = grn_str_others; - } - break; - case 0x09 : - *d++ = c1; *d = c2; - ctype = grn_str_others; - break; - case 0x0a : - switch (c1 & 0x0f) { - case 1 : - switch (c2) { - case 0xbc : - *d++ = c1; *d = c2; - ctype = grn_str_katakana; - break; - case 0xb9 : - *d++ = c1; *d = c2; - ctype = grn_str_kanji; - break; - case 0xa1 : - if (removeblankp) { - if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } - continue; - } else { - *d = ' '; - ctype = GRN_STR_BLANK|grn_str_symbol; - } - break; - default : - if (c2 >= 0xa4 && (c3 = symbol[c2 - 0xa4])) { - *d = c3; - ctype = grn_str_symbol; - } else { - *d++ = c1; *d = c2; - ctype = grn_str_others; - } - break; - } - break; - case 2 : - *d++ = c1; *d = c2; - ctype = grn_str_symbol; - break; - case 3 : - c3 = c2 - 0x80; - if ('a' <= c3 && c3 <= 'z') { - ctype = grn_str_alpha; - *d = c3; - } else if ('A' <= c3 && c3 <= 'Z') { - ctype = grn_str_alpha; - *d = c3 + 0x20; - } else if ('0' <= c3 && c3 <= '9') { - ctype = grn_str_digit; - *d = c3; - } else { - ctype = grn_str_others; - *d++ = c1; *d = c2; - } - break; - case 4 : - *d++ = c1; *d = c2; - ctype = grn_str_hiragana; - break; - case 5 : - *d++ = c1; *d = c2; - ctype = grn_str_katakana; - break; - case 6 : - case 7 : - case 8 : - *d++ = c1; *d = c2; - ctype = grn_str_symbol; - break; - default : - *d++ = c1; *d = c2; - ctype = grn_str_others; - break; - } - break; - default : - *d++ = c1; *d = c2; - ctype = grn_str_kanji; - break; - } - } else { - /* skip invalid character */ - continue; - } - } else { - unsigned char c = *s; - switch (c >> 4) { - case 0 : - case 1 : - /* skip unprintable ascii */ - if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } - continue; - case 2 : - if (c == 0x20) { - if (removeblankp) { - if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } - continue; - } else { - *d = ' '; - ctype = GRN_STR_BLANK|grn_str_symbol; - } - } else { - *d = c; - ctype = grn_str_symbol; - } - break; - case 3 : - *d = c; - ctype = (c <= 0x39) ? grn_str_digit : grn_str_symbol; - break; - case 4 : - *d = ('A' <= c) ? c + 0x20 : c; - ctype = (c == 0x40) ? grn_str_symbol : grn_str_alpha; - break; - case 5 : - *d = (c <= 'Z') ? c + 0x20 : c; - ctype = (c <= 0x5a) ? grn_str_alpha : grn_str_symbol; - break; - case 6 : - *d = c; - ctype = (c == 0x60) ? grn_str_symbol : grn_str_alpha; - break; - case 7 : - *d = c; - ctype = (c <= 0x7a) ? grn_str_alpha : (c == 0x7f ? grn_str_others : grn_str_symbol); - break; - default : - *d = c; - ctype = grn_str_others; - break; - } - } - d++; - length++; - if (cp) { *cp++ = ctype; } - if (ch) { - *ch++ = (int16_t)(s + 1 - s_); - s_ = s + 1; - while (++d_ < d) { *ch++ = 0; } - } - } - if (cp) { *cp = grn_str_null; } - *d = '\0'; - nstr->length = length; - nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm); - return NULL; -} - -#ifndef NO_NFKC -uint_least8_t grn_nfkc_ctype(const unsigned char *str); -const char *grn_nfkc_map1(const unsigned char *str); -const char *grn_nfkc_map2(const unsigned char *prefix, const unsigned char *suffix); - -inline static grn_obj * -utf8_nfkc_normalize(grn_ctx *ctx, int nargs, grn_obj **args, - grn_user_data *user_data) -{ - grn_normalized_text *nstr = (grn_normalized_text *)args[0]; - int16_t *ch; - const unsigned char *s, *s_, *s__ = NULL, *p, *p2, *pe, *e; - unsigned char *d, *d_, *de; - uint_least8_t *cp; - size_t length = 0, ls, lp, size = nstr->orig_blen, ds = size * 3; - int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK; - if (!(nstr->norm = GRN_MALLOC(ds + 1))) { - ERR(GRN_NO_MEMORY_AVAILABLE, - "[normalizer][utf8][nfkc] failed to allocate normalized text space"); - return NULL; - } - if (nstr->flags & GRN_STR_WITH_CHECKS) { - if (!(nstr->checks = GRN_MALLOC(ds * sizeof(int16_t) + 1))) { - GRN_FREE(nstr->norm); - nstr->norm = NULL; - ERR(GRN_NO_MEMORY_AVAILABLE, - "[normalizer][utf8][nfkc] failed to allocate checks space"); - return NULL; - } - } - ch = nstr->checks; - if (nstr->flags & GRN_STR_WITH_CTYPES) { - if (!(nstr->ctypes = GRN_MALLOC(ds + 1))) { - if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; } - GRN_FREE(nstr->norm); - nstr->norm = NULL; - ERR(GRN_NO_MEMORY_AVAILABLE, - "[normalizer][utf8][nfkc] failed to allocate character types space"); - return NULL; - } - } - cp = nstr->ctypes; - d = (unsigned char *)nstr->norm; - de = d + ds; - d_ = NULL; - e = (unsigned char *)nstr->orig + size; - for (s = s_ = (unsigned char *)nstr->orig; ; s += ls) { - if (!(ls = grn_str_charlen_utf8(ctx, s, e))) { - break; - } - if ((p = (unsigned char *)grn_nfkc_map1(s))) { - pe = p + strlen((char *)p); - } else { - p = s; - pe = p + ls; - } - if (d_ && (p2 = (unsigned char *)grn_nfkc_map2(d_, p))) { - p = p2; - pe = p + strlen((char *)p); - if (cp) { cp--; } - if (ch) { - ch -= (d - d_); - s_ = s__; - } - d = d_; - length--; - } - for (; ; p += lp) { - if (!(lp = grn_str_charlen_utf8(ctx, p, pe))) { - break; - } - if ((*p == ' ' && removeblankp) || *p < 0x20 /* skip unprintable ascii */ ) { - if (cp > nstr->ctypes) { *(cp - 1) |= GRN_STR_BLANK; } - } else { - if (de <= d + lp) { - unsigned char *norm; - ds += (ds >> 1) + lp; - if (!(norm = GRN_REALLOC(nstr->norm, ds + 1))) { - if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; } - if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; } - GRN_FREE(nstr->norm); nstr->norm = NULL; - ERR(GRN_NO_MEMORY_AVAILABLE, - "[normalizer][utf8][nfkc] " - "failed to reallocate normalized text space"); - return NULL; - } - de = norm + ds; - d = norm + (d - (unsigned char *)nstr->norm); - nstr->norm = norm; - if (ch) { - int16_t *checks; - if (!(checks = GRN_REALLOC(nstr->checks, ds * sizeof(int16_t)+ 1))) { - if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; } - GRN_FREE(nstr->checks); nstr->checks = NULL; - GRN_FREE(nstr->norm); nstr->norm = NULL; - ERR(GRN_NO_MEMORY_AVAILABLE, - "[normalizer][utf8][nfkc] " - "failed to reallocate checks space"); - return NULL; - } - ch = checks + (ch - nstr->checks); - nstr->checks = checks; - } - if (cp) { - uint_least8_t *ctypes; - if (!(ctypes = GRN_REALLOC(nstr->ctypes, ds + 1))) { - GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; - if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; } - GRN_FREE(nstr->norm); nstr->norm = NULL; - ERR(GRN_NO_MEMORY_AVAILABLE, - "[normalizer][utf8][nfkc] " - "failed to reallocate character types space"); - return NULL; - } - cp = ctypes + (cp - nstr->ctypes); - nstr->ctypes = ctypes; - } - } - memcpy(d, p, lp); - d_ = d; - d += lp; - length++; - if (cp) { *cp++ = grn_nfkc_ctype(p); } - if (ch) { - size_t i; - if (s_ == s + ls) { - *ch++ = -1; - } else { - *ch++ = (int16_t)(s + ls - s_); - s__ = s_; - s_ = s + ls; - } - for (i = lp; i > 1; i--) { *ch++ = 0; } - } - } - } - } - if (cp) { *cp = grn_str_null; } - *d = '\0'; - nstr->length = length; - nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm); - return NULL; -} -#endif /* NO_NFKC */ - -inline static grn_obj * -sjis_normalize(grn_ctx *ctx, int nargs, grn_obj **args, - grn_user_data *user_data) -{ - static uint16_t hankana[] = { - 0x8140, 0x8142, 0x8175, 0x8176, 0x8141, 0x8145, 0x8392, 0x8340, 0x8342, - 0x8344, 0x8346, 0x8348, 0x8383, 0x8385, 0x8387, 0x8362, 0x815b, 0x8341, - 0x8343, 0x8345, 0x8347, 0x8349, 0x834a, 0x834c, 0x834e, 0x8350, 0x8352, - 0x8354, 0x8356, 0x8358, 0x835a, 0x835c, 0x835e, 0x8360, 0x8363, 0x8365, - 0x8367, 0x8369, 0x836a, 0x836b, 0x836c, 0x836d, 0x836e, 0x8371, 0x8374, - 0x8377, 0x837a, 0x837d, 0x837e, 0x8380, 0x8381, 0x8382, 0x8384, 0x8386, - 0x8388, 0x8389, 0x838a, 0x838b, 0x838c, 0x838d, 0x838f, 0x8393, 0x814a, - 0x814b - }; - static unsigned char dakuten[] = { - 0x94, 0, 0, 0, 0, 0x4b, 0, 0x4d, 0, 0x4f, 0, 0x51, 0, 0x53, 0, 0x55, 0, - 0x57, 0, 0x59, 0, 0x5b, 0, 0x5d, 0, 0x5f, 0, 0x61, 0, 0, 0x64, 0, 0x66, - 0, 0x68, 0, 0, 0, 0, 0, 0, 0x6f, 0, 0, 0x72, 0, 0, 0x75, 0, 0, 0x78, 0, - 0, 0x7b - }; - static unsigned char handaku[] = { - 0x70, 0, 0, 0x73, 0, 0, 0x76, 0, 0, 0x79, 0, 0, 0x7c - }; - grn_normalized_text *nstr = (grn_normalized_text *)args[0]; - int16_t *ch; - const unsigned char *s, *s_; - unsigned char *d, *d0, *d_, b, *e; - uint_least8_t *cp, *ctypes, ctype; - size_t size = nstr->orig_blen, length = 0; - int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK; - if (!(nstr->norm = GRN_MALLOC(size * 2 + 1))) { - ERR(GRN_NO_MEMORY_AVAILABLE, - "[normalizer][sjis] failed to allocate normalized text space"); - return NULL; - } - d0 = (unsigned char *) nstr->norm; - if (nstr->flags & GRN_STR_WITH_CHECKS) { - if (!(nstr->checks = GRN_MALLOC(size * 2 * sizeof(int16_t) + 1))) { - GRN_FREE(nstr->norm); - nstr->norm = NULL; - ERR(GRN_NO_MEMORY_AVAILABLE, - "[normalizer][sjis] failed to allocate checks space"); - return NULL; - } - } - ch = nstr->checks; - if (nstr->flags & GRN_STR_WITH_CTYPES) { - if (!(nstr->ctypes = GRN_MALLOC(size + 1))) { - GRN_FREE(nstr->checks); - GRN_FREE(nstr->norm); - nstr->checks = NULL; - nstr->norm = NULL; - ERR(GRN_NO_MEMORY_AVAILABLE, - "[normalizer][sjis] failed to allocate character types space"); - return NULL; - } - } - cp = ctypes = nstr->ctypes; - e = (unsigned char *)nstr->orig + size; - for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) { - if ((*s & 0x80)) { - if (0xa0 <= *s && *s <= 0xdf) { - uint16_t c = hankana[*s - 0xa0]; - switch (c) { - case 0x814a : - if (d > d0 + 1 && d[-2] == 0x83 - && 0x45 <= d[-1] && d[-1] <= 0x7a && (b = dakuten[d[-1] - 0x45])) { - *(d - 1) = b; - if (ch) { ch[-1]++; s_++; } - continue; - } else { - *d++ = c >> 8; *d = c & 0xff; - } - break; - case 0x814b : - if (d > d0 + 1 && d[-2] == 0x83 - && 0x6e <= d[-1] && d[-1] <= 0x7a && (b = handaku[d[-1] - 0x6e])) { - *(d - 1) = b; - if (ch) { ch[-1]++; s_++; } - continue; - } else { - *d++ = c >> 8; *d = c & 0xff; - } - break; - default : - *d++ = c >> 8; *d = c & 0xff; - break; - } - ctype = grn_str_katakana; - } else { - if ((s + 1) < e && 0x40 <= *(s + 1) && *(s + 1) <= 0xfc) { - unsigned char c1 = *s++, c2 = *s, c3 = 0; - if (0x81 <= c1 && c1 <= 0x87) { - switch (c1 & 0x0f) { - case 1 : - switch (c2) { - case 0x5b : - *d++ = c1; *d = c2; - ctype = grn_str_katakana; - break; - case 0x58 : - *d++ = c1; *d = c2; - ctype = grn_str_kanji; - break; - case 0x40 : - if (removeblankp) { - if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } - continue; - } else { - *d = ' '; - ctype = GRN_STR_BLANK|grn_str_symbol; - } - break; - default : - if (0x43 <= c2 && c2 <= 0x7e && (c3 = symbol[c2 - 0x43])) { - *d = c3; - ctype = grn_str_symbol; - } else if (0x7f <= c2 && c2 <= 0x97 && (c3 = symbol[c2 - 0x44])) { - *d = c3; - ctype = grn_str_symbol; - } else { - *d++ = c1; *d = c2; - ctype = grn_str_others; - } - break; - } - break; - case 2 : - c3 = c2 - 0x1f; - if (0x4f <= c2 && c2 <= 0x58) { - ctype = grn_str_digit; - *d = c2 - 0x1f; - } else if (0x60 <= c2 && c2 <= 0x79) { - ctype = grn_str_alpha; - *d = c2 + 0x01; - } else if (0x81 <= c2 && c2 <= 0x9a) { - ctype = grn_str_alpha; - *d = c2 - 0x20; - } else if (0x9f <= c2 && c2 <= 0xf1) { - *d++ = c1; *d = c2; - ctype = grn_str_hiragana; - } else { - *d++ = c1; *d = c2; - ctype = grn_str_others; - } - break; - case 3 : - if (0x40 <= c2 && c2 <= 0x96) { - *d++ = c1; *d = c2; - ctype = grn_str_katakana; - } else { - *d++ = c1; *d = c2; - ctype = grn_str_symbol; - } - break; - case 4 : - case 7 : - *d++ = c1; *d = c2; - ctype = grn_str_symbol; - break; - default : - *d++ = c1; *d = c2; - ctype = grn_str_others; - break; - } - } else { - *d++ = c1; *d = c2; - ctype = grn_str_kanji; - } - } else { - /* skip invalid character */ - continue; - } - } - } else { - unsigned char c = *s; - switch (c >> 4) { - case 0 : - case 1 : - /* skip unprintable ascii */ - if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } - continue; - case 2 : - if (c == 0x20) { - if (removeblankp) { - if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } - continue; - } else { - *d = ' '; - ctype = GRN_STR_BLANK|grn_str_symbol; - } - } else { - *d = c; - ctype = grn_str_symbol; - } - break; - case 3 : - *d = c; - ctype = (c <= 0x39) ? grn_str_digit : grn_str_symbol; - break; - case 4 : - *d = ('A' <= c) ? c + 0x20 : c; - ctype = (c == 0x40) ? grn_str_symbol : grn_str_alpha; - break; - case 5 : - *d = (c <= 'Z') ? c + 0x20 : c; - ctype = (c <= 0x5a) ? grn_str_alpha : grn_str_symbol; - break; - case 6 : - *d = c; - ctype = (c == 0x60) ? grn_str_symbol : grn_str_alpha; - break; - case 7 : - *d = c; - ctype = (c <= 0x7a) ? grn_str_alpha : (c == 0x7f ? grn_str_others : grn_str_symbol); - break; - default : - *d = c; - ctype = grn_str_others; - break; - } - } - d++; - length++; - if (cp) { *cp++ = ctype; } - if (ch) { - *ch++ = (int16_t)(s + 1 - s_); - s_ = s + 1; - while (++d_ < d) { *ch++ = 0; } - } - } - if (cp) { *cp = grn_str_null; } - *d = '\0'; - nstr->length = length; - nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm); - return NULL; -} - -inline static grn_obj * -ascii_normalize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) -{ - grn_normalized_text *nstr = (grn_normalized_text *)args[0]; - int16_t *ch; - const unsigned char *s, *s_, *e; - unsigned char *d, *d0, *d_; - uint_least8_t *cp, *ctypes, ctype; - size_t size = nstr->orig_blen, length = 0; - int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK; - if (!(nstr->norm = GRN_MALLOC(size + 1))) { - ERR(GRN_NO_MEMORY_AVAILABLE, - "[normalizer][ascii] failed to allocate normalized text space"); - return NULL; - } - d0 = (unsigned char *) nstr->norm; - if (nstr->flags & GRN_STR_WITH_CHECKS) { - if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) { - GRN_FREE(nstr->norm); - nstr->norm = NULL; - ERR(GRN_NO_MEMORY_AVAILABLE, - "[normalizer][ascii] failed to allocate checks space"); - return NULL; - } - } - ch = nstr->checks; - if (nstr->flags & GRN_STR_WITH_CTYPES) { - if (!(nstr->ctypes = GRN_MALLOC(size + 1))) { - GRN_FREE(nstr->checks); - GRN_FREE(nstr->norm); - nstr->checks = NULL; - nstr->norm = NULL; - ERR(GRN_NO_MEMORY_AVAILABLE, - "[normalizer][ascii] failed to allocate character types space"); - return NULL; - } - } - cp = ctypes = nstr->ctypes; - e = (unsigned char *)nstr->orig + size; - for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) { - unsigned char c = *s; - switch (c >> 4) { - case 0 : - case 1 : - /* skip unprintable ascii */ - if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } - continue; - case 2 : - if (c == 0x20) { - if (removeblankp) { - if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } - continue; - } else { - *d = ' '; - ctype = GRN_STR_BLANK|grn_str_symbol; - } - } else { - *d = c; - ctype = grn_str_symbol; - } - break; - case 3 : - *d = c; - ctype = (c <= 0x39) ? grn_str_digit : grn_str_symbol; - break; - case 4 : - *d = ('A' <= c) ? c + 0x20 : c; - ctype = (c == 0x40) ? grn_str_symbol : grn_str_alpha; - break; - case 5 : - *d = (c <= 'Z') ? c + 0x20 : c; - ctype = (c <= 0x5a) ? grn_str_alpha : grn_str_symbol; - break; - case 6 : - *d = c; - ctype = (c == 0x60) ? grn_str_symbol : grn_str_alpha; - break; - case 7 : - *d = c; - ctype = (c <= 0x7a) ? grn_str_alpha : (c == 0x7f ? grn_str_others : grn_str_symbol); - break; - default : - *d = c; - ctype = grn_str_others; - break; - } - d++; - length++; - if (cp) { *cp++ = ctype; } - if (ch) { - *ch++ = (int16_t)(s + 1 - s_); - s_ = s + 1; - while (++d_ < d) { *ch++ = 0; } - } - } - if (cp) { *cp = grn_str_null; } - *d = '\0'; - nstr->length = length; - nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm); - return NULL; -} - -/* use cp1252 as latin1 */ -inline static grn_obj * -latin1_normalize(grn_ctx *ctx, int nargs, grn_obj **args, - grn_user_data *user_data) -{ - grn_normalized_text *nstr = (grn_normalized_text *)args[0]; - int16_t *ch; - const unsigned char *s, *s_, *e; - unsigned char *d, *d0, *d_; - uint_least8_t *cp, *ctypes, ctype; - size_t size = strlen(nstr->orig), length = 0; - int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK; - if (!(nstr->norm = GRN_MALLOC(size + 1))) { - ERR(GRN_NO_MEMORY_AVAILABLE, - "[normalizer][latin1] failed to allocate normalized text space"); - return NULL; - } - d0 = (unsigned char *) nstr->norm; - if (nstr->flags & GRN_STR_WITH_CHECKS) { - if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) { - GRN_FREE(nstr->norm); - nstr->norm = NULL; - ERR(GRN_NO_MEMORY_AVAILABLE, - "[normalizer][latin1] failed to allocate checks space"); - return NULL; - } - } - ch = nstr->checks; - if (nstr->flags & GRN_STR_WITH_CTYPES) { - if (!(nstr->ctypes = GRN_MALLOC(size + 1))) { - GRN_FREE(nstr->checks); - GRN_FREE(nstr->norm); - nstr->checks = NULL; - nstr->norm = NULL; - ERR(GRN_NO_MEMORY_AVAILABLE, - "[normalizer][latin1] failed to allocate character types space"); - return NULL; - } - } - cp = ctypes = nstr->ctypes; - e = (unsigned char *)nstr->orig + size; - for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) { - unsigned char c = *s; - switch (c >> 4) { - case 0 : - case 1 : - /* skip unprintable ascii */ - if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } - continue; - case 2 : - if (c == 0x20) { - if (removeblankp) { - if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } - continue; - } else { - *d = ' '; - ctype = GRN_STR_BLANK|grn_str_symbol; - } - } else { - *d = c; - ctype = grn_str_symbol; - } - break; - case 3 : - *d = c; - ctype = (c <= 0x39) ? grn_str_digit : grn_str_symbol; - break; - case 4 : - *d = ('A' <= c) ? c + 0x20 : c; - ctype = (c == 0x40) ? grn_str_symbol : grn_str_alpha; - break; - case 5 : - *d = (c <= 'Z') ? c + 0x20 : c; - ctype = (c <= 0x5a) ? grn_str_alpha : grn_str_symbol; - break; - case 6 : - *d = c; - ctype = (c == 0x60) ? grn_str_symbol : grn_str_alpha; - break; - case 7 : - *d = c; - ctype = (c <= 0x7a) ? grn_str_alpha : (c == 0x7f ? grn_str_others : grn_str_symbol); - break; - case 8 : - if (c == 0x8a || c == 0x8c || c == 0x8e) { - *d = c + 0x10; - ctype = grn_str_alpha; - } else { - *d = c; - ctype = grn_str_symbol; - } - break; - case 9 : - if (c == 0x9a || c == 0x9c || c == 0x9e || c == 0x9f) { - *d = (c == 0x9f) ? c + 0x60 : c; - ctype = grn_str_alpha; - } else { - *d = c; - ctype = grn_str_symbol; - } - break; - case 0x0c : - *d = c + 0x20; - ctype = grn_str_alpha; - break; - case 0x0d : - *d = (c == 0xd7 || c == 0xdf) ? c : c + 0x20; - ctype = (c == 0xd7) ? grn_str_symbol : grn_str_alpha; - break; - case 0x0e : - *d = c; - ctype = grn_str_alpha; - break; - case 0x0f : - *d = c; - ctype = (c == 0xf7) ? grn_str_symbol : grn_str_alpha; - break; - default : - *d = c; - ctype = grn_str_others; - break; - } - d++; - length++; - if (cp) { *cp++ = ctype; } - if (ch) { - *ch++ = (int16_t)(s + 1 - s_); - s_ = s + 1; - while (++d_ < d) { *ch++ = 0; } - } - } - if (cp) { *cp = grn_str_null; } - *d = '\0'; - nstr->length = length; - nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm); - return NULL; -} - -inline static grn_obj * -koi8r_normalize(grn_ctx *ctx, int nargs, grn_obj **args, - grn_user_data *user_data) -{ - grn_normalized_text *nstr = (grn_normalized_text *)args[0]; - int16_t *ch; - const unsigned char *s, *s_, *e; - unsigned char *d, *d0, *d_; - uint_least8_t *cp, *ctypes, ctype; - size_t size = strlen(nstr->orig), length = 0; - int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK; - if (!(nstr->norm = GRN_MALLOC(size + 1))) { - ERR(GRN_NO_MEMORY_AVAILABLE, - "[normalizer][koi8r] failed to allocate normalized text space"); - return NULL; - } - d0 = (unsigned char *) nstr->norm; - if (nstr->flags & GRN_STR_WITH_CHECKS) { - if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) { - GRN_FREE(nstr->norm); - nstr->norm = NULL; - ERR(GRN_NO_MEMORY_AVAILABLE, - "[normalizer][koi8r] failed to allocate checks space"); - return NULL; - } - } - ch = nstr->checks; - if (nstr->flags & GRN_STR_WITH_CTYPES) { - if (!(nstr->ctypes = GRN_MALLOC(size + 1))) { - GRN_FREE(nstr->checks); - GRN_FREE(nstr->norm); - nstr->checks = NULL; - nstr->norm = NULL; - ERR(GRN_NO_MEMORY_AVAILABLE, - "[normalizer][koi8r] failed to allocate character types space"); - return NULL; - } - } - cp = ctypes = nstr->ctypes; - e = (unsigned char *)nstr->orig + size; - for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) { - unsigned char c = *s; - switch (c >> 4) { - case 0 : - case 1 : - /* skip unprintable ascii */ - if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } - continue; - case 2 : - if (c == 0x20) { - if (removeblankp) { - if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } - continue; - } else { - *d = ' '; - ctype = GRN_STR_BLANK|grn_str_symbol; - } - } else { - *d = c; - ctype = grn_str_symbol; - } - break; - case 3 : - *d = c; - ctype = (c <= 0x39) ? grn_str_digit : grn_str_symbol; - break; - case 4 : - *d = ('A' <= c) ? c + 0x20 : c; - ctype = (c == 0x40) ? grn_str_symbol : grn_str_alpha; - break; - case 5 : - *d = (c <= 'Z') ? c + 0x20 : c; - ctype = (c <= 0x5a) ? grn_str_alpha : grn_str_symbol; - break; - case 6 : - *d = c; - ctype = (c == 0x60) ? grn_str_symbol : grn_str_alpha; - break; - case 7 : - *d = c; - ctype = (c <= 0x7a) ? grn_str_alpha : (c == 0x7f ? grn_str_others : grn_str_symbol); - break; - case 0x0a : - *d = c; - ctype = (c == 0xa3) ? grn_str_alpha : grn_str_others; - break; - case 0x0b : - if (c == 0xb3) { - *d = c - 0x10; - ctype = grn_str_alpha; - } else { - *d = c; - ctype = grn_str_others; - } - break; - case 0x0c : - case 0x0d : - *d = c; - ctype = grn_str_alpha; - break; - case 0x0e : - case 0x0f : - *d = c - 0x20; - ctype = grn_str_alpha; - break; - default : - *d = c; - ctype = grn_str_others; - break; - } - d++; - length++; - if (cp) { *cp++ = ctype; } - if (ch) { - *ch++ = (int16_t)(s + 1 - s_); - s_ = s + 1; - while (++d_ < d) { *ch++ = 0; } - } - } - if (cp) { *cp = grn_str_null; } - *d = '\0'; - nstr->length = length; - nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm); - return NULL; -} - -#define DEF_NORMALIZERIZER(name, normalize)\ - (grn_proc_create(ctx, (name), (sizeof(name) - 1),\ - GRN_PROC_NORMALIZER, NULL, (normalize), NULL, 0, NULL)) - -grn_rc -grn_db_init_builtin_normalizers(grn_ctx *ctx) -{ - grn_obj *obj; - - obj = DEF_NORMALIZERIZER("NormalizerASCII", ascii_normalize); - if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_NORMALIZER_ASCII) { - return GRN_FILE_CORRUPT; - } - obj = DEF_NORMALIZERIZER("NormalizerUTF8NFKC", utf8_nfkc_normalize); - if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_NORMALIZER_UTF8_NFKC) { - return GRN_FILE_CORRUPT; - } - obj = DEF_NORMALIZERIZER("NormalizerEUCJP", eucjp_normalize); - if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_NORMALIZER_EUC_JP) { - return GRN_FILE_CORRUPT; - } - obj = DEF_NORMALIZERIZER("NormalizerSJIS", sjis_normalize); - if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_NORMALIZER_SJIS) { - return GRN_FILE_CORRUPT; - } - obj = DEF_NORMALIZERIZER("NormalizerLATIN1", latin1_normalize); - if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_NORMALIZER_LATIN1) { - return GRN_FILE_CORRUPT; - } - obj = DEF_NORMALIZERIZER("NormalizerKOI8R", koi8r_normalize); - if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_NORMALIZER_KOI8R) { - return GRN_FILE_CORRUPT; - } - /* obj = DEF_NORMALIZERIZER("NormalizerUTF8UCA", utf8_uca_normalize); */ - /* if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_NORMALIZER_UTF8_UCA) { */ - /* return GRN_FILE_CORRUPT; */ - /* } */ - - return GRN_SUCCESS; -} Deleted: lib/normalizer.h (+0 -67) 100644 =================================================================== --- lib/normalizer.h 2012-02-14 14:00:31 +0900 (39c1e36) +++ /dev/null @@ -1,67 +0,0 @@ -/* -*- c-basic-offset: 2 -*- */ -/* - Copyright(C) 2012 Brazil - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License version 2.1 as published by the Free Software Foundation. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -*/ -#ifndef GRN_NORMALIZER_H -#define GRN_NORMALIZER_H - -#ifndef GROONGA_IN_H -#include "groonga_in.h" -#endif /* GROONGA_IN_H */ - -#ifndef GRN_CTX_H -#include "ctx.h" -#endif /* GRN_CTX_H */ - -#ifndef GRN_DB_H -#include "db.h" -#endif /* GRN_DB_H */ - -#ifndef GRN_STR_H -#include "str.h" -#endif /* GRN_STR_H */ - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct { - grn_obj_header header; - const char *orig; - unsigned int orig_blen; - char *norm; - unsigned int norm_blen; - unsigned int length; - short *checks; - unsigned char *ctypes; - grn_encoding encoding; - int flags; -} grn_normalized_text; - -grn_rc grn_normalizer_init(void); -grn_rc grn_normalizer_fin(void); - -grn_rc grn_normalized_text_close(grn_ctx *ctx, grn_obj *normalized_text); - -grn_id grn_normalizer_find(grn_ctx *ctx, grn_encoding encoding); - -grn_rc grn_db_init_builtin_normalizers(grn_ctx *ctx); - -#ifdef __cplusplus -} -#endif - -#endif /* GRN_NORMALIZER_H */ Modified: lib/pat.c (+1 -17) =================================================================== --- lib/pat.c 2012-02-14 14:00:31 +0900 (e575a3e) +++ lib/pat.c 2012-02-14 14:00:52 +0900 (525eba0) @@ -19,7 +19,6 @@ #include <limits.h> #include "pat.h" #include "output.h" -#include "normalizer.h" #include "util.h" #define GRN_PAT_DELETED (GRN_ID_MAX + 1) @@ -423,14 +422,6 @@ _grn_pat_create(grn_ctx *ctx, grn_pat *pat, header->curr_del3 = 0; header->n_garbages = 0; header->tokenizer = GRN_ID_NIL; - if (header->flags & GRN_OBJ_KEY_NORMALIZE) { - header->flags &= ~GRN_OBJ_KEY_NORMALIZE; - header->normalizer = grn_normalizer_find(ctx, ctx->encoding); - pat->normalizer = grn_ctx_at(ctx, header->normalizer); - } else { - header->normalizer = GRN_ID_NIL; - pat->normalizer = NULL; - } pat->io = io; pat->header = header; pat->key_size = key_size; @@ -527,11 +518,6 @@ grn_pat_open(grn_ctx *ctx, const char *path) pat->encoding = header->encoding; pat->obj.header.flags = header->flags; pat->tokenizer = grn_ctx_at(ctx, header->tokenizer); - if (header->flags & GRN_OBJ_KEY_NORMALIZE) { - header->flags &= ~GRN_OBJ_KEY_NORMALIZE; - header->normalizer = grn_normalizer_find(ctx, ctx->encoding); - } - pat->normalizer = grn_ctx_at(ctx, header->normalizer); PAT_AT(pat, 0, node0); if (!node0) { grn_io_close(ctx, io); @@ -2285,7 +2271,7 @@ grn_pat_check(grn_ctx *ctx, grn_pat *pat) char buf[8]; struct grn_pat_header *h = pat->header; GRN_OUTPUT_ARRAY_OPEN("RESULT", 1); - GRN_OUTPUT_MAP_OPEN("SUMMARY", 23); + GRN_OUTPUT_MAP_OPEN("SUMMARY", 22); GRN_OUTPUT_CSTR("flags"); grn_itoh(h->flags, buf, 8); GRN_OUTPUT_STR(buf, 8); @@ -2295,8 +2281,6 @@ grn_pat_check(grn_ctx *ctx, grn_pat *pat) GRN_OUTPUT_INT64(h->value_size); GRN_OUTPUT_CSTR("tokenizer"); GRN_OUTPUT_INT64(h->tokenizer); - GRN_OUTPUT_CSTR("normalizer"); - GRN_OUTPUT_INT64(h->normalizer); GRN_OUTPUT_CSTR("n_entries"); GRN_OUTPUT_INT64(h->n_entries); GRN_OUTPUT_CSTR("curr_rec"); Modified: lib/pat.h (+1 -3) =================================================================== --- lib/pat.h 2012-02-14 14:00:31 +0900 (32acdea) +++ lib/pat.h 2012-02-14 14:00:52 +0900 (30e484a) @@ -38,7 +38,6 @@ struct _grn_pat { uint32_t key_size; uint32_t value_size; grn_obj *tokenizer; - grn_obj *normalizer; grn_id *cache; uint32_t cache_size; }; @@ -65,8 +64,7 @@ struct grn_pat_header { int32_t curr_del2; int32_t curr_del3; uint32_t n_garbages; - grn_id normalizer; - uint32_t reserved[1004]; + uint32_t reserved[1005]; grn_pat_delinfo delinfos[GRN_PAT_NDELINFOS]; grn_id garbages[GRN_PAT_MAX_KEY_SIZE + 1]; }; Modified: lib/snip.c (+1 -1) =================================================================== --- lib/snip.c 2012-02-14 14:00:31 +0900 (0f0f58d) +++ lib/snip.c 2012-02-14 14:00:52 +0900 (cfe958a) @@ -247,7 +247,7 @@ grn_snip_cond_close(grn_ctx *ctx, snip_cond *cond) grn_rc grn_snip_cond_init(grn_ctx *ctx, snip_cond *sc, const char *keyword, unsigned int keyword_len, - grn_encoding enc, int flags) + grn_encoding enc, int flags) { size_t norm_blen; int f = GRN_STR_REMOVEBLANK; Modified: lib/str.c (+988 -32) =================================================================== --- lib/str.c 2012-02-14 14:00:31 +0900 (2865a7b) +++ lib/str.c 2012-02-14 14:00:52 +0900 (f6f518c) @@ -20,14 +20,13 @@ #include <string.h> #include "db.h" #include "str.h" -#include "normalizer.h" #ifndef _ISOC99_SOURCE #define _ISOC99_SOURCE #endif /* _ISOC99_SOURCE */ #include <math.h> -int +inline static int grn_str_charlen_utf8(grn_ctx *ctx, const unsigned char *str, const unsigned char *end) { /* MEMO: This function allows non-null-terminated string as str. */ @@ -171,6 +170,952 @@ grn_charlen(grn_ctx *ctx, const char *str, const char *end) return grn_charlen_(ctx, str, end, ctx->encoding); } +static unsigned char symbol[] = { + ',', '.', 0, ':', ';', '?', '!', 0, 0, 0, '`', 0, '^', '~', '_', 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, '-', '-', '/', '\\', 0, 0, '|', 0, 0, 0, '\'', 0, + '"', '(', ')', 0, 0, '[', ']', '{', '}', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + '+', '-', 0, 0, 0, '=', 0, '<', '>', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + '$', 0, 0, '%', '#', '&', '*', '@', 0, 0, 0, 0, 0, 0, 0, 0 +}; + +inline static grn_rc +normalize_euc(grn_ctx *ctx, grn_str *nstr) +{ + static uint16_t hankana[] = { + 0xa1a1, 0xa1a3, 0xa1d6, 0xa1d7, 0xa1a2, 0xa1a6, 0xa5f2, 0xa5a1, 0xa5a3, + 0xa5a5, 0xa5a7, 0xa5a9, 0xa5e3, 0xa5e5, 0xa5e7, 0xa5c3, 0xa1bc, 0xa5a2, + 0xa5a4, 0xa5a6, 0xa5a8, 0xa5aa, 0xa5ab, 0xa5ad, 0xa5af, 0xa5b1, 0xa5b3, + 0xa5b5, 0xa5b7, 0xa5b9, 0xa5bb, 0xa5bd, 0xa5bf, 0xa5c1, 0xa5c4, 0xa5c6, + 0xa5c8, 0xa5ca, 0xa5cb, 0xa5cc, 0xa5cd, 0xa5ce, 0xa5cf, 0xa5d2, 0xa5d5, + 0xa5d8, 0xa5db, 0xa5de, 0xa5df, 0xa5e0, 0xa5e1, 0xa5e2, 0xa5e4, 0xa5e6, + 0xa5e8, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5ef, 0xa5f3, 0xa1ab, + 0xa1eb + }; + static unsigned char dakuten[] = { + 0xf4, 0, 0, 0, 0, 0xac, 0, 0xae, 0, 0xb0, 0, 0xb2, 0, 0xb4, 0, 0xb6, 0, + 0xb8, 0, 0xba, 0, 0xbc, 0, 0xbe, 0, 0xc0, 0, 0xc2, 0, 0, 0xc5, 0, 0xc7, + 0, 0xc9, 0, 0, 0, 0, 0, 0, 0xd0, 0, 0, 0xd3, 0, 0, 0xd6, 0, 0, 0xd9, 0, + 0, 0xdc + }; + static unsigned char handaku[] = { + 0xd1, 0, 0, 0xd4, 0, 0, 0xd7, 0, 0, 0xda, 0, 0, 0xdd + }; + int16_t *ch; + const unsigned char *s, *s_, *e; + unsigned char *d, *d0, *d_, b; + uint_least8_t *cp, *ctypes, ctype; + size_t size = nstr->orig_blen, length = 0; + int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK; + if (!(nstr->norm = GRN_MALLOC(size * 2 + 1))) { + return GRN_NO_MEMORY_AVAILABLE; + } + d0 = (unsigned char *) nstr->norm; + if (nstr->flags & GRN_STR_WITH_CHECKS) { + if (!(nstr->checks = GRN_MALLOC(size * 2 * sizeof(int16_t) + 1))) { + GRN_FREE(nstr->norm); + nstr->norm = NULL; + return GRN_NO_MEMORY_AVAILABLE; + } + } + ch = nstr->checks; + if (nstr->flags & GRN_STR_WITH_CTYPES) { + if (!(nstr->ctypes = GRN_MALLOC(size + 1))) { + GRN_FREE(nstr->checks); + GRN_FREE(nstr->norm); + nstr->checks = NULL; + nstr->norm = NULL; + return GRN_NO_MEMORY_AVAILABLE; + } + } + cp = ctypes = nstr->ctypes; + e = (unsigned char *)nstr->orig + size; + for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) { + if ((*s & 0x80)) { + if (((s + 1) < e) && (*(s + 1) & 0x80)) { + unsigned char c1 = *s++, c2 = *s, c3 = 0; + switch (c1 >> 4) { + case 0x08 : + if (c1 == 0x8e && 0xa0 <= c2 && c2 <= 0xdf) { + uint16_t c = hankana[c2 - 0xa0]; + switch (c) { + case 0xa1ab : + if (d > d0 + 1 && d[-2] == 0xa5 + && 0xa6 <= d[-1] && d[-1] <= 0xdb && (b = dakuten[d[-1] - 0xa6])) { + *(d - 1) = b; + if (ch) { ch[-1] += 2; s_ += 2; } + continue; + } else { + *d++ = c >> 8; *d = c & 0xff; + } + break; + case 0xa1eb : + if (d > d0 + 1 && d[-2] == 0xa5 + && 0xcf <= d[-1] && d[-1] <= 0xdb && (b = handaku[d[-1] - 0xcf])) { + *(d - 1) = b; + if (ch) { ch[-1] += 2; s_ += 2; } + continue; + } else { + *d++ = c >> 8; *d = c & 0xff; + } + break; + default : + *d++ = c >> 8; *d = c & 0xff; + break; + } + ctype = grn_str_katakana; + } else { + *d++ = c1; *d = c2; + ctype = grn_str_others; + } + break; + case 0x09 : + *d++ = c1; *d = c2; + ctype = grn_str_others; + break; + case 0x0a : + switch (c1 & 0x0f) { + case 1 : + switch (c2) { + case 0xbc : + *d++ = c1; *d = c2; + ctype = grn_str_katakana; + break; + case 0xb9 : + *d++ = c1; *d = c2; + ctype = grn_str_kanji; + break; + case 0xa1 : + if (removeblankp) { + if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } + continue; + } else { + *d = ' '; + ctype = GRN_STR_BLANK|grn_str_symbol; + } + break; + default : + if (c2 >= 0xa4 && (c3 = symbol[c2 - 0xa4])) { + *d = c3; + ctype = grn_str_symbol; + } else { + *d++ = c1; *d = c2; + ctype = grn_str_others; + } + break; + } + break; + case 2 : + *d++ = c1; *d = c2; + ctype = grn_str_symbol; + break; + case 3 : + c3 = c2 - 0x80; + if ('a' <= c3 && c3 <= 'z') { + ctype = grn_str_alpha; + *d = c3; + } else if ('A' <= c3 && c3 <= 'Z') { + ctype = grn_str_alpha; + *d = c3 + 0x20; + } else if ('0' <= c3 && c3 <= '9') { + ctype = grn_str_digit; + *d = c3; + } else { + ctype = grn_str_others; + *d++ = c1; *d = c2; + } + break; + case 4 : + *d++ = c1; *d = c2; + ctype = grn_str_hiragana; + break; + case 5 : + *d++ = c1; *d = c2; + ctype = grn_str_katakana; + break; + case 6 : + case 7 : + case 8 : + *d++ = c1; *d = c2; + ctype = grn_str_symbol; + break; + default : + *d++ = c1; *d = c2; + ctype = grn_str_others; + break; + } + break; + default : + *d++ = c1; *d = c2; + ctype = grn_str_kanji; + break; + } + } else { + /* skip invalid character */ + continue; + } + } else { + unsigned char c = *s; + switch (c >> 4) { + case 0 : + case 1 : + /* skip unprintable ascii */ + if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } + continue; + case 2 : + if (c == 0x20) { + if (removeblankp) { + if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } + continue; + } else { + *d = ' '; + ctype = GRN_STR_BLANK|grn_str_symbol; + } + } else { + *d = c; + ctype = grn_str_symbol; + } + break; + case 3 : + *d = c; + ctype = (c <= 0x39) ? grn_str_digit : grn_str_symbol; + break; + case 4 : + *d = ('A' <= c) ? c + 0x20 : c; + ctype = (c == 0x40) ? grn_str_symbol : grn_str_alpha; + break; + case 5 : + *d = (c <= 'Z') ? c + 0x20 : c; + ctype = (c <= 0x5a) ? grn_str_alpha : grn_str_symbol; + break; + case 6 : + *d = c; + ctype = (c == 0x60) ? grn_str_symbol : grn_str_alpha; + break; + case 7 : + *d = c; + ctype = (c <= 0x7a) ? grn_str_alpha : (c == 0x7f ? grn_str_others : grn_str_symbol); + break; + default : + *d = c; + ctype = grn_str_others; + break; + } + } + d++; + length++; + if (cp) { *cp++ = ctype; } + if (ch) { + *ch++ = (int16_t)(s + 1 - s_); + s_ = s + 1; + while (++d_ < d) { *ch++ = 0; } + } + } + if (cp) { *cp = grn_str_null; } + *d = '\0'; + nstr->length = length; + nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm); + return GRN_SUCCESS; +} + +#ifndef NO_NFKC +uint_least8_t grn_nfkc_ctype(const unsigned char *str); +const char *grn_nfkc_map1(const unsigned char *str); +const char *grn_nfkc_map2(const unsigned char *prefix, const unsigned char *suffix); + +inline static grn_rc +normalize_utf8(grn_ctx *ctx, grn_str *nstr) +{ + int16_t *ch; + const unsigned char *s, *s_, *s__ = NULL, *p, *p2, *pe, *e; + unsigned char *d, *d_, *de; + uint_least8_t *cp; + size_t length = 0, ls, lp, size = nstr->orig_blen, ds = size * 3; + int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK; + if (!(nstr->norm = GRN_MALLOC(ds + 1))) { + return GRN_NO_MEMORY_AVAILABLE; + } + if (nstr->flags & GRN_STR_WITH_CHECKS) { + if (!(nstr->checks = GRN_MALLOC(ds * sizeof(int16_t) + 1))) { + GRN_FREE(nstr->norm); nstr->norm = NULL; + return GRN_NO_MEMORY_AVAILABLE; + } + } + ch = nstr->checks; + if (nstr->flags & GRN_STR_WITH_CTYPES) { + if (!(nstr->ctypes = GRN_MALLOC(ds + 1))) { + if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; } + GRN_FREE(nstr->norm); nstr->norm = NULL; + return GRN_NO_MEMORY_AVAILABLE; + } + } + cp = nstr->ctypes; + d = (unsigned char *)nstr->norm; + de = d + ds; + d_ = NULL; + e = (unsigned char *)nstr->orig + size; + for (s = s_ = (unsigned char *)nstr->orig; ; s += ls) { + if (!(ls = grn_str_charlen_utf8(ctx, s, e))) { + break; + } + if ((p = (unsigned char *)grn_nfkc_map1(s))) { + pe = p + strlen((char *)p); + } else { + p = s; + pe = p + ls; + } + if (d_ && (p2 = (unsigned char *)grn_nfkc_map2(d_, p))) { + p = p2; + pe = p + strlen((char *)p); + if (cp) { cp--; } + if (ch) { + ch -= (d - d_); + s_ = s__; + } + d = d_; + length--; + } + for (; ; p += lp) { + if (!(lp = grn_str_charlen_utf8(ctx, p, pe))) { + break; + } + if ((*p == ' ' && removeblankp) || *p < 0x20 /* skip unprintable ascii */ ) { + if (cp > nstr->ctypes) { *(cp - 1) |= GRN_STR_BLANK; } + } else { + if (de <= d + lp) { + unsigned char *norm; + ds += (ds >> 1) + lp; + if (!(norm = GRN_REALLOC(nstr->norm, ds + 1))) { + if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; } + if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; } + GRN_FREE(nstr->norm); nstr->norm = NULL; + return GRN_NO_MEMORY_AVAILABLE; + } + de = norm + ds; + d = norm + (d - (unsigned char *)nstr->norm); + nstr->norm = norm; + if (ch) { + int16_t *checks; + if (!(checks = GRN_REALLOC(nstr->checks, ds * sizeof(int16_t)+ 1))) { + if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; } + GRN_FREE(nstr->checks); nstr->checks = NULL; + GRN_FREE(nstr->norm); nstr->norm = NULL; + return GRN_NO_MEMORY_AVAILABLE; + } + ch = checks + (ch - nstr->checks); + nstr->checks = checks; + } + if (cp) { + uint_least8_t *ctypes; + if (!(ctypes = GRN_REALLOC(nstr->ctypes, ds + 1))) { + GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; + if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; } + GRN_FREE(nstr->norm); nstr->norm = NULL; + return GRN_NO_MEMORY_AVAILABLE; + } + cp = ctypes + (cp - nstr->ctypes); + nstr->ctypes = ctypes; + } + } + memcpy(d, p, lp); + d_ = d; + d += lp; + length++; + if (cp) { *cp++ = grn_nfkc_ctype(p); } + if (ch) { + size_t i; + if (s_ == s + ls) { + *ch++ = -1; + } else { + *ch++ = (int16_t)(s + ls - s_); + s__ = s_; + s_ = s + ls; + } + for (i = lp; i > 1; i--) { *ch++ = 0; } + } + } + } + } + if (cp) { *cp = grn_str_null; } + *d = '\0'; + nstr->length = length; + nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm); + return GRN_SUCCESS; +} +#endif /* NO_NFKC */ + +inline static grn_rc +normalize_sjis(grn_ctx *ctx, grn_str *nstr) +{ + static uint16_t hankana[] = { + 0x8140, 0x8142, 0x8175, 0x8176, 0x8141, 0x8145, 0x8392, 0x8340, 0x8342, + 0x8344, 0x8346, 0x8348, 0x8383, 0x8385, 0x8387, 0x8362, 0x815b, 0x8341, + 0x8343, 0x8345, 0x8347, 0x8349, 0x834a, 0x834c, 0x834e, 0x8350, 0x8352, + 0x8354, 0x8356, 0x8358, 0x835a, 0x835c, 0x835e, 0x8360, 0x8363, 0x8365, + 0x8367, 0x8369, 0x836a, 0x836b, 0x836c, 0x836d, 0x836e, 0x8371, 0x8374, + 0x8377, 0x837a, 0x837d, 0x837e, 0x8380, 0x8381, 0x8382, 0x8384, 0x8386, + 0x8388, 0x8389, 0x838a, 0x838b, 0x838c, 0x838d, 0x838f, 0x8393, 0x814a, + 0x814b + }; + static unsigned char dakuten[] = { + 0x94, 0, 0, 0, 0, 0x4b, 0, 0x4d, 0, 0x4f, 0, 0x51, 0, 0x53, 0, 0x55, 0, + 0x57, 0, 0x59, 0, 0x5b, 0, 0x5d, 0, 0x5f, 0, 0x61, 0, 0, 0x64, 0, 0x66, + 0, 0x68, 0, 0, 0, 0, 0, 0, 0x6f, 0, 0, 0x72, 0, 0, 0x75, 0, 0, 0x78, 0, + 0, 0x7b + }; + static unsigned char handaku[] = { + 0x70, 0, 0, 0x73, 0, 0, 0x76, 0, 0, 0x79, 0, 0, 0x7c + }; + int16_t *ch; + const unsigned char *s, *s_; + unsigned char *d, *d0, *d_, b, *e; + uint_least8_t *cp, *ctypes, ctype; + size_t size = nstr->orig_blen, length = 0; + int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK; + if (!(nstr->norm = GRN_MALLOC(size * 2 + 1))) { + return GRN_NO_MEMORY_AVAILABLE; + } + d0 = (unsigned char *) nstr->norm; + if (nstr->flags & GRN_STR_WITH_CHECKS) { + if (!(nstr->checks = GRN_MALLOC(size * 2 * sizeof(int16_t) + 1))) { + GRN_FREE(nstr->norm); + nstr->norm = NULL; + return GRN_NO_MEMORY_AVAILABLE; + } + } + ch = nstr->checks; + if (nstr->flags & GRN_STR_WITH_CTYPES) { + if (!(nstr->ctypes = GRN_MALLOC(size + 1))) { + GRN_FREE(nstr->checks); + GRN_FREE(nstr->norm); + nstr->checks = NULL; + nstr->norm = NULL; + return GRN_NO_MEMORY_AVAILABLE; + } + } + cp = ctypes = nstr->ctypes; + e = (unsigned char *)nstr->orig + size; + for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) { + if ((*s & 0x80)) { + if (0xa0 <= *s && *s <= 0xdf) { + uint16_t c = hankana[*s - 0xa0]; + switch (c) { + case 0x814a : + if (d > d0 + 1 && d[-2] == 0x83 + && 0x45 <= d[-1] && d[-1] <= 0x7a && (b = dakuten[d[-1] - 0x45])) { + *(d - 1) = b; + if (ch) { ch[-1]++; s_++; } + continue; + } else { + *d++ = c >> 8; *d = c & 0xff; + } + break; + case 0x814b : + if (d > d0 + 1 && d[-2] == 0x83 + && 0x6e <= d[-1] && d[-1] <= 0x7a && (b = handaku[d[-1] - 0x6e])) { + *(d - 1) = b; + if (ch) { ch[-1]++; s_++; } + continue; + } else { + *d++ = c >> 8; *d = c & 0xff; + } + break; + default : + *d++ = c >> 8; *d = c & 0xff; + break; + } + ctype = grn_str_katakana; + } else { + if ((s + 1) < e && 0x40 <= *(s + 1) && *(s + 1) <= 0xfc) { + unsigned char c1 = *s++, c2 = *s, c3 = 0; + if (0x81 <= c1 && c1 <= 0x87) { + switch (c1 & 0x0f) { + case 1 : + switch (c2) { + case 0x5b : + *d++ = c1; *d = c2; + ctype = grn_str_katakana; + break; + case 0x58 : + *d++ = c1; *d = c2; + ctype = grn_str_kanji; + break; + case 0x40 : + if (removeblankp) { + if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } + continue; + } else { + *d = ' '; + ctype = GRN_STR_BLANK|grn_str_symbol; + } + break; + default : + if (0x43 <= c2 && c2 <= 0x7e && (c3 = symbol[c2 - 0x43])) { + *d = c3; + ctype = grn_str_symbol; + } else if (0x7f <= c2 && c2 <= 0x97 && (c3 = symbol[c2 - 0x44])) { + *d = c3; + ctype = grn_str_symbol; + } else { + *d++ = c1; *d = c2; + ctype = grn_str_others; + } + break; + } + break; + case 2 : + c3 = c2 - 0x1f; + if (0x4f <= c2 && c2 <= 0x58) { + ctype = grn_str_digit; + *d = c2 - 0x1f; + } else if (0x60 <= c2 && c2 <= 0x79) { + ctype = grn_str_alpha; + *d = c2 + 0x01; + } else if (0x81 <= c2 && c2 <= 0x9a) { + ctype = grn_str_alpha; + *d = c2 - 0x20; + } else if (0x9f <= c2 && c2 <= 0xf1) { + *d++ = c1; *d = c2; + ctype = grn_str_hiragana; + } else { + *d++ = c1; *d = c2; + ctype = grn_str_others; + } + break; + case 3 : + if (0x40 <= c2 && c2 <= 0x96) { + *d++ = c1; *d = c2; + ctype = grn_str_katakana; + } else { + *d++ = c1; *d = c2; + ctype = grn_str_symbol; + } + break; + case 4 : + case 7 : + *d++ = c1; *d = c2; + ctype = grn_str_symbol; + break; + default : + *d++ = c1; *d = c2; + ctype = grn_str_others; + break; + } + } else { + *d++ = c1; *d = c2; + ctype = grn_str_kanji; + } + } else { + /* skip invalid character */ + continue; + } + } + } else { + unsigned char c = *s; + switch (c >> 4) { + case 0 : + case 1 : + /* skip unprintable ascii */ + if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } + continue; + case 2 : + if (c == 0x20) { + if (removeblankp) { + if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } + continue; + } else { + *d = ' '; + ctype = GRN_STR_BLANK|grn_str_symbol; + } + } else { + *d = c; + ctype = grn_str_symbol; + } + break; + case 3 : + *d = c; + ctype = (c <= 0x39) ? grn_str_digit : grn_str_symbol; + break; + case 4 : + *d = ('A' <= c) ? c + 0x20 : c; + ctype = (c == 0x40) ? grn_str_symbol : grn_str_alpha; + break; + case 5 : + *d = (c <= 'Z') ? c + 0x20 : c; + ctype = (c <= 0x5a) ? grn_str_alpha : grn_str_symbol; + break; + case 6 : + *d = c; + ctype = (c == 0x60) ? grn_str_symbol : grn_str_alpha; + break; + case 7 : + *d = c; + ctype = (c <= 0x7a) ? grn_str_alpha : (c == 0x7f ? grn_str_others : grn_str_symbol); + break; + default : + *d = c; + ctype = grn_str_others; + break; + } + } + d++; + length++; + if (cp) { *cp++ = ctype; } + if (ch) { + *ch++ = (int16_t)(s + 1 - s_); + s_ = s + 1; + while (++d_ < d) { *ch++ = 0; } + } + } + if (cp) { *cp = grn_str_null; } + *d = '\0'; + nstr->length = length; + nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm); + return GRN_SUCCESS; +} + +inline static grn_rc +normalize_none(grn_ctx *ctx, grn_str *nstr) +{ + int16_t *ch; + const unsigned char *s, *s_, *e; + unsigned char *d, *d0, *d_; + uint_least8_t *cp, *ctypes, ctype; + size_t size = nstr->orig_blen, length = 0; + int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK; + if (!(nstr->norm = GRN_MALLOC(size + 1))) { + return GRN_NO_MEMORY_AVAILABLE; + } + d0 = (unsigned char *) nstr->norm; + if (nstr->flags & GRN_STR_WITH_CHECKS) { + if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) { + GRN_FREE(nstr->norm); + nstr->norm = NULL; + return GRN_NO_MEMORY_AVAILABLE; + } + } + ch = nstr->checks; + if (nstr->flags & GRN_STR_WITH_CTYPES) { + if (!(nstr->ctypes = GRN_MALLOC(size + 1))) { + GRN_FREE(nstr->checks); + GRN_FREE(nstr->norm); + nstr->checks = NULL; + nstr->norm = NULL; + return GRN_NO_MEMORY_AVAILABLE; + } + } + cp = ctypes = nstr->ctypes; + e = (unsigned char *)nstr->orig + size; + for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) { + unsigned char c = *s; + switch (c >> 4) { + case 0 : + case 1 : + /* skip unprintable ascii */ + if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } + continue; + case 2 : + if (c == 0x20) { + if (removeblankp) { + if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } + continue; + } else { + *d = ' '; + ctype = GRN_STR_BLANK|grn_str_symbol; + } + } else { + *d = c; + ctype = grn_str_symbol; + } + break; + case 3 : + *d = c; + ctype = (c <= 0x39) ? grn_str_digit : grn_str_symbol; + break; + case 4 : + *d = ('A' <= c) ? c + 0x20 : c; + ctype = (c == 0x40) ? grn_str_symbol : grn_str_alpha; + break; + case 5 : + *d = (c <= 'Z') ? c + 0x20 : c; + ctype = (c <= 0x5a) ? grn_str_alpha : grn_str_symbol; + break; + case 6 : + *d = c; + ctype = (c == 0x60) ? grn_str_symbol : grn_str_alpha; + break; + case 7 : + *d = c; + ctype = (c <= 0x7a) ? grn_str_alpha : (c == 0x7f ? grn_str_others : grn_str_symbol); + break; + default : + *d = c; + ctype = grn_str_others; + break; + } + d++; + length++; + if (cp) { *cp++ = ctype; } + if (ch) { + *ch++ = (int16_t)(s + 1 - s_); + s_ = s + 1; + while (++d_ < d) { *ch++ = 0; } + } + } + if (cp) { *cp = grn_str_null; } + *d = '\0'; + nstr->length = length; + nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm); + return GRN_SUCCESS; +} + +/* use cp1252 as latin1 */ +inline static grn_rc +normalize_latin1(grn_ctx *ctx, grn_str *nstr) +{ + int16_t *ch; + const unsigned char *s, *s_, *e; + unsigned char *d, *d0, *d_; + uint_least8_t *cp, *ctypes, ctype; + size_t size = strlen(nstr->orig), length = 0; + int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK; + if (!(nstr->norm = GRN_MALLOC(size + 1))) { + return GRN_NO_MEMORY_AVAILABLE; + } + d0 = (unsigned char *) nstr->norm; + if (nstr->flags & GRN_STR_WITH_CHECKS) { + if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) { + GRN_FREE(nstr->norm); + nstr->norm = NULL; + return GRN_NO_MEMORY_AVAILABLE; + } + } + ch = nstr->checks; + if (nstr->flags & GRN_STR_WITH_CTYPES) { + if (!(nstr->ctypes = GRN_MALLOC(size + 1))) { + GRN_FREE(nstr->checks); + GRN_FREE(nstr->norm); + nstr->checks = NULL; + nstr->norm = NULL; + return GRN_NO_MEMORY_AVAILABLE; + } + } + cp = ctypes = nstr->ctypes; + e = (unsigned char *)nstr->orig + size; + for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) { + unsigned char c = *s; + switch (c >> 4) { + case 0 : + case 1 : + /* skip unprintable ascii */ + if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } + continue; + case 2 : + if (c == 0x20) { + if (removeblankp) { + if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } + continue; + } else { + *d = ' '; + ctype = GRN_STR_BLANK|grn_str_symbol; + } + } else { + *d = c; + ctype = grn_str_symbol; + } + break; + case 3 : + *d = c; + ctype = (c <= 0x39) ? grn_str_digit : grn_str_symbol; + break; + case 4 : + *d = ('A' <= c) ? c + 0x20 : c; + ctype = (c == 0x40) ? grn_str_symbol : grn_str_alpha; + break; + case 5 : + *d = (c <= 'Z') ? c + 0x20 : c; + ctype = (c <= 0x5a) ? grn_str_alpha : grn_str_symbol; + break; + case 6 : + *d = c; + ctype = (c == 0x60) ? grn_str_symbol : grn_str_alpha; + break; + case 7 : + *d = c; + ctype = (c <= 0x7a) ? grn_str_alpha : (c == 0x7f ? grn_str_others : grn_str_symbol); + break; + case 8 : + if (c == 0x8a || c == 0x8c || c == 0x8e) { + *d = c + 0x10; + ctype = grn_str_alpha; + } else { + *d = c; + ctype = grn_str_symbol; + } + break; + case 9 : + if (c == 0x9a || c == 0x9c || c == 0x9e || c == 0x9f) { + *d = (c == 0x9f) ? c + 0x60 : c; + ctype = grn_str_alpha; + } else { + *d = c; + ctype = grn_str_symbol; + } + break; + case 0x0c : + *d = c + 0x20; + ctype = grn_str_alpha; + break; + case 0x0d : + *d = (c == 0xd7 || c == 0xdf) ? c : c + 0x20; + ctype = (c == 0xd7) ? grn_str_symbol : grn_str_alpha; + break; + case 0x0e : + *d = c; + ctype = grn_str_alpha; + break; + case 0x0f : + *d = c; + ctype = (c == 0xf7) ? grn_str_symbol : grn_str_alpha; + break; + default : + *d = c; + ctype = grn_str_others; + break; + } + d++; + length++; + if (cp) { *cp++ = ctype; } + if (ch) { + *ch++ = (int16_t)(s + 1 - s_); + s_ = s + 1; + while (++d_ < d) { *ch++ = 0; } + } + } + if (cp) { *cp = grn_str_null; } + *d = '\0'; + nstr->length = length; + nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm); + return GRN_SUCCESS; +} + +inline static grn_rc +normalize_koi8r(grn_ctx *ctx, grn_str *nstr) +{ + int16_t *ch; + const unsigned char *s, *s_, *e; + unsigned char *d, *d0, *d_; + uint_least8_t *cp, *ctypes, ctype; + size_t size = strlen(nstr->orig), length = 0; + int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK; + if (!(nstr->norm = GRN_MALLOC(size + 1))) { + return GRN_NO_MEMORY_AVAILABLE; + } + d0 = (unsigned char *) nstr->norm; + if (nstr->flags & GRN_STR_WITH_CHECKS) { + if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) { + GRN_FREE(nstr->norm); + nstr->norm = NULL; + return GRN_NO_MEMORY_AVAILABLE; + } + } + ch = nstr->checks; + if (nstr->flags & GRN_STR_WITH_CTYPES) { + if (!(nstr->ctypes = GRN_MALLOC(size + 1))) { + GRN_FREE(nstr->checks); + GRN_FREE(nstr->norm); + nstr->checks = NULL; + nstr->norm = NULL; + return GRN_NO_MEMORY_AVAILABLE; + } + } + cp = ctypes = nstr->ctypes; + e = (unsigned char *)nstr->orig + size; + for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) { + unsigned char c = *s; + switch (c >> 4) { + case 0 : + case 1 : + /* skip unprintable ascii */ + if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } + continue; + case 2 : + if (c == 0x20) { + if (removeblankp) { + if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; } + continue; + } else { + *d = ' '; + ctype = GRN_STR_BLANK|grn_str_symbol; + } + } else { + *d = c; + ctype = grn_str_symbol; + } + break; + case 3 : + *d = c; + ctype = (c <= 0x39) ? grn_str_digit : grn_str_symbol; + break; + case 4 : + *d = ('A' <= c) ? c + 0x20 : c; + ctype = (c == 0x40) ? grn_str_symbol : grn_str_alpha; + break; + case 5 : + *d = (c <= 'Z') ? c + 0x20 : c; + ctype = (c <= 0x5a) ? grn_str_alpha : grn_str_symbol; + break; + case 6 : + *d = c; + ctype = (c == 0x60) ? grn_str_symbol : grn_str_alpha; + break; + case 7 : + *d = c; + ctype = (c <= 0x7a) ? grn_str_alpha : (c == 0x7f ? grn_str_others : grn_str_symbol); + break; + case 0x0a : + *d = c; + ctype = (c == 0xa3) ? grn_str_alpha : grn_str_others; + break; + case 0x0b : + if (c == 0xb3) { + *d = c - 0x10; + ctype = grn_str_alpha; + } else { + *d = c; + ctype = grn_str_others; + } + break; + case 0x0c : + case 0x0d : + *d = c; + ctype = grn_str_alpha; + break; + case 0x0e : + case 0x0f : + *d = c - 0x20; + ctype = grn_str_alpha; + break; + default : + *d = c; + ctype = grn_str_others; + break; + } + d++; + length++; + if (cp) { *cp++ = ctype; } + if (ch) { + *ch++ = (int16_t)(s + 1 - s_); + s_ = s + 1; + while (++d_ < d) { *ch++ = 0; } + } + } + if (cp) { *cp = grn_str_null; } + *d = '\0'; + nstr->length = length; + nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm); + return GRN_SUCCESS; +} + static grn_str * grn_fakenstr_open(grn_ctx *ctx, const char *str, size_t str_len, grn_encoding encoding, int flags) { @@ -257,42 +1202,53 @@ grn_fakenstr_open(grn_ctx *ctx, const char *str, size_t str_len, grn_encoding en grn_str * grn_str_open_(grn_ctx *ctx, const char *str, unsigned int str_len, int flags, grn_encoding encoding) { - grn_str *nstr = NULL; - grn_id normalizer_id; - grn_obj *normalizer; - grn_obj *normalized_text_obj; + grn_rc rc; + grn_str *nstr; if (!str || !str_len) { return NULL; } if (!(flags & GRN_STR_NORMALIZE)) { return grn_fakenstr_open(ctx, str, str_len, encoding, flags); } - normalizer_id = grn_normalizer_find(ctx, encoding); - normalizer = grn_ctx_at(ctx, normalizer_id); - normalized_text_obj = grn_normalized_text_open(ctx, normalizer, str, str_len, - encoding, flags); - if (normalized_text_obj) { - grn_normalized_text *normalized_text; - if (!(nstr = GRN_MALLOC(sizeof(grn_str)))) { - GRN_LOG(ctx, GRN_LOG_ALERT, "memory allocation on grn_str_open failed !"); - grn_obj_close(ctx, normalized_text_obj); - return NULL; - } - normalized_text = (grn_normalized_text *)normalized_text_obj; - nstr->orig = normalized_text->orig; - nstr->orig_blen = normalized_text->orig_blen; - nstr->norm = normalized_text->norm; - normalized_text->norm = NULL; - nstr->norm_blen = normalized_text->norm_blen; - normalized_text->norm_blen = 0; - nstr->length = normalized_text->length; - nstr->checks = normalized_text->checks; - normalized_text->checks = NULL; - nstr->ctypes = normalized_text->ctypes; - normalized_text->ctypes = NULL; - nstr->encoding = encoding; - nstr->flags = flags; - grn_obj_close(ctx, normalized_text_obj); + if (!(nstr = GRN_MALLOC(sizeof(grn_str)))) { + GRN_LOG(ctx, GRN_LOG_ALERT, "memory allocation on grn_str_open failed !"); + return NULL; + } + nstr->orig = str; + nstr->orig_blen = str_len; + nstr->norm = NULL; + nstr->norm_blen = 0; + nstr->checks = NULL; + nstr->ctypes = NULL; + nstr->encoding = encoding; + nstr->flags = flags; + switch (encoding) { + case GRN_ENC_EUC_JP : + rc = normalize_euc(ctx, nstr); + break; + case GRN_ENC_UTF8 : +#ifdef NO_NFKC + rc = normalize_none(ctx, nstr); +#else /* NO_NFKC */ + rc = normalize_utf8(ctx, nstr); +#endif /* NO_NFKC */ + break; + case GRN_ENC_SJIS : + rc = normalize_sjis(ctx, nstr); + break; + case GRN_ENC_LATIN1 : + rc = normalize_latin1(ctx, nstr); + break; + case GRN_ENC_KOI8R : + rc = normalize_koi8r(ctx, nstr); + break; + default : + rc = normalize_none(ctx, nstr); + break; + } + if (rc) { + grn_str_close(ctx, nstr); + return NULL; } return nstr; } Modified: lib/str.h (+0 -1) =================================================================== --- lib/str.h 2012-02-14 14:00:31 +0900 (bf98e59) +++ lib/str.h 2012-02-14 14:00:52 +0900 (6bf0ce0) @@ -80,7 +80,6 @@ grn_rc grn_substring(grn_ctx *ctx, char **str, char **str_end, int start, int en void grn_logger_fin(void); GRN_API int grn_charlen_(grn_ctx *ctx, const char *str, const char *end, grn_encoding encoding); -GRN_API int grn_str_charlen_utf8(grn_ctx *ctx, const unsigned char *str, const unsigned char *end); GRN_API grn_str *grn_str_open_(grn_ctx *ctx, const char *str, unsigned int str_len, int flags, grn_encoding encoding); #define GRN_BULK_INCR_LEN(buf,len) {\ Modified: lib/util.c (+0 -3) =================================================================== --- lib/util.c 2012-02-14 14:00:31 +0900 (4913edd) +++ lib/util.c 2012-02-14 14:00:52 +0900 (fad7124) @@ -214,9 +214,6 @@ grn_proc_inspect(grn_ctx *ctx, grn_obj *buf, grn_obj *obj) case GRN_PROC_HOOK : GRN_TEXT_PUTS(ctx, buf, "hook"); break; - case GRN_PROC_NORMALIZER : - GRN_TEXT_PUTS(ctx, buf, "normalizer"); - break; } GRN_TEXT_PUTS(ctx, buf, " "); Modified: test/unit/core/dat/test-dat.cpp (+0 -5) =================================================================== --- test/unit/core/dat/test-dat.cpp 2012-02-14 14:00:31 +0900 (9841566) +++ test/unit/core/dat/test-dat.cpp 2012-02-14 14:00:52 +0900 (1dce81d) @@ -72,7 +72,6 @@ namespace test_dat { const char *base_dir; grn_ctx ctx; - grn_obj *db; void cut_setup(void) { @@ -83,16 +82,12 @@ namespace test_dat g_mkdir_with_parents(base_dir, 0755); grn_ctx_init(&ctx, 0); - db = grn_db_create(&ctx, NULL, NULL); enter_api(&ctx); } void cut_teardown(void) { leave_api(&ctx); - if (db) { - grn_obj_unlink(&ctx, db); - } grn_ctx_fin(&ctx); if (base_dir) { Modified: test/unit/util/test-snip.c (+1 -7) =================================================================== --- test/unit/util/test-snip.c 2012-02-14 14:00:31 +0900 (8573e1f) +++ test/unit/util/test-snip.c 2012-02-14 14:00:52 +0900 (925431e) @@ -1,6 +1,6 @@ /* -*- c-basic-offset: 2; coding: utf-8 -*- */ /* - Copyright (C) 2008-2012 Kouhei Sutou <kou****@clear*****> + Copyright (C) 2008-2009 Kouhei Sutou <kou****@cozmi*****> This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -45,7 +45,6 @@ void test_add_cond_with_too_large_keyword(void); void test_add_cond_with_copy_tag_flag(void); static grn_ctx context; -static grn_obj *db; static grn_snip *snip; static gchar *keyword; static gchar *result; @@ -198,7 +197,6 @@ void cut_setup(void) { grn_ctx_init(&context, GRN_CTX_USE_QL); - db = grn_db_create(&context, NULL, NULL); snip = NULL; keyword = NULL; @@ -235,10 +233,6 @@ cut_teardown(void) g_free(default_close_tag); } - if (db) { - grn_obj_close(&context, db); - } - grn_ctx_fin(&context); } Modified: test/unit/util/test-string.c (+1 -4) =================================================================== --- test/unit/util/test-string.c 2012-02-14 14:00:31 +0900 (3e97014) +++ test/unit/util/test-string.c 2012-02-14 14:00:52 +0900 (2417060) @@ -1,6 +1,6 @@ /* -*- c-basic-offset: 2; coding: utf-8 -*- */ /* - Copyright (C) 2008-2012 Kouhei Sutou <kou****@clear*****> + Copyright (C) 2008-2011 Kouhei Sutou <kou****@clear*****> This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -53,7 +53,6 @@ void data_itoh(void); void test_itoh(gconstpointer data); static grn_ctx context; -static grn_obj *db; static grn_obj buffer; static const gchar text_ja_utf8[] = @@ -76,7 +75,6 @@ void setup (void) { grn_ctx_init(&context, GRN_CTX_USE_QL); - db = grn_db_create(&context, NULL, NULL); GRN_VOID_INIT(&buffer); } @@ -84,7 +82,6 @@ void teardown (void) { GRN_OBJ_FIN(&context, &buffer); - grn_obj_unlink(&context, db); grn_ctx_fin(&context); }