Kouhei Sutou
null+****@clear*****
Tue Nov 13 11:23:18 JST 2012
Kouhei Sutou 2012-11-13 11:23:18 +0900 (Tue, 13 Nov 2012) New Revision: bf674e7d2ddaed3758ed704ee5c6785b8bca2d60 https://github.com/groonga/groonga/commit/bf674e7d2ddaed3758ed704ee5c6785b8bca2d60 Log: Add grn_tokenizer_tokenized_delimiter_next() Modified files: include/groonga/tokenizer.h lib/tokenizer.c Modified: include/groonga/tokenizer.h (+13 -0) =================================================================== --- include/groonga/tokenizer.h 2012-11-13 11:22:32 +0900 (5298106) +++ include/groonga/tokenizer.h 2012-11-13 11:23:18 +0900 (3f270df) @@ -146,6 +146,19 @@ void grn_tokenizer_token_push(grn_ctx *ctx, grn_tokenizer_token *token, grn_tokenizer_status status); /* + grn_tokenizer_tokenized_delimiter_next() extracts the next token + from the string specified by `str_ptr' and `str_length' and pushes + the next token into `token'. It returns the string after the next + token. The returned string may be `NULL' when all tokens are + extracted. + */ +const char *grn_tokenizer_tokenized_delimiter_next(grn_ctx *ctx, + grn_tokenizer_token *token, + const char *str_ptr, + unsigned int str_length, + grn_encoding encoding); + +/* grn_tokenizer_register() registers a plugin to the database which is associated with `ctx'. `plugin_name_ptr' and `plugin_name_length' specify the plugin name. Alphabetic letters ('A'-'Z' and 'a'-'z'), digits ('0'-'9') and Modified: lib/tokenizer.c (+38 -0) =================================================================== --- lib/tokenizer.c 2012-11-13 11:22:32 +0900 (566d403) +++ lib/tokenizer.c 2012-11-13 11:23:18 +0900 (7bbcf7b) @@ -242,6 +242,44 @@ grn_tokenizer_token_push(grn_ctx *ctx, grn_tokenizer_token *token, grn_ctx_push(ctx, &token->status); } +const char * +grn_tokenizer_tokenized_delimiter_next(grn_ctx *ctx, + grn_tokenizer_token *token, + const char *str_ptr, + unsigned int str_length, + grn_encoding encoding) +{ + size_t char_length = 0; + const unsigned char *start = str_ptr; + const unsigned char *current; + const unsigned char *end = str_ptr + str_length; + const char *next_start = NULL; + unsigned int token_length; + grn_tokenizer_status status; + + for (current = start; current < end; current += char_length) { + char_length = grn_charlen_(ctx, current, end, encoding); + if (char_length == 0) { + break; + } + if (grn_tokenizer_is_tokenized_delimiter(ctx, current, char_length, + encoding)) { + next_start = str_ptr + (current - start + char_length); + break; + } + } + + token_length = current - start; + if (current == end) { + status = GRN_TOKENIZER_LAST; + } else { + status = GRN_TOKENIZER_CONTINUE; + } + grn_tokenizer_token_push(ctx, token, start, token_length, status); + + return next_start; +} + grn_rc grn_tokenizer_register(grn_ctx *ctx, const char *plugin_name_ptr, unsigned int plugin_name_length, -------------- next part -------------- HTML����������������������������...Download