Kouhei Sutou
null+****@clear*****
Mon May 7 17:34:13 JST 2018
Kouhei Sutou 2018-05-07 17:34:13 +0900 (Mon, 07 May 2018) New Revision: 57862cfe8e79fd62918b9d68b986a44cf3bbcd45 https://github.com/groonga/groonga/commit/57862cfe8e79fd62918b9d68b986a44cf3bbcd45 Message: highlighter: add TODO: * Support non HTML mode * Support custom tag * Support lexicon mode Added files: include/groonga/highlighter.h lib/highlighter.c Modified files: include/groonga.h include/groonga/groonga.h lib/c_sources.am lib/db.c lib/proc/proc_highlight.c Modified: include/groonga.h (+1 -0) =================================================================== --- include/groonga.h 2018-05-07 16:59:33 +0900 (dd465e3aa) +++ include/groonga.h 2018-05-07 17:34:13 +0900 (c05ea8304) @@ -35,6 +35,7 @@ #include "groonga/file_reader.h" #include "groonga/geo.h" #include "groonga/hash.h" +#include "groonga/highlighter.h" #include "groonga/id.h" #include "groonga/ii.h" #include "groonga/obj.h" Modified: include/groonga/groonga.h (+1 -0) =================================================================== --- include/groonga/groonga.h 2018-05-07 16:59:33 +0900 (2415d68a4) +++ include/groonga/groonga.h 2018-05-07 17:34:13 +0900 (1af9c3a11) @@ -371,6 +371,7 @@ typedef uint32_t grn_column_flags; #define GRN_SNIP (0x0b) #define GRN_PATSNIP (0x0c) #define GRN_STRING (0x0d) +#define GRN_HIGHLIGHTER (0x0e) #define GRN_CURSOR_TABLE_HASH_KEY (0x10) #define GRN_CURSOR_TABLE_PAT_KEY (0x11) #define GRN_CURSOR_TABLE_DAT_KEY (0x12) Added: include/groonga/highlighter.h (+54 -0) 100644 =================================================================== --- /dev/null +++ include/groonga/highlighter.h 2018-05-07 17:34:13 +0900 (7b84c42e4) @@ -0,0 +1,54 @@ +/* + Copyright(C) 2018 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _grn_highlighter grn_highlighter; + +GRN_API grn_highlighter * +grn_highlighter_open(grn_ctx *ctx); +GRN_API grn_rc +grn_highlighter_close(grn_ctx *ctx, + grn_highlighter *highlighter); +GRN_API grn_rc +grn_highlighter_highlight(grn_ctx *ctx, + grn_highlighter *highlighter, + const char *text, + int64_t text_length, + grn_obj *output); +GRN_API grn_rc +grn_highlighter_set_lexicon(grn_ctx *ctx, + grn_highlighter *highlighter, + grn_obj *lexicon); +GRN_API grn_obj * +grn_highlighter_get_lexicon(grn_ctx *ctx, + grn_highlighter *highlighter); +GRN_API grn_rc +grn_highlighter_add_keyword(grn_ctx *ctx, + grn_highlighter *highlighter, + const char *keyword, + int64_t keyword_length); + + +#ifdef __cplusplus +} +#endif Modified: lib/c_sources.am (+1 -0) =================================================================== --- lib/c_sources.am 2018-05-07 16:59:33 +0900 (d6aae25fe) +++ lib/c_sources.am 2018-05-07 17:34:13 +0900 (3272940f3) @@ -37,6 +37,7 @@ libgroonga_c_sources = \ grn.h \ hash.c \ grn_hash.h \ + highlighter.c \ id.c \ ii.c \ grn_ii.h \ Modified: lib/db.c (+3 -0) =================================================================== --- lib/db.c 2018-05-07 16:59:33 +0900 (df7371f7b) +++ lib/db.c 2018-05-07 17:34:13 +0900 (19d46d3eb) @@ -11214,6 +11214,9 @@ grn_obj_close(grn_ctx *ctx, grn_obj *obj) case GRN_STRING : rc = grn_string_close(ctx, obj); break; + case GRN_HIGHLIGHTER : + rc = grn_highlighter_close(ctx, (grn_highlighter *)obj); + break; case GRN_CURSOR_TABLE_PAT_KEY : grn_pat_cursor_close(ctx, (grn_pat_cursor *)obj); break; Added: lib/highlighter.c (+345 -0) 100644 =================================================================== --- /dev/null +++ lib/highlighter.c 2018-05-07 17:34:13 +0900 (f87d4e7e5) @@ -0,0 +1,345 @@ +/* -*- c-basic-offset: 2 -*- */ +/* + Copyright(C) 2018 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include "grn.h" +#include "grn_pat.h" + +/* + * TODO: + * * Support non HTML mode. + * * Support custom tag. + */ +struct _grn_highlighter { + grn_obj_header header; + + grn_bool is_html_mode; + grn_bool need_prepared; + grn_obj raw_keywords; + + struct { + const char *open; + size_t open_length; + const char *close; + size_t close_length; + } tag; + + /* For lexicon mode */ + grn_obj *lexicon; + grn_obj keyword_token_ids; + + /* For patricia trie mode */ + grn_obj *keywords; +}; + +grn_highlighter * +grn_highlighter_open(grn_ctx *ctx) +{ + grn_highlighter *highlighter; + + GRN_API_ENTER; + + highlighter = GRN_MALLOCN(grn_highlighter, 1); + if (!highlighter) { + ERR(ctx->rc, + "[highlighter][open] failed to allocate memory: %s", + ctx->errbuf); + GRN_API_RETURN(NULL); + } + + highlighter->header.type = GRN_HIGHLIGHTER; + highlighter->header.impl_flags = 0; + highlighter->header.flags = 0; + highlighter->header.domain = GRN_ID_NIL; + + highlighter->is_html_mode = GRN_TRUE; + highlighter->need_prepared = GRN_TRUE; + GRN_TEXT_INIT(&(highlighter->raw_keywords), GRN_OBJ_VECTOR); + + highlighter->tag.open = "<span class=\"keyword\">"; + highlighter->tag.open_length = strlen(highlighter->tag.open); + highlighter->tag.close = "</span>"; + highlighter->tag.close_length = strlen(highlighter->tag.close); + + highlighter->lexicon = NULL; + GRN_RECORD_INIT(&(highlighter->keyword_token_ids), GRN_OBJ_VECTOR, GRN_ID_NIL); + + highlighter->keywords = NULL; + + GRN_API_RETURN(highlighter); +} + +grn_rc +grn_highlighter_close(grn_ctx *ctx, + grn_highlighter *highlighter) +{ + GRN_API_ENTER; + + if (!highlighter) { + GRN_API_RETURN(ctx->rc); + } + + if (highlighter->keywords) { + grn_obj_close(ctx, highlighter->keywords); + } + + GRN_OBJ_FIN(ctx, &(highlighter->keyword_token_ids)); + + GRN_OBJ_FIN(ctx, &(highlighter->raw_keywords)); + GRN_FREE(highlighter); + + GRN_API_RETURN(ctx->rc); +} + +static void +grn_highlighter_prepare_lexicon(grn_ctx *ctx, + grn_highlighter *highlighter) +{ + /* TODO */ +} + +static void +grn_highlighter_prepare_patricia_trie(grn_ctx *ctx, + grn_highlighter *highlighter) +{ + if (highlighter->keywords) { + grn_obj_close(ctx, highlighter->keywords); + } + + highlighter->keywords = grn_table_create(ctx, + NULL, 0, + NULL, + GRN_OBJ_TABLE_PAT_KEY, + grn_ctx_at(ctx, GRN_DB_SHORT_TEXT), + NULL); + if (!highlighter->keywords) { + grn_rc rc = ctx->rc; + if (rc == GRN_SUCCESS) { + rc = GRN_UNKNOWN_ERROR; + } + ERR(rc, + "[highlighter][prepare][no-lexicon] " + "failed to create an internal patricia trie: %s", + ctx->errbuf); + return; + } + + grn_obj_set_info(ctx, + highlighter->keywords, + GRN_INFO_NORMALIZER, + grn_ctx_get(ctx, "NormalizerAuto", -1)); + + { + unsigned int i, n; + + n = grn_vector_size(ctx, &(highlighter->raw_keywords)); + for (i = 0; i < n; i++) { + const char *keyword; + unsigned int keyword_size; + + keyword_size = grn_vector_get_element(ctx, + &(highlighter->raw_keywords), + i, + &keyword, + NULL, + NULL); + grn_table_add(ctx, + highlighter->keywords, + keyword, + keyword_size, + NULL); + } + } +} + +static void +grn_highlighter_prepare(grn_ctx *ctx, + grn_highlighter *highlighter) +{ + if (highlighter->lexicon) { + grn_highlighter_prepare_lexicon(ctx, highlighter); + } else { + grn_highlighter_prepare_patricia_trie(ctx, highlighter); + } +} + +static void +grn_highlighter_highlight_lexicon(grn_ctx *ctx, + grn_highlighter *highlighter, + const char *text, + size_t text_length, + grn_obj *output) +{ + /* TODO */ +} + +static void +grn_highlighter_highlight_patricia_trie(grn_ctx *ctx, + grn_highlighter *highlighter, + const char *text, + size_t text_length, + grn_obj *output) +{ + const char *current = text; + size_t current_length = text_length; + + while (current_length > 0) { +#define MAX_N_HITS 16 + grn_pat_scan_hit hits[MAX_N_HITS]; + const char *rest; + int i, n_hits; + size_t previous_length = 0; + size_t chunk_length; + + n_hits = grn_pat_scan(ctx, + (grn_pat *)(highlighter->keywords), + current, current_length, + hits, MAX_N_HITS, + &rest); + for (i = 0; i < n_hits; i++) { + if ((hits[i].offset - previous_length) > 0) { + grn_text_escape_xml(ctx, + output, + current + previous_length, + hits[i].offset - previous_length); + } + GRN_TEXT_PUT(ctx, + output, + highlighter->tag.open, + highlighter->tag.open_length); + grn_text_escape_xml(ctx, + output, + current + hits[i].offset, + hits[i].length); + GRN_TEXT_PUT(ctx, + output, + highlighter->tag.close, + highlighter->tag.close_length); + previous_length = hits[i].offset + hits[i].length; + } + + chunk_length = rest - current; + if ((chunk_length - previous_length) > 0) { + grn_text_escape_xml(ctx, + output, + current + previous_length, + current_length - previous_length); + } + current_length -= chunk_length; + current = rest; +#undef MAX_N_HITS + } +} + +grn_rc +grn_highlighter_highlight(grn_ctx *ctx, + grn_highlighter *highlighter, + const char *text, + int64_t text_length, + grn_obj *output) +{ + GRN_API_ENTER; + + if (text_length < 0) { + text_length = strlen(text); + } + + if (grn_vector_size(ctx, &(highlighter->raw_keywords)) == 0) { + if (highlighter->is_html_mode) { + grn_text_escape_xml(ctx, + output, + text, + text_length); + } else { + GRN_TEXT_PUT(ctx, output, text, text_length); + } + goto exit; + } + + if (highlighter->need_prepared) { + grn_highlighter_prepare(ctx, highlighter); + if (ctx->rc != GRN_SUCCESS) { + goto exit; + } + } + + if (highlighter->lexicon) { + grn_highlighter_highlight_lexicon(ctx, + highlighter, + text, + text_length, + output); + } else { + grn_highlighter_highlight_patricia_trie(ctx, + highlighter, + text, + text_length, + output); + } + +exit : + GRN_API_RETURN(ctx->rc); +} + +grn_rc +grn_highlighter_set_lexicon(grn_ctx *ctx, + grn_highlighter *highlighter, + grn_obj *lexicon) +{ + GRN_API_ENTER; + + highlighter->lexicon = lexicon; + + GRN_API_RETURN(ctx->rc); +} + +grn_obj * +grn_highlighter_get_lexicon(grn_ctx *ctx, + grn_highlighter *highlighter) +{ + GRN_API_ENTER; + + GRN_API_RETURN(highlighter->lexicon); +} + +grn_rc +grn_highlighter_add_keyword(grn_ctx *ctx, + grn_highlighter *highlighter, + const char *keyword, + int64_t keyword_length) +{ + GRN_API_ENTER; + + if (keyword_length < 0) { + keyword_length = strlen(keyword); + } + + if (keyword_length == 0) { + goto exit; + } + + grn_vector_add_element(ctx, + &(highlighter->raw_keywords), + keyword, + keyword_length, + 0, + GRN_DB_TEXT); + highlighter->need_prepared = GRN_TRUE; + +exit : + GRN_API_RETURN(ctx->rc); +} Modified: lib/proc/proc_highlight.c (+29 -40) =================================================================== --- lib/proc/proc_highlight.c 2018-05-07 16:59:33 +0900 (67c6da485) +++ lib/proc/proc_highlight.c 2018-05-07 17:34:13 +0900 (49e04401d) @@ -1,6 +1,6 @@ /* -*- c-basic-offset: 2 -*- */ /* - Copyright(C) 2009-2016 Brazil + Copyright(C) 2009-2018 Brazil This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -394,24 +394,14 @@ grn_proc_init_highlight_full(grn_ctx *ctx) func_highlight_full, NULL, NULL, 0, NULL); } -static grn_obj * -func_highlight_html_create_keywords_table(grn_ctx *ctx, grn_obj *expression) +static grn_highlighter * +func_highlight_html_create_highlighter(grn_ctx *ctx, grn_obj *expression) { - grn_obj *keywords; + grn_highlighter *highlighter; grn_obj *condition_ptr = NULL; grn_obj *condition = NULL; - keywords = grn_table_create(ctx, NULL, 0, NULL, - GRN_OBJ_TABLE_PAT_KEY, - grn_ctx_at(ctx, GRN_DB_SHORT_TEXT), - NULL); - - { - grn_obj *normalizer; - normalizer = grn_ctx_get(ctx, "NormalizerAuto", -1); - grn_obj_set_info(ctx, keywords, GRN_INFO_NORMALIZER, normalizer); - grn_obj_unlink(ctx, normalizer); - } + highlighter = grn_highlighter_open(ctx); condition_ptr = grn_expr_get_var(ctx, expression, GRN_SELECT_INTERNAL_VAR_CONDITION, @@ -436,16 +426,15 @@ func_highlight_html_create_keywords_table(grn_ctx *ctx, grn_obj *expression) &keyword, NULL, NULL); - grn_table_add(ctx, - keywords, - keyword, - keyword_size, - NULL); + grn_highlighter_add_keyword(ctx, + highlighter, + keyword, + keyword_size); } GRN_OBJ_FIN(ctx, ¤t_keywords); } - return keywords; + return highlighter; } static grn_obj * @@ -458,35 +447,35 @@ func_highlight_html(grn_ctx *ctx, int nargs, grn_obj **args, if (nargs == N_REQUIRED_ARGS) { grn_obj *string = args[0]; grn_obj *expression = NULL; - grn_obj *keywords; - grn_obj *keywords_ptr; - grn_bool use_html_escape = GRN_TRUE; + grn_highlighter *highlighter; + grn_obj *highlighter_ptr; grn_proc_get_info(ctx, user_data, NULL, NULL, &expression); - keywords_ptr = grn_expr_get_var(ctx, expression, - GRN_FUNC_HIGHLIGHT_HTML_CACHE_NAME, - strlen(GRN_FUNC_HIGHLIGHT_HTML_CACHE_NAME)); - if (keywords_ptr) { - keywords = GRN_PTR_VALUE(keywords_ptr); + highlighter_ptr = + grn_expr_get_var(ctx, expression, + GRN_FUNC_HIGHLIGHT_HTML_CACHE_NAME, + strlen(GRN_FUNC_HIGHLIGHT_HTML_CACHE_NAME)); + if (highlighter_ptr) { + highlighter = (grn_highlighter *)GRN_PTR_VALUE(highlighter_ptr); } else { - keywords_ptr = + highlighter_ptr = grn_expr_get_or_add_var(ctx, expression, GRN_FUNC_HIGHLIGHT_HTML_CACHE_NAME, strlen(GRN_FUNC_HIGHLIGHT_HTML_CACHE_NAME)); - GRN_OBJ_FIN(ctx, keywords_ptr); - GRN_PTR_INIT(keywords_ptr, GRN_OBJ_OWN, GRN_DB_OBJECT); + GRN_OBJ_FIN(ctx, highlighter_ptr); + GRN_PTR_INIT(highlighter_ptr, GRN_OBJ_OWN, GRN_DB_OBJECT); - keywords = func_highlight_html_create_keywords_table(ctx, expression); - GRN_PTR_SET(ctx, keywords_ptr, keywords); + highlighter = func_highlight_html_create_highlighter(ctx, expression); + GRN_PTR_SET(ctx, highlighter_ptr, highlighter); } - highlighted = highlight_keywords(ctx, user_data, - string, keywords, use_html_escape, - "<span class=\"keyword\">", - strlen("<span class=\"keyword\">"), - "</span>", - strlen("</span>")); + highlighted = grn_plugin_proc_alloc(ctx, user_data, GRN_DB_TEXT, 0); + grn_highlighter_highlight(ctx, + highlighter, + GRN_TEXT_VALUE(string), + GRN_TEXT_LEN(string), + highlighted); } #undef N_REQUIRED_ARGS -------------- next part -------------- HTML����������������������������... URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180507/e3372d38/attachment-0001.htm