[Groonga-commit] groonga/groonga at 57862cf [master] highlighter: add

Zurück zum Archiv-Index

Kouhei Sutou null+****@clear*****
Mon May 7 17:34:13 JST 2018


Kouhei Sutou	2018-05-07 17:34:13 +0900 (Mon, 07 May 2018)

  New Revision: 57862cfe8e79fd62918b9d68b986a44cf3bbcd45
  https://github.com/groonga/groonga/commit/57862cfe8e79fd62918b9d68b986a44cf3bbcd45

  Message:
    highlighter: add
    
    TODO:
    
      * Support non HTML mode
      * Support custom tag
      * Support lexicon mode

  Added files:
    include/groonga/highlighter.h
    lib/highlighter.c
  Modified files:
    include/groonga.h
    include/groonga/groonga.h
    lib/c_sources.am
    lib/db.c
    lib/proc/proc_highlight.c

  Modified: include/groonga.h (+1 -0)
===================================================================
--- include/groonga.h    2018-05-07 16:59:33 +0900 (dd465e3aa)
+++ include/groonga.h    2018-05-07 17:34:13 +0900 (c05ea8304)
@@ -35,6 +35,7 @@
 #include "groonga/file_reader.h"
 #include "groonga/geo.h"
 #include "groonga/hash.h"
+#include "groonga/highlighter.h"
 #include "groonga/id.h"
 #include "groonga/ii.h"
 #include "groonga/obj.h"

  Modified: include/groonga/groonga.h (+1 -0)
===================================================================
--- include/groonga/groonga.h    2018-05-07 16:59:33 +0900 (2415d68a4)
+++ include/groonga/groonga.h    2018-05-07 17:34:13 +0900 (1af9c3a11)
@@ -371,6 +371,7 @@ typedef uint32_t grn_column_flags;
 #define GRN_SNIP                       (0x0b)
 #define GRN_PATSNIP                    (0x0c)
 #define GRN_STRING                     (0x0d)
+#define GRN_HIGHLIGHTER                (0x0e)
 #define GRN_CURSOR_TABLE_HASH_KEY      (0x10)
 #define GRN_CURSOR_TABLE_PAT_KEY       (0x11)
 #define GRN_CURSOR_TABLE_DAT_KEY       (0x12)

  Added: include/groonga/highlighter.h (+54 -0) 100644
===================================================================
--- /dev/null
+++ include/groonga/highlighter.h    2018-05-07 17:34:13 +0900 (7b84c42e4)
@@ -0,0 +1,54 @@
+/*
+  Copyright(C) 2018 Brazil
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+#pragma once
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+typedef struct _grn_highlighter grn_highlighter;
+
+GRN_API grn_highlighter *
+grn_highlighter_open(grn_ctx *ctx);
+GRN_API grn_rc
+grn_highlighter_close(grn_ctx *ctx,
+                      grn_highlighter *highlighter);
+GRN_API grn_rc
+grn_highlighter_highlight(grn_ctx *ctx,
+                          grn_highlighter *highlighter,
+                          const char *text,
+                          int64_t text_length,
+                          grn_obj *output);
+GRN_API grn_rc
+grn_highlighter_set_lexicon(grn_ctx *ctx,
+                            grn_highlighter *highlighter,
+                            grn_obj *lexicon);
+GRN_API grn_obj *
+grn_highlighter_get_lexicon(grn_ctx *ctx,
+                            grn_highlighter *highlighter);
+GRN_API grn_rc
+grn_highlighter_add_keyword(grn_ctx *ctx,
+                            grn_highlighter *highlighter,
+                            const char *keyword,
+                            int64_t keyword_length);
+
+
+#ifdef __cplusplus
+}
+#endif

  Modified: lib/c_sources.am (+1 -0)
===================================================================
--- lib/c_sources.am    2018-05-07 16:59:33 +0900 (d6aae25fe)
+++ lib/c_sources.am    2018-05-07 17:34:13 +0900 (3272940f3)
@@ -37,6 +37,7 @@ libgroonga_c_sources =				\
 	grn.h					\
 	hash.c					\
 	grn_hash.h				\
+	highlighter.c				\
 	id.c					\
 	ii.c					\
 	grn_ii.h				\

  Modified: lib/db.c (+3 -0)
===================================================================
--- lib/db.c    2018-05-07 16:59:33 +0900 (df7371f7b)
+++ lib/db.c    2018-05-07 17:34:13 +0900 (19d46d3eb)
@@ -11214,6 +11214,9 @@ grn_obj_close(grn_ctx *ctx, grn_obj *obj)
     case GRN_STRING :
       rc = grn_string_close(ctx, obj);
       break;
+    case GRN_HIGHLIGHTER :
+      rc = grn_highlighter_close(ctx, (grn_highlighter *)obj);
+      break;
     case GRN_CURSOR_TABLE_PAT_KEY :
       grn_pat_cursor_close(ctx, (grn_pat_cursor *)obj);
       break;

  Added: lib/highlighter.c (+345 -0) 100644
===================================================================
--- /dev/null
+++ lib/highlighter.c    2018-05-07 17:34:13 +0900 (f87d4e7e5)
@@ -0,0 +1,345 @@
+/* -*- c-basic-offset: 2 -*- */
+/*
+  Copyright(C) 2018 Brazil
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License version 2.1 as published by the Free Software Foundation.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+#include "grn.h"
+#include "grn_pat.h"
+
+/*
+ * TODO:
+ *   * Support non HTML mode.
+ *   * Support custom tag.
+ */
+struct _grn_highlighter {
+  grn_obj_header header;
+
+  grn_bool is_html_mode;
+  grn_bool need_prepared;
+  grn_obj raw_keywords;
+
+  struct {
+    const char *open;
+    size_t open_length;
+    const char *close;
+    size_t close_length;
+  } tag;
+
+  /* For lexicon mode */
+  grn_obj *lexicon;
+  grn_obj keyword_token_ids;
+
+  /* For patricia trie mode */
+  grn_obj *keywords;
+};
+
+grn_highlighter *
+grn_highlighter_open(grn_ctx *ctx)
+{
+  grn_highlighter *highlighter;
+
+  GRN_API_ENTER;
+
+  highlighter = GRN_MALLOCN(grn_highlighter, 1);
+  if (!highlighter) {
+    ERR(ctx->rc,
+        "[highlighter][open] failed to allocate memory: %s",
+        ctx->errbuf);
+    GRN_API_RETURN(NULL);
+  }
+
+  highlighter->header.type = GRN_HIGHLIGHTER;
+  highlighter->header.impl_flags = 0;
+  highlighter->header.flags = 0;
+  highlighter->header.domain = GRN_ID_NIL;
+
+  highlighter->is_html_mode = GRN_TRUE;
+  highlighter->need_prepared = GRN_TRUE;
+  GRN_TEXT_INIT(&(highlighter->raw_keywords), GRN_OBJ_VECTOR);
+
+  highlighter->tag.open = "<span class=\"keyword\">";
+  highlighter->tag.open_length = strlen(highlighter->tag.open);
+  highlighter->tag.close = "</span>";
+  highlighter->tag.close_length = strlen(highlighter->tag.close);
+
+  highlighter->lexicon = NULL;
+  GRN_RECORD_INIT(&(highlighter->keyword_token_ids), GRN_OBJ_VECTOR, GRN_ID_NIL);
+
+  highlighter->keywords = NULL;
+
+  GRN_API_RETURN(highlighter);
+}
+
+grn_rc
+grn_highlighter_close(grn_ctx *ctx,
+                      grn_highlighter *highlighter)
+{
+  GRN_API_ENTER;
+
+  if (!highlighter) {
+    GRN_API_RETURN(ctx->rc);
+  }
+
+  if (highlighter->keywords) {
+    grn_obj_close(ctx, highlighter->keywords);
+  }
+
+  GRN_OBJ_FIN(ctx, &(highlighter->keyword_token_ids));
+
+  GRN_OBJ_FIN(ctx, &(highlighter->raw_keywords));
+  GRN_FREE(highlighter);
+
+  GRN_API_RETURN(ctx->rc);
+}
+
+static void
+grn_highlighter_prepare_lexicon(grn_ctx *ctx,
+                                grn_highlighter *highlighter)
+{
+  /* TODO */
+}
+
+static void
+grn_highlighter_prepare_patricia_trie(grn_ctx *ctx,
+                                      grn_highlighter *highlighter)
+{
+  if (highlighter->keywords) {
+    grn_obj_close(ctx, highlighter->keywords);
+  }
+
+  highlighter->keywords = grn_table_create(ctx,
+                                           NULL, 0,
+                                           NULL,
+                                           GRN_OBJ_TABLE_PAT_KEY,
+                                           grn_ctx_at(ctx, GRN_DB_SHORT_TEXT),
+                                           NULL);
+  if (!highlighter->keywords) {
+    grn_rc rc = ctx->rc;
+    if (rc == GRN_SUCCESS) {
+      rc = GRN_UNKNOWN_ERROR;
+    }
+    ERR(rc,
+        "[highlighter][prepare][no-lexicon] "
+        "failed to create an internal patricia trie: %s",
+        ctx->errbuf);
+    return;
+  }
+
+  grn_obj_set_info(ctx,
+                   highlighter->keywords,
+                   GRN_INFO_NORMALIZER,
+                   grn_ctx_get(ctx, "NormalizerAuto", -1));
+
+  {
+    unsigned int i, n;
+
+    n = grn_vector_size(ctx, &(highlighter->raw_keywords));
+    for (i = 0; i < n; i++) {
+      const char *keyword;
+      unsigned int keyword_size;
+
+      keyword_size = grn_vector_get_element(ctx,
+                                            &(highlighter->raw_keywords),
+                                            i,
+                                            &keyword,
+                                            NULL,
+                                            NULL);
+      grn_table_add(ctx,
+                    highlighter->keywords,
+                    keyword,
+                    keyword_size,
+                    NULL);
+    }
+  }
+}
+
+static void
+grn_highlighter_prepare(grn_ctx *ctx,
+                        grn_highlighter *highlighter)
+{
+  if (highlighter->lexicon) {
+    grn_highlighter_prepare_lexicon(ctx, highlighter);
+  } else {
+    grn_highlighter_prepare_patricia_trie(ctx, highlighter);
+  }
+}
+
+static void
+grn_highlighter_highlight_lexicon(grn_ctx *ctx,
+                                  grn_highlighter *highlighter,
+                                  const char *text,
+                                  size_t text_length,
+                                  grn_obj *output)
+{
+  /* TODO */
+}
+
+static void
+grn_highlighter_highlight_patricia_trie(grn_ctx *ctx,
+                                        grn_highlighter *highlighter,
+                                        const char *text,
+                                        size_t text_length,
+                                        grn_obj *output)
+{
+  const char *current = text;
+  size_t current_length = text_length;
+
+  while (current_length > 0) {
+#define MAX_N_HITS 16
+    grn_pat_scan_hit hits[MAX_N_HITS];
+    const char *rest;
+    int i, n_hits;
+    size_t previous_length = 0;
+    size_t chunk_length;
+
+    n_hits = grn_pat_scan(ctx,
+                          (grn_pat *)(highlighter->keywords),
+                          current, current_length,
+                          hits, MAX_N_HITS,
+                          &rest);
+    for (i = 0; i < n_hits; i++) {
+      if ((hits[i].offset - previous_length) > 0) {
+        grn_text_escape_xml(ctx,
+                            output,
+                            current + previous_length,
+                            hits[i].offset - previous_length);
+      }
+      GRN_TEXT_PUT(ctx,
+                   output,
+                   highlighter->tag.open,
+                   highlighter->tag.open_length);
+      grn_text_escape_xml(ctx,
+                          output,
+                          current + hits[i].offset,
+                          hits[i].length);
+      GRN_TEXT_PUT(ctx,
+                   output,
+                   highlighter->tag.close,
+                   highlighter->tag.close_length);
+      previous_length = hits[i].offset + hits[i].length;
+    }
+
+    chunk_length = rest - current;
+    if ((chunk_length - previous_length) > 0) {
+      grn_text_escape_xml(ctx,
+                          output,
+                          current + previous_length,
+                          current_length - previous_length);
+    }
+    current_length -= chunk_length;
+    current = rest;
+#undef MAX_N_HITS
+  }
+}
+
+grn_rc
+grn_highlighter_highlight(grn_ctx *ctx,
+                          grn_highlighter *highlighter,
+                          const char *text,
+                          int64_t text_length,
+                          grn_obj *output)
+{
+  GRN_API_ENTER;
+
+  if (text_length < 0) {
+    text_length = strlen(text);
+  }
+
+  if (grn_vector_size(ctx, &(highlighter->raw_keywords)) == 0) {
+    if (highlighter->is_html_mode) {
+      grn_text_escape_xml(ctx,
+                          output,
+                          text,
+                          text_length);
+    } else {
+      GRN_TEXT_PUT(ctx, output, text, text_length);
+    }
+    goto exit;
+  }
+
+  if (highlighter->need_prepared) {
+    grn_highlighter_prepare(ctx, highlighter);
+    if (ctx->rc != GRN_SUCCESS) {
+      goto exit;
+    }
+  }
+
+  if (highlighter->lexicon) {
+    grn_highlighter_highlight_lexicon(ctx,
+                                      highlighter,
+                                      text,
+                                      text_length,
+                                      output);
+  } else {
+    grn_highlighter_highlight_patricia_trie(ctx,
+                                            highlighter,
+                                            text,
+                                            text_length,
+                                            output);
+  }
+
+exit :
+  GRN_API_RETURN(ctx->rc);
+}
+
+grn_rc
+grn_highlighter_set_lexicon(grn_ctx *ctx,
+                            grn_highlighter *highlighter,
+                            grn_obj *lexicon)
+{
+  GRN_API_ENTER;
+
+  highlighter->lexicon = lexicon;
+
+  GRN_API_RETURN(ctx->rc);
+}
+
+grn_obj *
+grn_highlighter_get_lexicon(grn_ctx *ctx,
+                            grn_highlighter *highlighter)
+{
+  GRN_API_ENTER;
+
+  GRN_API_RETURN(highlighter->lexicon);
+}
+
+grn_rc
+grn_highlighter_add_keyword(grn_ctx *ctx,
+                            grn_highlighter *highlighter,
+                            const char *keyword,
+                            int64_t keyword_length)
+{
+  GRN_API_ENTER;
+
+  if (keyword_length < 0) {
+    keyword_length = strlen(keyword);
+  }
+
+  if (keyword_length == 0) {
+    goto exit;
+  }
+
+  grn_vector_add_element(ctx,
+                         &(highlighter->raw_keywords),
+                         keyword,
+                         keyword_length,
+                         0,
+                         GRN_DB_TEXT);
+  highlighter->need_prepared = GRN_TRUE;
+
+exit :
+  GRN_API_RETURN(ctx->rc);
+}

  Modified: lib/proc/proc_highlight.c (+29 -40)
===================================================================
--- lib/proc/proc_highlight.c    2018-05-07 16:59:33 +0900 (67c6da485)
+++ lib/proc/proc_highlight.c    2018-05-07 17:34:13 +0900 (49e04401d)
@@ -1,6 +1,6 @@
 /* -*- c-basic-offset: 2 -*- */
 /*
-  Copyright(C) 2009-2016 Brazil
+  Copyright(C) 2009-2018 Brazil
 
   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -394,24 +394,14 @@ grn_proc_init_highlight_full(grn_ctx *ctx)
                   func_highlight_full, NULL, NULL, 0, NULL);
 }
 
-static grn_obj *
-func_highlight_html_create_keywords_table(grn_ctx *ctx, grn_obj *expression)
+static grn_highlighter *
+func_highlight_html_create_highlighter(grn_ctx *ctx, grn_obj *expression)
 {
-  grn_obj *keywords;
+  grn_highlighter *highlighter;
   grn_obj *condition_ptr = NULL;
   grn_obj *condition = NULL;
 
-  keywords = grn_table_create(ctx, NULL, 0, NULL,
-                              GRN_OBJ_TABLE_PAT_KEY,
-                              grn_ctx_at(ctx, GRN_DB_SHORT_TEXT),
-                              NULL);
-
-  {
-    grn_obj *normalizer;
-    normalizer = grn_ctx_get(ctx, "NormalizerAuto", -1);
-    grn_obj_set_info(ctx, keywords, GRN_INFO_NORMALIZER, normalizer);
-    grn_obj_unlink(ctx, normalizer);
-  }
+  highlighter = grn_highlighter_open(ctx);
 
   condition_ptr = grn_expr_get_var(ctx, expression,
                                    GRN_SELECT_INTERNAL_VAR_CONDITION,
@@ -436,16 +426,15 @@ func_highlight_html_create_keywords_table(grn_ctx *ctx, grn_obj *expression)
                                             &keyword,
                                             NULL,
                                             NULL);
-      grn_table_add(ctx,
-                    keywords,
-                    keyword,
-                    keyword_size,
-                    NULL);
+      grn_highlighter_add_keyword(ctx,
+                                  highlighter,
+                                  keyword,
+                                  keyword_size);
     }
     GRN_OBJ_FIN(ctx, &current_keywords);
   }
 
-  return keywords;
+  return highlighter;
 }
 
 static grn_obj *
@@ -458,35 +447,35 @@ func_highlight_html(grn_ctx *ctx, int nargs, grn_obj **args,
   if (nargs == N_REQUIRED_ARGS) {
     grn_obj *string = args[0];
     grn_obj *expression = NULL;
-    grn_obj *keywords;
-    grn_obj *keywords_ptr;
-    grn_bool use_html_escape = GRN_TRUE;
+    grn_highlighter *highlighter;
+    grn_obj *highlighter_ptr;
 
     grn_proc_get_info(ctx, user_data, NULL, NULL, &expression);
 
-    keywords_ptr = grn_expr_get_var(ctx, expression,
-                                    GRN_FUNC_HIGHLIGHT_HTML_CACHE_NAME,
-                                    strlen(GRN_FUNC_HIGHLIGHT_HTML_CACHE_NAME));
-    if (keywords_ptr) {
-      keywords = GRN_PTR_VALUE(keywords_ptr);
+    highlighter_ptr =
+      grn_expr_get_var(ctx, expression,
+                       GRN_FUNC_HIGHLIGHT_HTML_CACHE_NAME,
+                       strlen(GRN_FUNC_HIGHLIGHT_HTML_CACHE_NAME));
+    if (highlighter_ptr) {
+      highlighter = (grn_highlighter *)GRN_PTR_VALUE(highlighter_ptr);
     } else {
-      keywords_ptr =
+      highlighter_ptr =
         grn_expr_get_or_add_var(ctx, expression,
                                 GRN_FUNC_HIGHLIGHT_HTML_CACHE_NAME,
                                 strlen(GRN_FUNC_HIGHLIGHT_HTML_CACHE_NAME));
-      GRN_OBJ_FIN(ctx, keywords_ptr);
-      GRN_PTR_INIT(keywords_ptr, GRN_OBJ_OWN, GRN_DB_OBJECT);
+      GRN_OBJ_FIN(ctx, highlighter_ptr);
+      GRN_PTR_INIT(highlighter_ptr, GRN_OBJ_OWN, GRN_DB_OBJECT);
 
-      keywords = func_highlight_html_create_keywords_table(ctx, expression);
-      GRN_PTR_SET(ctx, keywords_ptr, keywords);
+      highlighter = func_highlight_html_create_highlighter(ctx, expression);
+      GRN_PTR_SET(ctx, highlighter_ptr, highlighter);
     }
 
-    highlighted = highlight_keywords(ctx, user_data,
-                                     string, keywords, use_html_escape,
-                                     "<span class=\"keyword\">",
-                                     strlen("<span class=\"keyword\">"),
-                                     "</span>",
-                                     strlen("</span>"));
+    highlighted = grn_plugin_proc_alloc(ctx, user_data, GRN_DB_TEXT, 0);
+    grn_highlighter_highlight(ctx,
+                              highlighter,
+                              GRN_TEXT_VALUE(string),
+                              GRN_TEXT_LEN(string),
+                              highlighted);
   }
 #undef N_REQUIRED_ARGS
 
-------------- next part --------------
HTML����������������������������...
URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180507/e3372d38/attachment-0001.htm 



More information about the Groonga-commit mailing list
Zurück zum Archiv-Index