[Groonga-commit] groonga/groonga at a3599bf [master] ii: support offline index construction with tokenizer options

Zurück zum Archiv-Index

Kouhei Sutou null+****@clear*****
Wed Apr 25 15:07:54 JST 2018


Kouhei Sutou	2018-04-25 15:07:54 +0900 (Wed, 25 Apr 2018)

  New Revision: a3599bf4cfe765f5d781ec2e78ae82699d162462
  https://github.com/groonga/groonga/commit/a3599bf4cfe765f5d781ec2e78ae82699d162462

  Message:
    ii: support offline index construction with tokenizer options

  Added files:
    test/command/suite/table_create/default_tokenizer/ngram/options/offline.expected
    test/command/suite/table_create/default_tokenizer/ngram/options/offline.test
  Modified files:
    lib/ii.c

  Modified: lib/ii.c (+24 -12)
===================================================================
--- lib/ii.c    2018-04-25 13:06:13 +0900 (f0b3c7f84)
+++ lib/ii.c    2018-04-25 15:07:54 +0900 (0b767adb2)
@@ -11801,7 +11801,7 @@ typedef struct {
   uint64_t sid_mask;   /* Mask bits for section ID */
 
   grn_obj  *lexicon;    /* Block lexicon (to be closed) */
-  grn_obj  *tokenizer;  /* Lexicon's tokenizer */
+  grn_bool have_tokenizer;  /* Whether lexicon has tokenizer */
   grn_obj  *normalizer; /* Lexicon's normalzier */
 
   uint32_t n;   /* Number of integers appended to the current block */
@@ -11855,7 +11855,7 @@ grn_ii_builder_init(grn_ctx *ctx, grn_ii_builder *builder,
   builder->sid_mask = 0;
 
   builder->lexicon = NULL;
-  builder->tokenizer = NULL;
+  builder->have_tokenizer = GRN_FALSE;
   builder->normalizer = NULL;
 
   builder->n = 0;
@@ -11991,9 +11991,11 @@ grn_ii_builder_create_lexicon(grn_ctx *ctx, grn_ii_builder *builder)
   grn_table_flags flags;
   grn_obj *domain = grn_ctx_at(ctx, builder->ii->lexicon->header.domain);
   grn_obj *range = grn_ctx_at(ctx, DB_OBJ(builder->ii->lexicon)->range);
-  grn_obj *tokenizer, *normalizer, *token_filters;
-  grn_rc rc = grn_table_get_info(ctx, builder->ii->lexicon, &flags, NULL,
-                                 &tokenizer, &normalizer, &token_filters);
+  grn_obj *normalizer, *token_filters;
+  grn_rc rc;
+
+  rc = grn_table_get_info(ctx, builder->ii->lexicon, &flags, NULL,
+                          NULL, &normalizer, &token_filters);
   if (rc != GRN_SUCCESS) {
     return rc;
   }
@@ -12006,11 +12008,21 @@ grn_ii_builder_create_lexicon(grn_ctx *ctx, grn_ii_builder *builder)
     }
     return ctx->rc;
   }
-  builder->tokenizer = tokenizer;
-  builder->normalizer = normalizer;
-  rc = grn_obj_set_info(ctx, builder->lexicon,
-                        GRN_INFO_DEFAULT_TOKENIZER, tokenizer);
+  {
+    grn_obj tokenizer;
+    GRN_TEXT_INIT(&tokenizer, 0);
+    grn_table_get_default_tokenizer_string(ctx,
+                                           builder->ii->lexicon,
+                                           &tokenizer);
+    if (GRN_TEXT_LEN(&tokenizer) > 0) {
+      builder->have_tokenizer = GRN_TRUE;
+      rc = grn_obj_set_info(ctx, builder->lexicon,
+                            GRN_INFO_DEFAULT_TOKENIZER, &tokenizer);
+    }
+    GRN_OBJ_FIN(ctx, &tokenizer);
+  }
   if (rc == GRN_SUCCESS) {
+    builder->normalizer = normalizer;
     rc = grn_obj_set_info(ctx, builder->lexicon,
                           GRN_INFO_NORMALIZER, normalizer);
     if (rc == GRN_SUCCESS) {
@@ -12433,7 +12445,7 @@ grn_ii_builder_append_value(grn_ctx *ctx, grn_ii_builder *builder,
     builder->pos++;
   }
   if (value_size) {
-    if (!builder->tokenizer && !builder->normalizer) {
+    if (!builder->have_tokenizer && !builder->normalizer) {
       grn_id tid;
       switch (builder->lexicon->header.type) {
       case GRN_TABLE_PAT_KEY :
@@ -12534,7 +12546,7 @@ grn_ii_builder_append_obj(grn_ctx *ctx, grn_ii_builder *builder,
           continue;
         }
         if ((builder->ii->header->flags & GRN_OBJ_WITH_SECTION) &&
-            builder->tokenizer) {
+            builder->have_tokenizer) {
           sid = i + 1;
         }
         rc = grn_ii_builder_append_value(ctx, builder, rid, sid, sec->weight,
@@ -12654,7 +12666,7 @@ static grn_rc
 grn_ii_builder_set_sid_bits(grn_ctx *ctx, grn_ii_builder *builder)
 {
   /* Calculate the number of bits required to represent a section ID. */
-  if (builder->n_srcs == 1 && builder->tokenizer &&
+  if (builder->n_srcs == 1 && builder->have_tokenizer &&
       (builder->srcs[0]->header.flags & GRN_OBJ_COLUMN_VECTOR) != 0) {
     /* If the source column is a vector column and the index has a tokenizer, */
     /* the maximum sid equals to the maximum number of elements. */

  Added: test/command/suite/table_create/default_tokenizer/ngram/options/offline.expected (+98 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/table_create/default_tokenizer/ngram/options/offline.expected    2018-04-25 15:07:54 +0900 (7ce06c019)
@@ -0,0 +1,98 @@
+table_create Memos TABLE_NO_KEY
+[[0,0.0,0.0],true]
+column_create Memos content COLUMN_SCALAR Text
+[[0,0.0,0.0],true]
+load --table Memos
+[
+{"content": "This is a pen."},
+{"content": "これはペンです。"}
+]
+[[0,0.0,0.0],2]
+table_create Terms TABLE_PAT_KEY ShortText   --default_tokenizer 'TokenNgram("n", 3)'
+[[0,0.0,0.0],true]
+column_create Terms memos_content COLUMN_INDEX Memos content
+[[0,0.0,0.0],true]
+select Terms --output_columns _key --limit -1
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        21
+      ],
+      [
+        [
+          "_key",
+          "ShortText"
+        ]
+      ],
+      [
+        " a "
+      ],
+      [
+        " is"
+      ],
+      [
+        " pe"
+      ],
+      [
+        "."
+      ],
+      [
+        "Thi"
+      ],
+      [
+        "a p"
+      ],
+      [
+        "en."
+      ],
+      [
+        "his"
+      ],
+      [
+        "is "
+      ],
+      [
+        "n."
+      ],
+      [
+        "pen"
+      ],
+      [
+        "s a"
+      ],
+      [
+        "s i"
+      ],
+      [
+        "。"
+      ],
+      [
+        "これは"
+      ],
+      [
+        "す。"
+      ],
+      [
+        "です。"
+      ],
+      [
+        "はペン"
+      ],
+      [
+        "れはペ"
+      ],
+      [
+        "ペンで"
+      ],
+      [
+        "ンです"
+      ]
+    ]
+  ]
+]

  Added: test/command/suite/table_create/default_tokenizer/ngram/options/offline.test (+14 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/table_create/default_tokenizer/ngram/options/offline.test    2018-04-25 15:07:54 +0900 (6253854b1)
@@ -0,0 +1,14 @@
+table_create Memos TABLE_NO_KEY
+column_create Memos content COLUMN_SCALAR Text
+
+load --table Memos
+[
+{"content": "This is a pen."},
+{"content": "これはペンです。"}
+]
+
+table_create Terms TABLE_PAT_KEY ShortText \
+  --default_tokenizer 'TokenNgram("n", 3)'
+column_create Terms memos_content COLUMN_INDEX Memos content
+
+select Terms --output_columns _key --limit -1
-------------- next part --------------
HTML����������������������������...
URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180425/502466bd/attachment-0001.htm 



More information about the Groonga-commit mailing list
Zurück zum Archiv-Index