groonga/groonga at e30e135 [master] TokenNgram: add "loose_symbol" option (Groonga-commit) - Groonga - fulltext search engine.

Kouhei Sutou	2018-04-06 15:30:48 +0900 (Fri, 06 Apr 2018)

  New Revision: e30e135a10fc5b2d0169a5738929ed1194e77e16
  https://github.com/groonga/groonga/commit/e30e135a10fc5b2d0169a5738929ed1194e77e16

  Message:
    TokenNgram: add "loose_symbol" option

  Added files:
    test/command/suite/tokenizers/ngram/loose_symbol/add.expected
    test/command/suite/tokenizers/ngram/loose_symbol/add.test
    test/command/suite/tokenizers/ngram/loose_symbol/get.expected
    test/command/suite/tokenizers/ngram/loose_symbol/get.test
    test/command/suite/tokenizers/ngram/n.expected
    test/command/suite/tokenizers/ngram/n.test
  Copied files:
    test/command/suite/table_create/default_tokenizer/ngram/options/multiple.expected
      (from test/command/suite/table_create/default_tokenizer/ngram/options/n.expected)
    test/command/suite/table_create/default_tokenizer/ngram/options/multiple.test
      (from test/command/suite/table_create/default_tokenizer/ngram/options/n.test)
  Modified files:
    lib/tokenizers.c
  Renamed files:
    test/command/suite/table_create/default_tokenizer/ngram/options/one.expected
      (from test/command/suite/table_create/default_tokenizer/ngram/options/n.expected)
    test/command/suite/table_create/default_tokenizer/ngram/options/one.test
      (from test/command/suite/table_create/default_tokenizer/ngram/options/n.test)

  Modified: lib/tokenizers.c (+126 -11)
===================================================================

--- lib/tokenizers.c    2018-04-06 15:29:18 +0900 (f96640b63)
+++ lib/tokenizers.c    2018-04-06 15:30:48 +0900 (998b3ffca)
@@ -246,6 +246,7 @@ typedef struct {
   grn_bool uni_digit;
   grn_bool uni_symbol;
   grn_bool ignore_blank;
+  grn_bool loose_symbol;
 } grn_ngram_options;
 
 typedef struct {
@@ -253,6 +254,13 @@ typedef struct {
   grn_tokenizer_query *query;
   grn_ngram_options options;
   grn_bool overlap;
+  struct {
+    grn_bool ing;
+    grn_bool need;
+    grn_bool need_end_mark;
+    grn_obj text;
+    uint_least8_t *ctypes;
+  } loose;
   int32_t pos;
   uint32_t skip;
   const unsigned char *next;
@@ -270,6 +278,69 @@ ngram_options_init(grn_ngram_options *options, uint8_t unit)
   options->uni_digit = GRN_TRUE;
   options->uni_symbol = GRN_TRUE;
   options->ignore_blank = GRN_FALSE;
+  options->loose_symbol = GRN_FALSE;
+}
+
+static void
+ngram_switch_to_loose_mode(grn_ctx *ctx,
+                           grn_ngram_tokenizer *tokenizer)
+{
+  const char *normalized;
+  unsigned int normalized_length_in_bytes;
+  unsigned int normalized_length_in_chars;
+  const char *normalized_end;
+  const uint_least8_t *types = tokenizer->ctypes;
+
+  grn_string_get_normalized(ctx,
+                            tokenizer->query->normalized_query,
+                            &normalized,
+                            &normalized_length_in_bytes,
+                            &normalized_length_in_chars);
+  normalized_end = normalized + normalized_length_in_bytes;
+
+  if (types) {
+    uint_least8_t *loose_types;
+
+    tokenizer->loose.ctypes =
+      GRN_MALLOC(sizeof(uint_least8_t) * normalized_length_in_chars);
+    if (!tokenizer->loose.ctypes) {
+      ERR(GRN_NO_MEMORY_AVAILABLE,
+          "[tokenizer][ngram][loose] "
+          "failed to allocate memory for character types");
+      return;
+    }
+    loose_types = tokenizer->loose.ctypes;
+    while (normalized < normalized_end) {
+      size_t length;
+      length = grn_charlen_(ctx,
+                            (char *)normalized,
+                            (char *)normalized_end,
+                            tokenizer->query->encoding);
+      if (length == 0) {
+        break;
+      }
+      if (!(tokenizer->options.loose_symbol &&
+            GRN_STR_CTYPE(*types) == GRN_CHAR_SYMBOL)) {
+        GRN_TEXT_PUT(ctx, &(tokenizer->loose.text), normalized, length);
+        *loose_types = *types;
+        loose_types++;
+      }
+      normalized += length;
+      types++;
+    }
+    tokenizer->next =
+      (const unsigned char *)GRN_TEXT_VALUE(&(tokenizer->loose.text));
+    tokenizer->end = tokenizer->next + GRN_TEXT_LEN(&(tokenizer->loose.text));
+    tokenizer->ctypes = tokenizer->loose.ctypes;
+  } else {
+    tokenizer->next = normalized;
+    tokenizer->end = normalized_end;
+  }
+
+  tokenizer->pos = 0;
+  tokenizer->skip = 0;
+  tokenizer->overlap = GRN_FALSE;
+  tokenizer->loose.ing = GRN_TRUE;
 }
 
 static grn_obj *
@@ -310,6 +381,11 @@ ngram_init_raw(grn_ctx *ctx,
 
   tokenizer->options = *options;
   tokenizer->overlap = GRN_FALSE;
+  tokenizer->loose.ing = GRN_FALSE;
+  tokenizer->loose.need = GRN_FALSE;
+  tokenizer->loose.need_end_mark = GRN_FALSE;
+  GRN_TEXT_INIT(&(tokenizer->loose.text), 0);
+  tokenizer->loose.ctypes = NULL;
   tokenizer->pos = 0;
   tokenizer->skip = 0;
 
@@ -320,6 +396,11 @@ ngram_init_raw(grn_ctx *ctx,
   tokenizer->end = tokenizer->next + normalized_length_in_bytes;
   tokenizer->ctypes =
     grn_string_get_types(ctx, tokenizer->query->normalized_query);
+
+  if (tokenizer->query->tokenize_mode == GRN_TOKEN_GET) {
+    ngram_switch_to_loose_mode(ctx, tokenizer);
+  }
+
   return NULL;
 }
 
@@ -431,7 +512,7 @@ ngram_open_options(grn_ctx *ctx,
   if (!options) {
     ERR(GRN_NO_MEMORY_AVAILABLE,
         "[tokenizer][ngram] "
-        "failed to allocate Ngram options");
+        "failed to allocate memory for options");
     return NULL;
   }
 
@@ -447,11 +528,11 @@ ngram_open_options(grn_ctx *ctx,
                                                    raw_options,
                                                    i,
                                                    options->unit);
-    /* } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "loose_symbol")) { */
-    /*   options->loose_symbol = grn_vector_get_element_bool(ctx, */
-    /*                                                       raw_options, */
-    /*                                                       i, */
-    /*                                                       options->loose_symbol); */
+    } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "loose_symbol")) {
+      options->loose_symbol = grn_vector_get_element_bool(ctx,
+                                                          raw_options,
+                                                          i,
+                                                          options->loose_symbol);
     }
   } GRN_OPTION_VALUES_EACH_END();
 
@@ -492,6 +573,26 @@ ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
   int32_t len = 0, pos = tokenizer->pos + tokenizer->skip;
   grn_token_status status = 0;
   const uint_least8_t *cp = tokenizer->ctypes ? tokenizer->ctypes + pos : NULL;
+
+  if (tokenizer->loose.ing && tokenizer->loose.need_end_mark) {
+    grn_tokenizer_token_push(ctx,
+                             &(tokenizer->token),
+                             GRN_TOKENIZER_END_MARK_UTF8,
+                             GRN_TOKENIZER_END_MARK_UTF8_LEN,
+                             status);
+    ngram_switch_to_loose_mode(ctx, tokenizer);
+    tokenizer->loose.need_end_mark = GRN_FALSE;
+    return NULL;
+  }
+
+  if (cp &&
+      !tokenizer->loose.ing &&
+      !tokenizer->loose.need &&
+      tokenizer->options.loose_symbol &&
+      GRN_STR_CTYPE(*cp) == GRN_CHAR_SYMBOL) {
+    tokenizer->loose.need = GRN_TRUE;
+  }
+
   if (cp && tokenizer->options.uni_alpha &&
       GRN_STR_CTYPE(*cp) == GRN_CHAR_ALPHA) {
     while ((cl = grn_charlen_(ctx, (char *)r, (char *)e,
@@ -589,11 +690,21 @@ ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
     tokenizer->skip = tokenizer->overlap ? 1 : len;
   }
   if (r == e) { status |= GRN_TOKEN_REACH_END; }
-  grn_tokenizer_token_push(ctx,
-                           &(tokenizer->token),
-                           (const char *)p,
-                           r - p,
-                           status);
+
+  {
+    if ((status & (GRN_TOKEN_LAST | GRN_TOKEN_REACH_END)) &&
+        !tokenizer->loose.ing && tokenizer->loose.need) {
+      status &= ~(GRN_TOKEN_LAST | GRN_TOKEN_REACH_END);
+      tokenizer->loose.ing = GRN_TRUE;
+      tokenizer->loose.need_end_mark = GRN_TRUE;
+    }
+    grn_tokenizer_token_push(ctx,
+                             &(tokenizer->token),
+                             (const char *)p,
+                             r - p,
+                             status);
+  }
+
   return NULL;
 }
 
@@ -604,6 +715,10 @@ ngram_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
   if (!tokenizer) {
     return NULL;
   }
+  if (tokenizer->loose.ctypes) {
+    GRN_FREE(tokenizer->loose.ctypes);
+  }
+  GRN_OBJ_FIN(ctx, &(tokenizer->loose.text));
   grn_tokenizer_token_fin(ctx, &(tokenizer->token));
   grn_tokenizer_query_close(ctx, tokenizer->query);
   GRN_FREE(tokenizer);

  Copied: test/command/suite/table_create/default_tokenizer/ngram/options/multiple.expected (+13 -34) 69%
===================================================================
--- test/command/suite/table_create/default_tokenizer/ngram/options/n.expected    2018-04-06 15:29:18 +0900 (0e1c9d971)
+++ test/command/suite/table_create/default_tokenizer/ngram/options/multiple.expected    2018-04-06 15:30:48 +0900 (476e32a4a)
@@ -2,13 +2,13 @@ table_create Memos TABLE_NO_KEY
 [[0,0.0,0.0],true]
 column_create Memos content COLUMN_SCALAR Text
 [[0,0.0,0.0],true]
-table_create Terms TABLE_PAT_KEY ShortText   --default_tokenizer 'TokenNgram("n", 3)'
+table_create Terms TABLE_PAT_KEY ShortText   --default_tokenizer 'TokenNgram("n", 3, "loose_symbol", true)'   --normalizer NormalizerAuto
 [[0,0.0,0.0],true]
 column_create Terms memos_content COLUMN_INDEX Memos content
 [[0,0.0,0.0],true]
 load --table Memos
 [
-{"content": "This is a pen."},
+{"content": "090-1234-5678"},
 {"content": "これはペンです。"}
 ]
 [[0,0.0,0.0],2]
@@ -22,7 +22,7 @@ select Terms --output_columns _key --limit -1
   [
     [
       [
-        21
+        14
       ],
       [
         [
@@ -31,43 +31,19 @@ select Terms --output_columns _key --limit -1
         ]
       ],
       [
-        " a "
+        "-"
       ],
       [
-        " is"
+        "090"
       ],
       [
-        " pe"
+        "09012345678"
       ],
       [
-        "."
+        "1234"
       ],
       [
-        "Thi"
-      ],
-      [
-        "a p"
-      ],
-      [
-        "en."
-      ],
-      [
-        "his"
-      ],
-      [
-        "is "
-      ],
-      [
-        "n."
-      ],
-      [
-        "pen"
-      ],
-      [
-        "s a"
-      ],
-      [
-        "s i"
+        "5678"
       ],
       [
         "。"
@@ -76,10 +52,10 @@ select Terms --output_columns _key --limit -1
         "これは"
       ],
       [
-        "す。"
+        "す"
       ],
       [
-        "です。"
+        "です"
       ],
       [
         "はペン"
@@ -92,6 +68,9 @@ select Terms --output_columns _key --limit -1
       ],
       [
         "ンです"
+      ],
+      [
+        "￰"
       ]
     ]
   ]

  Copied: test/command/suite/table_create/default_tokenizer/ngram/options/multiple.test (+3 -2) 70%
===================================================================
--- test/command/suite/table_create/default_tokenizer/ngram/options/n.test    2018-04-06 15:29:18 +0900 (199aa2878)
+++ test/command/suite/table_create/default_tokenizer/ngram/options/multiple.test    2018-04-06 15:30:48 +0900 (a1e9eb08f)
@@ -2,12 +2,13 @@ table_create Memos TABLE_NO_KEY
 column_create Memos content COLUMN_SCALAR Text
 
 table_create Terms TABLE_PAT_KEY ShortText \
-  --default_tokenizer 'TokenNgram("n", 3)'
+  --default_tokenizer 'TokenNgram("n", 3, "loose_symbol", true)' \
+  --normalizer NormalizerAuto
 column_create Terms memos_content COLUMN_INDEX Memos content
 
 load --table Memos
 [
-{"content": "This is a pen."},
+{"content": "090-1234-5678"},
 {"content": "これはペンです。"}
 ]
 

  Renamed: test/command/suite/table_create/default_tokenizer/ngram/options/one.expected (+0 -0) 100%
===================================================================

  Renamed: test/command/suite/table_create/default_tokenizer/ngram/options/one.test (+0 -0) 100%
===================================================================

  Added: test/command/suite/tokenizers/ngram/loose_symbol/add.expected (+45 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/ngram/loose_symbol/add.expected    2018-04-06 15:30:48 +0900 (a8b574720)
@@ -0,0 +1,45 @@
+tokenize   'TokenNgram("loose_symbol", true)'   "090-1234-5678"   NormalizerAuto   --mode ADD
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "090",
+      "position": 0,
+      "force_prefix": false
+    },
+    {
+      "value": "-",
+      "position": 1,
+      "force_prefix": false
+    },
+    {
+      "value": "1234",
+      "position": 2,
+      "force_prefix": false
+    },
+    {
+      "value": "-",
+      "position": 3,
+      "force_prefix": false
+    },
+    {
+      "value": "5678",
+      "position": 4,
+      "force_prefix": false
+    },
+    {
+      "value": "￰",
+      "position": 5,
+      "force_prefix": false
+    },
+    {
+      "value": "09012345678",
+      "position": 6,
+      "force_prefix": false
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/ngram/loose_symbol/add.test (+5 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/ngram/loose_symbol/add.test    2018-04-06 15:30:48 +0900 (8fb1d61a2)
@@ -0,0 +1,5 @@
+tokenize \
+  'TokenNgram("loose_symbol", true)' \
+  "090-1234-5678" \
+  NormalizerAuto \
+  --mode ADD

  Added: test/command/suite/tokenizers/ngram/loose_symbol/get.expected (+2 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/ngram/loose_symbol/get.expected    2018-04-06 15:30:48 +0900 (601e8b3cb)
@@ -0,0 +1,2 @@
+tokenize   'TokenNgram("loose_symbol", true)'   "090-1234-5678"   NormalizerAuto   --mode GET
+[[0,0.0,0.0],[{"value":"09012345678","position":0,"force_prefix":false}]]

  Added: test/command/suite/tokenizers/ngram/loose_symbol/get.test (+5 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/ngram/loose_symbol/get.test    2018-04-06 15:30:48 +0900 (8af91ffac)
@@ -0,0 +1,5 @@
+tokenize \
+  'TokenNgram("loose_symbol", true)' \
+  "090-1234-5678" \
+  NormalizerAuto \
+  --mode GET

  Added: test/command/suite/tokenizers/ngram/n.expected (+45 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/ngram/n.expected    2018-04-06 15:30:48 +0900 (b4af32bdb)
@@ -0,0 +1,45 @@
+tokenize 'TokenNgram("n", 3)' "abcdefg"
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "abc",
+      "position": 0,
+      "force_prefix": false
+    },
+    {
+      "value": "bcd",
+      "position": 1,
+      "force_prefix": false
+    },
+    {
+      "value": "cde",
+      "position": 2,
+      "force_prefix": false
+    },
+    {
+      "value": "def",
+      "position": 3,
+      "force_prefix": false
+    },
+    {
+      "value": "efg",
+      "position": 4,
+      "force_prefix": false
+    },
+    {
+      "value": "fg",
+      "position": 5,
+      "force_prefix": false
+    },
+    {
+      "value": "g",
+      "position": 6,
+      "force_prefix": false
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/ngram/n.test (+1 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/ngram/n.test    2018-04-06 15:30:48 +0900 (17d83ba64)
@@ -0,0 +1 @@
+tokenize 'TokenNgram("n", 3)' "abcdefg"
-------------- next part --------------
HTML����������������������������...
URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180406/34695edc/attachment-0001.htm 


Groonga - fulltext search engine.

[Groonga-commit] groonga/groonga at e30e135 [master] TokenNgram: add "loose_symbol" option