groonga/groonga at d5c6d1f [master] TokenNgram: fix a wrong source offset bug for loose case (Groonga-commit) - Groonga - fulltext search engine.

Kouhei Sutou	2018-05-22 18:01:35 +0900 (Tue, 22 May 2018)

  New Revision: d5c6d1f38438c34272f91f046baaf86150f6177c
  https://github.com/groonga/groonga/commit/d5c6d1f38438c34272f91f046baaf86150f6177c

  Message:
    TokenNgram: fix a wrong source offset bug for loose case

  Added files:
    test/command/suite/select/function/highlight_html/lexicon/loose.expected
    test/command/suite/select/function/highlight_html/lexicon/loose.test
    test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_non_number.expected
    test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_non_number.test
  Modified files:
    lib/tokenizers.c

  Modified: lib/tokenizers.c (+19 -12)
===================================================================

--- lib/tokenizers.c    2018-05-22 16:45:01 +0900 (cecf8af9a)
+++ lib/tokenizers.c    2018-05-22 18:01:35 +0900 (e59fcd6ca)
@@ -845,7 +845,6 @@ ngram_next(grn_ctx *ctx,
       uint32_t source_length = 0;
       uint32_t source_first_character_length = 0;
       uint64_t next_offset = tokenizer->source_offset;
-      grn_bool first_character = GRN_TRUE;
       grn_token_set_source_offset(ctx, token, tokenizer->source_offset);
       if (checks[0] == -1) {
         size_t n_leading_bytes = p - tokenizer->start;
@@ -855,22 +854,30 @@ ngram_next(grn_ctx *ctx,
             if (!tokenizer->overlap) {
               next_offset += checks[-i];
             }
-            first_character = GRN_FALSE;
             break;
           }
         }
       }
-      for (i = 0; i < data_size; i++) {
-        if (checks[i] > 0) {
-          if ((tokenizer->overlap && !first_character) ||
-              !tokenizer->overlap) {
-            next_offset += checks[i];
-          }
-          if (first_character) {
-            source_first_character_length = checks[i];
+      {
+        uint64_t first_offset = 0;
+        for (i = 0; i < data_size; i++) {
+          if (checks[i] > 0) {
+            if ((tokenizer->overlap && first_offset == 0) ||
+                !tokenizer->overlap) {
+              if (first_offset == 0) {
+                first_offset = checks[i];
+              }
+              next_offset += checks[i];
+            }
+            if (source_first_character_length == 0) {
+              source_first_character_length = checks[i];
+            }
+            source_length += checks[i];
+          } else if (checks[i] < 0) {
+            if (tokenizer->overlap) {
+              next_offset -= first_offset;
+            }
           }
-          source_length += checks[i];
-          first_character = GRN_FALSE;
         }
       }
       grn_token_set_source_length(ctx, token, source_length);

  Added: test/command/suite/select/function/highlight_html/lexicon/loose.expected (+37 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/function/highlight_html/lexicon/loose.expected    2018-05-22 18:01:35 +0900 (070da0b24)
@@ -0,0 +1,37 @@
+table_create Entries TABLE_NO_KEY
+[[0,0.0,0.0],true]
+column_create Entries body COLUMN_SCALAR ShortText
+[[0,0.0,0.0],true]
+table_create Terms TABLE_PAT_KEY ShortText   --default_tokenizer 'TokenNgram("loose_symbol", true,                                   "report_source_location", true)'   --normalizer 'NormalizerNFKC100'
+[[0,0.0,0.0],true]
+column_create Terms document_index COLUMN_INDEX|WITH_POSITION Entries body
+[[0,0.0,0.0],true]
+load --table Entries
+[
+{"body": "(あいうえお)"}
+]
+[[0,0.0,0.0],1]
+select Entries   --match_columns body   --query 'いうえお'   --output_columns 'highlight_html(body, Terms)'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        1
+      ],
+      [
+        [
+          "highlight_html",
+          null
+        ]
+      ],
+      [
+        "(あ<span class=\"keyword\">いうえお</span>)"
+      ]
+    ]
+  ]
+]

  Added: test/command/suite/select/function/highlight_html/lexicon/loose.test (+20 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/function/highlight_html/lexicon/loose.test    2018-05-22 18:01:35 +0900 (704d557dc)
@@ -0,0 +1,20 @@
+
+table_create Entries TABLE_NO_KEY
+column_create Entries body COLUMN_SCALAR ShortText
+
+table_create Terms TABLE_PAT_KEY ShortText \
+  --default_tokenizer 'TokenNgram("loose_symbol", true, \
+                                  "report_source_location", true)' \
+  --normalizer 'NormalizerNFKC100'
+column_create Terms document_index COLUMN_INDEX|WITH_POSITION Entries body
+
+load --table Entries
+[
+{"body": "(あいうえお)"}
+]
+
+select Entries \
+  --match_columns body \
+  --query 'いうえお' \
+  --output_columns 'highlight_html(body, Terms)'
+

  Added: test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_non_number.expected (+114 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_non_number.expected    2018-05-22 18:01:35 +0900 (e0ccd2903)
@@ -0,0 +1,114 @@
+tokenize   'TokenNgram("loose_symbol", true,               "report_source_location", true)'   "(あいうえお)"   'NormalizerNFKC100'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "(",
+      "position": 0,
+      "force_prefix": false,
+      "source_offset": 0,
+      "source_length": 1,
+      "source_first_character_length": 1
+    },
+    {
+      "value": "あい",
+      "position": 1,
+      "force_prefix": false,
+      "source_offset": 1,
+      "source_length": 6,
+      "source_first_character_length": 3
+    },
+    {
+      "value": "いう",
+      "position": 2,
+      "force_prefix": false,
+      "source_offset": 4,
+      "source_length": 6,
+      "source_first_character_length": 3
+    },
+    {
+      "value": "うえ",
+      "position": 3,
+      "force_prefix": false,
+      "source_offset": 7,
+      "source_length": 6,
+      "source_first_character_length": 3
+    },
+    {
+      "value": "えお",
+      "position": 4,
+      "force_prefix": false,
+      "source_offset": 10,
+      "source_length": 6,
+      "source_first_character_length": 3
+    },
+    {
+      "value": "お",
+      "position": 5,
+      "force_prefix": false,
+      "source_offset": 13,
+      "source_length": 3,
+      "source_first_character_length": 3
+    },
+    {
+      "value": ")",
+      "position": 6,
+      "force_prefix": false,
+      "source_offset": 16,
+      "source_length": 1,
+      "source_first_character_length": 1
+    },
+    {
+      "value": "￰",
+      "position": 7,
+      "force_prefix": false,
+      "source_offset": 17,
+      "source_length": 0,
+      "source_first_character_length": 0
+    },
+    {
+      "value": "あい",
+      "position": 8,
+      "force_prefix": false,
+      "source_offset": 0,
+      "source_length": 7,
+      "source_first_character_length": 4
+    },
+    {
+      "value": "いう",
+      "position": 9,
+      "force_prefix": false,
+      "source_offset": 4,
+      "source_length": 6,
+      "source_first_character_length": 3
+    },
+    {
+      "value": "うえ",
+      "position": 10,
+      "force_prefix": false,
+      "source_offset": 7,
+      "source_length": 6,
+      "source_first_character_length": 3
+    },
+    {
+      "value": "えお",
+      "position": 11,
+      "force_prefix": false,
+      "source_offset": 10,
+      "source_length": 6,
+      "source_first_character_length": 3
+    },
+    {
+      "value": "お",
+      "position": 12,
+      "force_prefix": false,
+      "source_offset": 13,
+      "source_length": 3,
+      "source_first_character_length": 3
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_non_number.test (+5 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_non_number.test    2018-05-22 18:01:35 +0900 (8b9a1545a)
@@ -0,0 +1,5 @@
+tokenize \
+  'TokenNgram("loose_symbol", true, \
+              "report_source_location", true)' \
+  "(あいうえお)" \
+  'NormalizerNFKC100'
-------------- next part --------------
HTML����������������������������...
URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180522/741f3833/attachment-0001.htm 


Groonga - fulltext search engine.

[Groonga-commit] groonga/groonga at d5c6d1f [master] TokenNgram: fix a wrong source offset bug for loose case