• R/O
  • HTTP
  • SSH
  • HTTPS

NMeCabRepo2: Commit


Commit MetaInfo

Revision86e8a0ecc6943322303429f842944341537829dd (tree)
Zeit2015-03-23 18:45:31
Autorkomutan <t_komuta@nift...>
Commiterkomutan

Log Message

.NET FW が対応していない文字コード名に対応

Ändern Zusammenfassung

Diff

--- a/src/LibNMeCab/Core/MeCabDictionary.cs
+++ b/src/LibNMeCab/Core/MeCabDictionary.cs
@@ -1,289 +1,289 @@
1-// MeCab -- Yet Another Part-of-Speech and Morphological Analyzer
2-//
3-// Copyright(C) 2001-2006 Taku Kudo <taku@chasen.org>
4-// Copyright(C) 2004-2006 Nippon Telegraph and Telephone Corporation
5-using System;
6-using System.Collections.Generic;
7-using System.Text;
8-using System.IO;
9-#if MMF_DIC
10-using System.IO.MemoryMappedFiles;
11-#endif
12-
13-namespace NMeCab.Core
14-{
15- public class MeCabDictionary : IDisposable
16- {
17- #region Const/Field/Property
18-
19- private const uint DictionaryMagicID = 0xEF718F77u;
20- private const uint DicVersion = 102u;
21-
22-#if MMF_DIC
23- private MemoryMappedFile mmf;
24- private MemoryMappedViewAccessor tokens;
25- private MemoryMappedViewAccessor features;
26-#else
27- private Token[] tokens;
28- private byte[] features;
29-#endif
30-
31- private DoubleArray da = new DoubleArray();
32-
33- private Encoding encoding;
34-
35- /// <summary>
36- /// 辞書の文字コード
37- /// </summary>
38- public string CharSet
39- {
40- get { return this.encoding.WebName; }
41- }
42-
43- /// <summary>
44- /// バージョン
45- /// </summary>
46- public uint Version { get; private set; }
47-
48- /// <summary>
49- /// 辞書のタイプ
50- /// </summary>
51- public DictionaryType Type { get; private set; }
52-
53- public uint LexSize { get; private set; }
54-
55- /// <summary>
56- /// 左文脈 ID のサイズ
57- /// </summary>
58- public uint LSize { get; private set; }
59-
60- /// <summary>
61- /// 右文脈 ID のサイズ
62- /// </summary>
63- public uint RSize { get; private set; }
64-
65- /// <summary>
66- /// 辞書のファイル名
67- /// </summary>
68- public string FileName { get; private set; }
69-
70- #endregion
71-
72- #region Open
73-
74-#if MMF_DIC
75-
76- public void Open(string filePath)
77- {
78- this.mmf = MemoryMappedFile.CreateFromFile(filePath, FileMode.Open,
79- null, 0L, MemoryMappedFileAccess.Read);
80- this.Open(this.mmf, filePath);
81- }
82-
83- public void Open(MemoryMappedFile mmf, string filePath = null)
84- {
85- this.FileName = filePath;
86-
87- using (MemoryMappedViewStream stream = mmf.CreateViewStream(
88- 0L, 0L, MemoryMappedFileAccess.Read))
89- using (BinaryReader reader = new BinaryReader(stream))
90- {
91- uint magic = reader.ReadUInt32();
92- if (stream.CanSeek && stream.Length < (magic ^ DictionaryMagicID)) //正確なサイズ取得ができないので不等号で代用
93- throw new MeCabInvalidFileException("dictionary file is broken", filePath);
94-
95- this.Version = reader.ReadUInt32();
96- if (this.Version != DicVersion)
97- throw new MeCabInvalidFileException("incompatible version", filePath);
98-
99- this.Type = (DictionaryType)reader.ReadUInt32();
100- this.LexSize = reader.ReadUInt32();
101- this.LSize = reader.ReadUInt32();
102- this.RSize = reader.ReadUInt32();
103- uint dSize = reader.ReadUInt32();
104- uint tSize = reader.ReadUInt32();
105- uint fSize = reader.ReadUInt32();
106- reader.ReadUInt32(); //dummy
107-
108- string charSet = StrUtils.GetString(reader.ReadBytes(32), Encoding.ASCII);
109- this.encoding = Encoding.GetEncoding(charSet);
110-
111- long offset = stream.Position;
112- this.da.Open(mmf, offset, dSize);
113- offset += dSize;
114- this.tokens = mmf.CreateViewAccessor(offset, tSize, MemoryMappedFileAccess.Read);
115- offset += tSize;
116- this.features = mmf.CreateViewAccessor(offset, fSize, MemoryMappedFileAccess.Read);
117- }
118- }
119-
120-#else
121-
122- public void Open(string filePath)
123- {
124- this.FileName = filePath;
125-
126- using (FileStream fileStream = new FileStream(filePath, FileMode.Open, FileAccess.Read))
127- using (BinaryReader reader = new BinaryReader(fileStream))
128- {
129- this.Open(reader);
130- }
131- }
132-
133- public unsafe void Open(BinaryReader reader)
134- {
135- uint magic = reader.ReadUInt32();
136- //CanSeekの時のみストリーム長のチェック
137- if (reader.BaseStream.CanSeek && reader.BaseStream.Length != (magic ^ DictionaryMagicID))
138- throw new MeCabInvalidFileException("dictionary file is broken", this.FileName);
139-
140- this.Version = reader.ReadUInt32();
141- if (this.Version != DicVersion)
142- throw new MeCabInvalidFileException("incompatible version", this.FileName);
143-
144- this.Type = (DictionaryType)reader.ReadUInt32();
145- this.LexSize = reader.ReadUInt32();
146- this.LSize = reader.ReadUInt32();
147- this.RSize = reader.ReadUInt32();
148- uint dSize = reader.ReadUInt32();
149- uint tSize = reader.ReadUInt32();
150- uint fSize = reader.ReadUInt32();
151- reader.ReadUInt32(); //dummy
152-
153- string charSet = StrUtils.GetString(reader.ReadBytes(32), Encoding.ASCII);
154- this.encoding = Encoding.GetEncoding(charSet);
155-
156- this.da.Open(reader, dSize);
157-
158- this.tokens = new Token[tSize / sizeof(Token)];
159- for (int i = 0; i < this.tokens.Length; i++)
160- this.tokens[i] = Token.Create(reader);
161-
162- this.features = reader.ReadBytes((int)fSize);
163-
164- if (reader.BaseStream.ReadByte() != -1)
165- throw new MeCabInvalidFileException("dictionary file is broken", this.FileName);
166- }
167-
168-#endif
169-
170- #endregion
171-
172- #region Search
173-
174- public unsafe DoubleArray.ResultPair ExactMatchSearch(string key)
175- {
176- fixed (char* pKey = key)
177- return this.ExactMatchSearch(pKey, key.Length, 0);
178- }
179-
180- public unsafe DoubleArray.ResultPair ExactMatchSearch(char* key, int len, int nodePos = 0)
181- {
182- //if (this.encoding == Encoding.Unicode)
183- // return this.da.ExactMatchSearch((byte*)key, len, nodePos);
184-
185- //エンコード
186- int maxByteCount = this.encoding.GetMaxByteCount(len);
187- byte* bytes = stackalloc byte[maxByteCount];
188- int bytesLen = this.encoding.GetBytes(key, len, bytes, maxByteCount);
189-
190- DoubleArray.ResultPair result = this.da.ExactMatchSearch(bytes, bytesLen, nodePos);
191-
192- //文字数をデコードしたものに変換
193- result.Length = this.encoding.GetCharCount(bytes, result.Length);
194-
195- return result;
196- }
197-
198- public unsafe int CommonPrefixSearch(char* key, int len, DoubleArray.ResultPair* result, int rLen)
199- {
200- //if (this.encoding == Encoding.Unicode)
201- // return this.da.CommonPrefixSearch((byte*)key, result, rLen, len);
202-
203- //エンコード
204- int maxByteLen = this.encoding.GetMaxByteCount(len);
205- byte* bytes = stackalloc byte[maxByteLen];
206- int bytesLen = this.encoding.GetBytes(key, len, bytes, maxByteLen);
207-
208- int n = this.da.CommonPrefixSearch(bytes, result, rLen, bytesLen);
209-
210- //文字数をデコードしたものに変換
211- for (int i = 0; i < n; i++)
212- result[i].Length = this.encoding.GetCharCount(bytes, result[i].Length);
213-
214- return n;
215- }
216-
217- #endregion
218-
219- #region Get Infomation
220-
221- public unsafe Token[] GetToken(DoubleArray.ResultPair n)
222- {
223- Token[] dist = new Token[0xFF & n.Value];
224- int tokenPos = n.Value >> 8;
225-#if MMF_DIC
226- this.tokens.ReadArray<Token>(tokenPos * sizeof(Token), dist, 0, dist.Length);
227-#else
228- Array.Copy(this.tokens, tokenPos, dist, 0, dist.Length);
229-#endif
230- return dist;
231- }
232-
233- public string GetFeature(uint featurePos)
234- {
235- return StrUtils.GetString(this.features, (long)featurePos, this.encoding);
236- }
237-
238- #endregion
239-
240- #region etc.
241-
242- public bool IsCompatible(MeCabDictionary d)
243- {
244- return (this.Version == d.Version &&
245- this.LSize == d.LSize &&
246- this.RSize == d.RSize &&
247- this.CharSet == d.CharSet);
248- }
249-
250- #endregion
251-
252- #region Dispose
253-
254- private bool disposed;
255-
256- /// <summary>
257- /// 使用されているリソースを開放する
258- /// </summary>
259- public void Dispose()
260- {
261- this.Dispose(true);
262- GC.SuppressFinalize(this);
263- }
264-
265- protected virtual void Dispose(bool disposing)
266- {
267- if (disposed) return;
268-
269- if (disposing)
270- {
271- if (this.da != null) this.da.Dispose();
272-#if MMF_DIC
273- if (this.mmf != null) this.mmf.Dispose();
274- if (this.tokens != null) this.tokens.Dispose();
275- if (this.features != null) this.features.Dispose();
276-#endif
277- }
278-
279- this.disposed = true;
280- }
281-
282- ~MeCabDictionary()
283- {
284- this.Dispose(false);
285- }
286-
287- #endregion
288- }
289-}
1+// MeCab -- Yet Another Part-of-Speech and Morphological Analyzer
2+//
3+// Copyright(C) 2001-2006 Taku Kudo <taku@chasen.org>
4+// Copyright(C) 2004-2006 Nippon Telegraph and Telephone Corporation
5+using System;
6+using System.Collections.Generic;
7+using System.Text;
8+using System.IO;
9+#if MMF_DIC
10+using System.IO.MemoryMappedFiles;
11+#endif
12+
13+namespace NMeCab.Core
14+{
15+ public class MeCabDictionary : IDisposable
16+ {
17+ #region Const/Field/Property
18+
19+ private const uint DictionaryMagicID = 0xEF718F77u;
20+ private const uint DicVersion = 102u;
21+
22+#if MMF_DIC
23+ private MemoryMappedFile mmf;
24+ private MemoryMappedViewAccessor tokens;
25+ private MemoryMappedViewAccessor features;
26+#else
27+ private Token[] tokens;
28+ private byte[] features;
29+#endif
30+
31+ private DoubleArray da = new DoubleArray();
32+
33+ private Encoding encoding;
34+
35+ /// <summary>
36+ /// 辞書の文字コード
37+ /// </summary>
38+ public string CharSet
39+ {
40+ get { return this.encoding.WebName; }
41+ }
42+
43+ /// <summary>
44+ /// バージョン
45+ /// </summary>
46+ public uint Version { get; private set; }
47+
48+ /// <summary>
49+ /// 辞書のタイプ
50+ /// </summary>
51+ public DictionaryType Type { get; private set; }
52+
53+ public uint LexSize { get; private set; }
54+
55+ /// <summary>
56+ /// 左文脈 ID のサイズ
57+ /// </summary>
58+ public uint LSize { get; private set; }
59+
60+ /// <summary>
61+ /// 右文脈 ID のサイズ
62+ /// </summary>
63+ public uint RSize { get; private set; }
64+
65+ /// <summary>
66+ /// 辞書のファイル名
67+ /// </summary>
68+ public string FileName { get; private set; }
69+
70+ #endregion
71+
72+ #region Open
73+
74+#if MMF_DIC
75+
76+ public void Open(string filePath)
77+ {
78+ this.mmf = MemoryMappedFile.CreateFromFile(filePath, FileMode.Open,
79+ null, 0L, MemoryMappedFileAccess.Read);
80+ this.Open(this.mmf, filePath);
81+ }
82+
83+ public void Open(MemoryMappedFile mmf, string filePath = null)
84+ {
85+ this.FileName = filePath;
86+
87+ using (MemoryMappedViewStream stream = mmf.CreateViewStream(
88+ 0L, 0L, MemoryMappedFileAccess.Read))
89+ using (BinaryReader reader = new BinaryReader(stream))
90+ {
91+ uint magic = reader.ReadUInt32();
92+ if (stream.CanSeek && stream.Length < (magic ^ DictionaryMagicID)) //正確なサイズ取得ができないので不等号で代用
93+ throw new MeCabInvalidFileException("dictionary file is broken", filePath);
94+
95+ this.Version = reader.ReadUInt32();
96+ if (this.Version != DicVersion)
97+ throw new MeCabInvalidFileException("incompatible version", filePath);
98+
99+ this.Type = (DictionaryType)reader.ReadUInt32();
100+ this.LexSize = reader.ReadUInt32();
101+ this.LSize = reader.ReadUInt32();
102+ this.RSize = reader.ReadUInt32();
103+ uint dSize = reader.ReadUInt32();
104+ uint tSize = reader.ReadUInt32();
105+ uint fSize = reader.ReadUInt32();
106+ reader.ReadUInt32(); //dummy
107+
108+ string charSet = StrUtils.GetString(reader.ReadBytes(32), Encoding.ASCII);
109+ this.encoding = StrUtils.GetEncoding(charSet);
110+
111+ long offset = stream.Position;
112+ this.da.Open(mmf, offset, dSize);
113+ offset += dSize;
114+ this.tokens = mmf.CreateViewAccessor(offset, tSize, MemoryMappedFileAccess.Read);
115+ offset += tSize;
116+ this.features = mmf.CreateViewAccessor(offset, fSize, MemoryMappedFileAccess.Read);
117+ }
118+ }
119+
120+#else
121+
122+ public void Open(string filePath)
123+ {
124+ this.FileName = filePath;
125+
126+ using (FileStream fileStream = new FileStream(filePath, FileMode.Open, FileAccess.Read))
127+ using (BinaryReader reader = new BinaryReader(fileStream))
128+ {
129+ this.Open(reader);
130+ }
131+ }
132+
133+ public unsafe void Open(BinaryReader reader)
134+ {
135+ uint magic = reader.ReadUInt32();
136+ //CanSeekの時のみストリーム長のチェック
137+ if (reader.BaseStream.CanSeek && reader.BaseStream.Length != (magic ^ DictionaryMagicID))
138+ throw new MeCabInvalidFileException("dictionary file is broken", this.FileName);
139+
140+ this.Version = reader.ReadUInt32();
141+ if (this.Version != DicVersion)
142+ throw new MeCabInvalidFileException("incompatible version", this.FileName);
143+
144+ this.Type = (DictionaryType)reader.ReadUInt32();
145+ this.LexSize = reader.ReadUInt32();
146+ this.LSize = reader.ReadUInt32();
147+ this.RSize = reader.ReadUInt32();
148+ uint dSize = reader.ReadUInt32();
149+ uint tSize = reader.ReadUInt32();
150+ uint fSize = reader.ReadUInt32();
151+ reader.ReadUInt32(); //dummy
152+
153+ string charSet = StrUtils.GetString(reader.ReadBytes(32), Encoding.ASCII);
154+ this.encoding = StrUtils.GetEncoding(charSet);
155+
156+ this.da.Open(reader, dSize);
157+
158+ this.tokens = new Token[tSize / sizeof(Token)];
159+ for (int i = 0; i < this.tokens.Length; i++)
160+ this.tokens[i] = Token.Create(reader);
161+
162+ this.features = reader.ReadBytes((int)fSize);
163+
164+ if (reader.BaseStream.ReadByte() != -1)
165+ throw new MeCabInvalidFileException("dictionary file is broken", this.FileName);
166+ }
167+
168+#endif
169+
170+ #endregion
171+
172+ #region Search
173+
174+ public unsafe DoubleArray.ResultPair ExactMatchSearch(string key)
175+ {
176+ fixed (char* pKey = key)
177+ return this.ExactMatchSearch(pKey, key.Length, 0);
178+ }
179+
180+ public unsafe DoubleArray.ResultPair ExactMatchSearch(char* key, int len, int nodePos = 0)
181+ {
182+ //if (this.encoding == Encoding.Unicode)
183+ // return this.da.ExactMatchSearch((byte*)key, len, nodePos);
184+
185+ //エンコード
186+ int maxByteCount = this.encoding.GetMaxByteCount(len);
187+ byte* bytes = stackalloc byte[maxByteCount];
188+ int bytesLen = this.encoding.GetBytes(key, len, bytes, maxByteCount);
189+
190+ DoubleArray.ResultPair result = this.da.ExactMatchSearch(bytes, bytesLen, nodePos);
191+
192+ //文字数をデコードしたものに変換
193+ result.Length = this.encoding.GetCharCount(bytes, result.Length);
194+
195+ return result;
196+ }
197+
198+ public unsafe int CommonPrefixSearch(char* key, int len, DoubleArray.ResultPair* result, int rLen)
199+ {
200+ //if (this.encoding == Encoding.Unicode)
201+ // return this.da.CommonPrefixSearch((byte*)key, result, rLen, len);
202+
203+ //エンコード
204+ int maxByteLen = this.encoding.GetMaxByteCount(len);
205+ byte* bytes = stackalloc byte[maxByteLen];
206+ int bytesLen = this.encoding.GetBytes(key, len, bytes, maxByteLen);
207+
208+ int n = this.da.CommonPrefixSearch(bytes, result, rLen, bytesLen);
209+
210+ //文字数をデコードしたものに変換
211+ for (int i = 0; i < n; i++)
212+ result[i].Length = this.encoding.GetCharCount(bytes, result[i].Length);
213+
214+ return n;
215+ }
216+
217+ #endregion
218+
219+ #region Get Infomation
220+
221+ public unsafe Token[] GetToken(DoubleArray.ResultPair n)
222+ {
223+ Token[] dist = new Token[0xFF & n.Value];
224+ int tokenPos = n.Value >> 8;
225+#if MMF_DIC
226+ this.tokens.ReadArray<Token>(tokenPos * sizeof(Token), dist, 0, dist.Length);
227+#else
228+ Array.Copy(this.tokens, tokenPos, dist, 0, dist.Length);
229+#endif
230+ return dist;
231+ }
232+
233+ public string GetFeature(uint featurePos)
234+ {
235+ return StrUtils.GetString(this.features, (long)featurePos, this.encoding);
236+ }
237+
238+ #endregion
239+
240+ #region etc.
241+
242+ public bool IsCompatible(MeCabDictionary d)
243+ {
244+ return (this.Version == d.Version &&
245+ this.LSize == d.LSize &&
246+ this.RSize == d.RSize &&
247+ this.CharSet == d.CharSet);
248+ }
249+
250+ #endregion
251+
252+ #region Dispose
253+
254+ private bool disposed;
255+
256+ /// <summary>
257+ /// 使用されているリソースを開放する
258+ /// </summary>
259+ public void Dispose()
260+ {
261+ this.Dispose(true);
262+ GC.SuppressFinalize(this);
263+ }
264+
265+ protected virtual void Dispose(bool disposing)
266+ {
267+ if (disposed) return;
268+
269+ if (disposing)
270+ {
271+ if (this.da != null) this.da.Dispose();
272+#if MMF_DIC
273+ if (this.mmf != null) this.mmf.Dispose();
274+ if (this.tokens != null) this.tokens.Dispose();
275+ if (this.features != null) this.features.Dispose();
276+#endif
277+ }
278+
279+ this.disposed = true;
280+ }
281+
282+ ~MeCabDictionary()
283+ {
284+ this.Dispose(false);
285+ }
286+
287+ #endregion
288+ }
289+}
--- a/src/LibNMeCab/Core/StrUtils.cs
+++ b/src/LibNMeCab/Core/StrUtils.cs
@@ -1,117 +1,132 @@
1-using System;
2-using System.Collections.Generic;
3-using System.Text;
4-#if MMF_DIC
5-using System.IO.MemoryMappedFiles;
6-#endif
7-
8-namespace NMeCab.Core
9-{
10- public static class StrUtils
11- {
12- private const byte Nul = (byte)0;
13-
14- /// <summary>
15- /// バイト配列の中から終端が\0で表された文字列を取り出す。
16- /// </summary>
17- /// <remarks>
18- /// バイト配列の長さはInt32.MaxValueを超えていても良い。
19- /// </remarks>
20- /// <param name="bytes">バイト配列</param>
21- /// <param name="enc">文字エンコーディング</param>
22- /// <returns>文字列(\0は含まない)</returns>
23- public static string GetString(byte[] bytes, Encoding enc)
24- {
25- return StrUtils.GetString(bytes, 0L, enc);
26- }
27-
28- /// <summary>
29- /// バイト配列の中から終端が\0で表された文字列を取り出す。
30- /// </summary>
31- /// <remarks>
32- /// バイト配列の長さはInt32.MaxValueを超えていても良い。
33- /// </remarks>
34- /// <param name="bytes">バイト配列</param>
35- /// <param name="offset">オフセット位置</param>
36- /// <param name="enc">文字エンコーディング</param>
37- /// <returns>文字列(\0は含まない)</returns>
38- public unsafe static string GetString(byte[] bytes, long offset, Encoding enc)
39- {
40- fixed (byte* pBytes = bytes)
41- return StrUtils.GetString(pBytes + offset, enc);
42- }
43-
44- /// <summary>
45- /// バイト配列の中から終端が\0で表された文字列を取り出す。
46- /// </summary>
47- /// <remarks>
48- /// バイト配列の長さはInt32.MaxValueを超えていても良い。
49- /// </remarks>
50- /// <param name="bytes">デコードする最初のバイトへのポインタ</param>
51- /// <param name="enc">文字エンコーディング</param>
52- /// <returns>文字列(\0は含まない)</returns>
53- public unsafe static string GetString(byte* bytes, Encoding enc)
54- {
55- //バイト長のカウント
56- int byteCount = 0;
57- while (*bytes != Nul) //終端\0に到達するまでシーク
58- {
59- checked { byteCount++; } //文字列のバイト長がInt32.MaxValueを超えたならエラー
60- bytes++;
61- }
62- bytes -= byteCount;
63-
64- //生成されうる最大文字数のバッファを確保
65- int maxCharCount = enc.GetMaxCharCount(byteCount);
66- fixed (char* buff = new char[maxCharCount])
67- {
68- //バイト配列を文字列にデコード
69- int len = enc.GetChars(bytes, byteCount, buff, maxCharCount);
70- return new string(buff, 0, len);
71- }
72- }
73-
74-#if MMF_DIC
75-
76- /// <summary>
77- /// MemoryMappedViewAccessorから終端が\0で表された文字列を取り出す。
78- /// </summary>
79- /// <remarks>
80- /// MemoryMappedViewAccessorの容量はInt32.MaxValueを超えていても良い。
81- /// </remarks>
82- /// <param name="accessor">MemoryMappedViewAccessor</param>
83- /// <param name="index">オフセット位置</param>
84- /// <param name="enc">文字エンコーディング</param>
85- /// <param name="buffSize">内部で使用するバッファの初期サイズ</param>
86- /// <returns>文字列(\0は含まない)</returns>
87- public static string GetString(MemoryMappedViewAccessor accessor, long offset, Encoding enc,
88- int buffSize = 128)
89- {
90- byte[] buff = new byte[buffSize]; //IO回数削減のためのバッファ配列
91- accessor.ReadArray<byte>(offset, buff, 0, buffSize); //初期読込
92-
93- //バイト長のカウント
94- int byteCount = 0;
95- while (buff[byteCount] != Nul) //終端\0に到達するまでシーク
96- {
97- byteCount++;
98-
99- if (byteCount == buffSize) //バッファ配列の終端
100- {
101- //バッファ配列の拡張と追加読込
102- checked { buffSize *= 2; } //Int32.MaxValueを超えたならエラー
103- byte[] newBuff = new byte[buffSize];
104- Buffer.BlockCopy(buff, 0, newBuff, 0, byteCount);
105- accessor.ReadArray<byte>(offset + byteCount, newBuff, byteCount, buffSize - byteCount);
106- buff = newBuff;
107- }
108- }
109-
110- //バッファ配列を文字列にデコード
111- return enc.GetString(buff, 0, byteCount);
112- }
113-
114-#endif
115-
116- }
117-}
1+using System;
2+using System.Collections.Generic;
3+using System.Text;
4+#if MMF_DIC
5+using System.IO.MemoryMappedFiles;
6+#endif
7+
8+namespace NMeCab.Core
9+{
10+ public static class StrUtils
11+ {
12+ private const byte Nul = (byte)0;
13+
14+ /// <summary>
15+ /// バイト配列の中から終端が\0で表された文字列を取り出す。
16+ /// </summary>
17+ /// <remarks>
18+ /// バイト配列の長さはInt32.MaxValueを超えていても良い。
19+ /// </remarks>
20+ /// <param name="bytes">バイト配列</param>
21+ /// <param name="enc">文字エンコーディング</param>
22+ /// <returns>文字列(\0は含まない)</returns>
23+ public static string GetString(byte[] bytes, Encoding enc)
24+ {
25+ return StrUtils.GetString(bytes, 0L, enc);
26+ }
27+
28+ /// <summary>
29+ /// バイト配列の中から終端が\0で表された文字列を取り出す。
30+ /// </summary>
31+ /// <remarks>
32+ /// バイト配列の長さはInt32.MaxValueを超えていても良い。
33+ /// </remarks>
34+ /// <param name="bytes">バイト配列</param>
35+ /// <param name="offset">オフセット位置</param>
36+ /// <param name="enc">文字エンコーディング</param>
37+ /// <returns>文字列(\0は含まない)</returns>
38+ public unsafe static string GetString(byte[] bytes, long offset, Encoding enc)
39+ {
40+ fixed (byte* pBytes = bytes)
41+ return StrUtils.GetString(pBytes + offset, enc);
42+ }
43+
44+ /// <summary>
45+ /// バイト配列の中から終端が\0で表された文字列を取り出す。
46+ /// </summary>
47+ /// <remarks>
48+ /// バイト配列の長さはInt32.MaxValueを超えていても良い。
49+ /// </remarks>
50+ /// <param name="bytes">デコードする最初のバイトへのポインタ</param>
51+ /// <param name="enc">文字エンコーディング</param>
52+ /// <returns>文字列(\0は含まない)</returns>
53+ public unsafe static string GetString(byte* bytes, Encoding enc)
54+ {
55+ //バイト長のカウント
56+ int byteCount = 0;
57+ while (*bytes != Nul) //終端\0に到達するまでシーク
58+ {
59+ checked { byteCount++; } //文字列のバイト長がInt32.MaxValueを超えたならエラー
60+ bytes++;
61+ }
62+ bytes -= byteCount;
63+
64+ //生成されうる最大文字数のバッファを確保
65+ int maxCharCount = enc.GetMaxCharCount(byteCount);
66+ fixed (char* buff = new char[maxCharCount])
67+ {
68+ //バイト配列を文字列にデコード
69+ int len = enc.GetChars(bytes, byteCount, buff, maxCharCount);
70+ return new string(buff, 0, len);
71+ }
72+ }
73+
74+#if MMF_DIC
75+
76+ /// <summary>
77+ /// MemoryMappedViewAccessorから終端が\0で表された文字列を取り出す。
78+ /// </summary>
79+ /// <remarks>
80+ /// MemoryMappedViewAccessorの容量はInt32.MaxValueを超えていても良い。
81+ /// </remarks>
82+ /// <param name="accessor">MemoryMappedViewAccessor</param>
83+ /// <param name="index">オフセット位置</param>
84+ /// <param name="enc">文字エンコーディング</param>
85+ /// <param name="buffSize">内部で使用するバッファの初期サイズ</param>
86+ /// <returns>文字列(\0は含まない)</returns>
87+ public static string GetString(MemoryMappedViewAccessor accessor, long offset, Encoding enc,
88+ int buffSize = 128)
89+ {
90+ byte[] buff = new byte[buffSize]; //IO回数削減のためのバッファ配列
91+ accessor.ReadArray<byte>(offset, buff, 0, buffSize); //初期読込
92+
93+ //バイト長のカウント
94+ int byteCount = 0;
95+ while (buff[byteCount] != Nul) //終端\0に到達するまでシーク
96+ {
97+ byteCount++;
98+
99+ if (byteCount == buffSize) //バッファ配列の終端
100+ {
101+ //バッファ配列の拡張と追加読込
102+ checked { buffSize *= 2; } //Int32.MaxValueを超えたならエラー
103+ byte[] newBuff = new byte[buffSize];
104+ Buffer.BlockCopy(buff, 0, newBuff, 0, byteCount);
105+ accessor.ReadArray<byte>(offset + byteCount, newBuff, byteCount, buffSize - byteCount);
106+ buff = newBuff;
107+ }
108+ }
109+
110+ //バッファ配列を文字列にデコード
111+ return enc.GetString(buff, 0, byteCount);
112+ }
113+
114+#endif
115+
116+ /// <summary>
117+ /// 指定の名前に対応するエンコーディングを取得する(.NET FWが対応していない名前にもアドホックに対応)
118+ /// </summary>
119+ /// <param name="name"></param>
120+ /// <returns></returns>
121+ public static Encoding GetEncoding(string name)
122+ {
123+ switch (name.ToUpper())
124+ {
125+ case "UTF8":
126+ return Encoding.UTF8;
127+ default:
128+ return Encoding.GetEncoding(name);
129+ }
130+ }
131+ }
132+}
Show on old repository browser