Revision | cdb3b2272616484b5e0019f81b8c6d0baccb30b9 (tree) |
---|---|
Zeit | 2020-06-27 13:05:09 |
Autor | Takashi Sawanaka <sdottaka@user...> |
Commiter | Takashi Sawanaka |
crystaledit: Use custom word break rules
@@ -438,7 +438,6 @@ CCrystalTextView::CCrystalTextView () | ||
438 | 438 | , m_pFindTextDlg(nullptr) |
439 | 439 | , m_CurSourceDef(nullptr) |
440 | 440 | , m_dwLastDblClickTime(0) |
441 | -, m_iterWord(UBRK_WORD, "en", nullptr, 0) | |
442 | 441 | , m_rxnode(nullptr) |
443 | 442 | , m_pszMatched(nullptr) |
444 | 443 | , m_bSelMargin(true) |
@@ -726,7 +726,6 @@ public : | ||
726 | 726 | static RENDERING_MODE s_nRenderingModeDefault; |
727 | 727 | RENDERING_MODE m_nRenderingMode; |
728 | 728 | |
729 | - ICUBreakIterator m_iterWord; | |
730 | 729 | std::unique_ptr<CCrystalRenderer> m_pCrystalRenderer; |
731 | 730 | CCrystalRenderer *m_pCrystalRendererSaved; |
732 | 731 |
@@ -162,10 +162,10 @@ MoveWordLeft (bool bSelect) | ||
162 | 162 | if (m_ptCursorPos.x > 0) |
163 | 163 | { |
164 | 164 | const TCHAR *pszChars = GetLineChars(m_ptCursorPos.y); |
165 | - m_iterWord.setText(reinterpret_cast<const UChar *>(pszChars), GetLineLength(m_ptCursorPos.y)); | |
166 | - int nPos = m_iterWord.preceding(m_ptCursorPos.x); | |
165 | + auto pIterWord = ICUBreakIterator::getWordBreakIterator(reinterpret_cast<const UChar *>(pszChars), GetLineLength(m_ptCursorPos.y)); | |
166 | + int nPos = pIterWord->preceding(m_ptCursorPos.x); | |
167 | 167 | if (xisspace(pszChars[nPos])) |
168 | - nPos = m_iterWord.preceding(nPos); | |
168 | + nPos = pIterWord->preceding(nPos); | |
169 | 169 | m_ptCursorPos.x = nPos; |
170 | 170 | } |
171 | 171 |
@@ -195,8 +195,8 @@ MoveWordRight (bool bSelect) | ||
195 | 195 | } |
196 | 196 | |
197 | 197 | const TCHAR *pszChars = GetLineChars(m_ptCursorPos.y); |
198 | - m_iterWord.setText(reinterpret_cast<const UChar *>(pszChars), nLength); | |
199 | - int nPos = m_iterWord.following(m_ptCursorPos.x); | |
198 | + auto pIterWord = ICUBreakIterator::getWordBreakIterator(reinterpret_cast<const UChar *>(pszChars), nLength); | |
199 | + int nPos = pIterWord->following(m_ptCursorPos.x); | |
200 | 200 | while (nPos < nLength && xisspace(pszChars[nPos])) |
201 | 201 | ++nPos; |
202 | 202 | m_ptCursorPos.x = nPos; |
@@ -497,8 +497,8 @@ WordToRight (CPoint pt) | ||
497 | 497 | int nLength = GetLineLength (pt.y); |
498 | 498 | if (pt.x < nLength) |
499 | 499 | { |
500 | - m_iterWord.setText(reinterpret_cast<const UChar *>(GetLineChars(pt.y)), nLength); | |
501 | - pt.x = m_iterWord.following(pt.x); | |
500 | + auto pIterWord = ICUBreakIterator::getWordBreakIterator(reinterpret_cast<const UChar *>(GetLineChars(pt.y)), nLength); | |
501 | + pt.x = pIterWord->following(pt.x); | |
502 | 502 | } |
503 | 503 | ASSERT_VALIDTEXTPOS (pt); |
504 | 504 | return pt; |
@@ -510,9 +510,9 @@ WordToLeft (CPoint pt) | ||
510 | 510 | ASSERT_VALIDTEXTPOS (pt); |
511 | 511 | if (pt.x > 0) |
512 | 512 | { |
513 | - m_iterWord.setText(reinterpret_cast<const UChar *>(GetLineChars(pt.y)), GetLineLength(pt.y)); | |
514 | - pt.x = m_iterWord.following(pt.x); | |
515 | - pt.x = m_iterWord.preceding(pt.x); | |
513 | + auto pIterWord = ICUBreakIterator::getWordBreakIterator(reinterpret_cast<const UChar *>(GetLineChars(pt.y)), GetLineLength(pt.y)); | |
514 | + pt.x = pIterWord->following(pt.x); | |
515 | + pt.x = pIterWord->preceding(pt.x); | |
516 | 516 | } |
517 | 517 | ASSERT_VALIDTEXTPOS (pt); |
518 | 518 | return pt; |
@@ -4,10 +4,11 @@ | ||
4 | 4 | |
5 | 5 | static ICULoader m_ICULoader; |
6 | 6 | HMODULE ICULoader::m_hLibrary = nullptr; |
7 | -template <> thread_local std::unique_ptr<ICUBreakIterator> m_pCharaterBreakIterator<1>; | |
8 | -template <> thread_local std::unique_ptr<ICUBreakIterator> m_pCharaterBreakIterator<2>; | |
9 | -template <> thread_local std::unique_ptr<ICUBreakIterator> m_pCharaterBreakIterator<3>; | |
10 | -template <> thread_local std::unique_ptr<ICUBreakIterator> m_pCharaterBreakIterator<4>; | |
7 | +template <> thread_local std::unique_ptr<ICUBreakIterator> m_pCharacterBreakIterator<1>; | |
8 | +template <> thread_local std::unique_ptr<ICUBreakIterator> m_pCharacterBreakIterator<2>; | |
9 | +template <> thread_local std::unique_ptr<ICUBreakIterator> m_pCharacterBreakIterator<3>; | |
10 | +template <> thread_local std::unique_ptr<ICUBreakIterator> m_pCharacterBreakIterator<4>; | |
11 | +thread_local std::unique_ptr<ICUBreakIterator> m_pWordBreakIterator; | |
11 | 12 | |
12 | 13 | // This rule set is based on character-break iterator rules of ICU 63.1 |
13 | 14 | // <https://github.com/unicode-org/icu/blob/release-63-1/icu4c/source/data/brkitr/rules/char.txt>. |
@@ -41,3 +42,83 @@ u"$Extended_Pict $Extend* $ZWJ $Extended_Pict;" | ||
41 | 42 | u"^$Prepend* $Regional_Indicator $Regional_Indicator / $Regional_Indicator;" |
42 | 43 | u"^$Prepend* $Regional_Indicator $Regional_Indicator;" |
43 | 44 | u".;"; |
45 | + | |
46 | +const UChar* ICUBreakIterator::kCustomWordBreakRules = | |
47 | +u"!!chain;" | |
48 | +u"!!quoted_literals_only;" | |
49 | +u"$CR = [\\p{Word_Break = CR}];" | |
50 | +u"$LF = [\\p{Word_Break = LF}];" | |
51 | +u"$Newline = [\\p{Word_Break = Newline} ];" | |
52 | +u"$Extend = [\\p{Word_Break = Extend}];" | |
53 | +u"$ZWJ = [\\p{Word_Break = ZWJ}];" | |
54 | +u"$Regional_Indicator = [\\p{Word_Break = Regional_Indicator}];" | |
55 | +u"$Format = [\\p{Word_Break = Format}];" | |
56 | +u"$Katakana = [\\p{Word_Break = Katakana}];" | |
57 | +u"$Hebrew_Letter = [\\p{Word_Break = Hebrew_Letter}];" | |
58 | +u"$ALetter = [\\p{Word_Break = ALetter}];" | |
59 | +u"$Single_Quote = [\\p{Word_Break = Single_Quote}];" | |
60 | +u"$Double_Quote = [\\p{Word_Break = Double_Quote}];" | |
61 | +u"$MidNumLet = [\\p{Word_Break = MidNumLet} - [.]];" | |
62 | +u"$MidLetter = [\\p{Word_Break = MidLetter} - [\\:]];" | |
63 | +u"$MidNum = [\\p{Word_Break = MidNum} [.]];" | |
64 | +u"$Numeric = [\\p{Word_Break = Numeric}];" | |
65 | +u"$ExtendNumLet = [\\p{Word_Break = ExtendNumLet}];" | |
66 | +u"$WSegSpace = [\\p{Word_Break = WSegSpace}];" | |
67 | +u"$Extended_Pict = [:ExtPict:];" | |
68 | +u"$Han = [:Han:];" | |
69 | +u"$Hiragana = [:Hiragana:];" | |
70 | +u"$Control = [\\p{Grapheme_Cluster_Break = Control}];" | |
71 | +u"$HangulSyllable = [\\uac00-\\ud7a3];" | |
72 | +u"$ComplexContext = [:LineBreak = Complex_Context:];" | |
73 | +u"$KanaKanji = [$Han $Hiragana $Katakana];" | |
74 | +u"$dictionaryCJK = [$KanaKanji $HangulSyllable];" | |
75 | +u"$dictionary = [$ComplexContext $dictionaryCJK];" | |
76 | +u"$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];" | |
77 | +u"$KatakanaEx = $Katakana ($Extend | $Format | $ZWJ)*;" | |
78 | +u"$Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format | $ZWJ)*;" | |
79 | +u"$ALetterEx = $ALetterPlus ($Extend | $Format | $ZWJ)*;" | |
80 | +u"$Single_QuoteEx = $Single_Quote ($Extend | $Format | $ZWJ)*;" | |
81 | +u"$Double_QuoteEx = $Double_Quote ($Extend | $Format | $ZWJ)*;" | |
82 | +u"$MidNumLetEx = $MidNumLet ($Extend | $Format | $ZWJ)*;" | |
83 | +u"$MidLetterEx = $MidLetter ($Extend | $Format | $ZWJ)*;" | |
84 | +u"$MidNumEx = $MidNum ($Extend | $Format | $ZWJ)*;" | |
85 | +u"$NumericEx = $Numeric ($Extend | $Format | $ZWJ)*;" | |
86 | +u"$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format | $ZWJ)*;" | |
87 | +u"$Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format | $ZWJ)*;" | |
88 | +u"$Ideographic = [\\p{Ideographic}];" | |
89 | +u"$HiraganaEx = $Hiragana ($Extend | $Format | $ZWJ)*;" | |
90 | +u"$IdeographicEx = $Ideographic ($Extend | $Format | $ZWJ)*;" | |
91 | +u"$CR $LF;" | |
92 | +u"$ZWJ $Extended_Pict;" | |
93 | +u"$WSegSpace $WSegSpace;" | |
94 | +u"[^$CR $LF $Newline]? ($Extend | $Format | $ZWJ)+;" | |
95 | +u"$NumericEx {100};" | |
96 | +u"$ALetterEx {200};" | |
97 | +u"$HangulSyllable {200};" | |
98 | +u"$Hebrew_LetterEx{200};" | |
99 | +u"$KatakanaEx {400};" | |
100 | +u"$HiraganaEx {400};" | |
101 | +u"$IdeographicEx {400};" | |
102 | +u"$Extended_Pict ($Extend | $Format | $ZWJ)*;" | |
103 | +u"($ALetterEx | $Hebrew_LetterEx) ($ALetterEx | $Hebrew_LetterEx) {200};" | |
104 | +u"($ALetterEx | $Hebrew_LetterEx) ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx) {200};" | |
105 | +u"$Hebrew_LetterEx $Single_QuoteEx {200};" | |
106 | +u"$Hebrew_LetterEx $Double_QuoteEx $Hebrew_LetterEx {200};" | |
107 | +u"$NumericEx $NumericEx {100};" | |
108 | +u"($ALetterEx | $Hebrew_LetterEx) $NumericEx {200};" | |
109 | +u"$NumericEx ($ALetterEx | $Hebrew_LetterEx) {200};" | |
110 | +u"$NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100};" | |
111 | +u"$KatakanaEx $KatakanaEx {400};" | |
112 | +u"$ALetterEx $ExtendNumLetEx {200};" | |
113 | +u"$Hebrew_LetterEx $ExtendNumLetEx {200};" | |
114 | +u"$NumericEx $ExtendNumLetEx {100};" | |
115 | +u"$KatakanaEx $ExtendNumLetEx {400};" | |
116 | +u"$ExtendNumLetEx $ExtendNumLetEx {200};" | |
117 | +u"$ExtendNumLetEx $ALetterEx {200};" | |
118 | +u"$ExtendNumLetEx $Hebrew_Letter {200};" | |
119 | +u"$ExtendNumLetEx $NumericEx {100};" | |
120 | +u"$ExtendNumLetEx $KatakanaEx {400};" | |
121 | +u"^$Regional_IndicatorEx $Regional_IndicatorEx;" | |
122 | +u"$HangulSyllable $HangulSyllable {200};" | |
123 | +u"$KanaKanji $KanaKanji {400};" | |
124 | +u".;"; |
@@ -32,7 +32,8 @@ typedef struct UParseError { | ||
32 | 32 | class ICUBreakIterator; |
33 | 33 | |
34 | 34 | template<int N> |
35 | -extern thread_local std::unique_ptr<ICUBreakIterator> m_pCharaterBreakIterator; | |
35 | +extern thread_local std::unique_ptr<ICUBreakIterator> m_pCharacterBreakIterator; | |
36 | +extern thread_local std::unique_ptr<ICUBreakIterator> m_pWordBreakIterator; | |
36 | 37 | |
37 | 38 | typedef UBreakIterator* (*ubrk_open_type)(UBreakIteratorType type, const char* locale, const UChar* text, int32_t textLength, UErrorCode* status); |
38 | 39 | ICU_EXTERN UBreakIterator* (*g_pubrk_open)(UBreakIteratorType type, const char* locale, const UChar* text, int32_t textLength, UErrorCode* status); |
@@ -140,6 +141,11 @@ public: | ||
140 | 141 | UParseError parseError; |
141 | 142 | m_iter = ubrk_openRules(kCustomRules, static_cast<int32_t>(wcslen(reinterpret_cast<const wchar_t *>(kCustomRules))), text, textLength, &parseError, &status); |
142 | 143 | } |
144 | + else if (type == UBRK_WORD) | |
145 | + { | |
146 | + UParseError parseError; | |
147 | + m_iter = ubrk_openRules(kCustomWordBreakRules, static_cast<int32_t>(wcslen(reinterpret_cast<const wchar_t *>(kCustomWordBreakRules))), text, textLength, &parseError, &status); | |
148 | + } | |
143 | 149 | else |
144 | 150 | { |
145 | 151 | m_iter = ubrk_open(type, locale, reinterpret_cast<const UChar *>(text), textLength, &status); |
@@ -231,11 +237,20 @@ public: | ||
231 | 237 | template<int N> |
232 | 238 | static ICUBreakIterator *getCharacterBreakIterator(const UChar * text, int32_t textLength) |
233 | 239 | { |
234 | - if (!m_pCharaterBreakIterator<N>) | |
235 | - m_pCharaterBreakIterator<N>.reset(new ICUBreakIterator(UBRK_CHARACTER, "en", text, textLength)); | |
240 | + if (!m_pCharacterBreakIterator<N>) | |
241 | + m_pCharacterBreakIterator<N>.reset(new ICUBreakIterator(UBRK_CHARACTER, "en", text, textLength)); | |
242 | + else | |
243 | + m_pCharacterBreakIterator<N>->setText(text, textLength); | |
244 | + return m_pCharacterBreakIterator<N>.get(); | |
245 | + } | |
246 | + | |
247 | + static ICUBreakIterator *getWordBreakIterator(const UChar * text, int32_t textLength) | |
248 | + { | |
249 | + if (!m_pWordBreakIterator) | |
250 | + m_pWordBreakIterator.reset(new ICUBreakIterator(UBRK_WORD, "en", text, textLength)); | |
236 | 251 | else |
237 | - m_pCharaterBreakIterator<N>->setText(text, textLength); | |
238 | - return m_pCharaterBreakIterator<N>.get(); | |
252 | + m_pWordBreakIterator->setText(text, textLength); | |
253 | + return m_pWordBreakIterator.get(); | |
239 | 254 | } |
240 | 255 | |
241 | 256 | private: |
@@ -328,5 +343,6 @@ private: | ||
328 | 343 | int m_i; |
329 | 344 | int m_textLength; |
330 | 345 | static const UChar *kCustomRules; |
346 | + static const UChar *kCustomWordBreakRules; | |
331 | 347 | }; |
332 | 348 |