development
Revision | b68c6b1cbec0aa6556a489e49949da6d8790beaf (tree) |
---|---|
Zeit | 2010-12-11 02:07:56 |
Autor | Ken Wakasa <kwakasa@goog...> |
Commiter | Ken Wakasa |
Move tools/makedict from platform/development to platform/packages/inputmethods/LatinIME
The corresponding change is I01ef7084
Change-Id: I559207ab75feffe5ef4678c4a85f178a024448c5
@@ -1,24 +0,0 @@ | ||
1 | -# | |
2 | -# Copyright (C) 2009 The Android Open Source Project | |
3 | -# | |
4 | -# Licensed under the Apache License, Version 2.0 (the "License"); | |
5 | -# you may not use this file except in compliance with the License. | |
6 | -# You may obtain a copy of the License at | |
7 | -# | |
8 | -# http://www.apache.org/licenses/LICENSE-2.0 | |
9 | -# | |
10 | -# Unless required by applicable law or agreed to in writing, software | |
11 | -# distributed under the License is distributed on an "AS IS" BASIS, | |
12 | -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
13 | -# See the License for the specific language governing permissions and | |
14 | -# limitations under the License. | |
15 | -# | |
16 | -LOCAL_PATH := $(call my-dir) | |
17 | -include $(CLEAR_VARS) | |
18 | - | |
19 | -LOCAL_SRC_FILES := $(call all-java-files-under,src) | |
20 | -LOCAL_JAR_MANIFEST := etc/manifest.txt | |
21 | -LOCAL_MODULE := makedict | |
22 | - | |
23 | -include $(BUILD_HOST_JAVA_LIBRARY) | |
24 | -include $(LOCAL_PATH)/etc/Android.mk |
@@ -1,20 +0,0 @@ | ||
1 | -# Copyright (C) 2009 The Android Open Source Project | |
2 | -# | |
3 | -# Licensed under the Apache License, Version 2.0 (the "License"); | |
4 | -# you may not use this file except in compliance with the License. | |
5 | -# You may obtain a copy of the License at | |
6 | -# | |
7 | -# http://www.apache.org/licenses/LICENSE-2.0 | |
8 | -# | |
9 | -# Unless required by applicable law or agreed to in writing, software | |
10 | -# distributed under the License is distributed on an "AS IS" BASIS, | |
11 | -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
12 | -# See the License for the specific language governing permissions and | |
13 | -# limitations under the License. | |
14 | - | |
15 | -LOCAL_PATH := $(call my-dir) | |
16 | -include $(CLEAR_VARS) | |
17 | - | |
18 | -LOCAL_PREBUILT_EXECUTABLES := makedict | |
19 | -include $(BUILD_HOST_PREBUILT) | |
20 | - |
@@ -1,63 +0,0 @@ | ||
1 | -#!/bin/sh | |
2 | -# Copyright 2009, The Android Open Source Project | |
3 | -# | |
4 | -# Licensed under the Apache License, Version 2.0 (the "License"); | |
5 | -# you may not use this file except in compliance with the License. | |
6 | -# You may obtain a copy of the License at | |
7 | -# | |
8 | -# http://www.apache.org/licenses/LICENSE-2.0 | |
9 | -# | |
10 | -# Unless required by applicable law or agreed to in writing, software | |
11 | -# distributed under the License is distributed on an "AS IS" BASIS, | |
12 | -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
13 | -# See the License for the specific language governing permissions and | |
14 | -# limitations under the License. | |
15 | - | |
16 | -# Set up prog to be the path of this script, including following symlinks, | |
17 | -# and set up progdir to be the fully-qualified pathname of its directory. | |
18 | -prog="$0" | |
19 | -while [ -h "${prog}" ]; do | |
20 | - newProg=`/bin/ls -ld "${prog}"` | |
21 | - newProg=`expr "${newProg}" : ".* -> \(.*\)$"` | |
22 | - if expr "x${newProg}" : 'x/' >/dev/null; then | |
23 | - prog="${newProg}" | |
24 | - else | |
25 | - progdir=`dirname "${prog}"` | |
26 | - prog="${progdir}/${newProg}" | |
27 | - fi | |
28 | -done | |
29 | -oldwd=`pwd` | |
30 | -progdir=`dirname "${prog}"` | |
31 | -cd "${progdir}" | |
32 | -progdir=`pwd` | |
33 | -prog="${progdir}"/`basename "${prog}"` | |
34 | -cd "${oldwd}" | |
35 | - | |
36 | -jarfile=makedict.jar | |
37 | -frameworkdir="$progdir" | |
38 | -if [ ! -r "$frameworkdir/$jarfile" ] | |
39 | -then | |
40 | - frameworkdir=`dirname "$progdir"`/tools/lib | |
41 | - libdir=`dirname "$progdir"`/tools/lib | |
42 | -fi | |
43 | -if [ ! -r "$frameworkdir/$jarfile" ] | |
44 | -then | |
45 | - frameworkdir=`dirname "$progdir"`/framework | |
46 | - libdir=`dirname "$progdir"`/lib | |
47 | -fi | |
48 | -if [ ! -r "$frameworkdir/$jarfile" ] | |
49 | -then | |
50 | - echo `basename "$prog"`": can't find $jarfile" | |
51 | - exit 1 | |
52 | -fi | |
53 | - | |
54 | -if [ "$OSTYPE" = "cygwin" ] ; then | |
55 | - jarpath=`cygpath -w "$frameworkdir/$jarfile"` | |
56 | - progdir=`cygpath -w "$progdir"` | |
57 | -else | |
58 | - jarpath="$frameworkdir/$jarfile" | |
59 | -fi | |
60 | - | |
61 | -# need to use "java.ext.dirs" because "-jar" causes classpath to be ignored | |
62 | -# might need more memory, e.g. -Xmx128M | |
63 | -exec java -Djava.ext.dirs="$frameworkdir" -jar "$jarpath" "$@" |
@@ -1 +0,0 @@ | ||
1 | -Main-Class: com.android.tools.dict.MakeBinaryDictionary |
@@ -1,286 +0,0 @@ | ||
1 | -/* | |
2 | - * Copyright (C) 2010 The Android Open Source Project | |
3 | - * | |
4 | - * Licensed under the Apache License, Version 2.0 (the "License"); | |
5 | - * you may not use this file except in compliance with the License. | |
6 | - * You may obtain a copy of the License at | |
7 | - * | |
8 | - * http://www.apache.org/licenses/LICENSE-2.0 | |
9 | - * | |
10 | - * Unless required by applicable law or agreed to in writing, software | |
11 | - * distributed under the License is distributed on an "AS IS" BASIS, | |
12 | - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
13 | - * See the License for the specific language governing permissions and | |
14 | - * limitations under the License. | |
15 | - */ | |
16 | - | |
17 | -package com.android.tools.dict; | |
18 | - | |
19 | -import org.xml.sax.Attributes; | |
20 | -import org.xml.sax.helpers.DefaultHandler; | |
21 | - | |
22 | -import java.io.File; | |
23 | -import java.util.ArrayList; | |
24 | -import java.util.HashMap; | |
25 | -import java.util.Map; | |
26 | -import java.util.Set; | |
27 | - | |
28 | -import javax.xml.parsers.SAXParser; | |
29 | -import javax.xml.parsers.SAXParserFactory; | |
30 | - | |
31 | -/** | |
32 | - * Helper for MakeBinaryDictionary | |
33 | - * Deals with all the bigram data | |
34 | - */ | |
35 | -public class BigramDictionary { | |
36 | - | |
37 | - /* | |
38 | - * Must match the values in the client side which is located in dictionary.cpp & dictionary.h | |
39 | - * Changing these values will generate totally different structure which must be also reflected | |
40 | - * on the client side. | |
41 | - */ | |
42 | - public static final int FLAG_BIGRAM_READ = 0x80; | |
43 | - public static final int FLAG_BIGRAM_CHILDEXIST = 0x40; | |
44 | - public static final int FLAG_BIGRAM_CONTINUED = 0x80; | |
45 | - public static final int FLAG_BIGRAM_FREQ = 0x7F; | |
46 | - | |
47 | - public static final int FOR_REVERSE_LOOKUPALL = -99; | |
48 | - | |
49 | - public ArrayList<String> mBigramToFill = new ArrayList<String>(); | |
50 | - public ArrayList<Integer> mBigramToFillAddress = new ArrayList<Integer>(); | |
51 | - | |
52 | - public HashMap<String, Bigram> mBi; | |
53 | - | |
54 | - public boolean mHasBigram; | |
55 | - | |
56 | - public BigramDictionary(String bigramSrcFilename, boolean hasBigram) { | |
57 | - mHasBigram = hasBigram; | |
58 | - loadBigram(bigramSrcFilename); | |
59 | - } | |
60 | - | |
61 | - private void loadBigram(String filename) { | |
62 | - mBi = new HashMap<String, Bigram>(); | |
63 | - if (!mHasBigram) { | |
64 | - System.out.println("Number of bigrams = " + Bigram.sBigramNum); | |
65 | - return; | |
66 | - } | |
67 | - try { | |
68 | - SAXParser parser = SAXParserFactory.newInstance().newSAXParser(); | |
69 | - parser.parse(new File(filename), new DefaultHandler() { | |
70 | - String w1 = null; | |
71 | - boolean inWord1 = false; | |
72 | - boolean inWord2 = false; | |
73 | - int freq = 0, counter = 0; | |
74 | - Bigram tempBigram = null; | |
75 | - | |
76 | - @Override | |
77 | - public void startElement(String uri, String localName, | |
78 | - String qName, Attributes attributes) { | |
79 | - if (qName.equals("bi")) { | |
80 | - inWord1 = true; | |
81 | - w1 = attributes.getValue(0); | |
82 | - int count = Integer.parseInt(attributes.getValue(1)); | |
83 | - tempBigram = new Bigram(count); | |
84 | - counter = 0; | |
85 | - } else if (qName.equals("w")) { | |
86 | - inWord2 = true; | |
87 | - String word2 = attributes.getValue(0); | |
88 | - int freq = Integer.parseInt(attributes.getValue(1)); | |
89 | - tempBigram.setWord2(counter, word2, freq); | |
90 | - counter++; | |
91 | - Bigram.sBigramNum++; | |
92 | - } | |
93 | - } | |
94 | - | |
95 | - @Override | |
96 | - public void endElement(String uri, String localName, | |
97 | - String qName) { | |
98 | - if (inWord2) { | |
99 | - inWord2 = false; | |
100 | - } else if (inWord1) { | |
101 | - inWord1 = false; | |
102 | - mBi.put(w1, tempBigram); | |
103 | - } | |
104 | - } | |
105 | - }); | |
106 | - } catch (Exception ioe) { | |
107 | - System.err.println("Exception in parsing bigram\n" + ioe); | |
108 | - ioe.printStackTrace(); | |
109 | - } | |
110 | - System.out.println("Number of bigrams = " + Bigram.sBigramNum); | |
111 | - } | |
112 | - | |
113 | - byte[] writeBigrams(byte[] dict, Map<String, Integer> mDictionary) { | |
114 | - for (int i = 0; i < mBigramToFill.size(); i++) { | |
115 | - String w1 = mBigramToFill.get(i); | |
116 | - int address = mBigramToFillAddress.get(i); | |
117 | - | |
118 | - Bigram temp = mBi.get(w1); | |
119 | - int word2Count = temp.count; | |
120 | - int j4; | |
121 | - for (int j = 0; j < word2Count; j++) { | |
122 | - if (!mDictionary.containsKey(temp.word2[j])) { | |
123 | - System.out.println("Not in dictionary: " + temp.word2[j]); | |
124 | - System.exit(0); | |
125 | - } else { | |
126 | - j4 = (j * 4); | |
127 | - int addressOfWord2 = mDictionary.get(temp.word2[j]); | |
128 | - dict[address + j4 + 0] = (byte) (((addressOfWord2 & 0x3F0000) >> 16) | |
129 | - | FLAG_BIGRAM_READ); | |
130 | - dict[address + j4 + 1] = (byte) ((addressOfWord2 & 0x00FF00) >> 8); | |
131 | - dict[address + j4 + 2] = (byte) ((addressOfWord2 & 0x0000FF)); | |
132 | - | |
133 | - if (j == (word2Count - 1)) { | |
134 | - dict[address + j4 + 3] = (byte) (temp.freq[j] & FLAG_BIGRAM_FREQ); | |
135 | - } else { | |
136 | - dict[address + j4 + 3] = (byte) ((temp.freq[j] & FLAG_BIGRAM_FREQ) | |
137 | - | FLAG_BIGRAM_CONTINUED); | |
138 | - } | |
139 | - } | |
140 | - } | |
141 | - } | |
142 | - | |
143 | - return dict; | |
144 | - } | |
145 | - | |
146 | - void reverseLookupAll(Map<String, Integer> mDictionary, byte[] dict) { | |
147 | - Set<String> st = mDictionary.keySet(); | |
148 | - for (String s : st) { | |
149 | - searchForTerminalNode(mDictionary.get(s), FOR_REVERSE_LOOKUPALL, dict); | |
150 | - } | |
151 | - } | |
152 | - | |
153 | - void searchForTerminalNode(int bigramAddress, int frequency, byte[] dict) { | |
154 | - StringBuilder sb = new StringBuilder(48); | |
155 | - int pos; | |
156 | - boolean found = false; | |
157 | - int followDownBranchAddress = 2; | |
158 | - char followingChar = ' '; | |
159 | - int depth = 0; | |
160 | - int totalLoopCount = 0; | |
161 | - | |
162 | - while (!found) { | |
163 | - boolean followDownAddressSearchStop = false; | |
164 | - boolean firstAddress = true; | |
165 | - boolean haveToSearchAll = true; | |
166 | - | |
167 | - if (depth > 0) { | |
168 | - sb.append(followingChar); | |
169 | - } | |
170 | - pos = followDownBranchAddress; // pos start at count | |
171 | - int count = dict[pos] & 0xFF; | |
172 | - pos++; | |
173 | - for (int i = 0; i < count; i++) { | |
174 | - totalLoopCount++; | |
175 | - // pos at data | |
176 | - pos++; | |
177 | - // pos now at flag | |
178 | - if (!MakeBinaryDictionary.getFirstBitOfByte(pos, dict)) { // non-terminal | |
179 | - if (!followDownAddressSearchStop) { | |
180 | - int addr = MakeBinaryDictionary.get22BitAddress(pos, dict); | |
181 | - if (addr > bigramAddress) { | |
182 | - followDownAddressSearchStop = true; | |
183 | - if (firstAddress) { | |
184 | - firstAddress = false; | |
185 | - haveToSearchAll = true; | |
186 | - } else if (!haveToSearchAll) { | |
187 | - break; | |
188 | - } | |
189 | - } else { | |
190 | - followDownBranchAddress = addr; | |
191 | - followingChar = (char) (0xFF & dict[pos-1]); | |
192 | - if(firstAddress) { | |
193 | - firstAddress = false; | |
194 | - haveToSearchAll = false; | |
195 | - } | |
196 | - } | |
197 | - } | |
198 | - pos += 3; | |
199 | - } else if (MakeBinaryDictionary.getFirstBitOfByte(pos, dict)) { // terminal | |
200 | - // found !! | |
201 | - if (bigramAddress == (pos-1)) { | |
202 | - sb.append((char) (0xFF & dict[pos-1])); | |
203 | - found = true; | |
204 | - break; | |
205 | - } | |
206 | - | |
207 | - // address + freq (4 byte) | |
208 | - if (MakeBinaryDictionary.getSecondBitOfByte(pos, dict)) { | |
209 | - if (!followDownAddressSearchStop) { | |
210 | - int addr = MakeBinaryDictionary.get22BitAddress(pos, dict); | |
211 | - if (addr > bigramAddress) { | |
212 | - followDownAddressSearchStop = true; | |
213 | - if (firstAddress) { | |
214 | - firstAddress = false; | |
215 | - haveToSearchAll = true; | |
216 | - } else if (!haveToSearchAll) { | |
217 | - break; | |
218 | - } | |
219 | - } else { | |
220 | - followDownBranchAddress = addr; | |
221 | - followingChar = (char) (0xFF & dict[pos-1]); | |
222 | - if(firstAddress) { | |
223 | - firstAddress = false; | |
224 | - haveToSearchAll = true; | |
225 | - } | |
226 | - } | |
227 | - } | |
228 | - pos += 4; | |
229 | - } else { // freq only (2 byte) | |
230 | - pos += 2; | |
231 | - } | |
232 | - // skipping bigram | |
233 | - int bigramExist = (dict[pos] & FLAG_BIGRAM_READ); | |
234 | - if (bigramExist > 0) { | |
235 | - int nextBigramExist = 1; | |
236 | - while (nextBigramExist > 0) { | |
237 | - pos += 3; | |
238 | - nextBigramExist = (dict[pos++] & FLAG_BIGRAM_CONTINUED); | |
239 | - } | |
240 | - } else { | |
241 | - pos++; | |
242 | - } | |
243 | - } | |
244 | - } | |
245 | - depth++; | |
246 | - if (followDownBranchAddress == 2) { | |
247 | - System.out.println("ERROR!!! Cannot find bigram!!"); | |
248 | - System.exit(0); | |
249 | - } | |
250 | - } | |
251 | - | |
252 | - if (frequency == FOR_REVERSE_LOOKUPALL) { | |
253 | - System.out.println("Reverse: " + sb.toString() + " (" + bigramAddress + ")" | |
254 | - + " Loop: " + totalLoopCount); | |
255 | - } else { | |
256 | - System.out.println(" bigram: " + sb.toString() + " (" + bigramAddress + ") freq: " | |
257 | - + frequency + " Loop: " + totalLoopCount); | |
258 | - } | |
259 | - } | |
260 | - | |
261 | - static class Bigram { | |
262 | - String[] word2; | |
263 | - int[] freq; | |
264 | - int count; | |
265 | - static int sBigramNum = 0; | |
266 | - | |
267 | - String getSecondWord(int i) { | |
268 | - return word2[i]; | |
269 | - } | |
270 | - | |
271 | - int getFrequency(int i) { | |
272 | - return (freq[i] == 0) ? 1 : freq[i]; | |
273 | - } | |
274 | - | |
275 | - void setWord2(int index, String word2, int freq) { | |
276 | - this.word2[index] = word2; | |
277 | - this.freq[index] = freq; | |
278 | - } | |
279 | - | |
280 | - public Bigram(int word2Count) { | |
281 | - count = word2Count; | |
282 | - word2 = new String[word2Count]; | |
283 | - freq = new int[word2Count]; | |
284 | - } | |
285 | - } | |
286 | -} |
@@ -1,443 +0,0 @@ | ||
1 | -/* | |
2 | - * Copyright (C) 2009 The Android Open Source Project | |
3 | - * | |
4 | - * Licensed under the Apache License, Version 2.0 (the "License"); | |
5 | - * you may not use this file except in compliance with the License. | |
6 | - * You may obtain a copy of the License at | |
7 | - * | |
8 | - * http://www.apache.org/licenses/LICENSE-2.0 | |
9 | - * | |
10 | - * Unless required by applicable law or agreed to in writing, software | |
11 | - * distributed under the License is distributed on an "AS IS" BASIS, | |
12 | - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
13 | - * See the License for the specific language governing permissions and | |
14 | - * limitations under the License. | |
15 | - */ | |
16 | - | |
17 | -package com.android.tools.dict; | |
18 | - | |
19 | -import org.xml.sax.Attributes; | |
20 | -import org.xml.sax.helpers.DefaultHandler; | |
21 | - | |
22 | -import java.io.File; | |
23 | -import java.io.FileOutputStream; | |
24 | -import java.io.IOException; | |
25 | -import java.util.ArrayList; | |
26 | -import java.util.Arrays; | |
27 | -import java.util.HashMap; | |
28 | -import java.util.List; | |
29 | -import java.util.Map; | |
30 | - | |
31 | -import javax.xml.parsers.SAXParser; | |
32 | -import javax.xml.parsers.SAXParserFactory; | |
33 | - | |
34 | -/** | |
35 | - * Compresses a list of words, frequencies, and bigram data | |
36 | - * into a tree structured binary dictionary. | |
37 | - * Dictionary Version: 200 (may contain bigrams) | |
38 | - * Version number started from 200 rather than 1 because we wanted to prevent number of roots in | |
39 | - * any old dictionaries being mistaken as the version number. There is not a chance that there | |
40 | - * will be more than 200 roots. Version number should be increased when there is structural change | |
41 | - * in the data. There is no need to increase the version when only the words in the data changes. | |
42 | - */ | |
43 | -public class MakeBinaryDictionary { | |
44 | - | |
45 | - private static final int VERSION_NUM = 200; | |
46 | - | |
47 | - public static final int ALPHA_SIZE = 256; | |
48 | - | |
49 | - public static final String TAG_WORD = "w"; | |
50 | - public static final String ATTR_FREQ = "f"; | |
51 | - | |
52 | - private static final int FLAG_ADDRESS_MASK = 0x400000; | |
53 | - private static final int FLAG_TERMINAL_MASK = 0x800000; | |
54 | - private static final int ADDRESS_MASK = 0x3FFFFF; | |
55 | - | |
56 | - /** | |
57 | - * Unit for this variable is in bytes | |
58 | - * If destination file name is main.dict and file limit causes dictionary to be separated into | |
59 | - * multiple file, it will generate main0.dict, main1.dict, and so forth. | |
60 | - */ | |
61 | - private static int sOutputFileSize; | |
62 | - private static boolean sSplitOutput; | |
63 | - | |
64 | - public static final CharNode EMPTY_NODE = new CharNode(); | |
65 | - | |
66 | - List<CharNode> roots; | |
67 | - Map<String, Integer> mDictionary; | |
68 | - int mWordCount; | |
69 | - | |
70 | - BigramDictionary bigramDict; | |
71 | - | |
72 | - static class CharNode { | |
73 | - char data; | |
74 | - int freq; | |
75 | - boolean terminal; | |
76 | - List<CharNode> children; | |
77 | - static int sNodes; | |
78 | - | |
79 | - public CharNode() { | |
80 | - sNodes++; | |
81 | - } | |
82 | - } | |
83 | - | |
84 | - public static void usage() { | |
85 | - System.err.println("Usage: makedict -s <src_dict.xml> [-b <src_bigram.xml>] " | |
86 | - + "-d <dest.dict> [--size filesize]"); | |
87 | - System.exit(-1); | |
88 | - } | |
89 | - | |
90 | - public static void main(String[] args) { | |
91 | - int checkSource = -1; | |
92 | - int checkBigram = -1; | |
93 | - int checkDest = -1; | |
94 | - int checkFileSize = -1; | |
95 | - for (int i = 0; i < args.length; i+=2) { | |
96 | - if (args[i].equals("-s")) checkSource = (i + 1); | |
97 | - if (args[i].equals("-b")) checkBigram = (i + 1); | |
98 | - if (args[i].equals("-d")) checkDest = (i + 1); | |
99 | - if (args[i].equals("--size")) checkFileSize = (i + 1); | |
100 | - } | |
101 | - if (checkFileSize >= 0) { | |
102 | - sSplitOutput = true; | |
103 | - sOutputFileSize = Integer.parseInt(args[checkFileSize]); | |
104 | - } else { | |
105 | - sSplitOutput = false; | |
106 | - } | |
107 | - if (checkDest >= 0 && !args[checkDest].endsWith(".dict")) { | |
108 | - System.err.println("Error: Dictionary output file extension should be \".dict\""); | |
109 | - usage(); | |
110 | - } else if (checkSource >= 0 && checkBigram >= 0 && checkDest >= 0 && | |
111 | - ((!sSplitOutput && args.length == 6) || (sSplitOutput && args.length == 8))) { | |
112 | - new MakeBinaryDictionary(args[checkSource], args[checkBigram], args[checkDest]); | |
113 | - } else if (checkSource >= 0 && checkDest >= 0 && | |
114 | - ((!sSplitOutput && args.length == 4) || (sSplitOutput && args.length == 6))) { | |
115 | - new MakeBinaryDictionary(args[checkSource], null, args[checkDest]); | |
116 | - } else { | |
117 | - usage(); | |
118 | - } | |
119 | - } | |
120 | - | |
121 | - public MakeBinaryDictionary(String srcFilename, String bigramSrcFilename, String destFilename){ | |
122 | - System.out.println("Generating dictionary version " + VERSION_NUM); | |
123 | - bigramDict = new BigramDictionary(bigramSrcFilename, (bigramSrcFilename != null)); | |
124 | - populateDictionary(srcFilename); | |
125 | - writeToDict(destFilename); | |
126 | - | |
127 | - // Enable the code below to verify that the generated tree is traversable | |
128 | - // and bigram data is stored correctly. | |
129 | - if (false) { | |
130 | - bigramDict.reverseLookupAll(mDictionary, dict); | |
131 | - traverseDict(2, new char[32], 0); | |
132 | - } | |
133 | - } | |
134 | - | |
135 | - private void populateDictionary(String filename) { | |
136 | - roots = new ArrayList<CharNode>(); | |
137 | - mDictionary = new HashMap<String, Integer>(); | |
138 | - try { | |
139 | - SAXParser parser = SAXParserFactory.newInstance().newSAXParser(); | |
140 | - parser.parse(new File(filename), new DefaultHandler() { | |
141 | - boolean inWord; | |
142 | - int freq; | |
143 | - StringBuilder wordBuilder = new StringBuilder(48); | |
144 | - | |
145 | - @Override | |
146 | - public void startElement(String uri, String localName, | |
147 | - String qName, Attributes attributes) { | |
148 | - if (qName.equals("w")) { | |
149 | - inWord = true; | |
150 | - freq = Integer.parseInt(attributes.getValue(0)); | |
151 | - wordBuilder.setLength(0); | |
152 | - } | |
153 | - } | |
154 | - | |
155 | - @Override | |
156 | - public void characters(char[] data, int offset, int length) { | |
157 | - // Ignore other whitespace | |
158 | - if (!inWord) return; | |
159 | - wordBuilder.append(data, offset, length); | |
160 | - } | |
161 | - | |
162 | - @Override | |
163 | - public void endElement(String uri, String localName, | |
164 | - String qName) { | |
165 | - if (qName.equals("w")) { | |
166 | - if (wordBuilder.length() > 1) { | |
167 | - addWordTop(wordBuilder.toString(), freq); | |
168 | - mWordCount++; | |
169 | - } | |
170 | - inWord = false; | |
171 | - } | |
172 | - } | |
173 | - }); | |
174 | - } catch (Exception ioe) { | |
175 | - System.err.println("Exception in parsing\n" + ioe); | |
176 | - ioe.printStackTrace(); | |
177 | - } | |
178 | - System.out.println("Nodes = " + CharNode.sNodes); | |
179 | - } | |
180 | - | |
181 | - private int indexOf(List<CharNode> children, char c) { | |
182 | - if (children == null) { | |
183 | - return -1; | |
184 | - } | |
185 | - for (int i = 0; i < children.size(); i++) { | |
186 | - if (children.get(i).data == c) { | |
187 | - return i; | |
188 | - } | |
189 | - } | |
190 | - return -1; | |
191 | - } | |
192 | - | |
193 | - private void addWordTop(String word, int occur) { | |
194 | - if (occur > 255) occur = 255; | |
195 | - char firstChar = word.charAt(0); | |
196 | - int index = indexOf(roots, firstChar); | |
197 | - if (index == -1) { | |
198 | - CharNode newNode = new CharNode(); | |
199 | - newNode.data = firstChar; | |
200 | - newNode.freq = occur; | |
201 | - index = roots.size(); | |
202 | - roots.add(newNode); | |
203 | - } else { | |
204 | - roots.get(index).freq += occur; | |
205 | - } | |
206 | - if (word.length() > 1) { | |
207 | - addWordRec(roots.get(index), word, 1, occur); | |
208 | - } else { | |
209 | - roots.get(index).terminal = true; | |
210 | - } | |
211 | - } | |
212 | - | |
213 | - private void addWordRec(CharNode parent, String word, int charAt, int occur) { | |
214 | - CharNode child = null; | |
215 | - char data = word.charAt(charAt); | |
216 | - if (parent.children == null) { | |
217 | - parent.children = new ArrayList<CharNode>(); | |
218 | - } else { | |
219 | - for (int i = 0; i < parent.children.size(); i++) { | |
220 | - CharNode node = parent.children.get(i); | |
221 | - if (node.data == data) { | |
222 | - child = node; | |
223 | - break; | |
224 | - } | |
225 | - } | |
226 | - } | |
227 | - if (child == null) { | |
228 | - child = new CharNode(); | |
229 | - parent.children.add(child); | |
230 | - } | |
231 | - child.data = data; | |
232 | - if (child.freq == 0) child.freq = occur; | |
233 | - if (word.length() > charAt + 1) { | |
234 | - addWordRec(child, word, charAt + 1, occur); | |
235 | - } else { | |
236 | - child.terminal = true; | |
237 | - child.freq = occur; | |
238 | - } | |
239 | - } | |
240 | - | |
241 | - byte[] dict; | |
242 | - int dictSize; | |
243 | - static final int CHAR_WIDTH = 8; | |
244 | - static final int FLAGS_WIDTH = 1; // Terminal flag (word end) | |
245 | - static final int ADDR_WIDTH = 23; // Offset to children | |
246 | - static final int FREQ_WIDTH_BYTES = 1; | |
247 | - static final int COUNT_WIDTH_BYTES = 1; | |
248 | - | |
249 | - private void addCount(int count) { | |
250 | - dict[dictSize++] = (byte) (0xFF & count); | |
251 | - } | |
252 | - | |
253 | - private void addNode(CharNode node, String word1) { | |
254 | - if (node.terminal) { // store address of each word1 | |
255 | - mDictionary.put(word1, dictSize); | |
256 | - } | |
257 | - int charData = 0xFFFF & node.data; | |
258 | - if (charData > 254) { | |
259 | - dict[dictSize++] = (byte) 255; | |
260 | - dict[dictSize++] = (byte) ((node.data >> 8) & 0xFF); | |
261 | - dict[dictSize++] = (byte) (node.data & 0xFF); | |
262 | - } else { | |
263 | - dict[dictSize++] = (byte) (0xFF & node.data); | |
264 | - } | |
265 | - if (node.children != null) { | |
266 | - dictSize += 3; // Space for children address | |
267 | - } else { | |
268 | - dictSize += 1; // Space for just the terminal/address flags | |
269 | - } | |
270 | - if ((0xFFFFFF & node.freq) > 255) { | |
271 | - node.freq = 255; | |
272 | - } | |
273 | - if (node.terminal) { | |
274 | - byte freq = (byte) (0xFF & node.freq); | |
275 | - dict[dictSize++] = freq; | |
276 | - // bigram | |
277 | - if (bigramDict.mBi.containsKey(word1)) { | |
278 | - int count = bigramDict.mBi.get(word1).count; | |
279 | - bigramDict.mBigramToFill.add(word1); | |
280 | - bigramDict.mBigramToFillAddress.add(dictSize); | |
281 | - dictSize += (4 * count); | |
282 | - } else { | |
283 | - dict[dictSize++] = (byte) (0x00); | |
284 | - } | |
285 | - } | |
286 | - } | |
287 | - | |
288 | - int nullChildrenCount = 0; | |
289 | - int notTerminalCount = 0; | |
290 | - | |
291 | - private void updateNodeAddress(int nodeAddress, CharNode node, | |
292 | - int childrenAddress) { | |
293 | - if ((dict[nodeAddress] & 0xFF) == 0xFF) { // 3 byte character | |
294 | - nodeAddress += 2; | |
295 | - } | |
296 | - childrenAddress = ADDRESS_MASK & childrenAddress; | |
297 | - if (childrenAddress == 0) { | |
298 | - nullChildrenCount++; | |
299 | - } else { | |
300 | - childrenAddress |= FLAG_ADDRESS_MASK; | |
301 | - } | |
302 | - if (node.terminal) { | |
303 | - childrenAddress |= FLAG_TERMINAL_MASK; | |
304 | - } else { | |
305 | - notTerminalCount++; | |
306 | - } | |
307 | - dict[nodeAddress + 1] = (byte) (childrenAddress >> 16); | |
308 | - if ((childrenAddress & FLAG_ADDRESS_MASK) != 0) { | |
309 | - dict[nodeAddress + 2] = (byte) ((childrenAddress & 0xFF00) >> 8); | |
310 | - dict[nodeAddress + 3] = (byte) ((childrenAddress & 0xFF)); | |
311 | - } | |
312 | - } | |
313 | - | |
314 | - void writeWordsRec(List<CharNode> children, StringBuilder word) { | |
315 | - if (children == null || children.size() == 0) { | |
316 | - return; | |
317 | - } | |
318 | - final int childCount = children.size(); | |
319 | - addCount(childCount); | |
320 | - int[] childrenAddresses = new int[childCount]; | |
321 | - for (int j = 0; j < childCount; j++) { | |
322 | - CharNode node = children.get(j); | |
323 | - childrenAddresses[j] = dictSize; | |
324 | - word.append(children.get(j).data); | |
325 | - addNode(node, word.toString()); | |
326 | - word.deleteCharAt(word.length()-1); | |
327 | - } | |
328 | - for (int j = 0; j < childCount; j++) { | |
329 | - CharNode node = children.get(j); | |
330 | - int nodeAddress = childrenAddresses[j]; | |
331 | - int cacheDictSize = dictSize; | |
332 | - word.append(children.get(j).data); | |
333 | - writeWordsRec(node.children, word); | |
334 | - word.deleteCharAt(word.length()-1); | |
335 | - updateNodeAddress(nodeAddress, node, node.children != null | |
336 | - ? cacheDictSize : 0); | |
337 | - } | |
338 | - } | |
339 | - | |
340 | - void writeToDict(String dictFilename) { | |
341 | - // 4MB max, 22-bit offsets | |
342 | - dict = new byte[4 * 1024 * 1024]; // 4MB upper limit. Actual is probably | |
343 | - // < 1MB in most cases, as there is a limit in the | |
344 | - // resource size in apks. | |
345 | - dictSize = 0; | |
346 | - | |
347 | - dict[dictSize++] = (byte) (0xFF & VERSION_NUM); // version info | |
348 | - dict[dictSize++] = (byte) (0xFF & (bigramDict.mHasBigram ? 1 : 0)); | |
349 | - | |
350 | - StringBuilder word = new StringBuilder(48); | |
351 | - writeWordsRec(roots, word); | |
352 | - dict = bigramDict.writeBigrams(dict, mDictionary); | |
353 | - System.out.println("Dict Size = " + dictSize); | |
354 | - if (!sSplitOutput) { | |
355 | - sOutputFileSize = dictSize; | |
356 | - } | |
357 | - try { | |
358 | - int currentLoc = 0; | |
359 | - int i = 0; | |
360 | - int extension = dictFilename.indexOf(".dict"); | |
361 | - String filename = dictFilename.substring(0, extension); | |
362 | - while (dictSize > 0) { | |
363 | - FileOutputStream fos; | |
364 | - if (sSplitOutput) { | |
365 | - fos = new FileOutputStream(filename + i + ".dict"); | |
366 | - } else { | |
367 | - fos = new FileOutputStream(filename + ".dict"); | |
368 | - } | |
369 | - if (dictSize > sOutputFileSize) { | |
370 | - fos.write(dict, currentLoc, sOutputFileSize); | |
371 | - dictSize -= sOutputFileSize; | |
372 | - currentLoc += sOutputFileSize; | |
373 | - } else { | |
374 | - fos.write(dict, currentLoc, dictSize); | |
375 | - dictSize = 0; | |
376 | - } | |
377 | - fos.close(); | |
378 | - i++; | |
379 | - } | |
380 | - } catch (IOException ioe) { | |
381 | - System.err.println("Error writing dict file:" + ioe); | |
382 | - } | |
383 | - } | |
384 | - | |
385 | - void traverseDict(int pos, char[] word, int depth) { | |
386 | - int count = dict[pos++] & 0xFF; | |
387 | - for (int i = 0; i < count; i++) { | |
388 | - char c = (char) (dict[pos++] & 0xFF); | |
389 | - if (c == 0xFF) { // two byte character | |
390 | - c = (char) (((dict[pos] & 0xFF) << 8) | (dict[pos+1] & 0xFF)); | |
391 | - pos += 2; | |
392 | - } | |
393 | - word[depth] = c; | |
394 | - boolean terminal = getFirstBitOfByte(pos, dict); | |
395 | - int address = 0; | |
396 | - if ((dict[pos] & (FLAG_ADDRESS_MASK >> 16)) > 0) { // address check | |
397 | - address = get22BitAddress(pos, dict); | |
398 | - pos += 3; | |
399 | - } else { | |
400 | - pos += 1; | |
401 | - } | |
402 | - if (terminal) { | |
403 | - showWord(word, depth + 1, dict[pos] & 0xFF); | |
404 | - pos++; | |
405 | - | |
406 | - int bigramExist = (dict[pos] & bigramDict.FLAG_BIGRAM_READ); | |
407 | - if (bigramExist > 0) { | |
408 | - int nextBigramExist = 1; | |
409 | - while (nextBigramExist > 0) { | |
410 | - int bigramAddress = get22BitAddress(pos, dict); | |
411 | - pos += 3; | |
412 | - int frequency = (bigramDict.FLAG_BIGRAM_FREQ & dict[pos]); | |
413 | - bigramDict.searchForTerminalNode(bigramAddress, frequency, dict); | |
414 | - nextBigramExist = (dict[pos++] & bigramDict.FLAG_BIGRAM_CONTINUED); | |
415 | - } | |
416 | - } else { | |
417 | - pos++; | |
418 | - } | |
419 | - } | |
420 | - if (address != 0) { | |
421 | - traverseDict(address, word, depth + 1); | |
422 | - } | |
423 | - } | |
424 | - } | |
425 | - | |
426 | - void showWord(char[] word, int size, int freq) { | |
427 | - System.out.print(new String(word, 0, size) + " " + freq + "\n"); | |
428 | - } | |
429 | - | |
430 | - static int get22BitAddress(int pos, byte[] dict) { | |
431 | - return ((dict[pos + 0] & 0x3F) << 16) | |
432 | - | ((dict[pos + 1] & 0xFF) << 8) | |
433 | - | ((dict[pos + 2] & 0xFF)); | |
434 | - } | |
435 | - | |
436 | - static boolean getFirstBitOfByte(int pos, byte[] dict) { | |
437 | - return (dict[pos] & 0x80) > 0; | |
438 | - } | |
439 | - | |
440 | - static boolean getSecondBitOfByte(int pos, byte[] dict) { | |
441 | - return (dict[pos] & 0x40) > 0; | |
442 | - } | |
443 | -} |