@@ -27,7 +27,7 @@ public CharacterLanguageModel(){
27
27
28
28
// counts maps: history -> { word: count, word : count, etc }
29
29
counts = new HashMap <>();
30
- order = 6 ;
30
+ order = 4 ;
31
31
}
32
32
33
33
@@ -173,6 +173,26 @@ public static List<String> string2list(String s){
173
173
return chars ;
174
174
}
175
175
176
+ // this test is for a large file at :
177
+ // /shared/corpora/ner/clm/wikiEntity_train.out
178
+ public static void test2 () throws Exception {
179
+ List <String > lines = LineIO .read ("/shared/corpora/ner/clm/wikiEntity_train.out" );
180
+ List <List <String >> seqs = new ArrayList <>();
181
+ for (String line : lines ){
182
+ String [] chars = line .trim ().split (" " );
183
+ ArrayList <String > seq = new ArrayList <String >(Arrays .asList (chars ));
184
+ seqs .add (seq );
185
+ }
186
+
187
+ CharacterLanguageModel clm = new CharacterLanguageModel ();
188
+ System .out .println (seqs .size ());
189
+ clm .train (seqs );
190
+
191
+
192
+ System .out .println (clm .perplexity (Arrays .asList ("H o e k s t e n b e r g e r" .split (" " ))));
193
+ System .out .println (clm .perplexity (Arrays .asList ("a b s t r a c t u a l l y" .split (" " ))));
194
+ }
195
+
176
196
public static void test () throws FileNotFoundException {
177
197
String dir = "/home/mayhew/data/pytorch-example/data/names/" ;
178
198
File names = new File (dir );
@@ -243,21 +263,21 @@ public static void test() throws FileNotFoundException {
243
263
244
264
public static void main (String [] args ) throws Exception {
245
265
// this trains models, and provides perplexities.
246
- //test ();
266
+ test2 ();
247
267
248
- ParametersForLbjCode params = Parameters .readConfigAndLoadExternalData ("config/ner.properties" , false );
268
+ // ParametersForLbjCode params = Parameters.readConfigAndLoadExternalData("config/ner.properties", false);
249
269
250
270
// String trainpath= "/shared/corpora/ner/conll2003/eng-files/Train-json/";
251
271
// String testpath = "/shared/corpora/ner/conll2003/eng-files/Test-json/";
252
272
253
- String trainpath = "/shared/corpora/ner/lorelei-swm-new/ben /Train/" ;
254
- String testpath = "/shared/corpora/ner/lorelei-swm-new/ben /Test/" ;
273
+ // String trainpath= "/shared/corpora/ner/lorelei-swm-new/ara /Train/";
274
+ // String testpath = "/shared/corpora/ner/lorelei-swm-new/ara /Test/";
255
275
256
276
257
- Data trainData = new Data (trainpath , trainpath , "-json" , new String [] {}, new String [] {}, params );
258
- Data testData = new Data (testpath , testpath , "-json" , new String [] {}, new String [] {}, params );
277
+ // Data trainData = new Data(trainpath, trainpath, "-json", new String[] {}, new String[] {}, params);
278
+ // Data testData = new Data(testpath, testpath, "-json", new String[] {}, new String[] {}, params);
259
279
260
- trainEntityNotEntity (trainData , testData );
280
+ // trainEntityNotEntity(trainData, testData);
261
281
}
262
282
263
283
0 commit comments