Skip to content

Commit a03db7b

Browse files
authored
Merge pull request #4 from CogComp/lm
Lm
2 parents 59a9d52 + b91b9ed commit a03db7b

File tree

1 file changed

+28
-8
lines changed

1 file changed

+28
-8
lines changed

ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/CharacterLanguageModel.java

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ public CharacterLanguageModel(){
2727

2828
// counts maps: history -> { word: count, word : count, etc }
2929
counts = new HashMap<>();
30-
order = 6;
30+
order = 4;
3131
}
3232

3333

@@ -173,6 +173,26 @@ public static List<String> string2list(String s){
173173
return chars;
174174
}
175175

176+
// this test is for a large file at :
177+
// /shared/corpora/ner/clm/wikiEntity_train.out
178+
public static void test2() throws Exception {
179+
List<String> lines = LineIO.read("/shared/corpora/ner/clm/wikiEntity_train.out");
180+
List<List<String>> seqs = new ArrayList<>();
181+
for(String line : lines){
182+
String[] chars = line.trim().split(" ");
183+
ArrayList<String> seq = new ArrayList<String>(Arrays.asList(chars));
184+
seqs.add(seq);
185+
}
186+
187+
CharacterLanguageModel clm = new CharacterLanguageModel();
188+
System.out.println(seqs.size());
189+
clm.train(seqs);
190+
191+
192+
System.out.println(clm.perplexity(Arrays.asList("H o e k s t e n b e r g e r".split(" "))));
193+
System.out.println(clm.perplexity(Arrays.asList("a b s t r a c t u a l l y".split(" "))));
194+
}
195+
176196
public static void test() throws FileNotFoundException {
177197
String dir = "/home/mayhew/data/pytorch-example/data/names/";
178198
File names = new File(dir);
@@ -243,21 +263,21 @@ public static void test() throws FileNotFoundException {
243263

244264
public static void main(String[] args) throws Exception {
245265
// this trains models, and provides perplexities.
246-
//test();
266+
test2();
247267

248-
ParametersForLbjCode params = Parameters.readConfigAndLoadExternalData("config/ner.properties", false);
268+
//ParametersForLbjCode params = Parameters.readConfigAndLoadExternalData("config/ner.properties", false);
249269

250270
// String trainpath= "/shared/corpora/ner/conll2003/eng-files/Train-json/";
251271
// String testpath = "/shared/corpora/ner/conll2003/eng-files/Test-json/";
252272

253-
String trainpath= "/shared/corpora/ner/lorelei-swm-new/ben/Train/";
254-
String testpath = "/shared/corpora/ner/lorelei-swm-new/ben/Test/";
273+
//String trainpath= "/shared/corpora/ner/lorelei-swm-new/ara/Train/";
274+
//String testpath = "/shared/corpora/ner/lorelei-swm-new/ara/Test/";
255275

256276

257-
Data trainData = new Data(trainpath, trainpath, "-json", new String[] {}, new String[] {}, params);
258-
Data testData = new Data(testpath, testpath, "-json", new String[] {}, new String[] {}, params);
277+
//Data trainData = new Data(trainpath, trainpath, "-json", new String[] {}, new String[] {}, params);
278+
//Data testData = new Data(testpath, testpath, "-json", new String[] {}, new String[] {}, params);
259279

260-
trainEntityNotEntity(trainData, testData);
280+
//trainEntityNotEntity(trainData, testData);
261281
}
262282

263283

0 commit comments

Comments
 (0)