Skip to content

Commit

Permalink
fix #12: recognize end of sentence
Browse files Browse the repository at this point in the history
  • Loading branch information
Koji Sekiguchi committed Nov 18, 2014
1 parent 737c0b9 commit 5186af2
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 3 deletions.
13 changes: 12 additions & 1 deletion src/java/com/rondhuit/w2v/Corpus.java
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,20 @@ public void rewind(int numThreads, int id){
eoc = false;
}

/**
*
* @return -3 if end of sentence, -2 if end of corpus, -1 if word not found or index value of the word
* @throws IOException
*/
public int readWordIndex() throws IOException {
String word = nextWord();
return word == null ? -2 : searchVocab(word);
if(word == null){
if(eoc) return -2; // end of corpus
else return -3; // end of sentence
}
else{
return searchVocab(word); // index value of the word
}
}

/**
Expand Down
2 changes: 1 addition & 1 deletion src/java/com/rondhuit/w2v/Word2vec.java
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ public void run(){
if(word == -2) break; // EOF
if(word == -1) continue;
word_count++;
if (word == 0) break;
if (word == -3) break;
// The subsampling randomly discards frequent words while keeping the ranking same
if (sample > 0) {
double ran = (Math.sqrt(vocab[word].cn / (sample * trainWords)) + 1) * (sample * trainWords) / vocab[word].cn;
Expand Down
5 changes: 4 additions & 1 deletion src/java/com/rondhuit/w2v/lucene/LuceneIndexCorpus.java
Original file line number Diff line number Diff line change
Expand Up @@ -127,11 +127,14 @@ public String nextWord() throws IOException {
tokenStream = analyzer.tokenStream(field, values[valPos++]);
termAtt = tokenStream.getAttribute(CharTermAttribute.class);
tokenStream.reset();
eoc = false;
return null;
}
else{
if(tdPos >= topDocs.totalHits){
tokenStream = null;
return null; // end of index
eoc = true;
return null; // end of index == end of corpus
}
Document doc = reader.document(topDocs.scoreDocs[tdPos++].doc);
values = doc.getValues(field); // This method returns an empty array when there are no matching fields.
Expand Down

0 comments on commit 5186af2

Please sign in to comment.