Skip to content

Commit

Permalink
merge paul's extractor code
Browse files Browse the repository at this point in the history
  • Loading branch information
Chris Dyer committed Apr 23, 2013
2 parents 0e46089 + d467e14 commit c164dc0
Show file tree
Hide file tree
Showing 113 changed files with 18,714 additions and 11,578 deletions.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ extools/filter_score_grammar
extools/mr_stripe_rule_reduce
extools/score_grammar
extools/sg_lexer.cc
extractor/*_test
extractor/compile
extractor/run_extractor
gi/clda/src/clda
gi/markov_al/ml
gi/pf/align-lexonly
Expand Down Expand Up @@ -100,7 +103,9 @@ jam-files/bjam
jam-files/engine/bin.*
jam-files/engine/bootstrap/
klm/lm/bin/
klm/lm/builder/builder
klm/lm/build_binary
klm/lm/ngram_query
klm/lm/query
klm/util/bin/
libtool
Expand All @@ -122,6 +127,7 @@ phrasinator/gibbs_train_plm_notables
previous.sh
pro-train/mr_pro_map
pro-train/mr_pro_reduce
python/build
python/setup.py
rampion/rampion_cccp
rst_parser/mst_train
Expand Down
3 changes: 2 additions & 1 deletion Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ SUBDIRS = \
training \
training/liblbfgs \
word-aligner \
example_extff
example_extff \
extractor

#gi/pyp-topics/src gi/clda/src gi/posterior-regularisation/prjava

Expand Down
90 changes: 88 additions & 2 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ esac
AC_PROG_CC
AC_PROG_CXX
AC_LANG_CPLUSPLUS
AC_OPENMP
BOOST_REQUIRE([1.44])
BOOST_FILESYSTEM
BOOST_PROGRAM_OPTIONS
BOOST_SYSTEM
BOOST_SERIALIZATION
Expand Down Expand Up @@ -87,11 +89,94 @@ then
AM_CONDITIONAL([HAVE_CMPH], true)
fi

AM_CONDITIONAL([HAVE_GTEST], false)
AC_ARG_WITH(gtest,
[AC_HELP_STRING([--with-gtest=DIR], [(optional) path to Google Test library])],
[with_gtest=$withval],
[with_gtest=no]
)

AM_CONDITIONAL([HAVE_GMOCK], false)
AC_ARG_WITH(gmock,
[AC_HELP_STRING([--with-gmock=DIR], [(optional) path to Google Mock library])],
[with_gmock=$withval],
[with_gmock=no]
)

if test "x$with_gtest" != 'xno'
then
gtest_CPPFLAGS="-I${with_gtest}/include"
gtest_LDFLAGS="-L${with_gtest} -L${with_gtest}/lib"
gtest_LIBS="-lgtest_main -lgtest -lpthread"

SAVECPP_FLAGS="$CPPFLAGS"
CPPFLAGS="$CPPFLAGS $gtest_CPPFLAGS"
AC_CHECK_HEADER(${with_gtest}/include/gtest/gtest.h,
[AC_DEFINE([HAVE_GTEST], [1], [flag for Google Test header])],
[AC_MSG_ERROR([Cannot find Google Test headers!])]
)

SAVE_LDFLAGS="$LDFLAGS"
LDFLAGS="$LDFLAGS $gtest_LDFLAGS"
SAVE_LIBS="$LIBS"
# Google Test needs pthreads.
AC_CHECK_LIB([pthread],
[pthread_mutex_init],
[],
[AC_MSG_ERROR([Cannot find pthread library])]
)
AX_CXX_CHECK_LIB([gtest],
[testing::TestInfo::name() const],
[],
[AC_MSG_ERROR([Cannot find Google Test library libgtest])]
)
AC_CHECK_LIB([gtest_main],
[main],
[],
[AC_MSG_ERROR([Cannot find Google Test library libgtest_main])]
)

AC_SUBST(AS_TR_CPP([GTEST_CPPFLAGS]), ["$gtest_CPPFLAGS"])
AC_SUBST(AS_TR_CPP([GTEST_LDFLAGS]), ["$gtest_LDFLAGS"])
AC_SUBST(AS_TR_CPP([GTEST_LIBS]), ["$gtest_LIBS"])


if test "x$with_gmock" != 'xno'
then
gmock_CPPFLAGS="-I${with_gmock}/include"
gmock_LDFLAGS="-L${with_gmock} -L${with_gmock}/lib"
gmock_LIBS="-lgmock"

CPPFLAGS="$CPPFLAGS $gmock_CPPFLAGS"
AC_CHECK_HEADER(${with_gmock}/include/gmock/gmock.h,
[AC_DEFINE([HAVE_GMOCK], [1], [flag for Google Mock header])],
[AC_MSG_ERROR([Cannot find Google Mock headers!])]
)

LDFLAGS="$LDFLAGS $gmock_LDFLAGS"
AX_CXX_CHECK_LIB([gmock],
[testing::Expectation],
[],
[AC_MSG_ERROR([Cannot find Google Mock library libgmock])]
)

AC_SUBST(AS_TR_CPP([GMOCK_CPPFLAGS]), ["$gmock_CPPFLAGS"])
AC_SUBST(AS_TR_CPP([GMOCK_LDFLAGS]), ["$gmock_LDFLAGS"])
AC_SUBST(AS_TR_CPP([GMOCK_LIBS]), ["$gmock_LIBS"])
AM_CONDITIONAL([HAVE_GMOCK], true)
fi

CPPFLAGS="$SAVE_CPPFLAGS"
LDFLAGS="$SAVE_LDFLAGS"
LIBS="$SAVE_LIBS"
AM_CONDITIONAL([HAVE_GTEST], true)
fi

#BOOST_THREADS
CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS"
LDFLAGS="$LDFLAGS $BOOST_PROGRAM_OPTIONS_LDFLAGS $BOOST_SERIALIZATION_LDFLAGS $BOOST_SYSTEM_LDFLAGS"
LDFLAGS="$LDFLAGS $BOOST_PROGRAM_OPTIONS_LDFLAGS $BOOST_SERIALIZATION_LDFLAGS $BOOST_SYSTEM_LDFLAGS $BOOST_FILESYSTEM_LDFLAGS"
# $BOOST_THREAD_LDFLAGS"
LIBS="$LIBS $BOOST_PROGRAM_OPTIONS_LIBS $BOOST_SERIALIZATION_LIBS $BOOST_SYSTEM_LIBS $ZLIBS"
LIBS="$LIBS $BOOST_PROGRAM_OPTIONS_LIBS $BOOST_SERIALIZATION_LIBS $BOOST_SYSTEM_LIBS $BOOST_FILESYSTEM_LIBS $ZLIBS"
# $BOOST_THREAD_LIBS"

AC_CHECK_HEADER(google/dense_hash_map,
Expand All @@ -108,6 +193,7 @@ AC_CONFIG_FILES([mteval/Makefile])
AC_CONFIG_FILES([mteval/meteor_jar.cc])
AC_CONFIG_FILES([decoder/Makefile])
AC_CONFIG_FILES([python/setup.py])
AC_CONFIG_FILES([extractor/Makefile])
AC_CONFIG_FILES([word-aligner/Makefile])

# KenLM stuff
Expand Down
149 changes: 149 additions & 0 deletions extractor/Makefile.am
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
bin_PROGRAMS = compile run_extractor

EXTRA_PROGRAMS = alignment_test \
data_array_test \
fast_intersector_test \
feature_count_source_target_test \
feature_is_source_singleton_test \
feature_is_source_target_singleton_test \
feature_max_lex_source_given_target_test \
feature_max_lex_target_given_source_test \
feature_sample_source_count_test \
feature_target_given_source_coherent_test \
grammar_extractor_test \
matchings_finder_test \
phrase_test \
precomputation_test \
rule_extractor_helper_test \
rule_extractor_test \
rule_factory_test \
sampler_test \
scorer_test \
suffix_array_test \
target_phrase_extractor_test \
translation_table_test

if HAVE_GTEST
RUNNABLE_TESTS = alignment_test \
data_array_test \
fast_intersector_test \
feature_count_source_target_test \
feature_is_source_singleton_test \
feature_is_source_target_singleton_test \
feature_max_lex_source_given_target_test \
feature_max_lex_target_given_source_test \
feature_sample_source_count_test \
feature_target_given_source_coherent_test \
grammar_extractor_test \
matchings_finder_test \
phrase_test \
precomputation_test \
rule_extractor_helper_test \
rule_extractor_test \
rule_factory_test \
sampler_test \
scorer_test \
suffix_array_test \
target_phrase_extractor_test \
translation_table_test
endif

noinst_PROGRAMS = $(RUNNABLE_TESTS)

TESTS = $(RUNNABLE_TESTS)

alignment_test_SOURCES = alignment_test.cc
alignment_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a
data_array_test_SOURCES = data_array_test.cc
data_array_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a
fast_intersector_test_SOURCES = fast_intersector_test.cc
fast_intersector_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
feature_count_source_target_test_SOURCES = features/count_source_target_test.cc
feature_count_source_target_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a
feature_is_source_singleton_test_SOURCES = features/is_source_singleton_test.cc
feature_is_source_singleton_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a
feature_is_source_target_singleton_test_SOURCES = features/is_source_target_singleton_test.cc
feature_is_source_target_singleton_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a
feature_max_lex_source_given_target_test_SOURCES = features/max_lex_source_given_target_test.cc
feature_max_lex_source_given_target_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
feature_max_lex_target_given_source_test_SOURCES = features/max_lex_target_given_source_test.cc
feature_max_lex_target_given_source_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
feature_sample_source_count_test_SOURCES = features/sample_source_count_test.cc
feature_sample_source_count_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a
feature_target_given_source_coherent_test_SOURCES = features/target_given_source_coherent_test.cc
feature_target_given_source_coherent_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a
grammar_extractor_test_SOURCES = grammar_extractor_test.cc
grammar_extractor_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
matchings_finder_test_SOURCES = matchings_finder_test.cc
matchings_finder_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
phrase_test_SOURCES = phrase_test.cc
phrase_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
precomputation_test_SOURCES = precomputation_test.cc
precomputation_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
rule_extractor_helper_test_SOURCES = rule_extractor_helper_test.cc
rule_extractor_helper_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
rule_extractor_test_SOURCES = rule_extractor_test.cc
rule_extractor_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
rule_factory_test_SOURCES = rule_factory_test.cc
rule_factory_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
sampler_test_SOURCES = sampler_test.cc
sampler_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
scorer_test_SOURCES = scorer_test.cc
scorer_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
suffix_array_test_SOURCES = suffix_array_test.cc
suffix_array_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
target_phrase_extractor_test_SOURCES = target_phrase_extractor_test.cc
target_phrase_extractor_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
translation_table_test_SOURCES = translation_table_test.cc
translation_table_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a

noinst_LIBRARIES = libextractor.a libcompile.a

compile_SOURCES = compile.cc
compile_LDADD = libcompile.a
run_extractor_SOURCES = run_extractor.cc
run_extractor_LDADD = libextractor.a

libcompile_a_SOURCES = \
alignment.cc \
data_array.cc \
phrase_location.cc \
precomputation.cc \
suffix_array.cc \
time_util.cc \
translation_table.cc

libextractor_a_SOURCES = \
alignment.cc \
data_array.cc \
fast_intersector.cc \
features/count_source_target.cc \
features/feature.cc \
features/is_source_singleton.cc \
features/is_source_target_singleton.cc \
features/max_lex_source_given_target.cc \
features/max_lex_target_given_source.cc \
features/sample_source_count.cc \
features/target_given_source_coherent.cc \
grammar.cc \
grammar_extractor.cc \
matchings_finder.cc \
matchings_trie.cc \
phrase.cc \
phrase_builder.cc \
phrase_location.cc \
precomputation.cc \
rule.cc \
rule_extractor.cc \
rule_extractor_helper.cc \
rule_factory.cc \
sampler.cc \
scorer.cc \
suffix_array.cc \
target_phrase_extractor.cc \
time_util.cc \
translation_table.cc \
vocabulary.cc

AM_CPPFLAGS = -W -Wall -Wno-sign-compare -std=c++0x -fopenmp $(GTEST_CPPFLAGS) $(GMOCK_CPPFLAGS)
AM_LDFLAGS = -fopenmp
53 changes: 53 additions & 0 deletions extractor/alignment.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#include "alignment.h"

#include <fstream>
#include <sstream>
#include <string>
#include <fcntl.h>
#include <unistd.h>
#include <vector>

#include <boost/algorithm/string.hpp>
#include <boost/filesystem.hpp>

namespace fs = boost::filesystem;
using namespace std;

namespace extractor {

Alignment::Alignment(const string& filename) {
ifstream infile(filename.c_str());
string line;
while (getline(infile, line)) {
vector<string> items;
boost::split(items, line, boost::is_any_of(" -"));
vector<pair<int, int> > alignment;
alignment.reserve(items.size() / 2);
for (size_t i = 0; i < items.size(); i += 2) {
alignment.push_back(make_pair(stoi(items[i]), stoi(items[i + 1])));
}
alignments.push_back(alignment);
}
alignments.shrink_to_fit();
}

Alignment::Alignment() {}

Alignment::~Alignment() {}

vector<pair<int, int> > Alignment::GetLinks(int sentence_index) const {
return alignments[sentence_index];
}

void Alignment::WriteBinary(const fs::path& filepath) {
FILE* file = fopen(filepath.string().c_str(), "w");
int size = alignments.size();
fwrite(&size, sizeof(int), 1, file);
for (vector<pair<int, int> > alignment: alignments) {
size = alignment.size();
fwrite(&size, sizeof(int), 1, file);
fwrite(alignment.data(), sizeof(pair<int, int>), size, file);
}
}

} // namespace extractor
39 changes: 39 additions & 0 deletions extractor/alignment.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#ifndef _ALIGNMENT_H_
#define _ALIGNMENT_H_

#include <string>
#include <vector>

#include <boost/filesystem.hpp>

namespace fs = boost::filesystem;
using namespace std;

namespace extractor {

/**
* Data structure storing the word alignments for a parallel corpus.
*/
class Alignment {
public:
// Reads alignment from text file.
Alignment(const string& filename);

// Returns the alignment for a given sentence.
virtual vector<pair<int, int> > GetLinks(int sentence_index) const;

// Writes alignment to file in binary format.
void WriteBinary(const fs::path& filepath);

virtual ~Alignment();

protected:
Alignment();

private:
vector<vector<pair<int, int> > > alignments;
};

} // namespace extractor

#endif
Loading

0 comments on commit c164dc0

Please sign in to comment.