Skip to content

Commit

Permalink
use the pipeline in exercice 02
Browse files Browse the repository at this point in the history
  • Loading branch information
ogrisel committed Mar 10, 2011
1 parent 19b1256 commit 7101ddd
Show file tree
Hide file tree
Showing 2 changed files with 89 additions and 88 deletions.
92 changes: 46 additions & 46 deletions skeletons/exercise_02_language_train_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@

import sys

from scikits.learn.feature_extraction.text.sparse import Vectorizer
from scikits.learn.feature_extraction.text.sparse import CountVectorizer
from scikits.learn.feature_extraction.text.sparse import TfidfTransformer
from scikits.learn.feature_extraction.text import CharNGramAnalyzer
from scikits.learn.svm.sparse import LinearSVC
from scikits.learn.pipeline import Pipeline
from scikits.learn.datasets import load_files
from scikits.learn import metrics

Expand Down Expand Up @@ -34,61 +36,59 @@ def __repr__(self):
dataset = load_files(languages_data_folder)

# split the dataset in training and test set:
n_samples_total = dataset.filenames.shape[0]

# TODO: define variables 'filenames_train' and 'filenames_test'
# TODO: define variables 'y_train' and 'y_test'
docs_train = [open(f).read()
for f in dataset.filenames[:n_samples_total/2]]
docs_test = [open(f).read()
for f in dataset.filenames[n_samples_total/2:]]


# Build a an analyzer that split strings into sequence of 1 to 3 characters
# using the previous preprocessor

# TODO: define a variable named analyzer


# Build a vectorizer using the analyzer, learn the mapping from feature name to
# feature id on the training data and then transform it into feature vectors.
# Then use the fitted vectorizer on the test data
y_train = dataset.target[:n_samples_total/2]
y_test = dataset.target[n_samples_total/2:]

# TODO: define a variable named 'vectorizer'
# TODO: define a variable named 'X_train'
# TODO: define a variable named 'X_test'

# XXX: Don't forget to read the content of the text files before feeding it to
# the vectorizer

# Build a linear classifier and train it on the training set

# TODO: define a variable named 'clf'
# Build a an analyzer that split strings into sequence of 1 to 3 characters
# after using the previous preprocessor
analyzer = CharNGramAnalyzer(
min_n=1,
max_n=3,
preprocessor=LowerCasePreprocessor(),
)

# Build a vectorizer / classifier pipeline using the previous analyzer
clf = Pipeline([
('vec', CountVectorizer(analyzer=analyzer)),
('tfidf', TfidfTransformer()),
('clf', LinearSVC(loss='l2', penalty='l1', dual=False, C=100)),
])

# Fit the pipeline on the training set
clf.fit(docs_train, y_train)

# Predict the outcome on the testing set
y_predicted = clf.predict(docs_test)

# TODO: define a variable named 'y_predicted'
# Print the classification report
print metrics.classification_report(y_test, y_predicted,
class_names=dataset.target_names)

# Plot the confusion matrix
cm = metrics.confusion_matrix(y_test, y_predicted)
print cm

#
# Evaluation of the quality of the predictions: uncomment the following when all
# of the above as been implemented
#
# import pylab as pl
#pl.matshow(cm)
#pl.show()

## Print the classification report
#
#print metrics.classification_report(y_test, y_predicted,
# class_names=dataset.target_names)
#
## Print the confusion matrix
#
#cm = metrics.confusion_matrix(y_test, y_predicted)
#print cm
#
# Predict the result on some short new sentences:
#sentences = [
# u'This is a language detection test.',
# u'Ceci est un test de d\xe9tection de la langue.',
# u'Dies ist ein Test, um die Sprache zu erkennen.',
#]
#vectors = vectorizer.transform(sentences)
#predicted = clf.predict(vectors)
#
#for s, p in zip(sentences, predicted):
# print u'The language of "%s" is "%s"' % (s, dataset.target_names[p])
sentences = [
u'This is a language detection test.',
u'Ceci est un test de d\xe9tection de la langue.',
u'Dies ist ein Test, um die Sprache zu erkennen.',
]
predicted = clf.predict(sentences)

for s, p in zip(sentences, predicted):
print u'The language of "%s" is "%s"' % (s, dataset.target_names[p])

85 changes: 43 additions & 42 deletions solutions/exercise_02_language_train_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@

import sys

from scikits.learn.feature_extraction.text.sparse import Vectorizer
from scikits.learn.feature_extraction.text.sparse import CountVectorizer
from scikits.learn.feature_extraction.text.sparse import TfidfTransformer
from scikits.learn.feature_extraction.text import CharNGramAnalyzer
from scikits.learn.svm.sparse import LinearSVC
from scikits.learn.pipeline import Pipeline
from scikits.learn.datasets import load_files
from scikits.learn import metrics

Expand Down Expand Up @@ -36,57 +38,56 @@ def __repr__(self):
# split the dataset in training and test set:
n_samples_total = dataset.filenames.shape[0]

filenames_train = dataset.filenames[:n_samples_total/2]
filenames_test = dataset.filenames[n_samples_total/2:]
docs_train = [open(f).read()
for f in dataset.filenames[:n_samples_total/2]]
docs_test = [open(f).read()
for f in dataset.filenames[n_samples_total/2:]]


y_train = dataset.target[:n_samples_total/2]
y_test = dataset.target[n_samples_total/2:]


# Build a an analyzer that split strings into sequence of 1 to 3 characters
# after using the previous preprocessor
analyzer = CharNGramAnalyzer(
min_n=1,
max_n=3,
preprocessor=LowerCasePreprocessor(),
)

# Build a vectorizer using the analyzer, learn the mapping from feature name to
# feature id on the training data while transforming it. The use the fitted
# vectorizer on the test data
vectorizer = Vectorizer(analyzer=analyzer, use_idf=False)
# TODO

# Build a vectorizer / classifier pipeline using the previous analyzer

# TODO: the pipeline instance must be named 'clf'

X_train = vectorizer.fit_transform((open(f) for f in filenames_train))
X_test = vectorizer.transform((open(f) for f in filenames_test))
# Fit the pipeline on the training set

# Build a linear classifier and train it on the training set
clf = LinearSVC(loss='l2', penalty='l1', dual=False, C=100)
clf.fit(X_train, y_train)
# TODO

# Predict the outcome on the testing set
y_predicted = clf.predict(X_test)

# Print the classification report
print metrics.classification_report(y_test, y_predicted,
class_names=dataset.target_names)

# Plot the confusion matrix
cm = metrics.confusion_matrix(y_test, y_predicted)
print cm

# import pylab as pl
#pl.matshow(cm)
#pl.show()

# Predict the result on some short new sentences:
sentences = [
u'This is a language detection test.',
u'Ceci est un test de d\xe9tection de la langue.',
u'Dies ist ein Test, um die Sprache zu erkennen.',
]
vectors = vectorizer.transform(sentences)
predicted = clf.predict(vectors)

for s, p in zip(sentences, predicted):
print u'The language of "%s" is "%s"' % (s, dataset.target_names[p])

# TODO: the predicted outcome must be named 'y_predicted'


# TODO: uncomment the following once all of the above is implemented

## Print the classification report
#print metrics.classification_report(y_test, y_predicted,
# class_names=dataset.target_names)
#
## Plot the confusion matrix
#cm = metrics.confusion_matrix(y_test, y_predicted)
#print cm
#
## import pylab as pl
##pl.matshow(cm)
##pl.show()
#
## Predict the result on some short new sentences:
#sentences = [
# u'This is a language detection test.',
# u'Ceci est un test de d\xe9tection de la langue.',
# u'Dies ist ein Test, um die Sprache zu erkennen.',
#]
#predicted = clf.predict(sentences)
#
#for s, p in zip(sentences, predicted):
# print u'The language of "%s" is "%s"' % (s, dataset.target_names[p])

0 comments on commit 7101ddd

Please sign in to comment.