Ability to obtain word count / word frequency from pretrained word vector corpus #5232

aced125 · 2020-03-31T00:54:39Z

aced125
Mar 31, 2020

Feature description

The idea would be to obtain word frequencies for e.g glove vectors.

This could allow computing weighted sentence vectors:

For example, SIF embeddings (https://openreview.net/pdf?id=SyK00v5xx)

There may be a way to do this already that I am not aware of.

Could the feature be a custom component or spaCy plugin?

I will provide a custom spacy component for SIF embeddings here:

from spacy.tokens import Span, Doc, Token
import numpy as np

class SIFSentEmbeddings:
    """
    Spacy component to compute SIF sentence embeddings.
    SIF (Smoothed-inverse frequency embeddings) are described
    in https://openreview.net/pdf?id=SyK00v5xx as a simple
    way to compute sentence similarity.
    """

    def __init__(self, word_frequencies, a: float = 0.0001, npc: int = 1):
        """
        Spacy component to compute SIF embeddings.
        Parameters
        ----------
        word_frequences: Dictionary of word frequencies.
        a: This parameter controls the inverse smoothed
        weight. Set to 0.0001 as in the paper.
        npc: Number of principal components to remove. Default 1.
        """
        Token.set_extension("weightedVector", default=None, force=True)
        Span.set_extension("sifEmbedding", default=None, force=True)
        self.word_freq_dict = word_freq_dict
        self.a = a
        self.npc = npc

    @staticmethod
    def _compute_pc(X, npc=1):
        kwargs = {"n_components": npc, "n_iter": 7}
        svd = get_sklearn_model("TruncatedSVD", "decomposition")(**kwargs)
        svd.fit(X)
        return svd.components_

    @staticmethod
    def _remove_pc(X, npc=1):
        pc = SIFSentEmbeddings._compute_pc(X, npc)
        if npc == 1:
            XX = X - X.dot(pc.transpose()) * pc
        else:
            XX = X - X.dot(pc.transpose()).dot(pc)
        return XX


    def _compute_token_weight(self, tok: Token):
        probability = self.word_freq_dict[tok.text]
        weight = self.a / (self.a + probability)
        return weight

    def tag_tokens(self, doc: Doc):

        # For each token in document, tag with weighted vector
        for token in doc:
            weight = self._compute_token_weight(token)
            token._.weightedVector = token.vector * weight

    def compute_naive_sent_embeddings(self, doc: Doc):
        sent_vecs = []
        for sent in doc.sents:
            sent_vec = np.sum([tok._.weightedVector for tok in sent], axis=0)
            sent_vecs.append(sent_vec / len(sent))

        return np.array(sent_vecs)

    def tag_sif_embeddings(self, doc: Doc) -> Doc:
        self.tag_tokens(doc)
        sent_vecs = self.compute_naive_sent_embeddings(doc)
        sent_vecs = SIFSentEmbeddings._remove_pc(sent_vecs, npc=self.npc)
        for sent, sent_vec in zip(doc.sents, sent_vecs):
            sent._.sifEmbedding = sent_vec

    def __call__(self, doc: Doc) -> Doc:
        self.tag_sif_embeddings(doc)
        return doc

adrianeboyd · 2020-03-31T13:37:42Z

adrianeboyd
Mar 31, 2020

Some of the provided spacy md/lg models have word probabilities from a separate source than the vectors (German, Spanish, English, Greek), which you can access per token as token.prob or by lexeme as nlp.vocab["word"].prob. Unfortunately the documentation about the sources of the probability data isn't great on our end, so while the provided probabilities are a place to start experimenting, in the end it's probably a good idea to replace these with your own probabilities.

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Ability to obtain word count / word frequency from pretrained word vector corpus #5232

{{title}}

Replies: 1 comment

{{title}}

Select a reply

Ability to obtain word count / word frequency from pretrained word vector corpus #5232

aced125 Mar 31, 2020

Feature description

Could the feature be a custom component or spaCy plugin?

Replies: 1 comment

adrianeboyd Mar 31, 2020

aced125
Mar 31, 2020

adrianeboyd
Mar 31, 2020