Skip to content

Commit

Permalink
Merge branch 'master' of github.com:pmbaumgartner/spaCy
Browse files Browse the repository at this point in the history
  • Loading branch information
pmbaumgartner committed Jul 15, 2019
2 parents 9a86d95 + c0e29f7 commit 040bb06
Show file tree
Hide file tree
Showing 11 changed files with 104 additions and 16 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@ requires = ["setuptools",
"cymem>=2.0.2,<2.1.0",
"preshed>=2.0.1,<2.1.0",
"murmurhash>=0.28.0,<1.1.0",
"thinc>=7.0.6,<7.1.0",
"thinc>=7.0.8,<7.1.0",
]
build-backend = "setuptools.build_meta"
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Our libraries
cymem>=2.0.2,<2.1.0
preshed>=2.0.1,<2.1.0
thinc>=7.0.6,<7.1.0
thinc>=7.0.8,<7.1.0
blis>=0.2.2,<0.3.0
murmurhash>=0.28.0,<1.1.0
wasabi>=0.2.0,<1.1.0
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ def setup_package():
"murmurhash>=0.28.0,<1.1.0",
"cymem>=2.0.2,<2.1.0",
"preshed>=2.0.1,<2.1.0",
"thinc>=7.0.6,<7.1.0",
"thinc>=7.0.8,<7.1.0",
"blis>=0.2.2,<0.3.0",
"plac<1.0.0,>=0.9.6",
"requests>=2.13.0,<3.0.0",
Expand Down
4 changes: 2 additions & 2 deletions spacy/about.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
# fmt: off

__title__ = "spacy"
__version__ = "2.1.5.dev0"
__version__ = "2.1.6"
__summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
__uri__ = "https://spacy.io"
__author__ = "Explosion AI"
__email__ = "contact@explosion.ai"
__license__ = "MIT"
__release__ = False
__release__ = True

__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
Expand Down
4 changes: 3 additions & 1 deletion spacy/attrs.pxd
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# Reserve 64 values for flag features
from . cimport symbols

cdef enum attr_id_t:
NULL_ATTR
IS_ALPHA
Expand Down Expand Up @@ -82,10 +84,10 @@ cdef enum attr_id_t:
DEP
ENT_IOB
ENT_TYPE
ENT_KB_ID
HEAD
SENT_START
SPACY
PROB

LANG
ENT_KB_ID = symbols.ENT_KB_ID
3 changes: 2 additions & 1 deletion spacy/lang/da/punctuation.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,11 @@
+ [
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
r'(?<=[{a}])[:<>=](?=[{a}])'.format(a=ALPHA),
r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
]
)

Expand Down
8 changes: 8 additions & 0 deletions spacy/lang/da/tokenizer_exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
{ORTH: "Ons.", LEMMA: "onsdag"},
{ORTH: "Fre.", LEMMA: "fredag"},
{ORTH: "Lør.", LEMMA: "lørdag"},
{ORTH: "og/eller", LEMMA: "og/eller", NORM: "og/eller", TAG: "CC"},
]:
_exc[exc_data[ORTH]] = [exc_data]

Expand All @@ -64,6 +65,8 @@
"mik.",
"pers.",
"A.D.",
"A/B",
"a/s",
"A/S",
"B.C.",
"BK.",
Expand All @@ -79,7 +82,9 @@
"Kprs.",
"L.A.",
"Ll.",
"m/k",
"m/s",
"m/sek.",
"M/S",
"Mag.",
"Mr.",
Expand All @@ -90,6 +95,7 @@
"Sdr.",
"Skt.",
"Spl.",
"TCP/IP",
"Vg.",
]:
_exc[orth] = [{ORTH: orth}]
Expand Down Expand Up @@ -141,6 +147,7 @@
"brolægn.",
"bto.",
"bygn.",
"c/o",
"ca.",
"cand.",
"d.d.",
Expand Down Expand Up @@ -293,6 +300,7 @@
"kgl.",
"kl.",
"kld.",
"km/t",
"knsp.",
"komm.",
"kons.",
Expand Down
3 changes: 2 additions & 1 deletion spacy/symbols.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,6 @@ cdef enum symbol_t:
DEP
ENT_IOB
ENT_TYPE
ENT_KB_ID
HEAD
SENT_START
SPACY
Expand Down Expand Up @@ -461,3 +460,5 @@ cdef enum symbol_t:
xcomp

acl

ENT_KB_ID
24 changes: 24 additions & 0 deletions spacy/tests/lang/da/test_exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,27 @@ def test_da_tokenizer_handles_custom_base_exc(da_tokenizer):
def test_da_tokenizer_norm_exceptions(da_tokenizer, text, norm):
tokens = da_tokenizer(text)
assert tokens[0].norm_ == norm


@pytest.mark.parametrize(
"text,n_tokens",
[
("Godt og/eller skidt", 3),
("Kør 4 km/t på vejen", 5),
("Det blæser 12 m/s.", 5),
("Det blæser 12 m/sek. på havnen", 6),
("Windows 8/Windows 10", 5),
("Billeten virker til bus/tog/metro", 8),
("26/02/2019", 1),
("Kristiansen c/o Madsen", 3),
("Sprogteknologi a/s", 2),
("De boede i A/B Bellevue", 5),
("Rotorhastigheden er 3400 o/m.", 5),
("Jeg købte billet t/r.", 5),
("Murerarbejdsmand m/k søges", 3),
("Netværket kører over TCP/IP", 4),
],
)
def test_da_tokenizer_slash(da_tokenizer, text, n_tokens):
tokens = da_tokenizer(text)
assert len(tokens) == n_tokens
51 changes: 51 additions & 0 deletions spacy/tests/regression/test_issue3611.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# coding: utf8
from __future__ import unicode_literals

import pytest
import spacy
from spacy.util import minibatch, compounding


def test_issue3611():
""" Test whether adding n-grams in the textcat works even when n > token length of some docs """
unique_classes = ["offensive", "inoffensive"]
x_train = ["This is an offensive text",
"This is the second offensive text",
"inoff"]
y_train = ["offensive", "offensive", "inoffensive"]

# preparing the data
pos_cats = list()
for train_instance in y_train:
pos_cats.append({label: label == train_instance for label in unique_classes})
train_data = list(zip(x_train, [{'cats': cats} for cats in pos_cats]))

# set up the spacy model with a text categorizer component
nlp = spacy.blank('en')

textcat = nlp.create_pipe(
"textcat",
config={
"exclusive_classes": True,
"architecture": "bow",
"ngram_size": 2
}
)

for label in unique_classes:
textcat.add_label(label)
nlp.add_pipe(textcat, last=True)

# training the network
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes):
optimizer = nlp.begin_training()
for i in range(3):
losses = {}
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))

for batch in batches:
texts, annotations = zip(*batch)
nlp.update(docs=texts, golds=annotations, sgd=optimizer, drop=0.1, losses=losses)


17 changes: 9 additions & 8 deletions website/src/widgets/landing.js
Original file line number Diff line number Diff line change
Expand Up @@ -152,20 +152,21 @@ const Landing = ({ data }) => {
<LandingBannerGrid>
<LandingBanner
title="spaCy IRL 2019: Two days of NLP"
label="Join us in Berlin"
to="https://irl.spacy.io/2019"
button="Get tickets"
label="Watch the videos"
to="https://www.youtube.com/playlist?list=PLBmcuObd5An4UC6jvK_-eSl6jCvP1gwXc"
button="Watch the videos"
background="#ffc194"
backgroundImage={irlBackground}
color="#1a1e23"
small
>
We're pleased to invite the spaCy community and other folks working on Natural
We were pleased to invite the spaCy community and other folks working on Natural
Language Processing to Berlin this summer for a small and intimate event{' '}
<strong>July 5-6, 2019</strong>. The event includes a hands-on training day for
teams using spaCy in production, followed by a one-track conference. We've
booked a beautiful venue, hand-picked an awesome lineup of speakers and
scheduled plenty of social time to get to know each other and exchange ideas.
<strong>July 6, 2019</strong>. We booked a beautiful venue, hand-picked an
awesome lineup of speakers and scheduled plenty of social time to get to know
each other and exchange ideas. The YouTube playlist includes 12 talks about NLP
research, development and applications, with keynotes by Sebastian Ruder
(DeepMind) and Yoav Goldberg (Allen AI).
</LandingBanner>

<LandingBanner
Expand Down

0 comments on commit 040bb06

Please sign in to comment.