Skip to content

Commit 034ac0a

Browse files
authored
Merge pull request explosion#8787 from adrianeboyd/chore/backport-v3.0.7
Backport bug fixes to v3.0.x
2 parents 02e1892 + 0080454 commit 034ac0a

33 files changed

+209
-105
lines changed

MANIFEST.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@ recursive-exclude spacy/lang *.json
88
recursive-include spacy/lang *.json.gz
99
recursive-include spacy/cli *.json *.yml
1010
recursive-include licenses *
11+
recursive-exclude spacy *.cpp

spacy/about.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# fmt: off
22
__title__ = "spacy"
3-
__version__ = "3.0.6"
3+
__version__ = "3.0.7"
44
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
55
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
66
__projects__ = "https://github.com/explosion/projects"

spacy/cli/convert.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,8 @@ def convert(
115115
ner_map = srsly.read_json(ner_map) if ner_map is not None else None
116116
doc_files = []
117117
for input_loc in walk_directory(Path(input_path), converter):
118-
input_data = input_loc.open("r", encoding="utf-8").read()
118+
with input_loc.open("r", encoding="utf-8") as infile:
119+
input_data = infile.read()
119120
# Use converter function to convert data
120121
func = CONVERTERS[converter]
121122
docs = func(

spacy/cli/package.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def package_cli(
1818
output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
1919
code_paths: str = Opt("", "--code", "-c", help="Comma-separated paths to Python file with additional code (registered functions) to be included in the package"),
2020
meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
21-
create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"),
21+
create_meta: bool = Opt(False, "--create-meta", "-C", help="Create meta.json, even if one exists"),
2222
name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"),
2323
version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
2424
build: str = Opt("sdist", "--build", "-b", help="Comma-separated formats to build: sdist and/or wheel, or none."),

spacy/cli/templates/quickstart_training.jinja

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -418,7 +418,7 @@ compound = 1.001
418418

419419
[initialize]
420420
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
421-
vectors = null
421+
vectors = ${paths.vectors}
422422
{% else -%}
423423
vectors = "{{ word_vectors }}"
424424
{% endif -%}

spacy/errors.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -518,6 +518,11 @@ class Errors:
518518
E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
519519

520520
# New errors added in v3.x
521+
E867 = ("The 'textcat' component requires at least two labels because it "
522+
"uses mutually exclusive classes where exactly one label is True "
523+
"for each doc. For binary classification tasks, you can use two "
524+
"labels with 'textcat' (LABEL / NOT_LABEL) or alternatively, you "
525+
"can use the 'textcat_multilabel' component with one label.")
521526
E870 = ("Could not serialize the DocBin because it is too large. Consider "
522527
"splitting up your documents into several doc bins and serializing "
523528
"each separately. spacy.Corpus.v1 will search recursively for all "

spacy/lang/az/__init__.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,11 @@
1-
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
21
from .stop_words import STOP_WORDS
3-
from .syntax_iterators import SYNTAX_ITERATORS
42
from .lex_attrs import LEX_ATTRS
53
from ...language import Language
64

75

86
class AzerbaijaniDefaults(Language.Defaults):
9-
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
107
lex_attr_getters = LEX_ATTRS
118
stop_words = STOP_WORDS
12-
token_match = TOKEN_MATCH
13-
syntax_iterators = SYNTAX_ITERATORS
149

1510

1611
class Azerbaijani(Language):

spacy/lang/el/lemmatizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,6 @@ def rule_lemmatize(self, token: Token) -> List[str]:
5757
forms.extend(oov_forms)
5858
if not forms:
5959
forms.append(string)
60-
forms = list(set(forms))
60+
forms = list(dict.fromkeys(forms))
6161
self.cache[cache_key] = forms
6262
return forms

spacy/lang/ru/lemmatizer.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212

1313

1414
class RussianLemmatizer(Lemmatizer):
15-
_morph = None
1615

1716
def __init__(
1817
self,
@@ -31,8 +30,8 @@ def __init__(
3130
"The Russian lemmatizer mode 'pymorphy2' requires the "
3231
"pymorphy2 library. Install it with: pip install pymorphy2"
3332
) from None
34-
if RussianLemmatizer._morph is None:
35-
RussianLemmatizer._morph = MorphAnalyzer()
33+
if getattr(self, "_morph", None) is None:
34+
self._morph = MorphAnalyzer()
3635
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
3736

3837
def pymorphy2_lemmatize(self, token: Token) -> List[str]:

spacy/lang/uk/lemmatizer.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77

88

99
class UkrainianLemmatizer(RussianLemmatizer):
10-
_morph = None
11-
1210
def __init__(
1311
self,
1412
vocab: Vocab,
@@ -27,6 +25,6 @@ def __init__(
2725
"pymorphy2 library and dictionaries. Install them with: "
2826
"pip install pymorphy2 pymorphy2-dicts-uk"
2927
) from None
30-
if UkrainianLemmatizer._morph is None:
31-
UkrainianLemmatizer._morph = MorphAnalyzer(lang="uk")
28+
if getattr(self, "_morph", None) is None:
29+
self._morph = MorphAnalyzer(lang="uk")
3230
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)

spacy/matcher/phrasematcher.pyx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ cdef class PhraseMatcher:
5050
if isinstance(attr, (int, long)):
5151
self.attr = attr
5252
else:
53+
if attr is None:
54+
attr = "ORTH"
5355
attr = attr.upper()
5456
if attr == "TEXT":
5557
attr = "ORTH"

spacy/ml/models/multi_task.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from thinc.api import MultiSoftmax, list2array
44
from thinc.api import to_categorical, CosineDistance, L2Distance
55

6-
from ...util import registry
6+
from ...util import registry, OOV_RANK
77
from ...errors import Errors
88
from ...attrs import ID
99

@@ -70,6 +70,7 @@ def get_vectors_loss(ops, docs, prediction, distance):
7070
# and look them up all at once. This prevents data copying.
7171
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
7272
target = docs[0].vocab.vectors.data[ids]
73+
target[ids == OOV_RANK] = 0
7374
d_target, loss = distance(prediction, target)
7475
return loss, d_target
7576

spacy/pipeline/entity_linker.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -481,7 +481,8 @@ def from_disk(
481481

482482
def load_model(p):
483483
try:
484-
self.model.from_bytes(p.open("rb").read())
484+
with p.open("rb") as infile:
485+
self.model.from_bytes(infile.read())
485486
except AttributeError:
486487
raise ValueError(Errors.E149) from None
487488

spacy/pipeline/entityruler.py

Lines changed: 22 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from collections import defaultdict
44
from pathlib import Path
55
import srsly
6+
import warnings
67

78
from .pipe import Pipe
89
from ..training import Example
@@ -102,17 +103,12 @@ def __init__(
102103
self.overwrite = overwrite_ents
103104
self.token_patterns = defaultdict(list)
104105
self.phrase_patterns = defaultdict(list)
106+
self._validate = validate
105107
self.matcher = Matcher(nlp.vocab, validate=validate)
106-
if phrase_matcher_attr is not None:
107-
if phrase_matcher_attr.upper() == "TEXT":
108-
phrase_matcher_attr = "ORTH"
109-
self.phrase_matcher_attr = phrase_matcher_attr
110-
self.phrase_matcher = PhraseMatcher(
111-
nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
112-
)
113-
else:
114-
self.phrase_matcher_attr = None
115-
self.phrase_matcher = PhraseMatcher(nlp.vocab, validate=validate)
108+
self.phrase_matcher_attr = phrase_matcher_attr
109+
self.phrase_matcher = PhraseMatcher(
110+
nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
111+
)
116112
self.ent_id_sep = ent_id_sep
117113
self._ent_ids = defaultdict(dict)
118114
if patterns is not None:
@@ -146,7 +142,9 @@ def __call__(self, doc: Doc) -> Doc:
146142

147143
def match(self, doc: Doc):
148144
self._require_patterns()
149-
matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
145+
with warnings.catch_warnings():
146+
warnings.filterwarnings("ignore", message="\\[W036")
147+
matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
150148
matches = set(
151149
[(m_id, start, end) for m_id, start, end in matches if start != end]
152150
)
@@ -281,7 +279,7 @@ def add_patterns(self, patterns: List[PatternType]) -> None:
281279
current_index = i
282280
break
283281
subsequent_pipes = [
284-
pipe for pipe in self.nlp.pipe_names[current_index + 1 :]
282+
pipe for pipe in self.nlp.pipe_names[current_index :]
285283
]
286284
except ValueError:
287285
subsequent_pipes = []
@@ -317,20 +315,22 @@ def add_patterns(self, patterns: List[PatternType]) -> None:
317315
pattern = entry["pattern"]
318316
if isinstance(pattern, Doc):
319317
self.phrase_patterns[label].append(pattern)
318+
self.phrase_matcher.add(label, [pattern])
320319
elif isinstance(pattern, list):
321320
self.token_patterns[label].append(pattern)
321+
self.matcher.add(label, [pattern])
322322
else:
323323
raise ValueError(Errors.E097.format(pattern=pattern))
324-
for label, patterns in self.token_patterns.items():
325-
self.matcher.add(label, patterns)
326-
for label, patterns in self.phrase_patterns.items():
327-
self.phrase_matcher.add(label, patterns)
328324

329325
def clear(self) -> None:
330326
"""Reset all patterns."""
331327
self.token_patterns = defaultdict(list)
332328
self.phrase_patterns = defaultdict(list)
333329
self._ent_ids = defaultdict(dict)
330+
self.matcher = Matcher(self.nlp.vocab, validate=self._validate)
331+
self.phrase_matcher = PhraseMatcher(
332+
self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate
333+
)
334334

335335
def _require_patterns(self) -> None:
336336
"""Raise a warning if this component has no patterns defined."""
@@ -381,10 +381,9 @@ def from_bytes(
381381
self.add_patterns(cfg.get("patterns", cfg))
382382
self.overwrite = cfg.get("overwrite", False)
383383
self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None)
384-
if self.phrase_matcher_attr is not None:
385-
self.phrase_matcher = PhraseMatcher(
386-
self.nlp.vocab, attr=self.phrase_matcher_attr
387-
)
384+
self.phrase_matcher = PhraseMatcher(
385+
self.nlp.vocab, attr=self.phrase_matcher_attr
386+
)
388387
self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
389388
else:
390389
self.add_patterns(cfg)
@@ -435,10 +434,9 @@ def from_disk(
435434
self.phrase_matcher_attr = cfg.get("phrase_matcher_attr")
436435
self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
437436

438-
if self.phrase_matcher_attr is not None:
439-
self.phrase_matcher = PhraseMatcher(
440-
self.nlp.vocab, attr=self.phrase_matcher_attr
441-
)
437+
self.phrase_matcher = PhraseMatcher(
438+
self.nlp.vocab, attr=self.phrase_matcher_attr
439+
)
442440
from_disk(path, deserializers_patterns, {})
443441
return self
444442

spacy/pipeline/textcat.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,8 @@ def initialize(
332332
else:
333333
for label in labels:
334334
self.add_label(label)
335+
if len(self.labels) < 2:
336+
raise ValueError(Errors.E867)
335337
if positive_label is not None:
336338
if positive_label not in self.labels:
337339
err = Errors.E920.format(pos_label=positive_label, labels=self.labels)

spacy/pipeline/trainable_pipe.pyx

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -324,7 +324,8 @@ cdef class TrainablePipe(Pipe):
324324
325325
def load_model(p):
326326
try:
327-
self.model.from_bytes(p.open("rb").read())
327+
with open(p, "rb") as mfile:
328+
self.model.from_bytes(mfile.read())
328329
except AttributeError:
329330
raise ValueError(Errors.E149) from None
330331

spacy/tests/doc/test_doc_api.py

Lines changed: 30 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -351,17 +351,25 @@ def test_doc_from_array_morph(en_vocab):
351351

352352
@pytest.mark.usefixtures("clean_underscore")
353353
def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
354-
en_texts = ["Merging the docs is fun.", "", "They don't think alike."]
354+
en_texts = [
355+
"Merging the docs is fun.",
356+
"",
357+
"They don't think alike. ",
358+
"Another doc.",
359+
]
355360
en_texts_without_empty = [t for t in en_texts if len(t)]
356361
de_text = "Wie war die Frage?"
357362
en_docs = [en_tokenizer(text) for text in en_texts]
358363
en_docs[0].spans["group"] = [en_docs[0][1:4]]
359364
en_docs[2].spans["group"] = [en_docs[2][1:4]]
360-
span_group_texts = sorted([en_docs[0][1:4].text, en_docs[2][1:4].text])
365+
en_docs[3].spans["group"] = [en_docs[3][0:1]]
366+
span_group_texts = sorted(
367+
[en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text]
368+
)
361369
de_doc = de_tokenizer(de_text)
362370
Token.set_extension("is_ambiguous", default=False)
363-
en_docs[0][2]._.is_ambiguous = True # docs
364-
en_docs[2][3]._.is_ambiguous = True # think
371+
en_docs[0][2]._.is_ambiguous = True # docs
372+
en_docs[2][3]._.is_ambiguous = True # think
365373
assert Doc.from_docs([]) is None
366374
assert de_doc is not Doc.from_docs([de_doc])
367375
assert str(de_doc) == str(Doc.from_docs([de_doc]))
@@ -371,8 +379,8 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
371379

372380
m_doc = Doc.from_docs(en_docs)
373381
assert len(en_texts_without_empty) == len(list(m_doc.sents))
374-
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
375-
assert str(m_doc) == " ".join(en_texts_without_empty)
382+
assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1])
383+
assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty])
376384
p_token = m_doc[len(en_docs[0]) - 1]
377385
assert p_token.text == "." and bool(p_token.whitespace_)
378386
en_docs_tokens = [t for doc in en_docs for t in doc]
@@ -384,11 +392,12 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
384392
assert not any([t._.is_ambiguous for t in m_doc[3:8]])
385393
assert "group" in m_doc.spans
386394
assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
395+
assert bool(m_doc[11].whitespace_)
387396

388397
m_doc = Doc.from_docs(en_docs, ensure_whitespace=False)
389398
assert len(en_texts_without_empty) == len(list(m_doc.sents))
390-
assert len(str(m_doc)) == sum(len(t) for t in en_texts)
391-
assert str(m_doc) == "".join(en_texts)
399+
assert len(m_doc.text) == sum(len(t) for t in en_texts)
400+
assert m_doc.text == "".join(en_texts_without_empty)
392401
p_token = m_doc[len(en_docs[0]) - 1]
393402
assert p_token.text == "." and not bool(p_token.whitespace_)
394403
en_docs_tokens = [t for doc in en_docs for t in doc]
@@ -397,11 +406,12 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
397406
assert m_doc[9].idx == think_idx
398407
assert "group" in m_doc.spans
399408
assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
409+
assert bool(m_doc[11].whitespace_)
400410

401411
m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"])
402-
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
412+
assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1])
403413
# space delimiter considered, although spacy attribute was missing
404-
assert str(m_doc) == " ".join(en_texts_without_empty)
414+
assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty])
405415
p_token = m_doc[len(en_docs[0]) - 1]
406416
assert p_token.text == "." and bool(p_token.whitespace_)
407417
en_docs_tokens = [t for doc in en_docs for t in doc]
@@ -414,6 +424,16 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
414424
# can merge empty docs
415425
doc = Doc.from_docs([en_tokenizer("")] * 10)
416426

427+
# empty but set spans keys are preserved
428+
en_docs = [en_tokenizer(text) for text in en_texts]
429+
m_doc = Doc.from_docs(en_docs)
430+
assert "group" not in m_doc.spans
431+
for doc in en_docs:
432+
doc.spans["group"] = []
433+
m_doc = Doc.from_docs(en_docs)
434+
assert "group" in m_doc.spans
435+
assert len(m_doc.spans["group"]) == 0
436+
417437

418438
def test_doc_api_from_docs_ents(en_tokenizer):
419439
texts = ["Merging the docs is fun.", "They don't think alike."]

spacy/tests/lang/test_initialize.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,13 @@
44

55
# fmt: off
66
# Only include languages with no external dependencies
7-
# excluded: ja, ru, th, uk, vi, zh
8-
LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
9-
"et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is",
10-
"it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "si", "sk",
11-
"sl", "sq", "sr", "sv", "ta", "te", "tl", "tn", "tr", "tt", "ur",
12-
"yo"]
7+
# excluded: ja, ko, th, vi, zh
8+
LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el",
9+
"en", "es", "et", "eu", "fa", "fi", "fr", "ga", "gu", "he", "hi",
10+
"hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv",
11+
"mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
12+
"si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn",
13+
"tr", "tt", "uk", "ur", "xx", "yo"]
1314
# fmt: on
1415

1516

spacy/tests/matcher/test_matcher_api.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -481,6 +481,7 @@ def test_matcher_schema_token_attributes(en_vocab, pattern, text):
481481
assert len(matches) == 1
482482

483483

484+
@pytest.mark.filterwarnings("ignore:\\[W036")
484485
def test_matcher_valid_callback(en_vocab):
485486
"""Test that on_match can only be None or callable."""
486487
matcher = Matcher(en_vocab)

spacy/tests/matcher/test_matcher_logic.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,7 @@ def test_matcher_sets_return_correct_tokens(en_vocab):
180180
assert texts == ["zero", "one", "two"]
181181

182182

183+
@pytest.mark.filterwarnings("ignore:\\[W036")
183184
def test_matcher_remove():
184185
nlp = English()
185186
matcher = Matcher(nlp.vocab)

0 commit comments

Comments
 (0)