Skip to content

Commit

Permalink
lint: Add y-015, possible typo: misspelled word
Browse files Browse the repository at this point in the history
  • Loading branch information
acabal committed Oct 31, 2024
1 parent 83bbaf2 commit 26a3b79
Show file tree
Hide file tree
Showing 8 changed files with 28 additions and 23 deletions.
37 changes: 21 additions & 16 deletions se/se_epub_lint.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,8 @@
"y-011", "Possible typo: two or more [text]’[/] in a row."
"y-012”, "Possible typo: [text]”[/] directly followed by letter."
"y-013”, "Possible typo: punctuation not within [text]’[/]."
"y-014”, "Possible typo: Unexpected [text].[/] at the end of quotation. Hint: If a dialog tag follows, should this be [text],[/]?"
"y-014”, "Possible typo: unexpected [text].[/] at the end of quotation. Hint: If a dialog tag follows, should this be [text],[/]?"
"y-015”, "Possible typo: misspelled word."
"y-016”, "Possible typo: consecutive periods ([text]..[/])."
"y-017”, "Possible typo: [text]“[/] followed by space."
"y-018”, "Possible typo: [text]‘[/] followed by space."
Expand All @@ -481,17 +482,16 @@
"y-024”, "Possible typo: dash before [text]the/there/is/and/they/when[/] probably should be em-dash."
"y-025”, "Possible typo: letter/comma/quote mark/letter with no intervening space."
"y-026”, "Possible typo: no punctuation before conjunction [text]But/And/For/Nor/Yet/Or[/]."
"y-027”, "Possible typo: Extra [text]’[/] at end of paragraph."
"y-027”, "Possible typo: extra [text]’[/] at end of paragraph."
"y-028”, "Possible typo: [xhtml]<abbr>[/] directly preceded or followed by letter."
"y-029", "Possible typo: Italics followed by a letter."
"y-030”, "Possible typo: Lowercase quotation following a period. Check either that the period should be a comma, or that the quotation should start with a capital."
"y-031”, "Possible typo: Dialog tag missing punctuation."
"y-032”, "Possible typo: Italics running into preceding or following characters."
"y-033", "Possible typo: Three-em-dash obscuring an entire word, but not preceded by a space."
"y-029", "Possible typo: italics followed by a letter."
"y-030”, "Possible typo: lowercase quotation following a period. Check either that the period should be a comma, or that the quotation should start with a capital."
"y-031”, "Possible typo: dialog tag missing punctuation."
"y-032”, "Possible typo: italics running into preceding or following characters."
"y-033", "Possible typo: three-em-dash obscuring an entire word, but not preceded by a space."
UNUSED
vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
"y-015”, "Possible typo: mis-curled [text]‘[/] or missing [text]’[/]."
"y-021”, "Possible typo: Opening [text]‘[/] without preceding [text]“[/]."
"y-021”, "Possible typo: opening [text]‘[/] without preceding [text]“[/]."
"y-023”, "Possible typo: two opening quotation marks in a run. Hint: Nested quotes should switch between [text]“[/] and [text]‘[/]"
"""

Expand Down Expand Up @@ -3002,7 +3002,12 @@ def _lint_xhtml_typo_checks(filename: Path, dom: se.easy_xml.EasyXmlTree, file_c
# Check for period before dialog tag; try to exclude abbrevations that close a quotation, like `“<abbr>Mr.</abbr>”`.
typos = [node.to_string() for node in dom.xpath("/html/body//p[(re:test(., '\\.”\\s[a-z\\s]*?(\\bsaid|[a-z]+ed\\b)') or re:test(., '\\.”\\s(s?he|they?|we|and)\\b')) and not(.//abbr[following-sibling::node()[re:test(., '^”')]])]")]
if typos:
messages.append(LintMessage("y-014", "Possible typo: Unexpected [text].[/] at the end of quotation. Hint: If a dialog tag follows, should this be [text],[/]?", se.MESSAGE_TYPE_WARNING, filename, typos))
messages.append(LintMessage("y-014", "Possible typo: unexpected [text].[/] at the end of quotation. Hint: If a dialog tag follows, should this be [text],[/]?", se.MESSAGE_TYPE_WARNING, filename, typos))

# Check for some common OCR misspellings
typos = regex.findall(r"\bbad (?:been|seen)\b", file_contents)
if typos:
messages.append(LintMessage("y-015", "Possible typo: misspelled word.", se.MESSAGE_TYPE_WARNING, filename, typos))

# Check for two periods in a row, almost always a typo for one period or a hellip
typos = [node.to_string() for node in dom.xpath("/html/body//p[re:test(., '[^\\.]\\.\\.[^\\.]')]")]
Expand Down Expand Up @@ -3064,7 +3069,7 @@ def _lint_xhtml_typo_checks(filename: Path, dom: se.easy_xml.EasyXmlTree, file_c
# Check for extra closing single quote at the end of dialog
typos = [node.to_string() for node in dom.xpath("/html/body//p[re:test(., '^“[^‘]+”\\s*’$')]")]
if typos:
messages.append(LintMessage("y-027", "Possible typo: Extra [text]’[/] at end of paragraph.", se.MESSAGE_TYPE_WARNING, filename, typos))
messages.append(LintMessage("y-027", "Possible typo: extra [text]’[/] at end of paragraph.", se.MESSAGE_TYPE_WARNING, filename, typos))

# Check for `<abbr>` preceded or followed by text. Ignore plurals (e.g. TVs) and compass directions followed by `ly`, like S.S.W.ly
typos = [node.to_string() for node in dom.xpath("/html/body//abbr[(preceding-sibling::node()[1])[re:test(., '[A-Za-z]$')] or (following-sibling::node()[1])[re:test(., '^[A-Za-z](?<!s\\b)') and not((./preceding-sibling::abbr[1])[contains(@epub:type, 'se:compass')] and re:test(., '^ly\\b'))]]")]
Expand All @@ -3074,29 +3079,29 @@ def _lint_xhtml_typo_checks(filename: Path, dom: se.easy_xml.EasyXmlTree, file_c
# Check for misapplied italics. Ignore 's' because the plural is too common. i with epub:type handled by y-032.
typos = [node.to_string() for node in dom.xpath("/html/body//*[(name() = 'em' or (name() = 'i' and not(@epub:type))) and ./following-sibling::node()[1][re:test(., '^[a-z]\\b', 'i') and not(re:test(., '^s\\b'))]]")]
if typos:
messages.append(LintMessage("y-029", "Possible typo: Italics followed by a letter.", se.MESSAGE_TYPE_WARNING, filename, typos))
messages.append(LintMessage("y-029", "Possible typo: italics followed by a letter.", se.MESSAGE_TYPE_WARNING, filename, typos))

# Check for lowercase letters starting quotations after a preceding period
typos = dom.xpath("/html/body//p/child::text()[re:test(., '\\.\\s[‘“][a-z]')]")
if typos:
messages.append(LintMessage("y-030", "Possible typo: Lowercase quotation following a period. Check either that the period should be a comma, or that the quotation should start with a capital.", se.MESSAGE_TYPE_WARNING, filename, typos))
messages.append(LintMessage("y-030", "Possible typo: lowercase quotation following a period. Check either that the period should be a comma, or that the quotation should start with a capital.", se.MESSAGE_TYPE_WARNING, filename, typos))

# Check for missing punctuation in continued quotations
# ” said Bob “
nodes = dom.xpath("/html/body//p[re:test(., '”\\s(?:said|[A-Za-z]{2,}ed)\\s[A-Za-z]+?(?<!\\bthe)(?<!\\bto)(?<!\\bwith)(?<!\\bfrom)(?<!\\ba\\b)(?<!\\bis)\\s“') or re:test(., '[^.?!]”\\s(he\\b|she\\b|I\\b|[A-Z][a-z]+?)\\s(?:said|[A-Za-z]{2,}ed)\\s“') or re:test(., ',” (?:said|[A-Za-z]{2,}ed) [A-Za-z]+? [A-Za-z]+?ly “') or re:test(., '[a-z]” said s?he[,\\.;]')]")
if nodes:
messages.append(LintMessage("y-031", "Possible typo: Dialog tag missing punctuation.", se.MESSAGE_TYPE_WARNING, filename, [node.to_string() for node in nodes]))
messages.append(LintMessage("y-031", "Possible typo: dialog tag missing punctuation.", se.MESSAGE_TYPE_WARNING, filename, [node.to_string() for node in nodes]))

# Check for italics having epub:type that run in to preceding or following characters
# Ignore things like <i>Newspaper</i>s
nodes = dom.xpath("/html/body//i[@epub:type and ( (following-sibling::node()[1][re:test(., '^[a-z]', 'i') and not(re:test(., '^(s|es|er)\\b'))]) or preceding-sibling::node()[1][re:test(., '[a-z]$')]) ]")
if nodes:
messages.append(LintMessage("y-032", "Possible typo: Italics running into preceding or following characters.", se.MESSAGE_TYPE_WARNING, filename, [node.to_string() for node in nodes]))
messages.append(LintMessage("y-032", "Possible typo: italics running into preceding or following characters.", se.MESSAGE_TYPE_WARNING, filename, [node.to_string() for node in nodes]))

# Check for three-em-dashes not preceded by a space
nodes = dom.xpath(f"/html/body//p[re:test(., '[^>“(\\s{se.WORD_JOINER}]{se.WORD_JOINER}?⸻')]")
if nodes:
messages.append(LintMessage("y-033", "Possible typo: Three-em-dash obscuring an entire word, but not preceded by a space.", se.MESSAGE_TYPE_WARNING, filename, [node.to_string() for node in nodes]))
messages.append(LintMessage("y-033", "Possible typo: three-em-dash obscuring an entire word, but not preceded by a space.", se.MESSAGE_TYPE_WARNING, filename, [node.to_string() for node in nodes]))

return messages

Expand Down
2 changes: 1 addition & 1 deletion tests/lint/typos/y-014/golden/y-014-out.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
y-014 [Manual Review] chapter-1.xhtml Possible typo: Unexpected `.` at the end
y-014 [Manual Review] chapter-1.xhtml Possible typo: unexpected `.` at the end
of quotation. Hint: If a dialog tag follows, should this be `,`?
<p>A pruner can hardly be considered a scrannel breath without also
being a flat. “Nowhere is it disputed that antic sailboats show us how smiles
Expand Down
2 changes: 1 addition & 1 deletion tests/lint/typos/y-027/golden/y-027-out.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
y-027 [Manual Review] chapter-1.xhtml Possible typo: Extra `’` at end of
y-027 [Manual Review] chapter-1.xhtml Possible typo: extra `’` at end of
paragraph.
<p>“We can assume that any instance of an hour can be construed as a
plaguy stretch. This is not to discredit the idea that an inept family's coin
Expand Down
2 changes: 1 addition & 1 deletion tests/lint/typos/y-029/golden/y-029-out.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
y-029 [Manual Review] chapter-1.xhtml Possible typo: Italics followed by a
y-029 [Manual Review] chapter-1.xhtml Possible typo: italics followed by a
letter.
<i>caw-caw-caw-caw</i>
<em>Is</em>
2 changes: 1 addition & 1 deletion tests/lint/typos/y-030/golden/y-030-out.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
y-030 [Manual Review] chapter-1.xhtml Possible typo: Lowercase quotation
y-030 [Manual Review] chapter-1.xhtml Possible typo: lowercase quotation
following a period. Check either that the period should be a comma, or that the
quotation should start with a capital.
To be more specific, a manx is a swordfish's edger. “the trout” of a
Expand Down
2 changes: 1 addition & 1 deletion tests/lint/typos/y-031/golden/y-031-out.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
t-005 [Manual Review] chapter-1.xhtml Dialog without ending comma.
fay” she said
y-031 [Manual Review] chapter-1.xhtml Possible typo: Dialog tag missing
y-031 [Manual Review] chapter-1.xhtml Possible typo: dialog tag missing
punctuation.
<p>“Good boy,” said old “Kiowa.” “You’d better go get some supper.”</p>
<p>He pronounced the inhibition lengthily and sonorously, so that the
Expand Down
2 changes: 1 addition & 1 deletion tests/lint/typos/y-032/golden/y-032-out.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
y-032 [Manual Review] chapter-1.xhtml Possible typo: Italics running into
y-032 [Manual Review] chapter-1.xhtml Possible typo: italics running into
preceding or following characters.
<i epub:type="se:name.publication.play">Hamlet</i>
<i epub:type="se:name.publication.book">Threatful Flood</i>
2 changes: 1 addition & 1 deletion tests/lint/typos/y-033/golden/y-033-out.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
y-033 [Manual Review] chapter-1.xhtml Possible typo: Three-em-dash obscuring an
y-033 [Manual Review] chapter-1.xhtml Possible typo: three-em-dash obscuring an
entire word, but not preceded by a space.
<p>Joy to me! Come hither! Give me thy hand⸺ha! let be! aha!⸺Disgust,
disgust, disgust⸻alas to me!</p>

0 comments on commit 26a3b79

Please sign in to comment.