Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Performance enhancements #73

Open
wants to merge 22 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,5 @@ docs/_build/

# PyBuilder
target/
.idea
.DS_Store
8 changes: 4 additions & 4 deletions edtf/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,7 @@ def trim_struct_time(st: struct_time, strip_time: bool = False) -> struct_time:
"""
if strip_time:
return struct_time(list(st[:3]) + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS)
else:
return struct_time(list(st[:6]) + TIME_EMPTY_EXTRAS)
return struct_time(list(st[:6]) + TIME_EMPTY_EXTRAS)


def struct_time_to_jd(st: struct_time) -> float:
Expand Down Expand Up @@ -116,7 +115,7 @@ def jd_to_struct_time(jd: float) -> struct_time:
return struct_time([year, month, day, hour, minute, second] + TIME_EMPTY_EXTRAS)


def _roll_negative_time_fields(year, month, day, hour, minute, second):
def _roll_negative_time_fields(year, month, day, hour, minute, second) -> tuple:
"""
Fix date/time fields which have nonsense negative values for any field
except for year by rolling the overall date/time value backwards, treating
Expand Down Expand Up @@ -152,4 +151,5 @@ def _roll_negative_time_fields(year, month, day, hour, minute, second):
year += int(month / 12.0) # Adjust by whole year in months
year -= 1 # Subtract 1 for negative minutes
month %= 12 # Convert negative month to positive remainder
return (year, month, day, hour, minute, second)

return year, month, day, hour, minute, second
144 changes: 80 additions & 64 deletions edtf/natlang/en.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
"""Utilities to derive an EDTF string from an (English) natural language string."""

import functools
import re
from datetime import datetime
from typing import Optional

from dateutil.parser import ParserError, parse

Expand All @@ -13,19 +15,45 @@
DEFAULT_DATE_1 = datetime(1234, 1, 1, 0, 0)
DEFAULT_DATE_2 = datetime(5678, 10, 10, 0, 0)

SHORT_YEAR_RE = r"(-?)([\dX])([\dX])([\dX])([\dX])"
LONG_YEAR_RE = r"Y(-?)([1-9]\d\d\d\d+)"
CENTURY_RE = r"(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?"
CE_RE = r"(\d{1,4}) (ad|ce|bc|bce)"
LONG_YEAR_RE = re.compile(r"y(-?)([1-9]\d\d\d\d+)")
CENTURY_RE = re.compile(r"(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?")
CENTURY_RANGE = re.compile(r"\b(\d\d)(th|st|nd|rd|)-(\d\d)(th|st|nd|rd) [cC]")
CE_RE = re.compile(r"(\d{1,4}) (ad|ce|bc|bce)")
ONE_DIGIT_PARTIAL_FIRST = re.compile(r"\d\D\b")
TWO_DIGIT_PARTIAL_FIRST = re.compile(r"\d\d\b")
PARTIAL_CHECK = re.compile(r"\b\d\d\d\d$")
SLASH_YEAR = re.compile(r"(\d\d\d\d)/(\d\d\d\d)")
BEFORE_CHECK = re.compile(r"\b(?:before|earlier|avant)\b")
AFTER_CHECK = re.compile(r"\b(after|since|later|aprés|apres)\b")
APPROX_CHECK = re.compile(
r"\b(?:ca?\.? ?\d{4}|circa|approx|approximately|around|about|~\d{3,4})|^~"
)
UNCERTAIN_CHECK = re.compile(r"\b(?:uncertain|possibly|maybe|guess|\d{3,4}\?)")
UNCERTAIN_REPL = re.compile(r"(\d{4})\?")
MIGHT_BE_CENTURY = re.compile(r"(\d{2}00)s")
MIGHT_BE_DECADE = re.compile(r"(\d{3}0)s")

APPROX_CENTURY_RE = re.compile(
r"\b(ca?\.?) ?(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?"
)
UNCERTAIN_CENTURY_RE = re.compile(
r"(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?\?"
)

APPROX_CE_RE = re.compile(r"\b(ca?\.?) ?(\d{1,4}) (ad|ce|bc|bce)")
UNCERTAIN_CE_RE = re.compile(r"(\d{1,4}) (ad|ce|bc|bce)\?")

MENTIONS_YEAR = re.compile(r"\byear\b.+(in|during)\b")
MENTIONS_MONTH = re.compile(r"\bmonth\b.+(in|during)\b")
MENTIONS_DAY = re.compile(r"\bday\b.+(in|during)\b")

# Set of RE rules that will cause us to abort text processing, since we know
# the results will be wrong.
REJECT_RULES = (
r".*dynasty.*", # Don't parse '23rd Dynasty' to 'uuuu-uu-23'
)
REJECT_RULES = re.compile(r".*dynasty.*") # Don't parse '23rd Dynasty' to 'uuuu-uu-23'


def text_to_edtf(text):
@functools.lru_cache
def text_to_edtf(text: str) -> Optional[str]:
"""
Generate EDTF string equivalent of a given natural language date string.
"""
Expand All @@ -35,35 +63,37 @@ def text_to_edtf(text):
t = text.lower()

# try parsing the whole thing
result = text_to_edtf_date(t)
result: Optional[str] = text_to_edtf_date(t)

if not result:
# split by list delims and move fwd with the first thing that returns a non-empty string.
# TODO: assemble multiple dates into a {} or [] structure.
for split in [",", ";", "or"]:
for list_item in t.split(split):
# try parsing as an interval - split by '-'
toks = list_item.split("-")
toks: list[str] = list_item.split("-")

if len(toks) == 2:
d1 = toks[0].strip()
d2 = toks[1].strip()

# match looks from the beginning of the string, search
# looks anywhere.

if re.match(r"\d\D\b", d2): # 1-digit year partial e.g. 1868-9
if re.match(
ONE_DIGIT_PARTIAL_FIRST, d2
): # 1-digit year partial e.g. 1868-9
if re.search(
r"\b\d\d\d\d$", d1
PARTIAL_CHECK, d1
): # TODO: evaluate it and see if it's a year
d2 = d1[-4:-1] + d2
elif re.match(r"\d\d\b", d2): # 2-digit year partial e.g. 1809-10
if re.search(r"\b\d\d\d\d$", d1):
elif re.match(
TWO_DIGIT_PARTIAL_FIRST, d2
): # 2-digit year partial e.g. 1809-10
if re.search(PARTIAL_CHECK, d1):
d2 = d1[-4:-2] + d2
else:
century_range_match = re.search(
r"\b(\d\d)(th|st|nd|rd|)-(\d\d)(th|st|nd|rd) [cC]",
f"{d1}-{d2}",
)
century_range_match = re.search(CENTURY_RANGE, f"{d1}-{d2}")
if century_range_match:
g = century_range_match.groups()
d1 = f"{g[0]}C"
Expand All @@ -73,7 +103,7 @@ def text_to_edtf(text):
r2 = text_to_edtf_date(d2)

if r1 and r2:
result = r1 + "/" + r2
result = f"{r1}/{r2}"
return result

# is it an either/or year "1838/1862" - that has a different
Expand All @@ -82,7 +112,7 @@ def text_to_edtf(text):
# This whole section could be more friendly.

else:
int_match = re.search(r"(\d\d\d\d)\/(\d\d\d\d)", list_item)
int_match = re.search(SLASH_YEAR, list_item)
if int_match:
return f"[{int_match.group(1)}, {int_match.group(2)}]"

Expand All @@ -92,21 +122,19 @@ def text_to_edtf(text):
if result:
break

is_before = re.findall(r"\bbefore\b", t)
is_before = is_before or re.findall(r"\bearlier\b", t)

is_after = re.findall(r"\bafter\b", t)
is_after = is_after or re.findall(r"\bsince\b", t)
is_after = is_after or re.findall(r"\blater\b", t)
is_before = re.findall(BEFORE_CHECK, t)
is_after = re.findall(AFTER_CHECK, t)

if is_before:
result = f"/{result}" # unknown is replaced with null for intervals
result = f"/{result}"
elif is_after:
result = f"{result}/" # unknown is replaced with null for intervals
result = f"{result}/"

return result


def text_to_edtf_date(text):
@functools.lru_cache
def text_to_edtf_date(text: str) -> Optional[str]:
"""
Return EDTF string equivalent of a given natural language date string.

Expand All @@ -115,37 +143,28 @@ def text_to_edtf_date(text):
differ are undefined.
"""
if not text:
return
return None

t = text.lower()
result = ""
result: str = ""

for reject_re in REJECT_RULES:
if re.match(reject_re, t):
return
if re.match(REJECT_RULES, t):
return None

# matches on '1800s'. Needs to happen before is_decade.
could_be_century = re.findall(r"(\d{2}00)s", t)
could_be_century: list = re.findall(MIGHT_BE_CENTURY, t)
# matches on '1800s' and '1910s'. Removes the 's'.
# Needs to happen before is_uncertain because e.g. "1860s?"
t, is_decade = re.subn(r"(\d{3}0)s", r"\1", t)
t, is_decade = re.subn(MIGHT_BE_DECADE, r"\1", t)

# detect approximation signifiers
# a few 'circa' abbreviations just before the year
is_approximate = re.findall(r"\b(ca?\.?) ?\d{4}", t)
is_approximate = re.findall(APPROX_CHECK, t)
# the word 'circa' anywhere
is_approximate = is_approximate or re.findall(r"\bcirca\b", t)
# the word 'approx'/'around'/'about' anywhere
is_approximate = is_approximate or re.findall(r"\b(approx|around|about)", t)
# a ~ before a year-ish number
is_approximate = is_approximate or re.findall(r"\b~\d{4}", t)
# a ~ at the beginning
is_approximate = is_approximate or re.findall(r"^~", t)

# detect uncertainty signifiers
t, is_uncertain = re.subn(r"(\d{4})\?", r"\1", t)
# the words uncertain/maybe/guess anywhere
is_uncertain = is_uncertain or re.findall(r"\b(uncertain|possibly|maybe|guess)", t)
t, is_uncertain = re.subn(UNCERTAIN_REPL, r"\1", t)
is_uncertain = is_uncertain or re.findall(UNCERTAIN_CHECK, t)

# detect century forms
is_century = re.findall(CENTURY_RE, t)
Expand All @@ -154,31 +173,28 @@ def text_to_edtf_date(text):
is_ce = re.findall(CE_RE, t)
if is_century:
result = "%02dXX" % (int(is_century[0][0]) - 1,)
is_approximate = is_approximate or re.findall(r"\b(ca?\.?) ?" + CENTURY_RE, t)
is_uncertain = is_uncertain or re.findall(CENTURY_RE + r"\?", t)
is_approximate = is_approximate or re.findall(APPROX_CENTURY_RE, t)
is_uncertain = is_uncertain or re.findall(UNCERTAIN_CENTURY_RE, t)

try:
is_bc = is_century[0][-1] in ("bc", "bce")
if is_bc:
if is_century[0][-1] in ("bc", "bce"):
result = f"-{result}"
except IndexError:
pass

elif is_ce:
result = "%04d" % (int(is_ce[0][0]))
is_approximate = is_approximate or re.findall(r"\b(ca?\.?) ?" + CE_RE, t)
is_uncertain = is_uncertain or re.findall(CE_RE + r"\?", t)
is_approximate = is_approximate or re.findall(APPROX_CE_RE, t)
is_uncertain = is_uncertain or re.findall(UNCERTAIN_CE_RE, t)

try:
is_bc = is_ce[0][-1] in ("bc", "bce")
if is_bc:
if is_ce[0][-1] in ("bc", "bce"):
result = f"-{result}"
except IndexError:
pass

else:
# try dateutil.parse

try:
# parse twice, using different defaults to see what was
# parsed and what was guessed.
Expand All @@ -205,34 +221,34 @@ def text_to_edtf_date(text):

if dt1.date() == DEFAULT_DATE_1.date() and dt2.date() == DEFAULT_DATE_2.date():
# couldn't parse anything - defaults are untouched.
return
return None

date1 = dt1.isoformat()[:10]
date2 = dt2.isoformat()[:10]

# guess precision of 'unspecified' characters to use
mentions_year = re.findall(r"\byear\b.+(in|during)\b", t)
mentions_month = re.findall(r"\bmonth\b.+(in|during)\b", t)
mentions_day = re.findall(r"\bday\b.+(in|during)\b", t)
mentions_year = re.findall(MENTIONS_YEAR, t)
mentions_month = re.findall(MENTIONS_MONTH, t)
mentions_day = re.findall(MENTIONS_DAY, t)

for i in range(len(date1)):
# if the given year could be a century (e.g. '1800s') then use
# approximate/uncertain markers to decide whether we treat it as
# a century or a decade.
if i == 2 and could_be_century and not (is_approximate or is_uncertain):
result += "X"
elif i == 3 and is_decade > 0:
elif i == 3 and is_decade:
if mentions_year:
result += "X" # previously year precision - now just X
result += "X" # year precision
else:
result += "X" # previously decade precision - now just X
result += "X" # decade precision
elif date1[i] == date2[i]:
# since both attempts at parsing produced the same result
# it must be parsed value, not a default
result += date1[i]
else:
# different values were produced, meaning that it's likely
# a default. Use 'X'
# a default. Use 'unspecified'
result += "X"

# strip off unknown chars from end of string - except the first 4
Expand Down
Loading
Loading