-
Notifications
You must be signed in to change notification settings - Fork 19
/
en.py
278 lines (227 loc) · 9.63 KB
/
en.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
"""Utilities to derive an EDTF string from an (English) natural language string."""
import re
from datetime import datetime
from dateutil.parser import ParserError, parse
from six.moves import xrange
from edtf import appsettings
# two dates where every digit of an ISO date representation is different,
# and one is in the past and one is in the future.
# This is used in the dateutil parse to detect which elements weren't parsed.
DEFAULT_DATE_1 = datetime(1234, 1, 1, 0, 0)
DEFAULT_DATE_2 = datetime(5678, 10, 10, 0, 0)
SHORT_YEAR_RE = r"(-?)([\dX])([\dX])([\dX])([\dX])"
LONG_YEAR_RE = r"Y(-?)([1-9]\d\d\d\d+)"
CENTURY_RE = r"(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?"
CE_RE = r"(\d{1,4}) (ad|ce|bc|bce)"
# Set of RE rules that will cause us to abort text processing, since we know
# the results will be wrong.
REJECT_RULES = (
r".*dynasty.*", # Don't parse '23rd Dynasty' to 'uuuu-uu-23'
)
def text_to_edtf(text):
"""
Generate EDTF string equivalent of a given natural language date string.
"""
if not text:
return None
t = text.lower()
# try parsing the whole thing
result = text_to_edtf_date(t)
if not result:
# split by list delims and move fwd with the first thing that returns a non-empty string.
# TODO: assemble multiple dates into a {} or [] structure.
for split in [",", ";", "or"]:
for list_item in t.split(split):
# try parsing as an interval - split by '-'
toks = list_item.split("-")
if len(toks) == 2:
d1 = toks[0].strip()
d2 = toks[1].strip()
# match looks from the beginning of the string, search
# looks anywhere.
if re.match(r"\d\D\b", d2): # 1-digit year partial e.g. 1868-9
if re.search(
r"\b\d\d\d\d$", d1
): # TODO: evaluate it and see if it's a year
d2 = d1[-4:-1] + d2
elif re.match(r"\d\d\b", d2): # 2-digit year partial e.g. 1809-10
if re.search(r"\b\d\d\d\d$", d1):
d2 = d1[-4:-2] + d2
else:
century_range_match = re.search(
r"\b(\d\d)(th|st|nd|rd|)-(\d\d)(th|st|nd|rd) [cC]",
f"{d1}-{d2}",
)
if century_range_match:
g = century_range_match.groups()
d1 = f"{g[0]}C"
d2 = f"{g[2]}C"
r1 = text_to_edtf_date(d1)
r2 = text_to_edtf_date(d2)
if r1 and r2:
result = r1 + "/" + r2
return result
# is it an either/or year "1838/1862" - that has a different
# representation in EDTF. If it's 'both', then we use {}. If
# it's 'or' then we use []. Assuming the latter for now.
# This whole section could be more friendly.
else:
int_match = re.search(r"(\d\d\d\d)\/(\d\d\d\d)", list_item)
if int_match:
return f"[{int_match.group(1)}, {int_match.group(2)}]"
result = text_to_edtf_date(list_item)
if result:
break
if result:
break
is_before = re.findall(r"\bbefore\b", t)
is_before = is_before or re.findall(r"\bearlier\b", t)
is_after = re.findall(r"\bafter\b", t)
is_after = is_after or re.findall(r"\bsince\b", t)
is_after = is_after or re.findall(r"\blater\b", t)
if is_before:
result = f"/{result}" # unknown is replaced with null for intervals
elif is_after:
result = f"{result}/" # unknown is replaced with null for intervals
return result
def text_to_edtf_date(text):
"""
Return EDTF string equivalent of a given natural language date string.
The approach here is to parse the text twice, with different default
dates. Then compare the results to see what differs - the parts that
differ are undefined.
"""
if not text:
return
t = text.lower()
result = ""
for reject_re in REJECT_RULES:
if re.match(reject_re, t):
return
# matches on '1800s'. Needs to happen before is_decade.
could_be_century = re.findall(r"(\d{2}00)s", t)
# matches on '1800s' and '1910s'. Removes the 's'.
# Needs to happen before is_uncertain because e.g. "1860s?"
t, is_decade = re.subn(r"(\d{3}0)s", r"\1", t)
# detect approximation signifiers
# a few 'circa' abbreviations just before the year
is_approximate = re.findall(r"\b(ca?\.?) ?\d{4}", t)
# the word 'circa' anywhere
is_approximate = is_approximate or re.findall(r"\bcirca\b", t)
# the word 'approx'/'around'/'about' anywhere
is_approximate = is_approximate or re.findall(r"\b(approx|around|about)", t)
# a ~ before a year-ish number
is_approximate = is_approximate or re.findall(r"\b~\d{4}", t)
# a ~ at the beginning
is_approximate = is_approximate or re.findall(r"^~", t)
# detect uncertainty signifiers
t, is_uncertain = re.subn(r"(\d{4})\?", r"\1", t)
# the words uncertain/maybe/guess anywhere
is_uncertain = is_uncertain or re.findall(r"\b(uncertain|possibly|maybe|guess)", t)
# detect century forms
is_century = re.findall(CENTURY_RE, t)
# detect CE/BCE year form
is_ce = re.findall(CE_RE, t)
if is_century:
result = "%02dXX" % (int(is_century[0][0]) - 1,)
is_approximate = is_approximate or re.findall(r"\b(ca?\.?) ?" + CENTURY_RE, t)
is_uncertain = is_uncertain or re.findall(CENTURY_RE + r"\?", t)
try:
is_bc = is_century[0][-1] in ("bc", "bce")
if is_bc:
result = f"-{result}"
except IndexError:
pass
elif is_ce:
result = "%04d" % (int(is_ce[0][0]))
is_approximate = is_approximate or re.findall(r"\b(ca?\.?) ?" + CE_RE, t)
is_uncertain = is_uncertain or re.findall(CE_RE + r"\?", t)
try:
is_bc = is_ce[0][-1] in ("bc", "bce")
if is_bc:
result = f"-{result}"
except IndexError:
pass
else:
# try dateutil.parse
try:
# parse twice, using different defaults to see what was
# parsed and what was guessed.
dt1 = parse(
t,
dayfirst=appsettings.DAY_FIRST,
yearfirst=False,
fuzzy=True, # force a match, even if it's default date
default=DEFAULT_DATE_1,
)
dt2 = parse(
t,
dayfirst=appsettings.DAY_FIRST,
yearfirst=False,
fuzzy=True, # force a match, even if it's default date
default=DEFAULT_DATE_2,
)
except ParserError:
return
except Exception:
return
if dt1.date() == DEFAULT_DATE_1.date() and dt2.date() == DEFAULT_DATE_2.date():
# couldn't parse anything - defaults are untouched.
return
date1 = dt1.isoformat()[:10]
date2 = dt2.isoformat()[:10]
# guess precision of 'unspecified' characters to use
mentions_year = re.findall(r"\byear\b.+(in|during)\b", t)
mentions_month = re.findall(r"\bmonth\b.+(in|during)\b", t)
mentions_day = re.findall(r"\bday\b.+(in|during)\b", t)
for i in xrange(len(date1)):
# if the given year could be a century (e.g. '1800s') then use
# approximate/uncertain markers to decide whether we treat it as
# a century or a decade.
if i == 2 and could_be_century and not (is_approximate or is_uncertain):
result += "X"
elif i == 3 and is_decade > 0:
if mentions_year:
result += "X" # previously year precision - now just X
else:
result += "X" # previously decade precision - now just X
elif date1[i] == date2[i]:
# since both attempts at parsing produced the same result
# it must be parsed value, not a default
result += date1[i]
else:
# different values were produced, meaning that it's likely
# a default. Use 'X'
result += "X"
# strip off unknown chars from end of string - except the first 4
for i in reversed(xrange(len(result))):
if result[i] not in ("X", "-"):
smallest_length = 4
if mentions_month:
smallest_length = 7
if mentions_day:
smallest_length = 10
limit = max(smallest_length, i + 1)
result = result[:limit]
break
# check for seasons
if "spring" in t:
result = result[:4] + "-21" + result[7:]
elif "summer" in t:
result = result[:4] + "-22" + result[7:]
elif "autumn" in t or "fall" in t:
result = result[:4] + "-23" + result[7:]
elif "winter" in t:
result = result[:4] + "-24" + result[7:]
# end dateutil post-parsing
if is_uncertain and is_approximate:
result += "%"
else:
if is_uncertain:
result += "?"
if is_approximate:
result += "~"
# weed out bad parses
if result.startswith("XX-XX"):
return None
return result