Skip to content

Commit

Permalink
Merge pull request #42 from szegedai/realistic_year_fix
Browse files Browse the repository at this point in the history
Realistic year fix v2
  • Loading branch information
nsomabalint authored Apr 25, 2024
2 parents d76df0a + e491c18 commit dfe55f2
Show file tree
Hide file tree
Showing 7 changed files with 130 additions and 78 deletions.
11 changes: 6 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,18 +86,19 @@ The following formats are currently supported:
### Setting search scope in case of ambiguous input

For the function `text2datetime`, the parameter `search_scope` is used to specify the desired time interval for parsing inputs.

- The default value, `SearchScopes.PRACTICAL_NOT_RESTRICTED`, does not restrict whether the scope of the search is in the past or the future. However, to minimize false matches, it generally restricts year mentions to be between 1900 and 2100.
- The default value, `SearchScopes.NOT_RESTRICTED`, does not restrict whether the scope of the search is in the past or the future.
- For example, when Tuesday is parsed, the date for the Tuesday of the given week will be returned, without considering whether that date is in the past or the future.
- As the number 3000 is unlikely to be considered a year value in everyday contexts, so it is ignored.
- The output datetime can still be later than 2100 or earlier than 1900 with mentions of 'Counted Time Frames' (e.g., `100000 nap múlva`).
- `SearchScopes.NOT_RESTRICTED` is similar to `SearchScopes.PRACTICAL_NOT_RESTRICTED` but applies no restrictions at all regarding the range of valid years.
- To prefer future dates in case of ambiguity, use the value `SearchScopes.FUTURE_DAY`.
- In this case, when Tuesday is parsed, the function will return the nearest Tuesday in the future, not necessarily the current week's Tuesday.
- Similarly, to search in the past, nudging the library to prefer past dates is possible with the value `SearchScopes.PAST_SEARCH`.
- For instance, if May is parsed by the function, with this setting, and if this year's May is still in the future, last year's May will be returned.
- Please note that when there's no ambiguity, the function can still return future or past dates, even when a different preference is specified.

The flag `realistic_year_required` can be set in order to minimize false matches, it generally restricts year mentions to be between 1900 and 2100.
- It defaults to true.
- As the number 3000 is unlikely to be considered a year value in everyday contexts, it is ignored.
- The output datetime can still be later than 2100 or earlier than 1900 with mentions of 'Counted Time Frames' (e.g., `100000 nap múlva`).

An example:
```python
from hun_date_parser import text2datetime
Expand Down
2 changes: 1 addition & 1 deletion hun_date_parser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@

__all__ = ["DatetimeTextualizer", "DatetimeExtractor", "datetime2text", "text2datetime", "text2date", "text2time"]

__version__ = "0.2.5"
__version__ = "0.2.6"
11 changes: 8 additions & 3 deletions hun_date_parser/date_parser/date_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,17 @@


def match_iso_date(s: str,
search_scope: SearchScopes = SearchScopes.NOT_RESTRICTED) -> List[Dict[str, Any]]:
realistic_year_restriction: bool = True) -> List[Dict[str, Any]]:
"""
Match ISO date-like format.
:param s: textual input
:param realistic_year_restriction: whether to restrict year candidate to 1900-->2100 range
:return: tuple of date parts
"""

pattern = r'\b\d{4} (darab|forint|huf|eur|usd|ft|fo)\b'
s = re.sub(pattern, '', s.lower())

match = re.findall(R_ISO_DATE, s)
match_rev = re.findall(R_REV_ISO_DATE, s)

Expand All @@ -35,7 +40,7 @@ def match_iso_date(s: str,
for group in match_rev:
group = [int(m.lstrip('0')) for m in group if m.lstrip('0')]

if search_scope == SearchScopes.PRACTICAL_NOT_RESTRICTED and not is_year_realistic(group[2]):
if realistic_year_restriction and not is_year_realistic(group[2]):
continue

res.append({'match': group,
Expand All @@ -46,7 +51,7 @@ def match_iso_date(s: str,
for group in match:
group = [int(m.lstrip('0')) for m in group if m.lstrip('0')]

if search_scope == SearchScopes.PRACTICAL_NOT_RESTRICTED and not is_year_realistic(group[0]):
if realistic_year_restriction and not is_year_realistic(group[0]):
continue

if len(group) == 1:
Expand Down
53 changes: 37 additions & 16 deletions hun_date_parser/date_parser/datetime_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,55 +32,68 @@


def text2datetime(input_sentence: str, now: datetime = datetime.now(),
search_scope: SearchScopes = SearchScopes.PRACTICAL_NOT_RESTRICTED) -> List[Dict[str, datelike]]:
search_scope: SearchScopes = SearchScopes.NOT_RESTRICTED,
realistic_year_required: bool = True) -> List[Dict[str, datelike]]:
"""
Returns the list of datetime intervals found in the input sentence.
:param input_sentence: Input sentence string.
:param now: Current timestamp to calculate relative dates.
:param search_scope: Defines whether the timeframe should be restricted to past or future.
:param realistic_year_required: Defines whether to restrict year candidates to be between 1900 and 2100.
:return: list of datetime interval dictionaries
"""
datetime_extractor = DatetimeExtractor(now=now, output_container='datetime', search_scope=search_scope)
datetime_extractor = DatetimeExtractor(now=now,
output_container='datetime',
search_scope=search_scope,
realistic_year_required=realistic_year_required)
return datetime_extractor.parse_datetime(sentence=input_sentence)


def text2date(input_sentence: str, now: datetime = datetime.now(),
search_scope: SearchScopes = SearchScopes.PRACTICAL_NOT_RESTRICTED) -> List[Dict[str, datelike]]:
search_scope: SearchScopes = SearchScopes.NOT_RESTRICTED,
realistic_year_required: bool = True) -> List[Dict[str, datelike]]:
"""
Returns the list of date intervals found in the input sentence.
:param input_sentence: Input sentence string.
:param now: Current timestamp to calculate relative dates.
:param search_scope: Defines whether the timeframe should be restricted to past or future.
:param realistic_year_required: Defines whether to restrict year candidates to be between 1900 and 2100.
:return: list of date interval dictionaries
"""
datetime_extractor = DatetimeExtractor(now=now, output_container='date', search_scope=search_scope)
datetime_extractor = DatetimeExtractor(now=now, output_container='date',
search_scope=search_scope, realistic_year_required=realistic_year_required)
return datetime_extractor.parse_datetime(sentence=input_sentence)


def text2time(input_sentence: str, now: datetime = datetime.now(),
search_scope: SearchScopes = SearchScopes.PRACTICAL_NOT_RESTRICTED) -> List[Dict[str, datelike]]:
search_scope: SearchScopes = SearchScopes.NOT_RESTRICTED,
realistic_year_required: bool = True) -> List[Dict[str, datelike]]:
"""
Returns the list of time intervals found in the input sentence.
:param input_sentence: Input sentence string.
:param now: Current timestamp to calculate relative dates.
:param search_scope: Defines whether the timeframe should be restricted to past or future.
:param realistic_year_required: Defines whether to restrict year candidates to be between 1900 and 2100.
:return: list of time interval dictionaries
"""
datetime_extractor = DatetimeExtractor(now=now, output_container='time', search_scope=search_scope)
datetime_extractor = DatetimeExtractor(now=now, output_container='time',
search_scope=search_scope, realistic_year_required=realistic_year_required)
return datetime_extractor.parse_datetime(sentence=input_sentence)


def match_rules(now: datetime, sentence: str,
search_scope: SearchScopes = SearchScopes.PRACTICAL_NOT_RESTRICTED) -> List:
search_scope: SearchScopes = SearchScopes.NOT_RESTRICTED,
realistic_year_required: bool = True) -> List:
"""
Matches all rules against input text.
:param now: Current timestamp to calculate relative dates.
:param sentence: Input sentence.
:param search_scope: Defines whether the timeframe should be restricted to past or future.
:param realistic_year_required: Defines whether to restrict year candidates to be between 1900 and 2100.
:return: Parsed date and time classes.
"""
matches = [*match_named_month(sentence, now, search_scope),
*match_iso_date(sentence, search_scope),
*match_iso_date(sentence, realistic_year_required),
*match_relative_day(sentence, now),
*match_weekday(sentence, now, search_scope),
*match_week(sentence, now),
Expand All @@ -100,13 +113,15 @@ def match_rules(now: datetime, sentence: str,


def match_duration_rules(now: datetime, sentence: str,
search_scope: SearchScopes = SearchScopes.PRACTICAL_NOT_RESTRICTED) -> List:
search_scope: SearchScopes = SearchScopes.NOT_RESTRICTED,
realistic_year_required: bool = True) -> List:
"""
Given that it as already been established that a duration is being parsed, matches all
duration-specific rules against the input text.
:param now: Current timestamp to calculate relative dates.
:param sentence: Input sentence.
:param search_scope: Defines whether the timeframe should be restricted to past or future.
:param realistic_year_required: Defines whether to restrict year candidates to be between 1900 and 2100.
:return: Parsed date and time classes.
"""
matches = [
Expand Down Expand Up @@ -150,18 +165,21 @@ class DatetimeExtractor:
"""

def __init__(self, now: datetime = datetime.now(), output_container: str = 'datetime',
search_scope: SearchScopes = SearchScopes.PRACTICAL_NOT_RESTRICTED) -> None:
search_scope: SearchScopes = SearchScopes.NOT_RESTRICTED,
realistic_year_required: bool = True) -> None:
"""
:param now: Current timestamp to calculate relative dates.
:param output_container: datetime object to populate with datetime parts
:param search_scope: Defines whether the timeframe should be restricted to past or future.
:param realistic_year_required: Defines whether to restrict year candidates to be between 1900 and 2100.
"""
self.now = now
self.output_container = output_container
self.search_scope = search_scope
self.realistic_year_required = realistic_year_required

def _get_implicit_intervall(self, sentence_part: str):
matches = match_rules(self.now, sentence_part, self.search_scope)
matches = match_rules(self.now, sentence_part, self.search_scope, self.realistic_year_required)
return [{'start_date': matches, 'end_date': matches}]

@return_on_value_error(None)
Expand Down Expand Up @@ -350,9 +368,9 @@ def parse_datetime(self, sentence: str) -> List[Dict[str, datelike]]:
# end_date: parse_date(jovo kedd)
if interval and not duration_parts:
interval['start_date'] = 'OPEN' if interval['start_date'] == 'OPEN' else match_rules(self.now, interval[
'start_date'], self.search_scope)
'start_date'], self.search_scope, self.realistic_year_required)
interval['end_date'] = 'OPEN' if interval['end_date'] == 'OPEN' else match_rules(self.now, interval[
'end_date'], self.search_scope)
'end_date'], self.search_scope, self.realistic_year_required)
parsed_dates.append(interval)

# ... another way of explicitly expressing time intervals is with a start date and a duration
Expand All @@ -363,9 +381,12 @@ def parse_datetime(self, sentence: str) -> List[Dict[str, datelike]]:
elif duration_parts:
from_part, duration_part = duration_parts

interval['start_date'] = match_rules(self.now, from_part, self.search_scope)
interval['end_date'] = match_rules(self.now, from_part, self.search_scope) + match_duration_rules(
self.now, duration_part, self.search_scope)
interval['start_date'] = match_rules(self.now, from_part, self.search_scope,
self.realistic_year_required)
interval['end_date'] = match_rules(self.now, from_part,
self.search_scope,
self.realistic_year_required) + match_duration_rules(
self.now, duration_part, self.search_scope, self.realistic_year_required)
parsed_dates.append(interval)

# ... else try to determine a time interval implicitly.
Expand Down
1 change: 0 additions & 1 deletion hun_date_parser/utils/general_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,6 @@ class EndDay(DateTimePartConatiner):

class SearchScopes(Enum):
NOT_RESTRICTED = "not_restricted"
PRACTICAL_NOT_RESTRICTED = "practical_not_restricted"
PAST_SEARCH = "past_search"
FUTURE_DAY = "future_day"

Expand Down
Loading

0 comments on commit dfe55f2

Please sign in to comment.