Skip to content

Commit

Permalink
feat(linker): allow parsing of roman numerals
Browse files Browse the repository at this point in the history
  • Loading branch information
nsantacruz committed Oct 30, 2024
1 parent f0f4961 commit e65a9cc
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 3 deletions.
8 changes: 8 additions & 0 deletions sefaria/model/linker/tests/linker_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def test_resolved_raw_ref_clone():


@pytest.mark.parametrize(('resolver_data', 'expected_trefs'), [
[crrd(['@Job', '#xli.', '#5'], lang='en'), ("Job 41:5",)],
# Numbered JAs
[crrd(["@Jerusalem", "@Talmud", "@Yoma", "#5a"], lang='en'), ("Jerusalem Talmud Yoma 1:1:20-25",)],
[crrd(["@Babylonian", "@Talmud", "@Sukkah", "#49b"], lang='en'), ("Sukkah 49b",)],
Expand All @@ -47,6 +48,13 @@ def test_resolved_raw_ref_clone():
[crrd(['@ספר בראשית', '#פסוק א', '#פרק יג']), ("Genesis 13:1",)], # sections out of order
[crrd(['@שמות', '#א', '#ב']), ("Exodus 1:2",)], # used to also match Exodus 2:1 b/c would allow mixing integer parts
# Roman numerals
[crrd(['@Job', '#III', '#5'], lang='en'), ("Job 3:5",)],
[crrd(['@Job', '#ix', '#5'], lang='en'), ("Job 9:5",)],
[crrd(['@Job', '#IV .', '#5'], lang='en'), ("Job 4:5",)],
[crrd(['@Job', '#CIV', '#5'], lang='en'), tuple()], # too high
[crrd(['@Job', '#iiii', '#5'], lang='en'), tuple()], # invalid roman numeral
# Amud split into two parts
[crrd(['@בבלי', '@יבמות', '#סא', '#א']), ("Yevamot 61a",)],
[crrd(["@תוספות", "@פסחים", "#קו", "#א"]), ("Tosafot on Pesachim 106a",)], # amud for commentary that has DH
Expand Down
26 changes: 23 additions & 3 deletions sefaria/model/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -2097,7 +2097,10 @@ def is_special_case(self, s):
def to_numeric_possibilities(self, lang, s, **kwargs):
if s in self.special_cases:
return self.special_cases[s]
return [self.toNumber(lang, s)]
try:
return [self.toNumber(lang, s)]
except ValueError:
return []

@classmethod
def can_match_out_of_order(cls, lang, s):
Expand Down Expand Up @@ -2141,7 +2144,7 @@ def get_all_possible_sections_from_string(cls, lang, s, fromSections=None, strip
section_str = curr_s
else:
strict = SuperClass not in {AddressAmud, AddressTalmud} # HACK: AddressTalmud doesn't inherit from AddressInteger so it relies on flexibility of not matching "Daf"
regex_str = addr.regex(lang, strict=strict, group_id='section') + "$" # must match entire string
regex_str = addr.regex(lang, strict=strict, group_id='section', with_roman_numerals=True) + "$" # must match entire string
if regex_str is None: continue
reg = regex.compile(regex_str, regex.VERBOSE)
match = reg.match(curr_s)
Expand Down Expand Up @@ -2584,7 +2587,11 @@ def _core_regex(self, lang, group_id=None, **kwargs):
reg = r"("

if lang == "en":
reg += r"\d+)"
if kwargs.get('with_roman_numerals', False):
# any char valid in roman numerals (I, V, X, L, C, D, M) + optional trailing period
reg += r"(?:\d+|[ivxlcdmIVXLCDM]+(?:\s?\.)?))"
else:
reg += r"\d+)"
elif lang == "he":
reg += self.hebrew_number_regex() + r")"

Expand All @@ -2596,6 +2603,19 @@ def toNumber(self, lang, s, **kwargs):
elif lang == "he":
return decode_hebrew_numeral(s)

def to_numeric_possibilities(self, lang, s, **kwargs):
import roman
from roman import InvalidRomanNumeralError

possibilities = super().to_numeric_possibilities(lang, s, **kwargs)
if lang == "en":
try:
s = re.sub(r"\.$", "", s).strip() # remove trailing period
possibilities.append(roman.fromRoman(s.upper()))
except InvalidRomanNumeralError as e:
pass
return possibilities

@classmethod
def can_match_out_of_order(cls, lang, s):
"""
Expand Down

0 comments on commit e65a9cc

Please sign in to comment.