From e65a9ccf2b267133d7587e4bf954de3aca5470ea Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Wed, 30 Oct 2024 15:53:40 +0200 Subject: [PATCH] feat(linker): allow parsing of roman numerals --- sefaria/model/linker/tests/linker_test.py | 8 +++++++ sefaria/model/schema.py | 26 ++++++++++++++++++++--- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/sefaria/model/linker/tests/linker_test.py b/sefaria/model/linker/tests/linker_test.py index 759f47d077..d604a2abc6 100644 --- a/sefaria/model/linker/tests/linker_test.py +++ b/sefaria/model/linker/tests/linker_test.py @@ -36,6 +36,7 @@ def test_resolved_raw_ref_clone(): @pytest.mark.parametrize(('resolver_data', 'expected_trefs'), [ + [crrd(['@Job', '#xli.', '#5'], lang='en'), ("Job 41:5",)], # Numbered JAs [crrd(["@Jerusalem", "@Talmud", "@Yoma", "#5a"], lang='en'), ("Jerusalem Talmud Yoma 1:1:20-25",)], [crrd(["@Babylonian", "@Talmud", "@Sukkah", "#49b"], lang='en'), ("Sukkah 49b",)], @@ -47,6 +48,13 @@ def test_resolved_raw_ref_clone(): [crrd(['@ספר בראשית', '#פסוק א', '#פרק יג']), ("Genesis 13:1",)], # sections out of order [crrd(['@שמות', '#א', '#ב']), ("Exodus 1:2",)], # used to also match Exodus 2:1 b/c would allow mixing integer parts + # Roman numerals + [crrd(['@Job', '#III', '#5'], lang='en'), ("Job 3:5",)], + [crrd(['@Job', '#ix', '#5'], lang='en'), ("Job 9:5",)], + [crrd(['@Job', '#IV .', '#5'], lang='en'), ("Job 4:5",)], + [crrd(['@Job', '#CIV', '#5'], lang='en'), tuple()], # too high + [crrd(['@Job', '#iiii', '#5'], lang='en'), tuple()], # invalid roman numeral + # Amud split into two parts [crrd(['@בבלי', '@יבמות', '#סא', '#א']), ("Yevamot 61a",)], [crrd(["@תוספות", "@פסחים", "#קו", "#א"]), ("Tosafot on Pesachim 106a",)], # amud for commentary that has DH diff --git a/sefaria/model/schema.py b/sefaria/model/schema.py index 2058e68407..8d30d9dba4 100644 --- a/sefaria/model/schema.py +++ b/sefaria/model/schema.py @@ -2097,7 +2097,10 @@ def is_special_case(self, s): def to_numeric_possibilities(self, lang, s, **kwargs): if s in self.special_cases: return self.special_cases[s] - return [self.toNumber(lang, s)] + try: + return [self.toNumber(lang, s)] + except ValueError: + return [] @classmethod def can_match_out_of_order(cls, lang, s): @@ -2141,7 +2144,7 @@ def get_all_possible_sections_from_string(cls, lang, s, fromSections=None, strip section_str = curr_s else: strict = SuperClass not in {AddressAmud, AddressTalmud} # HACK: AddressTalmud doesn't inherit from AddressInteger so it relies on flexibility of not matching "Daf" - regex_str = addr.regex(lang, strict=strict, group_id='section') + "$" # must match entire string + regex_str = addr.regex(lang, strict=strict, group_id='section', with_roman_numerals=True) + "$" # must match entire string if regex_str is None: continue reg = regex.compile(regex_str, regex.VERBOSE) match = reg.match(curr_s) @@ -2584,7 +2587,11 @@ def _core_regex(self, lang, group_id=None, **kwargs): reg = r"(" if lang == "en": - reg += r"\d+)" + if kwargs.get('with_roman_numerals', False): + # any char valid in roman numerals (I, V, X, L, C, D, M) + optional trailing period + reg += r"(?:\d+|[ivxlcdmIVXLCDM]+(?:\s?\.)?))" + else: + reg += r"\d+)" elif lang == "he": reg += self.hebrew_number_regex() + r")" @@ -2596,6 +2603,19 @@ def toNumber(self, lang, s, **kwargs): elif lang == "he": return decode_hebrew_numeral(s) + def to_numeric_possibilities(self, lang, s, **kwargs): + import roman + from roman import InvalidRomanNumeralError + + possibilities = super().to_numeric_possibilities(lang, s, **kwargs) + if lang == "en": + try: + s = re.sub(r"\.$", "", s).strip() # remove trailing period + possibilities.append(roman.fromRoman(s.upper())) + except InvalidRomanNumeralError as e: + pass + return possibilities + @classmethod def can_match_out_of_order(cls, lang, s): """