1010import hashlib
1111import io
1212import os
13- import re
1413import sys
1514import traceback
1615from collections import Counter
3433from licensedcode import MIN_MATCH_HIGH_LENGTH
3534from licensedcode import MIN_MATCH_LENGTH
3635from licensedcode import SMALL_RULE
36+ from licensedcode .frontmatter import SaneYAMLHandler
37+ from licensedcode .frontmatter import FrontmatterPost
38+ from licensedcode .frontmatter import dumps_frontmatter
39+ from licensedcode .frontmatter import load_frontmatter
40+ from licensedcode .frontmatter import get_rule_text
3741from licensedcode .languages import LANG_INFO as known_languages
3842from licensedcode .spans import Span
3943from licensedcode .tokenize import index_tokenizer
4044from licensedcode .tokenize import index_tokenizer_with_stopwords
4145from licensedcode .tokenize import key_phrase_tokenizer
4246from licensedcode .tokenize import KEY_PHRASE_OPEN
4347from licensedcode .tokenize import KEY_PHRASE_CLOSE
44- from licensedcode .tokenize import query_lines
4548
4649"""
4750Reference License and license Rule structures persisted as a combo of a YAML
@@ -912,40 +915,32 @@ def load_rules(rules_data_dir=rules_data_dir, with_checks=True):
912915 space_problems = []
913916 model_errors = []
914917
915- for data_file in resource_iter (location = rules_data_dir , with_dirs = False ):
916- if data_file .endswith ('.yml ' ):
917- base_name = file_base_name (data_file )
918+ for rule_file in resource_iter (location = rules_data_dir , with_dirs = False ):
919+ if rule_file .endswith ('.RULE ' ):
920+ base_name = file_base_name (rule_file )
918921
919922 if with_checks and ' ' in base_name :
920- space_problems .append (data_file )
921-
922- text_file = join (rules_data_dir , f'{ base_name } .RULE' )
923+ space_problems .append (rule_file )
923924
924925 try :
925- yield Rule .from_files ( data_file = data_file , text_file = text_file )
926+ yield Rule .from_file ( rule_file = rule_file )
926927 except Exception as re :
927928 if with_checks :
928929 model_errors .append (str (re ))
929930
930931 if with_checks :
931932 # accumulate sets to ensures we do not have illegal names or extra
932933 # orphaned files
933- data_file_lower = data_file .lower ()
934- if data_file_lower in lower_case_files :
935- case_problems .add (data_file_lower )
936- else :
937- lower_case_files .add (data_file_lower )
938-
939- text_file_lower = text_file .lower ()
940- if text_file_lower in lower_case_files :
941- case_problems .add (text_file_lower )
934+ rule_file_lower = rule_file .lower ()
935+ if rule_file_lower in lower_case_files :
936+ case_problems .add (rule_file_lower )
942937 else :
943- lower_case_files .add (text_file_lower )
938+ lower_case_files .add (rule_file_lower )
944939
945- processed_files .update ([ data_file , text_file ] )
940+ processed_files .add ( rule_file )
946941
947- if with_checks and not data_file .endswith ('~' ):
948- seen_files .add (data_file )
942+ if with_checks and not rule_file .endswith ('~' ):
943+ seen_files .add (rule_file )
949944
950945 if with_checks :
951946 unknown_files = seen_files - processed_files
@@ -955,29 +950,29 @@ def load_rules(rules_data_dir=rules_data_dir, with_checks=True):
955950 if model_errors :
956951 errors = '\n ' .join (model_errors )
957952 msg += (
958- '\n Invalid rule YAML file in directory: '
959- f'{ rules_data_dir !r} \n { errors } '
953+ '\n Invalid rule file in directory: '
954+ f'{ rules_data_dir !r} \n '
960955 )
961956
962957 if unknown_files :
963958 files = '\n ' .join (sorted (f'file://{ f } "' for f in unknown_files ))
964959 msg += (
965960 '\n Orphaned files in rule directory: '
966- f'{ rules_data_dir !r} \n { files } '
961+ f'{ rules_data_dir !r} \n '
967962 )
968963
969964 if case_problems :
970965 files = '\n ' .join (sorted (f'"file://{ f } "' for f in case_problems ))
971966 msg += (
972967 '\n Rule files with non-unique name in rule directory: '
973- f'{ rules_data_dir !r} \n { files } '
968+ f'{ rules_data_dir !r} \n '
974969 )
975970
976971 if space_problems :
977972 files = '\n ' .join (sorted (f'"file://{ f } "' for f in space_problems ))
978973 msg += (
979974 '\n Rule filename cannot contain spaces: '
980- f'{ rules_data_dir !r} \n { files } '
975+ f'{ rules_data_dir !r} \n '
981976 )
982977
983978 raise InvalidRule (msg )
@@ -1390,28 +1385,18 @@ class BasicRule:
13901385 'position is using the magic -1 key.' )
13911386 )
13921387
1393- def data_file (
1388+ def rule_file (
13941389 self ,
13951390 rules_data_dir = rules_data_dir ,
13961391 licenses_data_dir = licenses_data_dir ,
13971392 ):
1398- data_file_base_name = file_base_name (self .identifier )
1399- data_file_name = f'{ data_file_base_name } .yml '
1393+ rule_file_base_name = file_base_name (self .identifier )
1394+ rule_file_name = f'{ rule_file_base_name } .RULE '
14001395
14011396 if self .is_from_license :
1402- return join (licenses_data_dir , data_file_name )
1403- else :
1404- return join (rules_data_dir , data_file_name )
1405-
1406- def text_file (
1407- self ,
1408- rules_data_dir = rules_data_dir ,
1409- licenses_data_dir = licenses_data_dir ,
1410- ):
1411- if self .is_from_license :
1412- return join (licenses_data_dir , f'{ self .identifier } ' )
1397+ return join (licenses_data_dir , rule_file_name )
14131398 else :
1414- return join (rules_data_dir , f' { self . identifier } ' )
1399+ return join (rules_data_dir , rule_file_name )
14151400
14161401 def __attrs_post_init__ (self , * args , ** kwargs ):
14171402 self .setup ()
@@ -1431,13 +1416,13 @@ def setup(self):
14311416 trace = traceback .format_exc ()
14321417 raise InvalidRule (
14331418 f'Unable to parse Rule license expression: { exp !r} '
1434- f'for: file://{ self .data_file } \n { trace } '
1419+ f'for: file://{ self .identifier } \n { trace } '
14351420 ) from e
14361421
14371422 if expression is None :
14381423 raise InvalidRule (
14391424 f'Invalid rule License expression parsed to empty: '
1440- f'{ self .license_expression !r} for: file://{ self .data_file } '
1425+ f'{ self .license_expression !r} for: file://{ self .identifier } '
14411426 )
14421427
14431428 self .license_expression = expression .render ()
@@ -1655,15 +1640,7 @@ def to_dict(self):
16551640 return data
16561641
16571642
1658- def get_rule_text (location = None , text = None ):
1659- """
1660- Return the rule ``text`` prepared for indexing.
1661- ###############
1662- # IMPORTANT: we use the same process as used to load query text for symmetry
1663- ###############
1664- """
1665- numbered_lines = query_lines (location = location , query_string = text , plain_text = True )
1666- return '\n ' .join (l .strip () for _ , l in numbered_lines )
1643+
16671644
16681645
16691646def has_only_lower_license_keys (license_expression , licensing = Licensing ()):
@@ -1711,13 +1688,13 @@ def __attrs_post_init__(self, *args, **kwargs):
17111688 self .setup ()
17121689
17131690 @classmethod
1714- def from_files (cls , data_file , text_file ):
1691+ def from_file (cls , rule_file ):
17151692 """
17161693 Return a new Rule object loaded from a data file stored at
17171694 ``data_file`` and a companion ``text_file``.
17181695 """
17191696 rule = Rule ()
1720- rule .load_data (data_file = data_file , text_file = text_file )
1697+ rule .load_data (rule_file = rule_file )
17211698 return rule
17221699
17231700 @classmethod
@@ -1786,29 +1763,29 @@ def _from_expression(cls, license_expression=None, identifier=None, **kwargs):
17861763 rule .setup ()
17871764 return rule
17881765
1789- def load_data (self , data_file , text_file ):
1766+ def load_data (self , rule_file ):
17901767 """
1791- Load data from ``data_file `` and ``text_file``. Check presence of text
1792- file to determine if this is a special synthetic rule.
1768+ Load data from ``rule_file `` which has both the text and the data (as YAML forntmatter).
1769+ Check presence of text file to determine if this is a special synthetic rule.
17931770 """
17941771 if self .is_synthetic :
17951772 if not self .text :
17961773 raise InvalidRule (
17971774 f'Invalid synthetic rule without text: { self } : { self .text !r} ' )
17981775 return self
17991776
1800- if not data_file or not text_file :
1777+ if not rule_file :
18011778 raise InvalidRule (
1802- f'Cannot load rule without its corresponding text_file and data file : '
1803- f'{ self } : file://{ data_file } file:// { text_file } ' )
1779+ f'Cannot load rule without its corresponding rule_file : '
1780+ f'{ self } : file://{ rule_file } ' )
18041781
1805- self .identifier = file_name (text_file )
1782+ self .identifier = file_name (rule_file )
18061783
18071784 try :
1808- self .load (data_file = data_file , text_file = text_file )
1785+ self .load (rule_file = rule_file )
18091786 except Exception :
18101787 trace = traceback .format_exc ()
1811- raise InvalidRule (f'While loading: file://{ data_file } \n { trace } ' )
1788+ raise InvalidRule (f'While loading: file://{ rule_file } \n { trace } ' )
18121789
18131790 return self
18141791
@@ -1895,10 +1872,12 @@ def compute_thresholds(self, small_rule=SMALL_RULE):
18951872
18961873 def dump (self , rules_data_dir ):
18971874 """
1898- Dump a representation of this rule as two files stored in
1899- ``rules_data_dir``:
1900- - a .yml for the rule data in YAML (e.g., data_file)
1901- - a .RULE: the rule text as a UTF-8 file (e.g., text_file)
1875+ Dump a representation of this rule as a .RULE file stored in
1876+ ``rules_data_dir`` as a UTF-8 file having:
1877+ - the rule data as YAML frontmatter
1878+ - the rule text
1879+ and this is a `rule_file`.
1880+
19021881 Does nothing if this rule was created from a License (e.g.,
19031882 `is_from_license` is True)
19041883 """
@@ -1911,28 +1890,35 @@ def write(location, byte_string):
19111890 with io .open (location , 'wb' ) as of :
19121891 of .write (byte_string )
19131892
1914- data_file = self .data_file (rules_data_dir = rules_data_dir )
1915- as_yaml = saneyaml .dump (self .to_dict (), indent = 4 , encoding = 'utf-8' )
1916- write (data_file , as_yaml )
1893+ rule_file = self .rule_file (rules_data_dir = rules_data_dir )
19171894
1918- text_file = self .text_file (rules_data_dir = rules_data_dir )
1919- write (text_file , self .text .encode ('utf-8' ))
1895+ metadata = self .to_dict ()
1896+ content = self .text .encode ('utf-8' )
1897+ rule_post = FrontmatterPost (content = content , handler = SaneYAMLHandler (), ** metadata )
1898+ output_string = dumps_frontmatter (post = rule_post )
19201899
1921- def load (self , data_file , text_file , with_checks = True ):
1900+ write (rule_file , output_string .encode ('utf-8' ))
1901+
1902+ def load (self , rule_file , with_checks = True ):
19221903 """
1923- Load self from a .RULE YAML file stored in data_file and text_file.
1904+ Load self from a .RULE file with YAMl frontmatter stored in data_file and text_file.
19241905 Unknown fields are ignored and not bound to the Rule object.
19251906 Optionally check for consistency if ``with_checks`` is True.
19261907 """
19271908 try :
1928- with io .open (data_file , encoding = 'utf-8' ) as f :
1929- data = saneyaml .load (f .read (), allow_duplicate_keys = False )
1909+ post = load_frontmatter (rule_file )
1910+ data = post .metadata
1911+ if not post .content :
1912+ raise InvalidRule (
1913+ f'Cannot load rule with empty text: '
1914+ f'{ self } : file://{ rule_file } '
1915+ )
19301916
1931- self .text = get_rule_text ( location = text_file )
1917+ self .text = post . content
19321918
19331919 except Exception as e :
19341920 print ('#############################' )
1935- print ('INVALID LICENSE RULE FILE:' , f'file://{ data_file } ' , f'file:// { text_file } ' )
1921+ print ('INVALID LICENSE RULE FILE:' , f'file://{ rule_file } ' )
19361922 print ('#############################' )
19371923 print (e )
19381924 print ('#############################' )
0 commit comments