@@ -19,7 +19,7 @@ def extract_field(element, item_type, attribute=None, formatter=None):
19
19
return content
20
20
21
21
22
- class Selector :
22
+ class Extractor :
23
23
"""selector class"""
24
24
def __init__ (self , config , formatters = None ):
25
25
self .config = config
@@ -31,28 +31,48 @@ def __init__(self, config, formatters=None):
31
31
32
32
@classmethod
33
33
def from_yaml_string (cls , yaml_string : str , formatters = None ):
34
- """create selector object from yaml string"""
34
+ """create `Extractor` object from yaml string
35
+
36
+ >>> yaml_string = '''
37
+ title:
38
+ selector: "h1"
39
+ type: Text
40
+ '''
41
+ >>> extractor = Extractor.from_yaml_string(yaml_string)
42
+ """
35
43
config = yaml .safe_load (yaml_string )
36
44
return cls (config , formatters = formatters )
37
45
38
46
@classmethod
39
47
def from_yaml_file (cls , yaml_filename : str , formatters = None ):
40
- """create selector object from yaml file"""
48
+ """create `Extractor` object from yaml file
49
+
50
+ >>> extractor = Extractor.from_yaml_string(yaml_filename='selectors.yaml')
51
+ """
41
52
with open (yaml_filename ) as yaml_fileobj :
42
53
config = yaml .safe_load (yaml_fileobj .read ())
43
54
return cls (config , formatters = formatters )
44
55
45
56
def extract (self , html : str , base_url : str = None ):
46
- """returns extracted dict"""
57
+ """
58
+ Args:
59
+ html: html string
60
+ base_url (str, optional): specifying the base_url will make all extracted Links absolute
61
+ Returns:
62
+ dict: extracted data from given html string
63
+
64
+ >>> response = requests.get(url)
65
+ >>> selector.extract(response.text, base_url=response.url)
66
+ """
47
67
sel = parsel .Selector (html , base_url = base_url )
48
68
if base_url :
49
69
sel .root .make_links_absolute ()
50
70
fields_data = {}
51
71
for selector_name , selector_config in self .config .items ():
52
- fields_data [selector_name ] = self .extract_selector (selector_config , sel )
72
+ fields_data [selector_name ] = self ._extract_selector (selector_config , sel )
53
73
return fields_data
54
74
55
- def extract_selector (self , field_config , parent_parser ):
75
+ def _extract_selector (self , field_config , parent_parser ):
56
76
if 'xpath' in field_config :
57
77
elements = parent_parser .xpath (field_config ['xpath' ])
58
78
else :
@@ -62,7 +82,7 @@ def extract_selector(self, field_config, parent_parser):
62
82
63
83
for element in elements :
64
84
if 'children' in field_config :
65
- value = self .get_child_item (field_config , element )
85
+ value = self ._get_child_item (field_config , element )
66
86
else :
67
87
kwargs = {'attribute' : field_config .get ('attribute' )}
68
88
if 'attribute' in field_config :
@@ -78,10 +98,10 @@ def extract_selector(self, field_config, parent_parser):
78
98
79
99
return values
80
100
81
- def get_child_item (self , field_config , element ):
101
+ def _get_child_item (self , field_config , element ):
82
102
children_config = field_config ['children' ]
83
103
child_item = {}
84
104
for field in children_config :
85
- child_value = self .extract_selector (children_config [field ], element )
105
+ child_value = self ._extract_selector (children_config [field ], element )
86
106
child_item [field ] = child_value
87
107
return child_item
0 commit comments