8
8
9
9
def extract_field (element , item_type , attribute = None , formatter = None ):
10
10
if item_type == 'Text' :
11
- texts = [i .strip () for i in element .xpath ('.//text()' ).getall () if i .strip ()]
11
+ texts = [
12
+ i .strip () for i in element .xpath ('.//text()' ).getall () if i .strip ()
13
+ ]
12
14
content = " " .join (texts )
13
15
elif item_type == 'Link' :
14
16
content = element .xpath ('.//@href' ).get ()
@@ -68,7 +70,8 @@ def extract(self, html: str, base_url: str = None):
68
70
"""
69
71
Args:
70
72
html: html string
71
- base_url (str, optional): specifying the base_url will make all extracted Links absolute
73
+ base_url (str, optional): specifying the base_url will make all
74
+ extracted Links absolute
72
75
Returns:
73
76
dict: extracted data from given html string
74
77
@@ -80,7 +83,9 @@ def extract(self, html: str, base_url: str = None):
80
83
sel .root .make_links_absolute ()
81
84
fields_data = {}
82
85
for selector_name , selector_config in self .config .items ():
83
- fields_data [selector_name ] = self ._extract_selector (selector_config , sel )
86
+ fields_data [selector_name ] = self ._extract_selector (
87
+ selector_config , sel
88
+ )
84
89
return fields_data
85
90
86
91
def _extract_selector (self , field_config , parent_parser ):
@@ -105,7 +110,9 @@ def _extract_selector(self, field_config, parent_parser):
105
110
if 'attribute' in field_config :
106
111
kwargs ['attribute' ] = field_config ['attribute' ]
107
112
if 'format' in field_config :
108
- kwargs ['formatter' ] = self .formatters [field_config ['format' ]]
113
+ kwargs ['formatter' ] = self .formatters [
114
+ field_config ['format' ]
115
+ ]
109
116
value = extract_field (element , item_type , ** kwargs )
110
117
111
118
if field_config .get ('multiple' ) is not True :
@@ -119,7 +126,10 @@ def _get_child_item(self, field_config, element):
119
126
children_config = field_config ['children' ]
120
127
child_item = {}
121
128
for field in children_config :
122
- child_value = self ._extract_selector (children_config [field ], element )
129
+ child_value = self ._extract_selector (
130
+ children_config [field ],
131
+ element
132
+ )
123
133
child_item [field ] = child_value
124
134
return child_item
125
135
0 commit comments