Skip to content

Commit a5baff7

Browse files
committed
cleanup code and docs. rename Selector to Extractor
1 parent 2a63c5d commit a5baff7

File tree

8 files changed

+59
-45
lines changed

8 files changed

+59
-45
lines changed

HISTORY.rst

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,3 @@
11
=======
22
History
33
=======
4-
5-
0.6.0 (2019-05-22)
6-
------------------
7-
8-
* First release on PyPI.

README.rst

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,28 @@ selectorlib
2222

2323
A library to read a YML file with Xpath or CSS Selectors and extract data from HTML pages using them
2424

25-
2625
* Free software: MIT license
2726
* Documentation: https://selectorlib.readthedocs.io.
2827

28+
29+
Example
30+
--------
31+
32+
>>> from selectorlib import Extractor
33+
>>> yaml_string = """
34+
title:
35+
selector: "h1"
36+
type: Text
37+
link:
38+
selector: "h2 a"
39+
type: Link
40+
"""
41+
>>> extractor = Extractor.from_yaml_string(yaml_string)
42+
>>> html = """
43+
<h1>Title</h1>
44+
<h2>Usage
45+
<a class="headerlink" href="http://test">¶</a>
46+
</h2>
47+
"""
48+
>>> selector.extract(html)
49+
{'title': 'Title', 'link': 'http://test'}

docs/index.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
Welcome to selectorlib's documentation!
2-
======================================
2+
=======================================
33

44
.. include:: ../README.rst
55

docs/installation.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ Or download the `tarball`_:
3838

3939
.. code-block:: console
4040
41-
$ curl -OL https://github.com/scrapehero/selectorlib/tarball/master
41+
$ curl -OL https://github.com/scrapehero/selectorlib/tarball/master
4242
4343
Once you have a copy of the source, you can install it with:
4444

docs/selectorlib.rst

Lines changed: 4 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -5,39 +5,17 @@ Module contents
55
---------------
66

77
.. automodule:: selectorlib
8-
:members: Selector
8+
:members: Extractor
99

1010

1111

1212
Usage
1313
-----
1414

15-
To use selectorlib in a project::
16-
17-
>>> import selectorlib
18-
19-
>>> yaml_string = """
20-
title:
21-
selector: "h1"
22-
type: Text
23-
link:
24-
selector: "h2 a"
25-
type: Link
26-
"""
27-
>>> selector = selectorlib.Selector.from_yaml_string(yaml_string)
28-
>>> html = """
29-
<h1>Title</h1>
30-
<h2>Usage
31-
<a class="headerlink" href="http:://test">¶</a>
32-
</h2>
33-
"""
34-
>>> selector.extract(html)
35-
{'title': 'Title', 'link': 'http:://test'}
36-
37-
To use selectorlib with requests
15+
To use selectorlib with requests:
3816

3917
>>> import requests
40-
>>> from selectorlib import Selector
18+
>>> from selectorlib import Extractor
4119
>>> selector_yaml = """
4220
name:
4321
selector: h1.product_title
@@ -70,7 +48,7 @@ related_products:
7048
price:
7149
selector: .price
7250
"""
73-
>>> selector = Selector.from_yaml_string(selector_yaml)
51+
>>> extractor = Extractor.from_yaml_string(selector_yaml)
7452
>>> url = 'https://scrapeme.live/shop/Bulbasaur/'
7553
>>> response = requests.get(url)
7654
>>> selector.extract(response.text, base_url=response.url)

selectorlib/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,4 @@
66
__email__ = 'pypi@scrapehero.com'
77
__version__ = '0.10.0'
88

9-
from .selectorlib import Selector # noqa:F401
9+
from .selectorlib import Extractor # noqa:F401

selectorlib/selectorlib.py

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ def extract_field(element, item_type, attribute=None, formatter=None):
1919
return content
2020

2121

22-
class Selector:
22+
class Extractor:
2323
"""selector class"""
2424
def __init__(self, config, formatters=None):
2525
self.config = config
@@ -31,28 +31,48 @@ def __init__(self, config, formatters=None):
3131

3232
@classmethod
3333
def from_yaml_string(cls, yaml_string: str, formatters=None):
34-
"""create selector object from yaml string"""
34+
"""create `Extractor` object from yaml string
35+
36+
>>> yaml_string = '''
37+
title:
38+
selector: "h1"
39+
type: Text
40+
'''
41+
>>> extractor = Extractor.from_yaml_string(yaml_string)
42+
"""
3543
config = yaml.safe_load(yaml_string)
3644
return cls(config, formatters=formatters)
3745

3846
@classmethod
3947
def from_yaml_file(cls, yaml_filename: str, formatters=None):
40-
"""create selector object from yaml file"""
48+
"""create `Extractor` object from yaml file
49+
50+
>>> extractor = Extractor.from_yaml_string(yaml_filename='selectors.yaml')
51+
"""
4152
with open(yaml_filename) as yaml_fileobj:
4253
config = yaml.safe_load(yaml_fileobj.read())
4354
return cls(config, formatters=formatters)
4455

4556
def extract(self, html: str, base_url: str = None):
46-
"""returns extracted dict"""
57+
"""
58+
Args:
59+
html: html string
60+
base_url (str, optional): specifying the base_url will make all extracted Links absolute
61+
Returns:
62+
dict: extracted data from given html string
63+
64+
>>> response = requests.get(url)
65+
>>> selector.extract(response.text, base_url=response.url)
66+
"""
4767
sel = parsel.Selector(html, base_url=base_url)
4868
if base_url:
4969
sel.root.make_links_absolute()
5070
fields_data = {}
5171
for selector_name, selector_config in self.config.items():
52-
fields_data[selector_name] = self.extract_selector(selector_config, sel)
72+
fields_data[selector_name] = self._extract_selector(selector_config, sel)
5373
return fields_data
5474

55-
def extract_selector(self, field_config, parent_parser):
75+
def _extract_selector(self, field_config, parent_parser):
5676
if 'xpath' in field_config:
5777
elements = parent_parser.xpath(field_config['xpath'])
5878
else:
@@ -62,7 +82,7 @@ def extract_selector(self, field_config, parent_parser):
6282

6383
for element in elements:
6484
if 'children' in field_config:
65-
value = self.get_child_item(field_config, element)
85+
value = self._get_child_item(field_config, element)
6686
else:
6787
kwargs = {'attribute': field_config.get('attribute')}
6888
if 'attribute' in field_config:
@@ -78,10 +98,10 @@ def extract_selector(self, field_config, parent_parser):
7898

7999
return values
80100

81-
def get_child_item(self, field_config, element):
101+
def _get_child_item(self, field_config, element):
82102
children_config = field_config['children']
83103
child_item = {}
84104
for field in children_config:
85-
child_value = self.extract_selector(children_config[field], element)
105+
child_value = self._extract_selector(children_config[field], element)
86106
child_item[field] = child_value
87107
return child_item

tests/test_selectorlib.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def output_yaml():
3737
def test_content(html, input_yaml, output_yaml):
3838
base_url = "https://scrapeme.live/shop/Bulbasaur/"
3939
formatters = [formatter.Integer]
40-
selector = selectorlib.Selector.from_yaml_string(input_yaml, formatters=formatters)
40+
selector = selectorlib.Extractor.from_yaml_string(input_yaml, formatters=formatters)
4141
output = selector.extract(html, base_url=base_url)
4242
assert output == yaml.safe_load(output_yaml)
4343

0 commit comments

Comments
 (0)