cleanup code and docs. rename Selector to Extractor

ashwinrajeev · ashwinrajeev · commit a5baff78931f · 2019-05-29T19:40:31.000+05:30
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -1,8 +1,3 @@
 =======
 History
 =======
-
-0.6.0 (2019-05-22)
-------------------
-
-* First release on PyPI.
diff --git a/README.rst b/README.rst
@@ -22,7 +22,28 @@ selectorlib
 
 A library to read a YML file with Xpath or CSS Selectors and extract data from HTML pages using them
 
-
 * Free software: MIT license
 * Documentation: https://selectorlib.readthedocs.io.
 
+
+Example
+--------
+
+>>> from selectorlib import Extractor
+>>> yaml_string = """
+    title:
+        selector: "h1"
+        type: Text
+    link:
+        selector: "h2 a"
+        type: Link
+    """
+>>> extractor = Extractor.from_yaml_string(yaml_string)
+>>> html = """
+    <h1>Title</h1>
+    <h2>Usage
+        <a class="headerlink" href="http://test">¶</a>
+    </h2>
+    """
+>>> selector.extract(html)
+{'title': 'Title', 'link': 'http://test'}
diff --git a/docs/index.rst b/docs/index.rst
@@ -1,5 +1,5 @@
 Welcome to selectorlib's documentation!
-======================================
+=======================================
 
 .. include:: ../README.rst
 
diff --git a/docs/installation.rst b/docs/installation.rst
@@ -38,7 +38,7 @@ Or download the `tarball`_:
 
 .. code-block:: console
 
-    $ curl  -OL https://github.com/scrapehero/selectorlib/tarball/master
+    $ curl -OL https://github.com/scrapehero/selectorlib/tarball/master
 
 Once you have a copy of the source, you can install it with:
 
diff --git a/docs/selectorlib.rst b/docs/selectorlib.rst
@@ -5,39 +5,17 @@ Module contents
 ---------------
 
 .. automodule:: selectorlib
-    :members: Selector
+    :members: Extractor
 
 
 
 Usage
 -----
 
-To use selectorlib in a project::
-
->>> import selectorlib
-
->>> yaml_string = """
-    title:
-        selector: "h1"
-        type: Text
-    link:
-        selector: "h2 a"
-        type: Link
-    """
->>> selector = selectorlib.Selector.from_yaml_string(yaml_string)
->>> html = """
-    <h1>Title</h1>
-    <h2>Usage
-        <a class="headerlink" href="http:://test">¶</a>
-    </h2>
-    """
->>> selector.extract(html)
-{'title': 'Title', 'link': 'http:://test'}
-
-To use selectorlib with requests
+To use selectorlib with requests:
 
 >>> import requests
->>> from selectorlib import Selector
+>>> from selectorlib import Extractor
 >>> selector_yaml = """
 name:
     selector: h1.product_title
@@ -70,7 +48,7 @@ related_products:
         price:
             selector: .price
 """
->>> selector = Selector.from_yaml_string(selector_yaml)
+>>> extractor = Extractor.from_yaml_string(selector_yaml)
 >>> url = 'https://scrapeme.live/shop/Bulbasaur/'
 >>> response = requests.get(url)
 >>> selector.extract(response.text, base_url=response.url)
diff --git a/selectorlib/__init__.py b/selectorlib/__init__.py
@@ -6,4 +6,4 @@
 __email__ = 'pypi@scrapehero.com'
 __version__ = '0.10.0'
 
-from .selectorlib import Selector  # noqa:F401
+from .selectorlib import Extractor  # noqa:F401
diff --git a/selectorlib/selectorlib.py b/selectorlib/selectorlib.py
@@ -19,7 +19,7 @@ def extract_field(element, item_type, attribute=None, formatter=None):
     return content
 
 
-class Selector:
+class Extractor:
     """selector class"""
     def __init__(self, config, formatters=None):
         self.config = config
@@ -31,28 +31,48 @@ def __init__(self, config, formatters=None):
 
     @classmethod
     def from_yaml_string(cls, yaml_string: str, formatters=None):
-        """create selector object from yaml string"""
+        """create `Extractor` object from yaml string
+
+        >>> yaml_string = '''
+            title:
+                selector: "h1"
+                type: Text
+            '''
+        >>> extractor = Extractor.from_yaml_string(yaml_string)
+        """
         config = yaml.safe_load(yaml_string)
         return cls(config, formatters=formatters)
 
     @classmethod
     def from_yaml_file(cls, yaml_filename: str, formatters=None):
-        """create selector object from yaml file"""
+        """create `Extractor` object from yaml file
+
+        >>> extractor = Extractor.from_yaml_string(yaml_filename='selectors.yaml')
+        """
         with open(yaml_filename) as yaml_fileobj:
             config = yaml.safe_load(yaml_fileobj.read())
         return cls(config, formatters=formatters)
 
     def extract(self, html: str, base_url: str = None):
-        """returns extracted dict"""
+        """
+        Args:
+            html: html string
+            base_url (str, optional): specifying the base_url will make all extracted Links absolute
+        Returns:
+            dict: extracted data from given html string
+
+        >>> response = requests.get(url)
+        >>> selector.extract(response.text, base_url=response.url)
+        """
         sel = parsel.Selector(html, base_url=base_url)
         if base_url:
             sel.root.make_links_absolute()
         fields_data = {}
         for selector_name, selector_config in self.config.items():
-            fields_data[selector_name] = self.extract_selector(selector_config, sel)
+            fields_data[selector_name] = self._extract_selector(selector_config, sel)
         return fields_data
 
-    def extract_selector(self, field_config, parent_parser):
+    def _extract_selector(self, field_config, parent_parser):
         if 'xpath' in field_config:
             elements = parent_parser.xpath(field_config['xpath'])
         else:
@@ -62,7 +82,7 @@ def extract_selector(self, field_config, parent_parser):
 
         for element in elements:
             if 'children' in field_config:
-                value = self.get_child_item(field_config, element)
+                value = self._get_child_item(field_config, element)
             else:
                 kwargs = {'attribute': field_config.get('attribute')}
                 if 'attribute' in field_config:
@@ -78,10 +98,10 @@ def extract_selector(self, field_config, parent_parser):
 
         return values
 
-    def get_child_item(self, field_config, element):
+    def _get_child_item(self, field_config, element):
         children_config = field_config['children']
         child_item = {}
         for field in children_config:
-            child_value = self.extract_selector(children_config[field], element)
+            child_value = self._extract_selector(children_config[field], element)
             child_item[field] = child_value
         return child_item
diff --git a/tests/test_selectorlib.py b/tests/test_selectorlib.py
@@ -37,7 +37,7 @@ def output_yaml():
 def test_content(html, input_yaml, output_yaml):
     base_url = "https://scrapeme.live/shop/Bulbasaur/"
     formatters = [formatter.Integer]
-    selector = selectorlib.Selector.from_yaml_string(input_yaml, formatters=formatters)
+    selector = selectorlib.Extractor.from_yaml_string(input_yaml, formatters=formatters)
     output = selector.extract(html, base_url=base_url)
     assert output == yaml.safe_load(output_yaml)
 

-Original file line number
+Diff line change
@@ @@ -1,8 +1,3 @@ @@
 =======
 History
 =======
+-
 -0.6.0 (2019-05-22)
 -------------------
+-
 -* First release on PyPI.