|
1 |
| -Usage |
2 |
| -====== |
3 |
| - |
4 |
| -Using selectorlib with requests |
5 |
| --------------------------------- |
6 |
| - |
7 |
| ->>> import requests |
8 |
| ->>> from selectorlib import Extractor |
9 |
| ->>> selector_yaml = """ |
10 |
| -name: |
11 |
| - css: h1.product_title |
12 |
| -price: |
13 |
| - css: p.price |
14 |
| -stock: |
15 |
| - css: p.stock |
16 |
| -tags: |
17 |
| - css: span.tagged_as a |
18 |
| -short_description: |
19 |
| - css: .woocommerce-product-details__short-description > p |
20 |
| -description: |
21 |
| - css: div#tab-description p |
22 |
| -attributes: |
23 |
| - css: table.shop_attributes |
24 |
| - multiple: True |
25 |
| - children: |
26 |
| - name: |
27 |
| - css: th |
28 |
| - value: |
29 |
| - css: td |
30 |
| -related_products: |
31 |
| - css: li.product |
32 |
| - multiple: True |
33 |
| - children: |
34 |
| - name: |
35 |
| - css: h2 |
36 |
| - url: |
37 |
| - css: a[href] |
38 |
| - price: |
39 |
| - css: .price |
40 |
| -""" |
41 |
| ->>> extractor = Extractor.from_yaml_string(selector_yaml) |
42 |
| ->>> url = 'https://scrapeme.live/shop/Bulbasaur/' |
43 |
| ->>> response = requests.get(url) |
44 |
| ->>> extractor.extract(response.text, base_url=response.url) |
45 |
| -{'attributes': [{'name': 'Weight', 'value': '15.2 kg'}], |
46 |
| - 'description': 'Bulbasaur can be seen napping in bright sunlight. There is a ' |
47 |
| - 'seed on its back. By soaking up the sun’s rays, the seed ' |
48 |
| - 'grows progressively larger.', |
49 |
| - 'name': 'Bulbasaur', |
50 |
| - 'price': '£ 63.00', |
51 |
| - 'related_products': [{'name': 'Pidgeot', |
52 |
| - 'price': '£ 185.00', |
53 |
| - 'url': 'Pidgeot £ 185.00'}, |
54 |
| - {'name': 'Ekans', |
55 |
| - 'price': '£ 55.00', |
56 |
| - 'url': 'Ekans £ 55.00'}, |
57 |
| - {'name': 'Charizard', |
58 |
| - 'price': '£ 156.00', |
59 |
| - 'url': 'Charizard £ 156.00'}], |
60 |
| - 'short_description': 'Bulbasaur can be seen napping in bright sunlight. There ' |
61 |
| - 'is a seed on its back. By soaking up the sun’s rays, ' |
62 |
| - 'the seed grows progressively larger.', |
63 |
| - 'stock': '45 in stock', |
64 |
| - 'tags': 'bulbasaur'} |
65 |
| - |
66 |
| - |
67 |
| -Using formatter with selectors |
68 |
| -------------------------------- |
69 |
| - |
70 |
| ->>> from selectorlib import Extractor, Formatter |
71 |
| ->>> class Number(Formatter): |
| 1 | +Selectorlib lets you use a YML styled file to specify the selectors for |
| 2 | +the elements or data that you need to extract from a website. You can |
| 3 | +use both CSS Selectors, XPaths or both. |
| 4 | + |
| 5 | +YML Structure |
| 6 | +------------- |
| 7 | + |
| 8 | +Lets take a look at this fictional store that sells Pokemon - |
| 9 | +https://scrapeme.live/shop/ |
| 10 | + |
| 11 | +Lets extract Here is a sample YML that SelectorLib accepts as Input |
| 12 | + |
| 13 | +.. code:: yml |
| 14 | +
|
| 15 | + pokemon: |
| 16 | + css: li.product |
| 17 | + multiple: true |
| 18 | + type: Text |
| 19 | + children: |
| 20 | + name: |
| 21 | + css: h2.woocommerce-loop-product__title |
| 22 | + type: Text |
| 23 | + price: |
| 24 | + css: span.woocommerce-Price-amount |
| 25 | + type: Text |
| 26 | + image: |
| 27 | + css: img.attachment-woocommerce_thumbnail |
| 28 | + type: Attribute |
| 29 | + attribute: src |
| 30 | + url: |
| 31 | + css: a.woocommerce-LoopProduct-link |
| 32 | + type: Link |
| 33 | +
|
| 34 | +Here ``pokemon`` is the main element and the elements - name, price, |
| 35 | +image and url are inside it and are called the children of the pokemon |
| 36 | +element. |
| 37 | + |
| 38 | +Every element starts with its name and can have these properties |
| 39 | + |
| 40 | +- css |
| 41 | +- xpath |
| 42 | +- type |
| 43 | +- children |
| 44 | +- formatter |
| 45 | + |
| 46 | +css (default: Blank) |
| 47 | +~~~~~~~~~~~~~~~~~~~~ |
| 48 | + |
| 49 | +The css selector for the element. In our example the element called |
| 50 | +pokemon is in an li with a class product. So its ``li.product``. |
| 51 | + |
| 52 | +xpath (default: Blank) |
| 53 | +~~~~~~~~~~~~~~~~~~~~~~ |
| 54 | + |
| 55 | +The xpath selector for the element. If we were to use xpaths instead of |
| 56 | +css selectors for the element pokemon above. It would be |
| 57 | +``//li[contains(@class,'pokemon')]``. Every element needs either css or |
| 58 | +xpath selectors. |
| 59 | + |
| 60 | +Every element needs either css or xpath selectors. If both xpath and css |
| 61 | +are defined, xpath takes preference. |
| 62 | + |
| 63 | +type (default: Text) |
| 64 | +~~~~~~~~~~~~~~~~~~~~ |
| 65 | + |
| 66 | +The type defines what kind of extraction needs to happen on the selected |
| 67 | +element. Here are accepted types |
| 68 | + |
| 69 | +Text |
| 70 | +^^^^ |
| 71 | + |
| 72 | +This type of extraction just extracts all the text content from the |
| 73 | +selected elements. If you have not specifed a type, Text would be used |
| 74 | +as default. |
| 75 | + |
| 76 | +Attribute |
| 77 | +^^^^^^^^^ |
| 78 | + |
| 79 | +This type of extraction lets you extract a particular attribute, |
| 80 | +specified using the ``attribute`` property for the element. This is not |
| 81 | +usually required when you are selecting using xpaths as you define that |
| 82 | +easily in an expression as compared to css selectors. eg. |
| 83 | +``//img[@src]`` |
| 84 | + |
| 85 | +Here is an example that extracts the src attribute of an img element |
| 86 | + |
| 87 | +.. code:: yaml |
| 88 | +
|
| 89 | + image: |
| 90 | + css: img.attachment-woocommerce_thumbnail |
| 91 | + type: Attribute |
| 92 | + attribute: src |
| 93 | +
|
| 94 | +Link |
| 95 | +^^^^ |
| 96 | + |
| 97 | +This type is a shortcut for getting the href attribute from any links in |
| 98 | +the html defined using an ``<a>`` tag |
| 99 | + |
| 100 | +Example, |
| 101 | + |
| 102 | +.. code:: yaml |
| 103 | +
|
| 104 | + url: |
| 105 | + css: a.woocommerce-LoopProduct-link |
| 106 | + type: Link |
| 107 | +
|
| 108 | +HTML |
| 109 | +^^^^ |
| 110 | + |
| 111 | +HTML type, just gives you the full HTML content of the element. This is |
| 112 | +useful when you need the html as is for some custom extraction or |
| 113 | +checking a few conditions. |
| 114 | + |
| 115 | +multiple (default: False) |
| 116 | +~~~~~~~~~~~~~~~~~~~~~~~~~ |
| 117 | + |
| 118 | +If you need multiple matches on the selector of an element use multiple |
| 119 | +as true. If you only need to get the first match, use multiple as false |
| 120 | +or leave it blank. For example, the element pokemon has multiple matches |
| 121 | +on the same page, so we have set multiple:true in it to get all of them. |
| 122 | + |
| 123 | +children (default: Blank) |
| 124 | +~~~~~~~~~~~~~~~~~~~~~~~~~ |
| 125 | + |
| 126 | +An element can have multiple child elements. In the example above the |
| 127 | +parent element ``pokemon`` has these "children" - |
| 128 | +``name``,\ ``price``,\ ``image``,\ ``url``. Each child element could |
| 129 | +also more children and can be nested. If an element has children, it's |
| 130 | +``type`` property is ignored. |
| 131 | + |
| 132 | +format |
| 133 | +~~~~~~ |
| 134 | + |
| 135 | +You can define custom formatters, and can be used for minor |
| 136 | +transformations on the extracted data. In Python, these formatters are |
| 137 | +defined as |
| 138 | + |
| 139 | +:: |
| 140 | + |
| 141 | + from selectorlib.formatter import Format |
| 142 | + |
| 143 | + class Price(Format): |
72 | 144 | def format(self, text):
|
73 |
| - return int(text) |
74 |
| ->>> yaml_string = """ |
75 |
| - title: |
76 |
| - css: "h1" |
| 145 | + return text.replace('\\n','').strip() |
| 146 | + |
| 147 | +Used in the YAML as |
| 148 | + |
| 149 | +.. code:: yml |
| 150 | +
|
| 151 | + price: |
| 152 | + css: span.woocommerce-Price-amount |
| 153 | + type: Text |
| 154 | + format: Price |
| 155 | +
|
| 156 | +And passed to the Extractor while its initialized |
| 157 | + |
| 158 | +.. code:: python |
| 159 | +
|
| 160 | + formatters = Formatter.get_all() |
| 161 | + Extractor.from_yaml_file('a.yaml', formatters=formatters) |
| 162 | +
|
| 163 | +Python Example |
| 164 | +-------------- |
| 165 | + |
| 166 | +``scrapeme_listing_page.yml`` |
| 167 | + |
| 168 | +.. code:: yml |
| 169 | +
|
| 170 | + pokemon: |
| 171 | + css: li.product |
| 172 | + multiple: true |
77 | 173 | type: Text
|
78 |
| - num: |
79 |
| - css: "h2 span" |
80 |
| - format: Number |
81 |
| - """ |
82 |
| ->>> formatters = Formatter.get_all() |
83 |
| ->>> extractor = Extractor.from_yaml_string(yaml_string, formatters=formatters) |
84 |
| ->>> html = """ |
85 |
| - <h1>Title</h1> |
86 |
| - <h2> |
87 |
| - <span>123</span> |
88 |
| - </h2> |
89 |
| - """ |
90 |
| ->>> extractor.extract(html) |
| 174 | + children: |
| 175 | + name: |
| 176 | + css: h2.woocommerce-loop-product__title |
| 177 | + type: Text |
| 178 | + price: |
| 179 | + css: span.woocommerce-Price-amount |
| 180 | + type: Text |
| 181 | + image: |
| 182 | + css: img.attachment-woocommerce_thumbnail |
| 183 | + type: Attribute |
| 184 | + attribute: src |
| 185 | + url: |
| 186 | + css: a.woocommerce-LoopProduct-link |
| 187 | + type: Link |
| 188 | +
|
| 189 | +``extract.py`` |
| 190 | + |
| 191 | +.. code:: python |
| 192 | +
|
| 193 | + import requests |
| 194 | + from selectorlib import Extractor, Formatter |
| 195 | + from pprint import pprint |
| 196 | + import re |
| 197 | +
|
| 198 | + # Define a formatter for Price |
| 199 | + class Price(Formatter): |
| 200 | + def format(self, text): |
| 201 | + price = re.findall(r'\d+\.\d+',text) |
| 202 | + if price: |
| 203 | + return price[0] |
| 204 | + return None |
| 205 | + formatters = Formatter.get_all() |
| 206 | + extractor = Extractor.from_yaml_file('./scrapeme_listing_page.yml',formatters=formatters) |
| 207 | +
|
| 208 | + #Download the HTML and use Extractor |
| 209 | + r = requests.get('https://scrapeme.live/shop/') |
| 210 | + data = extractor.extract(r.text) |
| 211 | + pprint(data) |
| 212 | +
|
| 213 | +:: |
| 214 | + |
| 215 | + >>> python extract.py |
| 216 | + |
| 217 | +:: |
| 218 | + |
| 219 | + {'pokemon': [{'image': 'https://scrapeme.live/wp-content/uploads/2018/08/001-350x350.png', |
| 220 | + 'name': 'Bulbasaur', |
| 221 | + 'price': '63.00', |
| 222 | + 'url': 'https://scrapeme.live/shop/Bulbasaur/'}, |
| 223 | + {'image': 'https://scrapeme.live/wp-content/uploads/2018/08/002-350x350.png', |
| 224 | + 'name': 'Ivysaur', |
| 225 | + 'price': '87.00', |
| 226 | + 'url': 'https://scrapeme.live/shop/Ivysaur/'}, |
| 227 | + {'image': 'https://scrapeme.live/wp-content/uploads/2018/08/003-350x350.png', |
| 228 | + 'name': 'Venusaur', |
| 229 | + 'price': '105.00', |
| 230 | + 'url': 'https://scrapeme.live/shop/Venusaur/'}, |
| 231 | + {'image': 'https://scrapeme.live/wp-content/uploads/2018/08/004-350x350.png', |
| 232 | + 'name': 'Charmander', |
| 233 | + 'price': '48.00', |
| 234 | + 'url': 'https://scrapeme.live/shop/Charmander/'}, |
| 235 | + {'image': 'https://scrapeme.live/wp-content/uploads/2018/08/005-350x350.png', |
| 236 | + 'name': 'Charmeleon', |
| 237 | + 'price': '165.00', |
| 238 | + 'url': 'https://scrapeme.live/shop/Charmeleon/'}, |
| 239 | + {'image': 'https://scrapeme.live/wp-content/uploads/2018/08/006-350x350.png', |
| 240 | + 'name': 'Charizard', |
| 241 | + 'price': '156.00', |
| 242 | + 'url': 'https://scrapeme.live/shop/Charizard/'}, |
| 243 | + {'image': 'https://scrapeme.live/wp-content/uploads/2018/08/007-350x350.png', |
| 244 | + 'name': 'Squirtle', |
| 245 | + 'price': '130.00', |
| 246 | + 'url': 'https://scrapeme.live/shop/Squirtle/'}, |
| 247 | + {'image': 'https://scrapeme.live/wp-content/uploads/2018/08/008-350x350.png', |
| 248 | + 'name': 'Wartortle', |
| 249 | + 'price': '123.00', |
| 250 | + 'url': 'https://scrapeme.live/shop/Wartortle/'}, |
| 251 | + {'image': 'https://scrapeme.live/wp-content/uploads/2018/08/009-350x350.png', |
| 252 | + 'name': 'Blastoise', |
| 253 | + 'price': '76.00', |
| 254 | + 'url': 'https://scrapeme.live/shop/Blastoise/'}, |
| 255 | + {'image': 'https://scrapeme.live/wp-content/uploads/2018/08/010-350x350.png', |
| 256 | + 'name': 'Caterpie', |
| 257 | + 'price': '73.00', |
| 258 | + 'url': 'https://scrapeme.live/shop/Caterpie/'}, |
| 259 | + {'image': 'https://scrapeme.live/wp-content/uploads/2018/08/011-350x350.png', |
| 260 | + 'name': 'Metapod', |
| 261 | + 'price': '148.00', |
| 262 | + 'url': 'https://scrapeme.live/shop/Kakuna/'}, |
| 263 | + {'image': 'https://scrapeme.live/wp-content/uploads/2018/08/015-350x350.png', |
| 264 | + 'name': 'Beedrill', |
| 265 | + 'price': '168.00', |
| 266 | + 'url': 'https://scrapeme.live/shop/Beedrill/'}, |
| 267 | + {'image': 'https://scrapeme.live/wp-content/uploads/2018/08/016-350x350.png', |
| 268 | + 'name': 'Pidgey', |
| 269 | + 'price': '159.00', |
| 270 | + 'url': 'https://scrapeme.live/shop/Pidgey/'}]} |
| 271 | + |
0 commit comments