Skip to content

Commit

Permalink
extract_many
Browse files Browse the repository at this point in the history
  • Loading branch information
AkulS1008 committed Jun 14, 2022
1 parent bdc019d commit 4c908ea
Show file tree
Hide file tree
Showing 6 changed files with 2,134 additions and 39 deletions.
Binary file modified .DS_Store
Binary file not shown.
87 changes: 83 additions & 4 deletions dputils/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def __clean_url__(url):
url = url.split('?')[0]
return url

def get_webpage_data(url, headers = None, cookies = None) -> BeautifulSoup:
def get_webpage_data(url, headers = None, cookies = None, clean = True) -> BeautifulSoup:
"""
Obtains data from any website
Returns data as a BeautifulSoup object
Expand All @@ -21,8 +21,10 @@ def get_webpage_data(url, headers = None, cookies = None) -> BeautifulSoup:
url (str): url of the website to take data from
headers (str): default value is None, which then creates fake useragent
cookies (str): default value is None, which then satisfies given website's cookie policy
clean (bool): default value is True, which cleans url
"""
url = __clean_url__(url)
if clean:
url = __clean_url__(url)
if not __validate_url__(url):
print("Invalid url")
return None
Expand Down Expand Up @@ -83,5 +85,82 @@ def extract_one(soup : BeautifulSoup, **selectors) -> dict:
print("Could not extract data")
raise e

def extract_many():
pass
"""
selectorList : [{'tag' : 'span', 'attrs' : {'id' : 'productTitle'}},...]
for key,info in selectors.items():
tag = info.get('tag', 'div')
attrs = info.get('attrs', None)
target = soup.find(tag, attrs = attrs)
if target is None:
print("No data found")
items = target.find_all(tag, attrs = attrs)
item_count = len(items)
data_list = []
if item_count == 0:
print("No data found")
else:
print(f"{item_count} items found")
for item in items:
try:
title = item.find(tag, attrs = attrs).text
except:
title = None
if title is not None:
data_list.append({'title' : title})
return data_list
"""

#target is part of selectors. target = {'tag' : 'div', 'attrs' : {...}}
#items is mandatory. refers to repeating blocks of html code
def extract_many(soup : BeautifulSoup, **selectors) -> list:
if 'target' in selectors:
tag = selectors['target'].get('tag')
attrs = selectors['target'].get('attrs')
if tag is None:
print("Please give valid selectors")
print("Example: target = {'tag' : 'div', 'attrs' : {...}")
return None
else:
print(soup)
target = soup.find(tag, attrs)
if target is None:
print(f"Could not find target section with this {tag} and {attrs}")
return None
else:
target = soup
data_list = []
if 'items' in selectors:
items = target.find_all(selectors['items'].get('tag'), attrs = selectors['items'].get('attrs'))
items_count = len(items)
if items_count == 0:
print("No data found")
return data_list
else:
print(f"{items_count} items found")
selectors.pop('target')
selectors.pop('items')
for idx, item in enumerate(items):
data = {}
try:
for key,info in selectors.items():
tag = info.get('tag', 'div')
attrs = info.get('attrs', None)
output = info.get('output', 'text')
if output == 'text':
data[key] = item.find(tag, attrs = attrs).text.strip()
elif output == 'href':
data[key] = item.find(tag, attrs = attrs).attrs.get('href')
elif output == 'src':
data[key] = item.find(tag, attrs = attrs).attrs.get('src')
else:
print('Not suitable output')
data_list.append(data)
except:
print("Item skipped at index:", idx)
else:
print("All items extracted")
return data_list
else:
print("items is required as a parameter containing dict containing tag, attrs as keys")
print("Example: items = {'tag' : 'div', 'attrs' : {...}")

Loading

0 comments on commit 4c908ea

Please sign in to comment.