extract_many

digipodium · Jun 14, 2022 · 4c908ea · 4c908ea
1 parent bdc019d
commit 4c908ea
Show file tree

Hide file tree

Showing 6 changed files with 2,134 additions and 39 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/dputils/scrape.py b/dputils/scrape.py
@@ -12,7 +12,7 @@ def __clean_url__(url):
         url = url.split('?')[0]
     return url
 
-def get_webpage_data(url, headers = None, cookies = None) -> BeautifulSoup:
+def get_webpage_data(url, headers = None, cookies = None, clean = True) -> BeautifulSoup:
     """
     Obtains data from any website
     Returns data as a BeautifulSoup object
@@ -21,8 +21,10 @@ def get_webpage_data(url, headers = None, cookies = None) -> BeautifulSoup:
     url (str): url of the website to take data from
     headers (str): default value is None, which then creates fake useragent
     cookies (str): default value is None, which then satisfies given website's cookie policy
+    clean (bool): default value is True, which cleans url
     """
-    url = __clean_url__(url)
+    if clean:
+        url = __clean_url__(url)
     if not __validate_url__(url):
         print("Invalid url")
         return None
@@ -83,5 +85,82 @@ def extract_one(soup : BeautifulSoup, **selectors) -> dict:
         print("Could not extract data")
         raise e
 
-def extract_many():
-    pass
+    """
+        selectorList : [{'tag' : 'span', 'attrs' : {'id' : 'productTitle'}},...]
+        for key,info in selectors.items():
+            tag = info.get('tag', 'div')
+            attrs = info.get('attrs', None) 
+            target = soup.find(tag, attrs = attrs)
+            if target is None:
+                print("No data found")
+            items = target.find_all(tag, attrs = attrs)
+            item_count = len(items)
+            data_list = []
+            if item_count == 0:
+                print("No data found")
+            else:
+                print(f"{item_count} items found")
+                for item in items:
+                    try:
+                        title = item.find(tag, attrs = attrs).text
+                    except:
+                        title = None
+                    if title is not None:
+                        data_list.append({'title' : title})
+            return data_list
+    """
+
+#target is part of selectors. target = {'tag' : 'div', 'attrs' : {...}}
+#items is mandatory. refers to repeating blocks of html code
+def extract_many(soup : BeautifulSoup, **selectors) -> list:
+    if 'target' in selectors:
+        tag = selectors['target'].get('tag')
+        attrs = selectors['target'].get('attrs')
+        if tag is None:
+            print("Please give valid selectors")
+            print("Example: target = {'tag' : 'div', 'attrs' : {...}")
+            return None
+        else:
+            print(soup)
+            target = soup.find(tag, attrs)
+            if target is None:
+                print(f"Could not find target section with this {tag} and {attrs}")
+                return None
+    else:
+        target = soup
+    data_list = []
+    if 'items' in selectors:
+        items = target.find_all(selectors['items'].get('tag'), attrs = selectors['items'].get('attrs'))
+        items_count = len(items)
+        if items_count == 0:
+            print("No data found")
+            return data_list
+        else:
+            print(f"{items_count} items found")
+            selectors.pop('target')
+            selectors.pop('items')
+            for idx, item in enumerate(items):
+                data = {}
+                try:
+                    for key,info in selectors.items():
+                        tag = info.get('tag', 'div')
+                        attrs = info.get('attrs', None) 
+                        output = info.get('output', 'text')
+                        if output == 'text':
+                            data[key] = item.find(tag, attrs = attrs).text.strip()
+                        elif output == 'href':
+                            data[key] = item.find(tag, attrs = attrs).attrs.get('href') 
+                        elif output == 'src': 
+                            data[key] = item.find(tag, attrs = attrs).attrs.get('src')     
+                        else:
+                            print('Not suitable output')
+                    data_list.append(data)
+                except:
+                    print("Item skipped at index:", idx)
+            else:
+                print("All items extracted")
+            return data_list
+    else:
+        print("items is required as a parameter containing dict containing tag, attrs as keys")
+        print("Example: items = {'tag' : 'div', 'attrs' : {...}")
+