avinashkranjan · avinashkranjan · Jun 9, 2023 · May 23, 2023 · May 30, 2023
diff --git a/FlipkartScraper/README.md b/FlipkartScraper/README.md
@@ -0,0 +1,43 @@
+# Flipkart Scraper
+## This is a simple scraper designed to extract product information from Flipkart, an e-commerce platform. The scraper is written in Python and consists of the following files:
+
+1. dbConnector.py: This file contains the code for connecting to a database and performing database operations related to storing the scraped data.
+
+2. genericHtmlib.py: This file provides a set of generic functions and utilities for parsing HTML and extracting data from web pages.
+
+3. main.py: This is the main entry point of the scraper. It initializes the necessary components and orchestrates the scraping process.
+
+4. productList.py: container categories of list that you want to scrape.
+
+5. pycache: This directory contains the compiled bytecode of the Python files for faster execution. You can safely ignore this directory.
+
+6. useragent.py: This file defines the User-Agent string that the scraper uses for making HTTP requests. It helps mimic the behavior of a real web browser.
+
+## To use the Flipkart scraper, follow these steps:
+
+Make sure you have Python installed on your system.
+- create a virtual env by running the following command:
+```
+python3 -m venv venv
+```
+
+Install the required dependencies by running the following command:
+```
+pip install -r requirements.txt
+```
+
+- open productList.py and add the categories of list that you want to scrape.
+
+Execute the scraper by running the following command:
+
+```
+python main.py
+```
+
+The scraper will start processing the product URLs one by one, extracting relevant information such as the product name, price, description, and any other details specified in the code. The scraped data will be stored in the configured database or output format.
+
+Please note that web scraping should be done responsibly and in compliance with the terms and conditions of the target website. Make sure to respect the website's policies regarding scraping frequency and data usage.
+
+If you encounter any issues or have any questions, feel free to open an issue or reach out to the project maintainer.
+
+Built with ❤️ by [Paritosh Tripathi](https://github.com/paritoshtripathi935)
diff --git a/FlipkartScraper/dbConnector.py b/FlipkartScraper/dbConnector.py
@@ -0,0 +1,44 @@
+import sqlite3
+import os
+
+class FlipkartDatabaseConnector:
+    def __init__(self, stamp):
+        self.dbPath = "flipkart.db"
+        self.conn = sqlite3.connect(self.dbPath)
+        self.cur = self.conn.cursor()
+        self.welcomeMessage = "Welcome to Flipkart Scraper. This is the database for the Flipkart Scraper. This database was created on {}.".format(stamp)
+
+    def schemaMaker(self):
+        # creating tables
+        self.cur.execute("""CREATE TABLE products (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            sku TEXT NOT NULL,
+            name TEXT NOT NULL,
+            description TEXT NOT NULL,
+            image_path TEXT NOT NULL,
+            category TEXT NOT NULL,
+            timestamp TEXT NOT NULL,
+            URL TEXT NOT NULL,
+            price TEXT NOT NULL
+        );""")
+        self.conn.commit()
+        self.cur.execute("CREATE TABLE product_matches (id INTEGER PRIMARY KEY AUTOINCREMENT, product_id INTEGER NOT NULL, product_sku INTEGER NOT NULL, match_id INTEGER NOT NULL, match_sku INTEGER NOT NULL);")
+        self.conn.commit()
+
+    def insertProduct(self, productDetails):
+        self.cur.execute("INSERT INTO products (sku, name, description, image_path, category, timestamp, URL, price) VALUES (?, ?, ?, ?, ?, ?, ?, ?)", (productDetails["sku"], productDetails["name"], productDetails["description"], productDetails["image_path"], productDetails["category"], productDetails["timestamp"], productDetails["URL"], productDetails["price"]))
+        self.conn.commit()
+
+    def fetchAllProducts(self):
+        self.cur.execute("SELECT * FROM products")
+        return self.cur.fetchall()
+
+    def clearDatabase(self):
+        self.cur.execute("DELETE FROM products")
+        self.conn.commit()
+        self.cur.execute("DELETE FROM product_matches")
+        self.conn.commit()
+
+    def removeDuplicates(self):
+        self.cur.execute("DELETE FROM products WHERE rowid NOT IN (SELECT MIN(rowid) FROM products GROUP BY sku)")
+        self.conn.commit()
diff --git a/FlipkartScraper/flipkart.db b/FlipkartScraper/flipkart.db
diff --git a/FlipkartScraper/genricHtmlib.py b/FlipkartScraper/genricHtmlib.py
@@ -0,0 +1,153 @@
+from multiprocessing import Pool
+import os
+from datetime import datetime
+import lxml.html as html
+import pandas as pd
+import time
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+import warnings
+import requests
+warnings.filterwarnings("ignore")
+
+class SeleniumScraper:
+    def __init__(self, timeout=10):
+        self.timeout = timeout
+        self.reqSession = requests.Session()
+        self.stamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+        self.storagePath = os.path.join(
+            os.path.dirname(os.path.abspath(__file__))
+        )
+
+        self.headers = {
+            'authority': 'www.amazon.com',
+            'pragma': 'no-cache',
+            'cache-control': 'no-cache',
+            'dnt': '1',
+            'upgrade-insecure-requests': '1',
+            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36',
+            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
+            'sec-fetch-site': 'none',
+            'sec-fetch-mode': 'navigate',
+            'sec-fetch-dest': 'document',
+            'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
+        }
+
+    def fetch_request_normal(self, url, params=None):
+        try:
+            headers = {
+                "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
+            }
+            response = self.reqSession.get(url, headers=headers)
+
+            if response.status_code == 200:
+                return response.text
+
+            if response.status_code == 301:
+                # retry with redirect
+                response = requests.get(response.headers['Location'])
+                response.raise_for_status()
+                if response.status_code == 200:
+                    return response.text
+
+            if response.status_code == 503:
+                #print("Request Failed Response status code for url: {} and status code: {}".format(url, 503))
+                return None
+
+        except Exception as e:
+            print(
+                "Exception occurred for url: {} and exception: {}".format(url, e)
+            )
+            print("Exception occurred for url: {} and exception: {}".format(url, e))
+            pass
+            return None
+
+    def get_xpath_link(self, doc, xpath, website):
+        try:
+            name = doc.xpath("".join(xpath))
+            for i in range(len(name)):
+                if name[i].startswith("/"):
+                    name[i] = website + name[i]
+                else:
+                    name[i] = name[i]
+            return name
+
+        except Exception as e:
+            print("Error in getting {}: {}".format(name, e))
+            pass
+            return None
+            pass
+
+    def get_selenium_driver(self):
+        chrome_options = Options()
+        chrome_options.add_argument("--headless")
+        chrome_options.add_argument("--window-size=1920,1080")
+        chrome_options.add_argument("--disable-gpu")
+        chrome_options.add_argument("--no-sandbox")
+        chrome_options.add_argument("--disable-dev-shm-usage")
+        chrome_options.add_argument("--disable-extensions")
+        chrome_options.add_argument("--disable-logging")
+        chrome_options.add_argument("--log-level=3")
+        chrome_options.add_argument("--silent")
+        chrome_options.add_argument("--blink-settings=imagesEnabled=false")
+        driver = webdriver.Chrome(chrome_options=chrome_options)
+        return driver
+
+    def fetch_request_selenium(self, url, waiting_time=1):
+        try:
+            driver = self.get_selenium_driver()
+            driver.get(url)
+            time.sleep(waiting_time)
+            doc = html.fromstring(driver.page_source)
+            driver.close()
+            return doc
+
+        except Exception as e:
+            print(
+                "Exception occurred for url: {} and exception: {}".format(url, e)
+            )
+            pass
+
+    def get_xpath_data(self, doc, xpath):
+        try:
+            name = doc.xpath(xpath)
+            return name
+
+        except Exception as e:
+            print("Error in getting {}: {}".format(name, e))
+            pass
+            return None
+
+    def slow_page_scroll(self, driver, speed):
+        current_scroll_position = driver.execute_script("return window.pageYOffset;")
+        while current_scroll_position < driver.execute_script(
+            "return document.body.scrollHeight;"
+        ):
+            driver.execute_script(
+                "window.scrollTo(0, arguments[0]);", current_scroll_position
+            )
+            current_scroll_position += 1000
+            time.sleep(speed)
+
+    def data_storage(self, df_list, unique_id, name, storageFormat, storagePath=None):
+        df_combined = pd.concat(df_list, ignore_index=True)
+        df_combined.drop_duplicates(subset=unique_id, inplace=True)
+        if storageFormat == "csv":
+            df_combined.to_csv(
+            self.storagePath +"/{}_{}.csv".format(name, self.stamp),
+            index=False,
+        )
+        elif storageFormat == "json":
+            df_combined.to_json(
+            self.storagePath + "/{}_{}.json".format(name, self.stamp),
+            orient="records",
+        )
+
+    def cleanData(self, array):
+        array = [x.strip() for x in array]
+        array = list(filter(None, array))
+        array = [x.encode("ascii", "ignore").decode() for x in array]
+        array = [x.replace("\n", "") for x in array]
+        return array
+
+