|
| 1 | +import os |
| 2 | +import re |
| 3 | +import requests |
| 4 | +from bs4 import BeautifulSoup |
| 5 | +import pandas as pd |
| 6 | +import csv |
| 7 | + |
| 8 | + |
| 9 | +def scrape_html_content(url): |
| 10 | + response = requests.get(url) |
| 11 | + response.raise_for_status() |
| 12 | + return response.text |
| 13 | + |
| 14 | + |
| 15 | +def scrape_text_content(url): |
| 16 | + response = requests.get(url) |
| 17 | + response.raise_for_status() |
| 18 | + soup = BeautifulSoup(response.text, "html.parser") |
| 19 | + return soup.get_text() |
| 20 | + |
| 21 | + |
| 22 | +def scrape_connected_pages(url, num_pages): |
| 23 | + texts = [] |
| 24 | + for _ in range(num_pages): |
| 25 | + response = requests.get(url) |
| 26 | + response.raise_for_status() |
| 27 | + soup = BeautifulSoup(response.text, "html.parser") |
| 28 | + text = soup.get_text() |
| 29 | + texts.append(text) |
| 30 | + next_page = soup.find("a", href=True) |
| 31 | + if next_page: |
| 32 | + url = next_page["href"] |
| 33 | + else: |
| 34 | + break |
| 35 | + return "\n".join(texts) |
| 36 | + |
| 37 | + |
| 38 | +def scrape_specific_tags(url, tag_name): |
| 39 | + response = requests.get(url) |
| 40 | + response.raise_for_status() |
| 41 | + soup = BeautifulSoup(response.text, "html.parser") |
| 42 | + tags = soup.find_all(tag_name) |
| 43 | + return [tag.get_text() for tag in tags] |
| 44 | + |
| 45 | + |
| 46 | +def scrape_email_addresses(url): |
| 47 | + response = requests.get(url) |
| 48 | + response.raise_for_status() |
| 49 | + text = response.text |
| 50 | + email_pattern = r"\S+@\S+" |
| 51 | + emails = re.findall(email_pattern, text) |
| 52 | + return "\n".join(emails) |
| 53 | + |
| 54 | + |
| 55 | +def scrape_phone_numbers(url): |
| 56 | + response = requests.get(url) |
| 57 | + response.raise_for_status() |
| 58 | + text = response.text |
| 59 | + phone_pattern = r"(\d{3}[-\.\s]??\d{4}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{4}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})" |
| 60 | + phone_numbers = re.findall(phone_pattern, text) |
| 61 | + return phone_numbers |
| 62 | + |
| 63 | + |
| 64 | +def scrape_addresses(url): |
| 65 | + response = requests.get(url) |
| 66 | + response.raise_for_status() |
| 67 | + soup = BeautifulSoup(response.text, "html.parser") |
| 68 | + addresses = [] |
| 69 | + |
| 70 | + # Find elements that contain address information |
| 71 | + address_elements = soup.find_all(["address", "span"], text=True) |
| 72 | + |
| 73 | + for element in address_elements: |
| 74 | + address_text = element.get_text().strip() |
| 75 | + addresses.append(address_text) |
| 76 | + |
| 77 | + return addresses |
| 78 | + |
| 79 | + |
| 80 | +# ... (Rest of the code) |
| 81 | +def scrape_html_content_and_store(url, folder_path): |
| 82 | + response = requests.get(url) |
| 83 | + response.raise_for_status() |
| 84 | + html_content = response.text |
| 85 | + |
| 86 | + try: |
| 87 | + os.mkdir(folder_path) |
| 88 | + except FileExistsError: |
| 89 | + pass |
| 90 | + |
| 91 | + file_path = os.path.join(folder_path, "scraped_content.html") |
| 92 | + |
| 93 | + with open(file_path, "w", encoding="utf-8") as f: |
| 94 | + f.write(html_content) |
| 95 | + |
| 96 | + |
| 97 | +def scrape_text_and_store_as_txt(url, file_path): |
| 98 | + response = requests.get(url) |
| 99 | + response.raise_for_status() |
| 100 | + soup = BeautifulSoup(response.text, "html.parser") |
| 101 | + text = soup.get_text() |
| 102 | + |
| 103 | + # Split the text into paragraphs |
| 104 | + paragraphs = text.split("\n\n") |
| 105 | + |
| 106 | + with open(file_path, "w", encoding="utf-8") as f: |
| 107 | + for paragraph in paragraphs: |
| 108 | + f.write(paragraph + "\n\n") # Add double newline after each paragraph |
| 109 | + |
| 110 | + |
| 111 | +def main(): |
| 112 | + print( |
| 113 | + "\t\t******************************************WEB_SCRAPPER_TOOL*****************************************\n\t\t******************************************MADE_BY_DAYANANDA*****************************************" |
| 114 | + ) |
| 115 | + |
| 116 | + |
| 117 | +while True: |
| 118 | + url = input("Enter the URL you want to Scrap: ") |
| 119 | + if url == "0": |
| 120 | + break |
| 121 | + option = int( |
| 122 | + input( |
| 123 | + "[$]Select an option:\n" |
| 124 | + "[1] Scrap only the HTML content of the URL\n" |
| 125 | + "[2] Scrap only the text from the URL\n" |
| 126 | + "[3] Scrap text from multiple connected webpages\n" |
| 127 | + "[4] Scrap particular tags from the URL\n" |
| 128 | + "[5] Scrap Email Address from the URL\n" |
| 129 | + "[6] Scrap Phone Number from the URL\n" |
| 130 | + "[7] Scrap Address from the URL\n" |
| 131 | + "[8] Scrap the HTML content from the URL and store in a folder\n" |
| 132 | + "[9] Scrap the text from the URL and store as txt\n" |
| 133 | + "[0] Exit.\n" |
| 134 | + "##################################################################\n" |
| 135 | + "Enter option number: \n" |
| 136 | + "------------------>> " |
| 137 | + ) |
| 138 | + ) |
| 139 | + |
| 140 | + if option == 1: |
| 141 | + html_content = scrape_html_content(url) |
| 142 | + print(html_content) |
| 143 | + elif option == 2: |
| 144 | + text_content = scrape_text_content(url) |
| 145 | + print(text_content) |
| 146 | + elif option == 3: |
| 147 | + num_pages = int(input("Enter the number of pages to scrape: ")) |
| 148 | + multi_page_text = scrape_connected_pages(url, num_pages) |
| 149 | + print(multi_page_text) |
| 150 | + elif option == 4: |
| 151 | + tag_name = input("Enter the tag name to scrape (e.g., 'p', 'h1', 'a'): ") |
| 152 | + tags_text = scrape_specific_tags(url, tag_name) |
| 153 | + for text in tags_text: |
| 154 | + print(text) |
| 155 | + elif option == 5: |
| 156 | + emails = scrape_email_addresses(url) |
| 157 | + print(emails) |
| 158 | + elif option == 6: |
| 159 | + phone_numbers = scrape_phone_numbers(url) |
| 160 | + for phone_number in phone_numbers: |
| 161 | + print(phone_number) |
| 162 | + elif option == 7: |
| 163 | + addresses = scrape_addresses(url) |
| 164 | + for address in addresses: |
| 165 | + print(address) |
| 166 | + elif option == 8: |
| 167 | + folder_path = input("Enter the path to the folder to store the HTML content: ") |
| 168 | + scrape_html_content_and_store(url, folder_path) |
| 169 | + print("HTML content saved in the folder:", folder_path) |
| 170 | + elif option == 9: |
| 171 | + txt_file_path = input("Enter the path where you want to save the text file: ") |
| 172 | + |
| 173 | + scrape_text_and_store_as_txt(url, txt_file_path) |
| 174 | + print("Scraped text has been saved to the text file.") |
| 175 | + elif option == 0: |
| 176 | + exit() |
| 177 | + else: |
| 178 | + print("Invalid option.") |
| 179 | + |
| 180 | + |
| 181 | +if __name__ == "__main__": |
| 182 | + main() |
0 commit comments