Skip to content

Commit ecb7fbe

Browse files
committed
Updated
0 parents  commit ecb7fbe

4 files changed

Lines changed: 197 additions & 0 deletions

File tree

.vscode/settings.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"C_Cpp.errorSquiggles": "enabled"
3+
}

Log_Files.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
1.Installation of python and packages in windows.
2+
2. Download From This Site- https://www.python.org/downloads/
3+
3.Save the .exe file in the folder path where you want to doownload.
4+
4.You have to add the pyhton.exe to the enviromental path of your system.
5+
5.Install the pip dependencies from the command - |pip install requests| paste it in your command line or you can copy from the source -https://pypi.org/project/requests/
6+
6.Install the package called Beautifulsoup from the source - https://pypi.org/project/beautifulsoup4/ copy the command and paste it in the cmd then wait for installation or you can use the command - |pip install beautifulsoup4|.
7+
7.Displaying websites html source code in chrome - 1.open chrome 2.right click on your mouse 3.click on inspect
8+
8.Obtain HTML using beautifulsoup4.
9+
9.

Requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
pip install requests
2+
pip install Beautifulsoup
3+
pip install pandas

main.py

Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
import os
2+
import re
3+
import requests
4+
from bs4 import BeautifulSoup
5+
import pandas as pd
6+
import csv
7+
8+
9+
def scrape_html_content(url):
10+
response = requests.get(url)
11+
response.raise_for_status()
12+
return response.text
13+
14+
15+
def scrape_text_content(url):
16+
response = requests.get(url)
17+
response.raise_for_status()
18+
soup = BeautifulSoup(response.text, "html.parser")
19+
return soup.get_text()
20+
21+
22+
def scrape_connected_pages(url, num_pages):
23+
texts = []
24+
for _ in range(num_pages):
25+
response = requests.get(url)
26+
response.raise_for_status()
27+
soup = BeautifulSoup(response.text, "html.parser")
28+
text = soup.get_text()
29+
texts.append(text)
30+
next_page = soup.find("a", href=True)
31+
if next_page:
32+
url = next_page["href"]
33+
else:
34+
break
35+
return "\n".join(texts)
36+
37+
38+
def scrape_specific_tags(url, tag_name):
39+
response = requests.get(url)
40+
response.raise_for_status()
41+
soup = BeautifulSoup(response.text, "html.parser")
42+
tags = soup.find_all(tag_name)
43+
return [tag.get_text() for tag in tags]
44+
45+
46+
def scrape_email_addresses(url):
47+
response = requests.get(url)
48+
response.raise_for_status()
49+
text = response.text
50+
email_pattern = r"\S+@\S+"
51+
emails = re.findall(email_pattern, text)
52+
return "\n".join(emails)
53+
54+
55+
def scrape_phone_numbers(url):
56+
response = requests.get(url)
57+
response.raise_for_status()
58+
text = response.text
59+
phone_pattern = r"(\d{3}[-\.\s]??\d{4}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{4}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})"
60+
phone_numbers = re.findall(phone_pattern, text)
61+
return phone_numbers
62+
63+
64+
def scrape_addresses(url):
65+
response = requests.get(url)
66+
response.raise_for_status()
67+
soup = BeautifulSoup(response.text, "html.parser")
68+
addresses = []
69+
70+
# Find elements that contain address information
71+
address_elements = soup.find_all(["address", "span"], text=True)
72+
73+
for element in address_elements:
74+
address_text = element.get_text().strip()
75+
addresses.append(address_text)
76+
77+
return addresses
78+
79+
80+
# ... (Rest of the code)
81+
def scrape_html_content_and_store(url, folder_path):
82+
response = requests.get(url)
83+
response.raise_for_status()
84+
html_content = response.text
85+
86+
try:
87+
os.mkdir(folder_path)
88+
except FileExistsError:
89+
pass
90+
91+
file_path = os.path.join(folder_path, "scraped_content.html")
92+
93+
with open(file_path, "w", encoding="utf-8") as f:
94+
f.write(html_content)
95+
96+
97+
def scrape_text_and_store_as_txt(url, file_path):
98+
response = requests.get(url)
99+
response.raise_for_status()
100+
soup = BeautifulSoup(response.text, "html.parser")
101+
text = soup.get_text()
102+
103+
# Split the text into paragraphs
104+
paragraphs = text.split("\n\n")
105+
106+
with open(file_path, "w", encoding="utf-8") as f:
107+
for paragraph in paragraphs:
108+
f.write(paragraph + "\n\n") # Add double newline after each paragraph
109+
110+
111+
def main():
112+
print(
113+
"\t\t******************************************WEB_SCRAPPER_TOOL*****************************************\n\t\t******************************************MADE_BY_DAYANANDA*****************************************"
114+
)
115+
116+
117+
while True:
118+
url = input("Enter the URL you want to Scrap: ")
119+
if url == "0":
120+
break
121+
option = int(
122+
input(
123+
"[$]Select an option:\n"
124+
"[1] Scrap only the HTML content of the URL\n"
125+
"[2] Scrap only the text from the URL\n"
126+
"[3] Scrap text from multiple connected webpages\n"
127+
"[4] Scrap particular tags from the URL\n"
128+
"[5] Scrap Email Address from the URL\n"
129+
"[6] Scrap Phone Number from the URL\n"
130+
"[7] Scrap Address from the URL\n"
131+
"[8] Scrap the HTML content from the URL and store in a folder\n"
132+
"[9] Scrap the text from the URL and store as txt\n"
133+
"[0] Exit.\n"
134+
"##################################################################\n"
135+
"Enter option number: \n"
136+
"------------------>> "
137+
)
138+
)
139+
140+
if option == 1:
141+
html_content = scrape_html_content(url)
142+
print(html_content)
143+
elif option == 2:
144+
text_content = scrape_text_content(url)
145+
print(text_content)
146+
elif option == 3:
147+
num_pages = int(input("Enter the number of pages to scrape: "))
148+
multi_page_text = scrape_connected_pages(url, num_pages)
149+
print(multi_page_text)
150+
elif option == 4:
151+
tag_name = input("Enter the tag name to scrape (e.g., 'p', 'h1', 'a'): ")
152+
tags_text = scrape_specific_tags(url, tag_name)
153+
for text in tags_text:
154+
print(text)
155+
elif option == 5:
156+
emails = scrape_email_addresses(url)
157+
print(emails)
158+
elif option == 6:
159+
phone_numbers = scrape_phone_numbers(url)
160+
for phone_number in phone_numbers:
161+
print(phone_number)
162+
elif option == 7:
163+
addresses = scrape_addresses(url)
164+
for address in addresses:
165+
print(address)
166+
elif option == 8:
167+
folder_path = input("Enter the path to the folder to store the HTML content: ")
168+
scrape_html_content_and_store(url, folder_path)
169+
print("HTML content saved in the folder:", folder_path)
170+
elif option == 9:
171+
txt_file_path = input("Enter the path where you want to save the text file: ")
172+
173+
scrape_text_and_store_as_txt(url, txt_file_path)
174+
print("Scraped text has been saved to the text file.")
175+
elif option == 0:
176+
exit()
177+
else:
178+
print("Invalid option.")
179+
180+
181+
if __name__ == "__main__":
182+
main()

0 commit comments

Comments
 (0)