Skip to content

Commit

Permalink
Added Web Scrapping with BeautifulSoup module
Browse files Browse the repository at this point in the history
  • Loading branch information
asryan11 committed Oct 6, 2023
1 parent 7f90135 commit 9bbd329
Show file tree
Hide file tree
Showing 2 changed files with 371 additions and 0 deletions.
24 changes: 24 additions & 0 deletions Web Scraping/Requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
async-generator==1.10
attrs==21.4.0
beautifulsoup4==4.10.0
beautifultable==1.0.1
certifi==2021.10.8
cffi==1.15.0
charset-normalizer==2.0.12
cryptography==36.0.1
h11==0.13.0
idna==3.3
outcome==1.1.0
pycparser==2.21
pyOpenSSL==22.0.0
PySocks==1.7.1
requests==2.27.1
selenium==4.1.2
sniffio==1.2.0
sortedcontainers==2.4.0
soupsieve==2.3.1
trio==0.20.0
trio-websocket==0.9.2
urllib3==1.26.8
wcwidth==0.2.5
wsproto==1.1.0
347 changes: 347 additions & 0 deletions Web Scraping/Web Scraping with BeautifulSoup.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,347 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"source": [
"#Requirements\n",
"#pip3 install requests\n",
"#pip3 install bs4"
],
"metadata": {
"id": "-gVGfpsFg4Ns"
},
"execution_count": 11,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# import these two modules bs4 for selecting HTML tags easily\n",
"from bs4 import BeautifulSoup\n",
"# requests module is easy to operate some people use urllib but I prefer this one because it is easy to use.\n",
"import requests\n",
"\n",
"# I put here my own blog url ,you can change it.\n",
"url=\"https://getpython.wordpress.com/\"\n",
"\n",
"#Requests module use to data from given url\n",
"source=requests.get(url)\n",
"\n",
"# BeautifulSoup is used for getting HTML structure from requests response.(craete your soup)\n",
"soup=BeautifulSoup(source.text,'html')\n",
"\n",
"# Find function is used to find a single element if there are more than once it always returns the first element.\n",
"title=soup.find('title') # place your html tagg in parentheses that you want to find from html.\n",
"print(\"this is with html tags :\",title)\n",
"\n",
"qwery=soup.find('h1') # here i find first h1 tagg in my website using find operation.\n",
"\n",
"#use .text for extract only text without any html tags\n",
"print(\"this is without html tags:\",qwery.text)\n",
"\n",
"\n",
"links=soup.find('a') #i extarcted link using \"a\" tag\n",
"print(links)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "29YQuSxKg_py",
"outputId": "fad81c49-e0ad-4f16-d716-97e5a820a146"
},
"execution_count": 12,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"this is with html tags : <title>Fun with python programming – A programming language for revolution</title>\n",
"this is without html tags: Fun with python programming\n",
"<a class=\"screen-reader-text skip-link\" href=\"#content\">Skip to content</a>\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# here i extarcted href data from anchor tag.\n",
"print(links['href'])"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "dNci1cV-hDne",
"outputId": "4fa5d4cb-9d0c-4d6a-e025-631e879f5c0a"
},
"execution_count": 13,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"#content\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# findall function is used to fetch all tags at a single time.\n",
"many_link=soup.find_all('a') # here i extracted all the anchor tags of my website\n",
"total_links=len(many_link) # len function is use to calculate length of your array\n",
"print(\"total links in my website :\",total_links)\n",
"print()\n",
"for i in many_link[:6]: # here i use slicing to fetch only first 6 links from rest of them.\n",
" print(i)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "95HDbH_6hGRa",
"outputId": "80fd7795-6972-4aea-a292-a8906ee82043"
},
"execution_count": 14,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"total links in my website : 100\n",
"\n",
"<a class=\"screen-reader-text skip-link\" href=\"#content\">Skip to content</a>\n",
"<a href=\"https://getpython.wordpress.com/\" rel=\"home\">\n",
"<div class=\"cover\"></div>\n",
"</a>\n",
"<a class=\"screen-reader-text search-toggle\" href=\"#search-container\">Search</a>\n",
"<a href=\"https://getpython.wordpress.com/\" rel=\"home\">Fun with python programming</a>\n",
"<a aria-current=\"page\" href=\"/\">Home</a>\n",
"<a href=\"https://getpython.wordpress.com/contact/\">Contact</a>\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"second_link=many_link[1] #here i fetch second link which place on 1 index number in many_links.\n",
"print(second_link)\n",
"print()\n",
"print(\"href is :\",second_link['href']) #only href link is extracted from ancor tag"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Y8bBLk4phJbk",
"outputId": "d527d42f-4bcd-4f60-a7cd-80a75a2a8f6c"
},
"execution_count": 15,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"<a href=\"https://getpython.wordpress.com/\" rel=\"home\">\n",
"<div class=\"cover\"></div>\n",
"</a>\n",
"\n",
"href is : https://getpython.wordpress.com/\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# select div tag from second link\n",
"nested_div=second_link.find('div')\n",
"# As you can see div element extarcted , it also have inner elements\n",
"print(nested_div)\n",
"print()\n",
"#here i extracted class element from div but it give us in the form of list\n",
"z=(nested_div['class'])\n",
"print(z)\n",
"print(type(z))\n",
"print()\n",
"# \" \" .join () method use to convert list type into string type\n",
"print(\"class name of div is :\",\" \".join(nested_div['class']))"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "S8IgJqz7hMWC",
"outputId": "8f59199c-222b-4eb4-a155-a06564817d01"
},
"execution_count": 16,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"<div class=\"cover\"></div>\n",
"\n",
"['cover']\n",
"<class 'list'>\n",
"\n",
"class name of div is : cover\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"wiki=requests.get(\"https://en.wikipedia.org/wiki/World_War_II\")\n",
"soup=BeautifulSoup(wiki.text,'html')\n",
"print(soup.find('title'))"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "HapUBD0xhPOI",
"outputId": "bf68db0d-e780-4dad-dcd2-5f12aeb49c1f"
},
"execution_count": 17,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"<title>World War II - Wikipedia</title>\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"ww2_contents=soup.find_all(\"div\",class_='toc')\n",
"for i in ww2_contents:\n",
" print(i.text)"
],
"metadata": {
"id": "uhxBXUPwhSMD"
},
"execution_count": 18,
"outputs": []
},
{
"cell_type": "code",
"source": [
"overview=soup.find_all('table',class_='infobox vevent')\n",
"for z in overview:\n",
" print(z.text)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "FdBYCr3fhU9P",
"outputId": "75fe439e-6638-457a-8bed-eb8f96e42dc4"
},
"execution_count": 19,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"World War IIClockwise from top left: \n",
"German Stuka dive bombers on the Eastern Front, 1943\n",
"British Matilda II tanks during the North African campaign, 1941\n",
"Soviet troops at the Battle of Stalingrad, 1942–1943\n",
"U.S. warships in Lingayen Gulf in the Philippines, 1945\n",
"Soviet soldier raising a flag over the Reichstag after the Battle of Berlin, 1945\n",
"U.S. atomic bombing of Nagasaki in Japan, 1945\n",
"Date1 September 1939 – 2 September 1945[a] (6 years, 1 day)LocationMajor theaters: \n",
"Europe\n",
"Pacific\n",
"Atlantic\n",
"Indian Ocean\n",
"South-East Asia\n",
"China\n",
"Japan\n",
"Middle East\n",
"Mediterranean\n",
"North Africa\n",
"Horn of Africa\n",
"Central Africa\n",
"Australia\n",
"Caribbean\n",
"North and South America\n",
"Result\n",
"Allied victory\n",
"Fall of Nazi Germany, Fascist Italy, and Imperial Japan\n",
"Allied military occupations of Germany, Japan, Austria, and Korea\n",
"Beginning of the Nuclear Age\n",
"Dissolution of the League of Nations and creation of the United Nations\n",
"Decolonisation of Asia and Africa and decline of European international influence\n",
"Emergence of the United States and the Soviet Union as rival superpowers and beginning of the Cold War (see Aftermath of World War II)Participants\n",
"Allies\n",
"AxisCommanders and leaders\n",
"Main Allied leaders:\n",
" Winston Churchill\n",
" Joseph Stalin\n",
" Franklin D. Roosevelt\n",
" Chiang Kai-shek\n",
"\n",
"Main Axis leaders:\n",
" Adolf Hitler\n",
" Hirohito\n",
" Benito Mussolini\n",
"Casualties and losses\n",
"\n",
"Military dead:\n",
"Over 16,000,000\n",
"Civilian dead:\n",
"Over 45,000,000\n",
"Total dead:\n",
"Over 61,000,000\n",
"(1937–1945)\n",
"...further details\n",
"\n",
"\n",
"Military dead:\n",
"Over 8,000,000\n",
"Civilian dead:\n",
"Over 4,000,000\n",
"Total dead:\n",
"Over 12,000,000\n",
"(1937–1945)\n",
"...further details\n",
"\n"
]
}
]
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "NBFRwleXhdEy"
},
"execution_count": 19,
"outputs": []
}
]
}

0 comments on commit 9bbd329

Please sign in to comment.