Added Web Scrapping with BeautifulSoup module

ayush-raj13 · Oct 6, 2023 · 9bbd329 · 9bbd329
1 parent 7f90135
commit 9bbd329
Show file tree

Hide file tree

Showing 2 changed files with 371 additions and 0 deletions.
diff --git a/Web Scraping/Requirements.txt b/Web Scraping/Requirements.txt
@@ -0,0 +1,24 @@
+async-generator==1.10
+attrs==21.4.0
+beautifulsoup4==4.10.0
+beautifultable==1.0.1
+certifi==2021.10.8
+cffi==1.15.0
+charset-normalizer==2.0.12
+cryptography==36.0.1
+h11==0.13.0
+idna==3.3
+outcome==1.1.0
+pycparser==2.21
+pyOpenSSL==22.0.0
+PySocks==1.7.1
+requests==2.27.1
+selenium==4.1.2
+sniffio==1.2.0
+sortedcontainers==2.4.0
+soupsieve==2.3.1
+trio==0.20.0
+trio-websocket==0.9.2
+urllib3==1.26.8
+wcwidth==0.2.5
+wsproto==1.1.0
diff --git a/Web Scraping/Web Scraping with BeautifulSoup.ipynb b/Web Scraping/Web Scraping with BeautifulSoup.ipynb
@@ -0,0 +1,347 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "source": [
+        "#Requirements\n",
+        "#pip3 install requests\n",
+        "#pip3 install bs4"
+      ],
+      "metadata": {
+        "id": "-gVGfpsFg4Ns"
+      },
+      "execution_count": 11,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# import these two modules bs4 for selecting HTML tags easily\n",
+        "from bs4 import BeautifulSoup\n",
+        "# requests module is easy to operate some people use urllib but I prefer this one because it is easy to use.\n",
+        "import requests\n",
+        "\n",
+        "# I put here my own blog url ,you can change it.\n",
+        "url=\"https://getpython.wordpress.com/\"\n",
+        "\n",
+        "#Requests module use to data from given url\n",
+        "source=requests.get(url)\n",
+        "\n",
+        "# BeautifulSoup is used for getting HTML structure from requests response.(craete your soup)\n",
+        "soup=BeautifulSoup(source.text,'html')\n",
+        "\n",
+        "# Find function is used to find a single element if there are more than once it always returns the first element.\n",
+        "title=soup.find('title') # place your html tagg in parentheses that you want to find from html.\n",
+        "print(\"this is with html tags :\",title)\n",
+        "\n",
+        "qwery=soup.find('h1') # here i find first h1 tagg in my website using find operation.\n",
+        "\n",
+        "#use .text for extract only text without any html tags\n",
+        "print(\"this is without html tags:\",qwery.text)\n",
+        "\n",
+        "\n",
+        "links=soup.find('a') #i extarcted link using \"a\" tag\n",
+        "print(links)"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "29YQuSxKg_py",
+        "outputId": "fad81c49-e0ad-4f16-d716-97e5a820a146"
+      },
+      "execution_count": 12,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "this is with html tags : <title>Fun with python programming – A programming language for revolution</title>\n",
+            "this is without html tags: Fun with python programming\n",
+            "<a class=\"screen-reader-text skip-link\" href=\"#content\">Skip to content</a>\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# here i extarcted href data from anchor tag.\n",
+        "print(links['href'])"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "dNci1cV-hDne",
+        "outputId": "4fa5d4cb-9d0c-4d6a-e025-631e879f5c0a"
+      },
+      "execution_count": 13,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "#content\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# findall function is used to fetch all tags at a single time.\n",
+        "many_link=soup.find_all('a') # here i extracted all the anchor tags of my website\n",
+        "total_links=len(many_link) # len function is use to calculate length of your array\n",
+        "print(\"total links in my website :\",total_links)\n",
+        "print()\n",
+        "for i in many_link[:6]: # here i use slicing to fetch only first 6 links from rest of them.\n",
+        "    print(i)"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "95HDbH_6hGRa",
+        "outputId": "80fd7795-6972-4aea-a292-a8906ee82043"
+      },
+      "execution_count": 14,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "total links in my website : 100\n",
+            "\n",
+            "<a class=\"screen-reader-text skip-link\" href=\"#content\">Skip to content</a>\n",
+            "<a href=\"https://getpython.wordpress.com/\" rel=\"home\">\n",
+            "<div class=\"cover\"></div>\n",
+            "</a>\n",
+            "<a class=\"screen-reader-text search-toggle\" href=\"#search-container\">Search</a>\n",
+            "<a href=\"https://getpython.wordpress.com/\" rel=\"home\">Fun with python programming</a>\n",
+            "<a aria-current=\"page\" href=\"/\">Home</a>\n",
+            "<a href=\"https://getpython.wordpress.com/contact/\">Contact</a>\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "second_link=many_link[1] #here i fetch second link which place on 1 index number in many_links.\n",
+        "print(second_link)\n",
+        "print()\n",
+        "print(\"href is :\",second_link['href']) #only href link is extracted from ancor tag"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "Y8bBLk4phJbk",
+        "outputId": "d527d42f-4bcd-4f60-a7cd-80a75a2a8f6c"
+      },
+      "execution_count": 15,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "<a href=\"https://getpython.wordpress.com/\" rel=\"home\">\n",
+            "<div class=\"cover\"></div>\n",
+            "</a>\n",
+            "\n",
+            "href is : https://getpython.wordpress.com/\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# select div tag from second link\n",
+        "nested_div=second_link.find('div')\n",
+        "# As you can see div element extarcted , it also have inner elements\n",
+        "print(nested_div)\n",
+        "print()\n",
+        "#here i extracted class element from div but it give us in the form of list\n",
+        "z=(nested_div['class'])\n",
+        "print(z)\n",
+        "print(type(z))\n",
+        "print()\n",
+        "#  \" \" .join () method use to convert list type  into string type\n",
+        "print(\"class name of div is :\",\" \".join(nested_div['class']))"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "S8IgJqz7hMWC",
+        "outputId": "8f59199c-222b-4eb4-a155-a06564817d01"
+      },
+      "execution_count": 16,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "<div class=\"cover\"></div>\n",
+            "\n",
+            "['cover']\n",
+            "<class 'list'>\n",
+            "\n",
+            "class name of div is : cover\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "wiki=requests.get(\"https://en.wikipedia.org/wiki/World_War_II\")\n",
+        "soup=BeautifulSoup(wiki.text,'html')\n",
+        "print(soup.find('title'))"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "HapUBD0xhPOI",
+        "outputId": "bf68db0d-e780-4dad-dcd2-5f12aeb49c1f"
+      },
+      "execution_count": 17,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "<title>World War II - Wikipedia</title>\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "ww2_contents=soup.find_all(\"div\",class_='toc')\n",
+        "for i in ww2_contents:\n",
+        "    print(i.text)"
+      ],
+      "metadata": {
+        "id": "uhxBXUPwhSMD"
+      },
+      "execution_count": 18,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "overview=soup.find_all('table',class_='infobox vevent')\n",
+        "for z in overview:\n",
+        "    print(z.text)"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "FdBYCr3fhU9P",
+        "outputId": "75fe439e-6638-457a-8bed-eb8f96e42dc4"
+      },
+      "execution_count": 19,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "World War IIClockwise from top left: \n",
+            "German Stuka dive bombers on the Eastern Front, 1943\n",
+            "British Matilda II tanks during the North African campaign, 1941\n",
+            "Soviet troops at the Battle of Stalingrad, 1942–1943\n",
+            "U.S. warships in Lingayen Gulf in the Philippines, 1945\n",
+            "Soviet soldier raising a flag over the Reichstag after the Battle of Berlin, 1945\n",
+            "U.S. atomic bombing of Nagasaki in Japan, 1945\n",
+            "Date1 September 1939 – 2 September 1945[a] (6 years, 1 day)LocationMajor theaters: \n",
+            "Europe\n",
+            "Pacific\n",
+            "Atlantic\n",
+            "Indian Ocean\n",
+            "South-East Asia\n",
+            "China\n",
+            "Japan\n",
+            "Middle East\n",
+            "Mediterranean\n",
+            "North Africa\n",
+            "Horn of Africa\n",
+            "Central Africa\n",
+            "Australia\n",
+            "Caribbean\n",
+            "North and South America\n",
+            "Result\n",
+            "Allied victory\n",
+            "Fall of Nazi Germany, Fascist Italy, and Imperial Japan\n",
+            "Allied military occupations of Germany, Japan, Austria, and Korea\n",
+            "Beginning of the Nuclear Age\n",
+            "Dissolution of the League of Nations and creation of the United Nations\n",
+            "Decolonisation of Asia and Africa and decline of European international influence\n",
+            "Emergence of the United States and the Soviet Union as rival superpowers and beginning of the Cold War (see Aftermath of World War II)Participants\n",
+            "Allies\n",
+            "AxisCommanders and leaders\n",
+            "Main Allied leaders:\n",
+            " Winston Churchill\n",
+            " Joseph Stalin\n",
+            " Franklin D. Roosevelt\n",
+            " Chiang Kai-shek\n",
+            "\n",
+            "Main Axis leaders:\n",
+            " Adolf Hitler\n",
+            " Hirohito\n",
+            " Benito Mussolini\n",
+            "Casualties and losses\n",
+            "\n",
+            "Military dead:\n",
+            "Over 16,000,000\n",
+            "Civilian dead:\n",
+            "Over 45,000,000\n",
+            "Total dead:\n",
+            "Over 61,000,000\n",
+            "(1937–1945)\n",
+            "...further details\n",
+            "\n",
+            "\n",
+            "Military dead:\n",
+            "Over 8,000,000\n",
+            "Civilian dead:\n",
+            "Over 4,000,000\n",
+            "Total dead:\n",
+            "Over 12,000,000\n",
+            "(1937–1945)\n",
+            "...further details\n",
+            "\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "NBFRwleXhdEy"
+      },
+      "execution_count": 19,
+      "outputs": []
+    }
+  ]
+}