Skip to content

Commit 8700200

Browse files
Added lyrics scraper
1 parent a6091ae commit 8700200

File tree

1 file changed

+45
-0
lines changed

1 file changed

+45
-0
lines changed

lyrics-scraper/scrape.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
from urllib.parse import urljoin
2+
from bs4 import BeautifulSoup
3+
import requests
4+
5+
def get_lyrics(artist):
6+
"""Print all lyrics for given artist on azlyrics.com.
7+
8+
Takes artist name as an argument, follows all song links
9+
on artist's page, and prints lyrics for each song."""
10+
11+
base_url = 'http://www.azlyrics.com/'
12+
13+
#Change artist name to match url format: "azlyrics.com/w/west.html"
14+
artist_url = 'http://www.azlyrics.com/' + artist[0] + '/' + artist + ".html"
15+
16+
#Use requests library to get html from artist's page
17+
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'}
18+
response = requests.get(artist_url, headers= headers)
19+
20+
#Make the html soup object!
21+
soup = BeautifulSoup(response.text, "lxml")
22+
23+
#Find all song links on the page and create a new html soup object for each linked page.
24+
#For each song page, remove irrelevant text and return lyrics.
25+
for song_link in soup.find_all('a', attrs={'target': '_blank'})[1:]:
26+
link = urljoin(base_url, song_link['href'][3:])
27+
print(link)
28+
response = requests.get(link, headers= headers)
29+
soup = BeautifulSoup(response.text, "lxml")
30+
31+
# Remove unnecessary text items on page
32+
#(desired info is in a div w/o a class or additional selectors)
33+
for text in soup.find_all(['a','small','script','title','span','b','h1']):
34+
text.decompose()
35+
36+
for text in soup.select('.hidden'):
37+
text.decompose()
38+
39+
# get remaining text (lyrics) from page:
40+
print((soup.get_text(strip=True, separator=" ")))
41+
42+
43+
get_lyrics("gunsandroses")
44+
45+

0 commit comments

Comments
 (0)