|
| 1 | +from urllib.parse import urljoin |
| 2 | +from bs4 import BeautifulSoup |
| 3 | +import requests |
| 4 | + |
| 5 | +def get_lyrics(artist): |
| 6 | + """Print all lyrics for given artist on azlyrics.com. |
| 7 | +
|
| 8 | + Takes artist name as an argument, follows all song links |
| 9 | + on artist's page, and prints lyrics for each song.""" |
| 10 | + |
| 11 | + base_url = 'http://www.azlyrics.com/' |
| 12 | + |
| 13 | + #Change artist name to match url format: "azlyrics.com/w/west.html" |
| 14 | + artist_url = 'http://www.azlyrics.com/' + artist[0] + '/' + artist + ".html" |
| 15 | + |
| 16 | + #Use requests library to get html from artist's page |
| 17 | + headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'} |
| 18 | + response = requests.get(artist_url, headers= headers) |
| 19 | + |
| 20 | + #Make the html soup object! |
| 21 | + soup = BeautifulSoup(response.text, "lxml") |
| 22 | + |
| 23 | + #Find all song links on the page and create a new html soup object for each linked page. |
| 24 | + #For each song page, remove irrelevant text and return lyrics. |
| 25 | + for song_link in soup.find_all('a', attrs={'target': '_blank'})[1:]: |
| 26 | + link = urljoin(base_url, song_link['href'][3:]) |
| 27 | + print(link) |
| 28 | + response = requests.get(link, headers= headers) |
| 29 | + soup = BeautifulSoup(response.text, "lxml") |
| 30 | + |
| 31 | + # Remove unnecessary text items on page |
| 32 | + #(desired info is in a div w/o a class or additional selectors) |
| 33 | + for text in soup.find_all(['a','small','script','title','span','b','h1']): |
| 34 | + text.decompose() |
| 35 | + |
| 36 | + for text in soup.select('.hidden'): |
| 37 | + text.decompose() |
| 38 | + |
| 39 | + # get remaining text (lyrics) from page: |
| 40 | + print((soup.get_text(strip=True, separator=" "))) |
| 41 | + |
| 42 | + |
| 43 | +get_lyrics("gunsandroses") |
| 44 | + |
| 45 | + |
0 commit comments