From 13fcaaca22cde76a535ba655f649d12711e6f1f7 Mon Sep 17 00:00:00 2001 From: Umberto Grando Date: Fri, 18 Mar 2022 16:04:47 +0100 Subject: [PATCH] added webscraping_example --- .../webscraping_example.py | 67 +++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 Webscraping Start to End/webscraping_example.py diff --git a/Webscraping Start to End/webscraping_example.py b/Webscraping Start to End/webscraping_example.py new file mode 100644 index 0000000..e964592 --- /dev/null +++ b/Webscraping Start to End/webscraping_example.py @@ -0,0 +1,67 @@ +import requests_html +import sqlite3 + +url = 'https://www.cnet.com/tech/computing/{page}/' +sess = requests_html.HTMLSession() + +class Article: + + def __init__(self, title, url): + self.title = title + self.url = url + self.author = None + self.short_description = None + self.date = None + self.text = None + + def get_full_article(self): + """Get the full article from the url + """ + art_sess = requests_html.HTMLSession() + art_res = art_sess.get(self.url) + try: + self.date = art_res.html.find('time',first=True).attrs['datetime'] + full_text = [] + for el in art_res.html.find('.article-main-body',first=True).find('p'): + full_text.append(el.text) + self.text = '\n'.join(full_text) + except KeyError as e: + print(f'KeyError: {e}') + + def __str__(self): + return "{}, {}".format(self.title, self.url) + +# DATA SCRAPING +articles = [] +for page in range(1,2): + res = sess.get(url.format(page=page)) + html = res.html + for html_article in html.find('.item'): + title = html_article.find('h3', first=True).text + print('Scraping article:', title) + url = [el for el in list(html_article.absolute_links) if 'profiles' not in el][0] + article = Article(title, url) + article.short_description = html_article.find('p', first=True).text + article.author = html_article.find('span', first=True).text.replace('by ','') + article.get_full_article() + articles.append(article) + print('Scraped article:', article.title) + print('-' * 20) + +# DATA STORAGE +conn = sqlite3.connect('cnet_articles.db') +conn.execute('''CREATE TABLE IF NOT EXISTS articles ( + artTitle TEXT PRIMARY KEY, + artUrl TEXT, + artAuthor TEXT, + artShortDesc TEXT, + artDate TEXT, + artText TEXT + )''') +conn.commit() +for article in articles: + conn.execute('''INSERT OR IGNORE INTO articles VALUES (?,?,?,?,?,?)''', + (article.title, article.url, article.author, article.short_description, article.date, article.text)) +conn.commit() + +conn.execute('''SELECT * FROM articles''').fetchall() \ No newline at end of file