forked from prathimacode-hub/Awesome_Python_Scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlink_preview.py
142 lines (107 loc) · 3.65 KB
/
link_preview.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import requests
import json
import os
import time
from bs4 import BeautifulSoup
# Lets find the title of the link
# ......
def getTitle(soup):
ogTitle = soup.find("meta", property="og:title")
twitterTitle = soup.find("meta", attrs={"name": "twitter:title"})
documentTitle = soup.find("title")
me1Title = soup.find("me1")
me2Title = soup.find("me2")
mTitle = soup.find("m")
res = ogTitle or twitterTitle or documentTitle or me1Title or me2Title or mTitle
res = res.get_text() or res.get("content", None)
if (len(res) > 60):
res = res[0:60]
if (res == None or len(res.split()) == 0):
res = "Not available"
return res.strip()
# to scrape page description
def getDesc(soup):
ogDesc = soup.find("meta", property="og:description")
twitterDesc = soup.find("meta", attrs={"name": "twitter:description"})
metaDesc = soup.find("meta", attrs={"name": "description"})
pDesc = soup.find("p")
res = ogDesc or twitterDesc or metaDesc or pDesc
res = res.get_text() or res.get("content", None)
if (len(res) > 60):
res = res[0:60]
if (res == None or len(res.split()) == 0):
res = "Not available"
return res.strip()
# to scrape image link
def getImage(soup, url):
ogImg = soup.find("meta", property="og:image")
twitterImg = soup.find("meta", attrs={"name": "twitter:image"})
metaImg = soup.find("link", attrs={"rel": "img_src"})
img = soup.find("img")
res = ogImg or twitterImg or metaImg or img
res = res.get("content", None) or res.get_text() or res.get("src", None)
count = 0
for i in range(0, len(res)):
if (res[i] == "." or res[i] == "/"):
count += 1
else:
break
res = res[count::]
if ((not res == None) and ((not "https://" in res) or (not "https://" in res))):
res = url + "/" + res
if (res == None or len(res.split()) == 0):
res = "Not available"
return res
# print dictionary
def printData(data):
for item in data.items():
print(f'{item[0].capitalize()}: {item[1]}')
# start
print("\nloading.....")
print("- previewing your link -")
print("loading.......\n")
# now, let us get the url from the user
url = input("Enter URL you want to preview : ")
# parsing and checking the url
if (url == ""):
url = 'www.google.com'
if ((not "http://" in url) or (not "https://" in url)):
url = "https://" + url
# let us start printing the values
# first check in the DataBase
db = {}
# create the file if it doesn't exist
if not os.path.exists('linkPreview/db.json'):
f = open('linkPreview/db.json', "w")
f.write("{}")
f.close()
# read db
with open('linkPreview/db.json', 'r+') as file:
data = file.read()
if (len(data) == 0):
data = "{}"
file.write(data)
db = json.loads(data)
# let us check if it exists
if (url in db and db[url]["time"] < round(time.time())):
printData(db[url])
else:
# if the above is false and if not in db get via request
# will be getting the html
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
sevenDaysInSec = 7*24*60*60
# let us start printing the data
newData = {
"title": getTitle(soup),
"description": getDesc(soup),
"url": url,
"image": getImage(soup, url),
"time": round(time.time() * 1000) + sevenDaysInSec
}
printData(newData)
# this is parse the file
db[url] = newData
with open('Link-Preview/db.json', 'w') as file:
json.dump(db, file)
print("\n--END--\n")