1
+ import json
2
+ import httpx
3
+ from urllib .parse import quote
4
+ from typing import Dict
5
+ import jmespath
6
+
7
+ client = httpx .Client (
8
+ headers = {
9
+ "x-ig-app-id" : "936619743392459" ,
10
+ "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36" ,
11
+ "Accept-Language" : "en-US,en;q=0.9,ru;q=0.8" ,
12
+ "Accept-Encoding" : "gzip, deflate, br" ,
13
+ "Accept" : "*/*" ,
14
+ }
15
+ )
16
+
17
+ def scrape_user_id (username : str ):
18
+ """Scrape Instagram user's data"""
19
+ result = client .get (
20
+ f"https://i.instagram.com/api/v1/users/web_profile_info/?username={ username } " ,
21
+ )
22
+ data = json .loads (result .content )
23
+ user_data = data ["data" ]["user" ]
24
+ user_id = user_data .get ("id" )
25
+ return user_id
26
+
27
+ def parse_post (data : Dict ) -> Dict :
28
+ result = jmespath .search ("""{
29
+ shortcode: shortcode,
30
+ likes: edge_media_preview_like.count,
31
+ comments: edge_media_to_comment.count
32
+ }""" , data )
33
+ return result
34
+
35
+ def scrape_user_posts (user_id : str , session : httpx .Client , page_size = 12 ):
36
+ base_url = "https://www.instagram.com/graphql/query/?query_hash=e769aa130647d2354c40ea6a439bfc08&variables="
37
+ variables = {
38
+ "id" : user_id ,
39
+ "first" : page_size ,
40
+ "after" : None ,
41
+ }
42
+ _page_number = 1
43
+ while True :
44
+ resp = session .get (base_url + quote (json .dumps (variables )))
45
+ all_posts_data = resp .json ()
46
+ posts = all_posts_data ["data" ]["user" ]["edge_owner_to_timeline_media" ]
47
+ for post in posts .get ("edges" ):
48
+ yield parse_post (post .get ("node" )) # note: we're using parse_post function from previous chapter
49
+ page_info = posts .get ("page_info" )
50
+ if _page_number == 1 :
51
+ print (f"scraping total { posts ['count' ]} posts of { user_id } " )
52
+ else :
53
+ print (f"scraping page { _page_number } " )
54
+ if not page_info ["has_next_page" ]:
55
+ break
56
+ if variables ["after" ] == page_info ["end_cursor" ]:
57
+ break
58
+ variables ["after" ] = page_info ["end_cursor" ]
59
+ _page_number += 1
60
+
61
+
62
+ def get_all_posts (user_id ):
63
+ with httpx .Client (timeout = None ) as session :
64
+ posts = list (scrape_user_posts (str (user_id ), session ))
65
+ return posts
0 commit comments