1010import sys
1111import dns .resolver
1212import collections
13+ from typing import List , Dict , Union
1314
1415NSEEDS = 512
1516
16- MAX_SEEDS_PER_ASN = 2
17+ MAX_SEEDS_PER_ASN = {
18+ 'ipv4' : 2 ,
19+ 'ipv6' : 10 ,
20+ }
1721
18- MIN_BLOCKS = 337600
22+ MIN_BLOCKS = 730000
1923
2024# These are hosts that have been observed to be behaving strangely (e.g.
2125# aggressively connecting to every node).
4044 r"23.99"
4145 r")" )
4246
43- def parseline (line ):
47+ def parseline (line : str ) -> Union [dict , None ]:
48+ """ Parses a line from `seeds_main.txt` into a dictionary of details for that line.
49+ or `None`, if the line could not be parsed.
50+ """
4451 sline = line .split ()
4552 if len (sline ) < 11 :
53+ # line too short to be valid, skip it.
4654 return None
4755 m = PATTERN_IPV4 .match (sline [0 ])
4856 sortkey = None
@@ -107,25 +115,26 @@ def parseline(line):
107115 'sortkey' : sortkey ,
108116 }
109117
110- def dedup (ips ) :
111- '''deduplicate by address, port'''
118+ def dedup (ips : List [ Dict ]) -> List [ Dict ] :
119+ """ Remove duplicates from `ips` where multiple ips share address and port. """
112120 d = {}
113121 for ip in ips :
114122 d [ip ['ip' ],ip ['port' ]] = ip
115123 return list (d .values ())
116124
117- def filtermultiport (ips ) :
118- ''' Filter out hosts with more nodes per IP'''
125+ def filtermultiport (ips : List [ Dict ]) -> List [ Dict ] :
126+ """ Filter out hosts with more nodes per IP"""
119127 hist = collections .defaultdict (list )
120128 for ip in ips :
121129 hist [ip ['sortkey' ]].append (ip )
122130 return [value [0 ] for (key ,value ) in list (hist .items ()) if len (value )== 1 ]
123131
124- def lookup_asn (net , ip ):
125- '''
126- Look up the asn for an IP (4 or 6) address by querying cymru.com, or None
127- if it could not be found.
128- '''
132+ def lookup_asn (net : str , ip : str ) -> Union [int , None ]:
133+ """ Look up the asn for an `ip` address by querying cymru.com
134+ on network `net` (e.g. ipv4 or ipv6).
135+
136+ Returns in integer ASN or None if it could not be found.
137+ """
129138 try :
130139 if net == 'ipv4' :
131140 ipaddr = ip
@@ -147,20 +156,33 @@ def lookup_asn(net, ip):
147156 return None
148157
149158# Based on Greg Maxwell's seed_filter.py
150- def filterbyasn (ips , max_per_asn , max_per_net ):
159+ def filterbyasn (ips : List [Dict ], max_per_asn : Dict , max_per_net : int ) -> List [Dict ]:
160+ """ Prunes `ips` by
161+ (a) trimming ips to have at most `max_per_net` ips from each net (e.g. ipv4, ipv6); and
162+ (b) trimming ips to have at most `max_per_asn` ips from each asn in each net.
163+ """
151164 # Sift out ips by type
152165 ips_ipv46 = [ip for ip in ips if ip ['net' ] in ['ipv4' , 'ipv6' ]]
153166 ips_onion = [ip for ip in ips if ip ['net' ] == 'onion' ]
154167
155168 # Filter IPv46 by ASN, and limit to max_per_net per network
156169 result = []
157- net_count = collections .defaultdict (int )
158- asn_count = collections .defaultdict (int )
159- for ip in ips_ipv46 :
170+ net_count : Dict [str , int ] = collections .defaultdict (int )
171+ asn_count : Dict [int , int ] = collections .defaultdict (int )
172+
173+ for i , ip in enumerate (ips_ipv46 ):
174+ if i % 10 == 0 :
175+ # give progress update
176+ print (f"{ i :6d} /{ len (ips_ipv46 )} [{ 100 * i / len (ips_ipv46 ):04.1f} %]\r " , file = sys .stderr , end = '' , flush = True )
177+
160178 if net_count [ip ['net' ]] == max_per_net :
179+ # do not add this ip as we already too many
180+ # ips from this network
161181 continue
162182 asn = lookup_asn (ip ['net' ], ip ['ip' ])
163- if asn is None or asn_count [asn ] == max_per_asn :
183+ if asn is None or asn_count [asn ] == max_per_asn [ip ['net' ]]:
184+ # do not add this ip as we already have too many
185+ # ips from this ASN on this network
164186 continue
165187 asn_count [asn ] += 1
166188 net_count [ip ['net' ]] += 1
@@ -170,54 +192,55 @@ def filterbyasn(ips, max_per_asn, max_per_net):
170192 result .extend (ips_onion [0 :max_per_net ])
171193 return result
172194
173- def ip_stats (ips ):
174- hist = collections .defaultdict (int )
195+ def ip_stats (ips : List [Dict ]) -> str :
196+ """ Format and return pretty string from `ips`. """
197+ hist : Dict [str , int ] = collections .defaultdict (int )
175198 for ip in ips :
176199 if ip is not None :
177200 hist [ip ['net' ]] += 1
178201
179- return '%6d %6d %6d' % ( hist ['ipv4' ], hist ['ipv6' ], hist ['onion' ])
202+ return f" { hist ['ipv4' ]:6d } { hist ['ipv6' ]:6d } { hist ['onion' ]:6d } "
180203
181204def main ():
182205 lines = sys .stdin .readlines ()
183206 ips = [parseline (line ) for line in lines ]
184207
185208 print ('\x1b [7m IPv4 IPv6 Onion Pass \x1b [0m' , file = sys .stderr )
186- print ('%s Initial' % ( ip_stats (ips )) , file = sys .stderr )
209+ print (f' { ip_stats (ips ):s } Initial' , file = sys .stderr )
187210 # Skip entries with invalid address.
188211 ips = [ip for ip in ips if ip is not None ]
189- print ('%s Skip entries with invalid address' % ( ip_stats ( ips )) , file = sys .stderr )
212+ print (f' { ip_stats ( ips ):s } Skip entries with invalid address' , file = sys .stderr )
190213 # Skip duplicates (in case multiple seeds files were concatenated)
191214 ips = dedup (ips )
192- print ('%s After removing duplicates' % ( ip_stats ( ips )) , file = sys .stderr )
215+ print (f' { ip_stats ( ips ):s } After removing duplicates' , file = sys .stderr )
193216 # Skip entries from suspicious hosts.
194217 ips = [ip for ip in ips if ip ['ip' ] not in SUSPICIOUS_HOSTS ]
195- print ('%s Skip entries from suspicious hosts' % ( ip_stats ( ips )) , file = sys .stderr )
218+ print (f' { ip_stats ( ips ):s } Skip entries from suspicious hosts' , file = sys .stderr )
196219 # Enforce minimal number of blocks.
197220 ips = [ip for ip in ips if ip ['blocks' ] >= MIN_BLOCKS ]
198- print ('%s Enforce minimal number of blocks' % ( ip_stats ( ips )) , file = sys .stderr )
221+ print (f' { ip_stats ( ips ):s } Enforce minimal number of blocks' , file = sys .stderr )
199222 # Require service bit 1.
200223 ips = [ip for ip in ips if (ip ['service' ] & 1 ) == 1 ]
201- print ('%s Require service bit 1' % ( ip_stats ( ips )) , file = sys .stderr )
224+ print (f' { ip_stats ( ips ):s } Require service bit 1' , file = sys .stderr )
202225 # Require at least 50% 30-day uptime for clearnet, 10% for onion.
203226 req_uptime = {
204227 'ipv4' : 50 ,
205228 'ipv6' : 50 ,
206229 'onion' : 10 ,
207230 }
208231 ips = [ip for ip in ips if ip ['uptime' ] > req_uptime [ip ['net' ]]]
209- print ('%s Require minimum uptime' % ( ip_stats ( ips )) , file = sys .stderr )
232+ print (f' { ip_stats ( ips ):s } Require minimum uptime' , file = sys .stderr )
210233 # Require a known and recent user agent.
211234 ips = [ip for ip in ips if PATTERN_AGENT .match (ip ['agent' ])]
212- print ('%s Require a known and recent user agent' % ( ip_stats ( ips )) , file = sys .stderr )
235+ print (f' { ip_stats ( ips ):s } Require a known and recent user agent' , file = sys .stderr )
213236 # Sort by availability (and use last success as tie breaker)
214237 ips .sort (key = lambda x : (x ['uptime' ], x ['lastsuccess' ], x ['ip' ]), reverse = True )
215238 # Filter out hosts with multiple bitcoin ports, these are likely abusive
216239 ips = filtermultiport (ips )
217- print ('%s Filter out hosts with multiple bitcoin ports' % ( ip_stats ( ips )) , file = sys .stderr )
240+ print (f' { ip_stats ( ips ):s } Filter out hosts with multiple bitcoin ports' , file = sys .stderr )
218241 # Look up ASNs and limit results, both per ASN and globally.
219242 ips = filterbyasn (ips , MAX_SEEDS_PER_ASN , NSEEDS )
220- print ('%s Look up ASNs and limit results per ASN and per net' % ( ip_stats ( ips )) , file = sys .stderr )
243+ print (f' { ip_stats ( ips ):s } Look up ASNs and limit results per ASN and per net' , file = sys .stderr )
221244 # Sort the results by IP address (for deterministic output).
222245 ips .sort (key = lambda x : (x ['net' ], x ['sortkey' ]))
223246 for ip in ips :
0 commit comments