Skip to content

Commit 69f2bf9

Browse files
laanwjknst
authored andcommitted
Merge bitcoin#24818: net: improve and address issues in makeseeds.py
c457fb1 improve clarity and up max ipv6 ASNs (Baas) Pull request description: This PR attempts to address some of the areas of improvement raised in bitcoin#17020 . Concretely, my proposed change is fairly minor but addresses the following changes to [`makeseeds.py`](https://github.com/bitcoin/bitcoin/blob/master/contrib/seeds/makeseeds.py): - Increase max seeds per ASN for IPv6 to 10 as recommended [here](bitcoin#16999 (comment)), while keeping max seeds per ASN for IPv4 at 2. - Bump `MIN_BLOCKS` to 730000. - Improved script clarity: added function types and more docs to functions, added progress indicator when performing ASN lookup, and change string formatting to better align with [bitcoin python style guidelines](https://github.com/bitcoin/bitcoin/blob/master/test/functional/README.md#style-guidelines) With the different ASN limits for IPv4 and IPv6, and the new minimum block requirement, the current stats look look like: ``` IPv4 IPv6 Onion Pass 470689 73238 0 Initial 470689 73238 0 Skip entries with invalid address 470689 73238 0 After removing duplicates 470688 73238 0 Skip entries from suspicious hosts 6098 1676 0 Enforce minimal number of blocks 5252 1443 0 Require service bit 1 3812 898 0 Require minimum uptime 3738 877 0 Require a known and recent user agent 3715 869 0 Filter out hosts with multiple bitcoin ports 512 512 0 Look up ASNs and limit results per ASN and per net ``` The new ASN max seeds of 10 allows for 512 IPv6 addresses to be included, up from the ~150 that was filtered by the previous version. While there is more to do for bitcoin#17020 , these changes I think are fairly isolated from the rest and should make it a bit easier for others to get up to speed with what the functions in the script do. ACKs for top commit: laanwj: Concept and code review ACK c457fb1 Tree-SHA512: 3ed67868443cc50544e23b27e2341758c3a8866997b0dba47b137032d5e1a13428855daaeed682626ed471542b44435635178d54848a2cd6fe73777679428032
1 parent 4f8df9d commit 69f2bf9

File tree

1 file changed

+51
-25
lines changed

1 file changed

+51
-25
lines changed

contrib/seeds/makeseeds.py

Lines changed: 51 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,19 @@
1313
import collections
1414
import json
1515
import multiprocessing
16+
from typing import List, Dict, Union
1617

1718
NSEEDS=512
1819

1920
MAX_SEEDS_PER_ASN=4
2021

22+
MAX_SEEDS_PER_ASN = {
23+
'ipv4': 4,
24+
'ipv6': 10,
25+
}
26+
27+
MIN_BLOCKS = 2_300_000
28+
2129
# These are hosts that have been observed to be behaving strangely (e.g.
2230
# aggressively connecting to every node).
2331
with open("suspicious_hosts.txt", mode="r", encoding="utf-8") as f:
@@ -28,7 +36,11 @@
2836
PATTERN_IPV6 = re.compile(r"^\[([0-9a-z:]+)\]:(\d+)$")
2937
PATTERN_ONION = re.compile(r"^([a-z2-7]{56}\.onion):(\d+)$")
3038

31-
def parseip(ip_in):
39+
40+
def parseip(ip_in: str) -> Union[dict, None]:
41+
""" Parses a line from `seeds_main.txt` into a dictionary of details for that line.
42+
or `None`, if the line could not be parsed.
43+
"""
3244
m = PATTERN_IPV4.match(ip_in)
3345
ip = None
3446
if m is None:
@@ -70,32 +82,33 @@ def parseip(ip_in):
7082
"sortkey": sortkey
7183
}
7284

73-
def filtermulticollateralhash(mns):
85+
def filtermulticollateralhash(mns : List[Dict]) -> List[Dict]:
7486
'''Filter out MNs sharing the same collateral hash'''
7587
hist = collections.defaultdict(list)
7688
for mn in mns:
7789
hist[mn['collateralHash']].append(mn)
7890
return [mn for mn in mns if len(hist[mn['collateralHash']]) == 1]
7991

80-
def filtermulticollateraladdress(mns):
92+
def filtermulticollateraladdress(mns : List[Dict]) -> List[Dict]:
8193
'''Filter out MNs sharing the same collateral address'''
8294
hist = collections.defaultdict(list)
8395
for mn in mns:
8496
hist[mn['collateralAddress']].append(mn)
8597
return [mn for mn in mns if len(hist[mn['collateralAddress']]) == 1]
8698

87-
def filtermultipayoutaddress(mns):
99+
def filtermultipayoutaddress(mns : List[Dict]) -> List[Dict]:
88100
'''Filter out MNs sharing the same payout address'''
89101
hist = collections.defaultdict(list)
90102
for mn in mns:
91103
hist[mn['state']['payoutAddress']].append(mn)
92104
return [mn for mn in mns if len(hist[mn['state']['payoutAddress']]) == 1]
93105

94-
def resolveasn(resolver, ip):
95-
'''
96-
Look up the asn for an IP (4 or 6) address by querying cymry.com, or None
97-
if it could not be found.
98-
'''
106+
def resolveasn(resolver, ip : Dict) -> Union[int, None]:
107+
""" Look up the asn for an `ip` address by querying cymru.com
108+
on network `net` (e.g. ipv4 or ipv6).
109+
110+
Returns in integer ASN or None if it could not be found.
111+
"""
99112
try:
100113
if ip['net'] == 'ipv4':
101114
ipaddr = ip['ip']
@@ -117,13 +130,16 @@ def resolveasn(resolver, ip):
117130
return None
118131

119132
# Based on Greg Maxwell's seed_filter.py
120-
def filterbyasn(ips, max_per_asn, max_per_net):
133+
def filterbyasn(ips: List[Dict], max_per_asn: Dict, max_per_net: int) -> List[Dict]:
134+
""" Prunes `ips` by
135+
(a) trimming ips to have at most `max_per_net` ips from each net (e.g. ipv4, ipv6); and
136+
(b) trimming ips to have at most `max_per_asn` ips from each asn in each net.
137+
"""
121138
# Sift out ips by type
122139
ips_ipv46 = [ip for ip in ips if ip['net'] in ['ipv4', 'ipv6']]
123140
ips_onion = [ip for ip in ips if ip['net'] == 'onion']
124141

125142
my_resolver = dns.resolver.Resolver()
126-
127143
pool = multiprocessing.Pool(processes=16)
128144

129145
# OpenDNS servers
@@ -134,13 +150,22 @@ def filterbyasn(ips, max_per_asn, max_per_net):
134150

135151
# Filter IPv46 by ASN, and limit to max_per_net per network
136152
result = []
137-
net_count = collections.defaultdict(int)
138-
asn_count = collections.defaultdict(int)
153+
net_count: Dict[str, int] = collections.defaultdict(int)
154+
asn_count: Dict[int, int] = collections.defaultdict(int)
155+
139156
for i, ip in enumerate(ips_ipv46):
157+
if i % 10 == 0:
158+
# give progress update
159+
print(f"{i:6d}/{len(ips_ipv46)} [{100*i/len(ips_ipv46):04.1f}%]\r", file=sys.stderr, end='', flush=True)
160+
140161
if net_count[ip['net']] == max_per_net:
162+
# do not add this ip as we already too many
163+
# ips from this network
141164
continue
142165
asn = asns[i].get()
143-
if asn is None or asn_count[asn] == max_per_asn:
166+
if asn is None or asn_count[asn] == max_per_asn[ip['net']]:
167+
# do not add this ip as we already have too many
168+
# ips from this ASN on this network
144169
continue
145170
asn_count[asn] += 1
146171
net_count[ip['net']] += 1
@@ -150,13 +175,14 @@ def filterbyasn(ips, max_per_asn, max_per_net):
150175
result.extend(ips_onion[0:max_per_net])
151176
return result
152177

153-
def ip_stats(ips):
154-
hist = collections.defaultdict(int)
178+
def ip_stats(ips: List[Dict]) -> str:
179+
""" Format and return pretty string from `ips`. """
180+
hist: Dict[str, int] = collections.defaultdict(int)
155181
for ip in ips:
156182
if ip is not None:
157183
hist[ip['net']] += 1
158184

159-
return '%6d %6d %6d' % (hist['ipv4'], hist['ipv6'], hist['onion'])
185+
return f"{hist['ipv4']:6d} {hist['ipv6']:6d} {hist['onion']:6d}"
160186

161187
def main():
162188
# This expects a json as outputted by "protx list valid 1"
@@ -171,19 +197,19 @@ def main():
171197
with open(sys.argv[2], 'r', encoding="utf8") as f:
172198
onions = f.read().split('\n')
173199

174-
print(f"Total mns: {len(mns)}", file=sys.stderr)
200+
print(f'Total mns: {len(mns)}', file=sys.stderr)
175201
# Skip PoSe banned MNs
176202
mns = [mn for mn in mns if mn['state']['PoSeBanHeight'] == -1]
177-
print(f"After skip entries from PoSe banned MNs: {len(mns)}", file=sys.stderr)
203+
print(f'After skip entries from PoSe banned MNs: {len(mns)}', file=sys.stderr)
178204
# Skip MNs with < 10000 confirmations
179205
mns = [mn for mn in mns if mn['confirmations'] >= 10000]
180-
print(f"After skip MNs with less than 10000 confirmations: {len(mns)}", file=sys.stderr)
206+
print(f'After skip MNs with less than 10000 confirmations: {len(mns)}', file=sys.stderr)
181207

182208
# Filter out MNs which are definitely from the same person/operator
183209
mns = filtermulticollateralhash(mns)
184210
mns = filtermulticollateraladdress(mns)
185211
mns = filtermultipayoutaddress(mns)
186-
print(f"After removing duplicates: {len(mns)}", file=sys.stderr)
212+
print(f'After removing duplicates: {len(mns)}', file=sys.stderr)
187213

188214
# Extract IPs
189215
ips = [parseip(mn['state']['addresses'][0]) for mn in mns]
@@ -193,14 +219,14 @@ def main():
193219
ips.append(parsed)
194220

195221
print('\x1b[7m IPv4 IPv6 Onion Pass \x1b[0m', file=sys.stderr)
196-
print('%s Initial' % (ip_stats(ips)), file=sys.stderr)
222+
print(f'{ip_stats(ips):s} Initial', file=sys.stderr)
197223
# Skip entries with invalid address.
198-
mns = [ip for ip in ips if ip is not None]
199-
print('%s Skip entries with invalid address' % (ip_stats(mns)), file=sys.stderr)
224+
ips = [ip for ip in ips if ip is not None]
225+
print(f'{ip_stats(ips):s} Skip entries with invalid address', file=sys.stderr)
200226

201227
# Look up ASNs and limit results, both per ASN and globally.
202228
ips = filterbyasn(ips, MAX_SEEDS_PER_ASN, NSEEDS)
203-
print('%s Look up ASNs and limit results per ASN and per net' % (ip_stats(ips)), file=sys.stderr)
229+
print(f'{ip_stats(ips):s} Look up ASNs and limit results per ASN and per net', file=sys.stderr)
204230
# Sort the results by IP address (for deterministic output).
205231
ips.sort(key=lambda x: (x['net'], x['sortkey']), reverse=True)
206232

0 commit comments

Comments
 (0)