1
- from urllib .parse import urljoin , urlparse
2
1
from itertools import product
2
+ from urllib .parse import urljoin
3
+ from urllib .parse import urlparse
3
4
import csv
4
5
import posixpath
5
6
6
7
7
- def resolveComponents (url ):
8
+ def remove_dot_segments (url ):
8
9
"""
9
- >>> resolveComponents('http ://www.example.com/foo/bar/../../baz/bux/')
10
- 'http ://www.example.com/baz/bux/'
11
- >>> resolveComponents('http ://www.example.com/some/path/../file.ext')
12
- 'http ://www.example.com/some/file.ext'
10
+ >>> remove_dot_segments('https ://www.example.com/foo/bar/../../baz/bux/')
11
+ 'https ://www.example.com/baz/bux/'
12
+ >>> remove_dot_segments('https ://www.example.com/some/path/../file.ext')
13
+ 'https ://www.example.com/some/file.ext'
13
14
"""
14
15
15
16
parsed = urlparse (url )
16
17
new_path = posixpath .normpath (parsed .path )
17
18
if parsed .path .endswith ('/' ):
18
- # Compensate for issue1707768
19
+ # Fix missing trailing slash.
20
+ # https://bugs.python.org/issue1707768
19
21
new_path += '/'
20
22
if new_path .startswith ('//' ):
21
23
new_path = new_path [1 :]
22
24
cleaned = parsed ._replace (path = new_path )
23
25
return cleaned .geturl ()
24
26
25
27
26
- first_authorities = ['http://example.com@user:pass:7152' , 'https://example.com' ]
27
- second_authorities = ['' , 'https://www.example.org' , 'http://example.com@user:pass:1111' ,
28
- 'file://example.com' , 'file://' ]
29
- first_paths = ['' , '/' , '/foobar/bazz' , 'foobar/bazz/' ]
30
- second_paths = ['' , '/' , '/foo/bar' , 'foo/bar/' , './foo/../bar' , 'foo/./.././bar' ]
28
+ first_authorities = [
29
+ 'http://example.com@user:pass:7152' ,
30
+ 'https://example.com' ,
31
+ ]
32
+ second_authorities = [
33
+ '' ,
34
+ 'https://www.example.org' ,
35
+ 'http://example.com@user:pass:1111' ,
36
+ 'file://example.com' ,
37
+ 'file://' ,
38
+ ]
39
+ first_paths = [
40
+ '' ,
41
+ '/' ,
42
+ '/foobar/bazz' ,
43
+ 'foobar/bazz/' ,
44
+ ]
45
+ second_paths = [
46
+ '' ,
47
+ '/' ,
48
+ '/foo/bar' ,
49
+ 'foo/bar/' ,
50
+ './foo/../bar' ,
51
+ 'foo/./.././bar' ,
52
+ ]
31
53
first_queries = ['' , '?a=1' , '?a=647&b=s564' ]
32
54
second_queries = ['' , '?a=sdf' , '?a=cvb&b=987' ]
33
55
fragments = ['' , '#foo' , '#bar' ]
34
56
35
57
with open ('urls.csv' , 'wt' ) as f :
36
58
csvwriter = csv .writer (f , quotechar = '"' , quoting = csv .QUOTE_ALL )
37
59
csvwriter .writerow (['first_url' , 'second_url' , 'expected' ])
38
- counter = 1
39
60
for first_domain , second_domain in product (first_authorities , second_authorities ):
40
61
for first_path , second_path in product (first_paths , second_paths ):
41
62
for first_query , second_query in product (first_queries , second_queries ):
@@ -47,4 +68,5 @@ def resolveComponents(url):
47
68
second_path = '/' + second_path
48
69
second_url = second_domain + second_path + second_query + second_fragment
49
70
if first_url != second_url :
50
- csvwriter .writerow ([first_url , second_url , resolveComponents (urljoin (first_url , second_url ))])
71
+ expected_url = remove_dot_segments (urljoin (first_url , second_url ))
72
+ csvwriter .writerow ([first_url , second_url , expected_url ])
0 commit comments