22# Copyright (c) Microsoft Corporation. All rights reserved.
33# Licensed under the MIT License.
44# -------------------------------------------------------------------------
5+
56# Finds all HTTP URLs found in the NimbusML repository
67# Converts all valid HTTP links to HTTPS
7- # Usage: python3 FindHttpURLs .py [PATH_TO_NimbusML_REPOSITORY]
8+ # Usage: python3 findHttpURLs .py [PATH_TO_NimbusML_REPOSITORY]
89# Output: Report_AlterableUrls_FindHttpURLs.csv, [Report_NonAlterableUrls_FindHttpURLs.csv, Report_InvalidUrls_FindHttpURLs.csv]
910
1011# Required non-standard pip library: urlextract
1314import os
1415import requests
1516import csv
17+ import collections
1618from urlextract import URLExtract
1719
20+ def addToDictionary (dict , key , value ):
21+ if key not in dict :
22+ dict [key ] = [value ]
23+ else :
24+ if value not in dict [key ]:
25+ dict [key ].append (value )
26+ return dict
27+
1828def findHttpUrls (searchRootDirectory ):
29+ alterableUrlsStore = {}
30+ nonAlterableUrlsStore = {}
31+ invalidUrlsStore = {}
1932 extractor = URLExtract ()
2033 lengthOfOriginalRootPath = - 1
2134 for root , _ , files in os .walk (searchRootDirectory , onerror = None ):
2235 if lengthOfOriginalRootPath == - 1 :
2336 lengthOfOriginalRootPath = len (root )
2437 for filename in files :
2538 absoluteFilePath = os .path .join (root , filename )
26- relativeFilePath = '.' + absoluteFilePath [lengthOfOriginalRootPath - 1 :]
39+ relativeFilePath = '.' + absoluteFilePath [lengthOfOriginalRootPath :]
2740 try :
2841 with open (absoluteFilePath , "rb" ) as f :
2942 data = f .read ()
@@ -43,47 +56,45 @@ def findHttpUrls(searchRootDirectory):
4356 try :
4457 newRequest = requests .get (changedSelectedUrl )
4558 if newRequest .status_code == 200 :
46- alterableUrlsStore . append ([ selectedUrl , relativeFilePath ] )
59+ alterableUrlsStore = addToDictionary ( alterableUrlsStore , relativeFilePath , selectedUrl )
4760 else :
48- nonAlterableUrlsStore . append ([ selectedUrl , relativeFilePath ] )
61+ nonAlterableUrlsStore = addToDictionary ( nonAlterableUrlsStore , relativeFilePath , selectedUrl )
4962 except :
50- nonAlterableUrlsStore . append ([ selectedUrl , relativeFilePath ] )
63+ nonAlterableUrlsStore = addToDictionary ( nonAlterableUrlsStore , relativeFilePath , selectedUrl )
5164 else :
52- invalidUrlsStore . append ([ selectedUrl , relativeFilePath ] )
65+ invalidUrlsStore = addToDictionary ( invalidUrlsStore , relativeFilePath , selectedUrl )
5366 except ConnectionError :
54- invalidUrlsStore . append ([ selectedUrl , relativeFilePath ] )
67+ invalidUrlsStore = addToDictionary ( invalidUrlsStore , relativeFilePath , selectedUrl )
5568 except (IOError , OSError ):
5669 pass
57- return
70+ makeReports ( alterableUrlsStore , nonAlterableUrlsStore , invalidUrlsStore )
5871
59- def makeReports ():
60- fieldnames = ['filepath' , 'url' ]
72+ def makeReports (alterableUrlsStore , nonAlterableUrlsStore , invalidUrlsStore ):
6173 with open ('Report_AlterableUrls_FindHttpURLs.csv' , mode = 'w' , newline = '' ) as csv_file :
62- writer = csv .writer (csv_file , delimiter = ', ' , quotechar = '|' , quoting = csv .QUOTE_MINIMAL )
74+ writer = csv .writer (csv_file , delimiter = '\t ' , quotechar = '|' , quoting = csv .QUOTE_MINIMAL )
6375 writer .writerow (["url" , "relativeFilepath" ])
64- for pair in alterableUrlsStore :
65- writer .writerow ([pair [0 ], pair [1 ]])
76+ for fileKey in alterableUrlsStore :
77+ for url in alterableUrlsStore [fileKey ]:
78+ writer .writerow ([url , fileKey ])
6679 with open ('Report_NonAlterableUrls_FindHttpURLs.csv' , mode = 'w' , newline = '' ) as csv_file :
67- writer = csv .writer (csv_file , delimiter = ', ' , quotechar = '|' , quoting = csv .QUOTE_MINIMAL )
80+ writer = csv .writer (csv_file , delimiter = '\t ' , quotechar = '|' , quoting = csv .QUOTE_MINIMAL )
6881 writer .writerow (["url" , "relativeFilepath" ])
69- for pair in alterableUrlsStore :
70- writer .writerow ([pair [0 ], pair [1 ]])
82+ for fileKey in nonAlterableUrlsStore :
83+ for url in nonAlterableUrlsStore [fileKey ]:
84+ writer .writerow ([url , fileKey ])
7185 with open ('Report_InvalidUrls_FindHttpURLs.csv' , mode = 'w' , newline = '' ) as csv_file :
72- writer = csv .writer (csv_file , delimiter = ', ' , quotechar = '|' , quoting = csv .QUOTE_MINIMAL )
86+ writer = csv .writer (csv_file , delimiter = '\t ' , quotechar = '|' , quoting = csv .QUOTE_MINIMAL )
7387 writer .writerow (["url" , "relativeFilepath" ])
74- for pair in alterableUrlsStore :
75- writer .writerow ([pair [0 ], pair [1 ]])
88+ for fileKey in invalidUrlsStore :
89+ for url in invalidUrlsStore [fileKey ]:
90+ writer .writerow ([url , fileKey ])
7691 return
7792
7893def main ():
7994 if len (sys .argv ) < 2 :
80- print ("Usage: python3 FindHttpURLs .py [PATH_TO_NimbusML_REPOSITORY]" )
95+ print ("Usage: python3 findHttpURLs .py [PATH_TO_NimbusML_REPOSITORY]" )
8196 exit (1 )
8297 findHttpUrls (sys .argv [1 ])
83- makeReports ()
84-
85- alterableUrlsStore = []
86- invalidUrlsStore = []
87- nonAlterableUrlsStore = []
98+
8899if __name__ == "__main__" :
89100 main ()
0 commit comments