Skip to content
This repository was archived by the owner on Nov 16, 2023. It is now read-only.

Commit 038262f

Browse files
committed
Update FindHttpURLs.py and ChangHttpURLsToHttps.py
1 parent 168deb5 commit 038262f

File tree

2 files changed

+42
-29
lines changed

2 files changed

+42
-29
lines changed

src/python/tools/ChangeHttpURLsToHttps.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,11 @@
22
# Copyright (c) Microsoft Corporation. All rights reserved.
33
# Licensed under the MIT License.
44
# -------------------------------------------------------------------------
5+
56
# Converts all valid HTTP links to HTTPS, where the fed
67
# HTTP links are found in Report_AlterableUrls_FindHttpURLs.csv, which
78
# is generated by FindHttpURLs.py
8-
# usage: python3 ChangeHttpURLsToHttps.py [PATH_TO_Report_FindHttpURLs.txt] [PATH_TO_ROOT_OF_NIMBUSML_DIRECTORY]
9+
# usage: python3 changeHttpURLsToHttps.py [PATH_TO_Report_FindHttpURLs.txt] [PATH_TO_ROOT_OF_NIMBUSML_DIRECTORY]
910
# output: Report_ReplaceHttpsURLs.txt
1011

1112
import sys
@@ -14,15 +15,16 @@
1415

1516
def changeUrls(pathToReportCsv, pathToRootDirectory):
1617
with open(pathToReportCsv, newline='') as csvFile:
17-
csv_reader = csv.reader(csvFile, delimiter=',')
18+
csv_reader = csv.reader(csvFile, delimiter='\t')
1819
line_count = 0
1920
for row in csv_reader:
2021
if line_count == 0:
2122
line_count += 1
2223
else:
2324
#URL: row[0]
2425
#relativePath: row[1]
25-
absolutePath = pathToRootDirectory[:-1]+row[1]
26+
print(row[1])
27+
absolutePath = pathToRootDirectory+row[1]
2628
fullText = open(absolutePath).read()
2729
fullText = fullText.replace(row[0], row[0].replace('http', 'https'))
2830
f = open(absolutePath, 'w')
@@ -34,7 +36,7 @@ def changeUrls(pathToReportCsv, pathToRootDirectory):
3436

3537
def main():
3638
if len(sys.argv) < 3:
37-
print("Usage: python3 ChangeHttpURLsToHttps.py [PATH_TO_Report_FindHttpURLs.txt] [PATH_TO_ORIGINAL_NIMBUSML_DIRECTORY]")
39+
print("Usage: python3 changeHttpURLsToHttps.py [PATH_TO_Report_FindHttpURLs.txt] [PATH_TO_ORIGINAL_NIMBUSML_DIRECTORY]")
3840
exit(1)
3941
changeUrls(sys.argv[1], sys.argv[2])
4042

src/python/tools/FindHttpURLs.py

Lines changed: 36 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,10 @@
22
# Copyright (c) Microsoft Corporation. All rights reserved.
33
# Licensed under the MIT License.
44
# -------------------------------------------------------------------------
5+
56
# Finds all HTTP URLs found in the NimbusML repository
67
# Converts all valid HTTP links to HTTPS
7-
# Usage: python3 FindHttpURLs.py [PATH_TO_NimbusML_REPOSITORY]
8+
# Usage: python3 findHttpURLs.py [PATH_TO_NimbusML_REPOSITORY]
89
# Output: Report_AlterableUrls_FindHttpURLs.csv, [Report_NonAlterableUrls_FindHttpURLs.csv, Report_InvalidUrls_FindHttpURLs.csv]
910

1011
# Required non-standard pip library: urlextract
@@ -13,17 +14,29 @@
1314
import os
1415
import requests
1516
import csv
17+
import collections
1618
from urlextract import URLExtract
1719

20+
def addToDictionary(dict, key, value):
21+
if key not in dict:
22+
dict[key] = [value]
23+
else:
24+
if value not in dict[key]:
25+
dict[key].append(value)
26+
return dict
27+
1828
def findHttpUrls(searchRootDirectory):
29+
alterableUrlsStore = {}
30+
nonAlterableUrlsStore = {}
31+
invalidUrlsStore = {}
1932
extractor = URLExtract()
2033
lengthOfOriginalRootPath = -1
2134
for root, _, files in os.walk(searchRootDirectory, onerror=None):
2235
if lengthOfOriginalRootPath == -1:
2336
lengthOfOriginalRootPath = len(root)
2437
for filename in files:
2538
absoluteFilePath = os.path.join(root, filename)
26-
relativeFilePath = '.' + absoluteFilePath[lengthOfOriginalRootPath-1:]
39+
relativeFilePath = '.' + absoluteFilePath[lengthOfOriginalRootPath:]
2740
try:
2841
with open(absoluteFilePath, "rb") as f:
2942
data = f.read()
@@ -43,47 +56,45 @@ def findHttpUrls(searchRootDirectory):
4356
try:
4457
newRequest = requests.get(changedSelectedUrl)
4558
if newRequest.status_code == 200:
46-
alterableUrlsStore.append([selectedUrl, relativeFilePath])
59+
alterableUrlsStore = addToDictionary(alterableUrlsStore, relativeFilePath, selectedUrl)
4760
else:
48-
nonAlterableUrlsStore.append([selectedUrl, relativeFilePath])
61+
nonAlterableUrlsStore = addToDictionary(nonAlterableUrlsStore, relativeFilePath, selectedUrl)
4962
except:
50-
nonAlterableUrlsStore.append([selectedUrl, relativeFilePath])
63+
nonAlterableUrlsStore = addToDictionary(nonAlterableUrlsStore, relativeFilePath, selectedUrl)
5164
else:
52-
invalidUrlsStore.append([selectedUrl, relativeFilePath])
65+
invalidUrlsStore = addToDictionary(invalidUrlsStore, relativeFilePath, selectedUrl)
5366
except ConnectionError:
54-
invalidUrlsStore.append([selectedUrl, relativeFilePath])
67+
invalidUrlsStore = addToDictionary(invalidUrlsStore, relativeFilePath, selectedUrl)
5568
except (IOError, OSError):
5669
pass
57-
return
70+
makeReports(alterableUrlsStore, nonAlterableUrlsStore, invalidUrlsStore)
5871

59-
def makeReports():
60-
fieldnames = ['filepath', 'url']
72+
def makeReports(alterableUrlsStore, nonAlterableUrlsStore, invalidUrlsStore):
6173
with open('Report_AlterableUrls_FindHttpURLs.csv', mode='w', newline='') as csv_file:
62-
writer = csv.writer(csv_file, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
74+
writer = csv.writer(csv_file, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL)
6375
writer.writerow(["url", "relativeFilepath"])
64-
for pair in alterableUrlsStore:
65-
writer.writerow([pair[0], pair[1]])
76+
for fileKey in alterableUrlsStore:
77+
for url in alterableUrlsStore[fileKey]:
78+
writer.writerow([url, fileKey])
6679
with open('Report_NonAlterableUrls_FindHttpURLs.csv', mode='w', newline='') as csv_file:
67-
writer = csv.writer(csv_file, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
80+
writer = csv.writer(csv_file, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL)
6881
writer.writerow(["url", "relativeFilepath"])
69-
for pair in alterableUrlsStore:
70-
writer.writerow([pair[0], pair[1]])
82+
for fileKey in nonAlterableUrlsStore:
83+
for url in nonAlterableUrlsStore[fileKey]:
84+
writer.writerow([url, fileKey])
7185
with open('Report_InvalidUrls_FindHttpURLs.csv', mode='w', newline='') as csv_file:
72-
writer = csv.writer(csv_file, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
86+
writer = csv.writer(csv_file, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL)
7387
writer.writerow(["url", "relativeFilepath"])
74-
for pair in alterableUrlsStore:
75-
writer.writerow([pair[0], pair[1]])
88+
for fileKey in invalidUrlsStore:
89+
for url in invalidUrlsStore[fileKey]:
90+
writer.writerow([url, fileKey])
7691
return
7792

7893
def main():
7994
if len(sys.argv) < 2:
80-
print("Usage: python3 FindHttpURLs.py [PATH_TO_NimbusML_REPOSITORY]")
95+
print("Usage: python3 findHttpURLs.py [PATH_TO_NimbusML_REPOSITORY]")
8196
exit(1)
8297
findHttpUrls(sys.argv[1])
83-
makeReports()
84-
85-
alterableUrlsStore = []
86-
invalidUrlsStore = []
87-
nonAlterableUrlsStore = []
98+
8899
if __name__ == "__main__":
89100
main()

0 commit comments

Comments
 (0)