зеркало из https://github.com/microsoft/NimbusML.git
Update FindHttpURLs.py and ChangHttpURLsToHttps.py
This commit is contained in:
Родитель
168deb5320
Коммит
038262f175
|
@ -2,10 +2,11 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
# Converts all valid HTTP links to HTTPS, where the fed
|
||||
# HTTP links are found in Report_AlterableUrls_FindHttpURLs.csv, which
|
||||
# is generated by FindHttpURLs.py
|
||||
# usage: python3 ChangeHttpURLsToHttps.py [PATH_TO_Report_FindHttpURLs.txt] [PATH_TO_ROOT_OF_NIMBUSML_DIRECTORY]
|
||||
# usage: python3 changeHttpURLsToHttps.py [PATH_TO_Report_FindHttpURLs.txt] [PATH_TO_ROOT_OF_NIMBUSML_DIRECTORY]
|
||||
# output: Report_ReplaceHttpsURLs.txt
|
||||
|
||||
import sys
|
||||
|
@ -14,7 +15,7 @@ import csv
|
|||
|
||||
def changeUrls(pathToReportCsv, pathToRootDirectory):
|
||||
with open(pathToReportCsv, newline='') as csvFile:
|
||||
csv_reader = csv.reader(csvFile, delimiter=',')
|
||||
csv_reader = csv.reader(csvFile, delimiter='\t')
|
||||
line_count = 0
|
||||
for row in csv_reader:
|
||||
if line_count == 0:
|
||||
|
@ -22,7 +23,8 @@ def changeUrls(pathToReportCsv, pathToRootDirectory):
|
|||
else:
|
||||
#URL: row[0]
|
||||
#relativePath: row[1]
|
||||
absolutePath = pathToRootDirectory[:-1]+row[1]
|
||||
print(row[1])
|
||||
absolutePath = pathToRootDirectory+row[1]
|
||||
fullText = open(absolutePath).read()
|
||||
fullText = fullText.replace(row[0], row[0].replace('http', 'https'))
|
||||
f = open(absolutePath, 'w')
|
||||
|
@ -34,7 +36,7 @@ def changeUrls(pathToReportCsv, pathToRootDirectory):
|
|||
|
||||
def main():
|
||||
if len(sys.argv) < 3:
|
||||
print("Usage: python3 ChangeHttpURLsToHttps.py [PATH_TO_Report_FindHttpURLs.txt] [PATH_TO_ORIGINAL_NIMBUSML_DIRECTORY]")
|
||||
print("Usage: python3 changeHttpURLsToHttps.py [PATH_TO_Report_FindHttpURLs.txt] [PATH_TO_ORIGINAL_NIMBUSML_DIRECTORY]")
|
||||
exit(1)
|
||||
changeUrls(sys.argv[1], sys.argv[2])
|
||||
|
||||
|
|
|
@ -2,9 +2,10 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
# Finds all HTTP URLs found in the NimbusML repository
|
||||
# Converts all valid HTTP links to HTTPS
|
||||
# Usage: python3 FindHttpURLs.py [PATH_TO_NimbusML_REPOSITORY]
|
||||
# Usage: python3 findHttpURLs.py [PATH_TO_NimbusML_REPOSITORY]
|
||||
# Output: Report_AlterableUrls_FindHttpURLs.csv, [Report_NonAlterableUrls_FindHttpURLs.csv, Report_InvalidUrls_FindHttpURLs.csv]
|
||||
|
||||
# Required non-standard pip library: urlextract
|
||||
|
@ -13,9 +14,21 @@ import sys
|
|||
import os
|
||||
import requests
|
||||
import csv
|
||||
import collections
|
||||
from urlextract import URLExtract
|
||||
|
||||
def addToDictionary(dict, key, value):
|
||||
if key not in dict:
|
||||
dict[key] = [value]
|
||||
else:
|
||||
if value not in dict[key]:
|
||||
dict[key].append(value)
|
||||
return dict
|
||||
|
||||
def findHttpUrls(searchRootDirectory):
|
||||
alterableUrlsStore = {}
|
||||
nonAlterableUrlsStore = {}
|
||||
invalidUrlsStore = {}
|
||||
extractor = URLExtract()
|
||||
lengthOfOriginalRootPath = -1
|
||||
for root, _, files in os.walk(searchRootDirectory, onerror=None):
|
||||
|
@ -23,7 +36,7 @@ def findHttpUrls(searchRootDirectory):
|
|||
lengthOfOriginalRootPath = len(root)
|
||||
for filename in files:
|
||||
absoluteFilePath = os.path.join(root, filename)
|
||||
relativeFilePath = '.' + absoluteFilePath[lengthOfOriginalRootPath-1:]
|
||||
relativeFilePath = '.' + absoluteFilePath[lengthOfOriginalRootPath:]
|
||||
try:
|
||||
with open(absoluteFilePath, "rb") as f:
|
||||
data = f.read()
|
||||
|
@ -43,47 +56,45 @@ def findHttpUrls(searchRootDirectory):
|
|||
try:
|
||||
newRequest = requests.get(changedSelectedUrl)
|
||||
if newRequest.status_code == 200:
|
||||
alterableUrlsStore.append([selectedUrl, relativeFilePath])
|
||||
alterableUrlsStore = addToDictionary(alterableUrlsStore, relativeFilePath, selectedUrl)
|
||||
else:
|
||||
nonAlterableUrlsStore.append([selectedUrl, relativeFilePath])
|
||||
nonAlterableUrlsStore = addToDictionary(nonAlterableUrlsStore, relativeFilePath, selectedUrl)
|
||||
except:
|
||||
nonAlterableUrlsStore.append([selectedUrl, relativeFilePath])
|
||||
nonAlterableUrlsStore = addToDictionary(nonAlterableUrlsStore, relativeFilePath, selectedUrl)
|
||||
else:
|
||||
invalidUrlsStore.append([selectedUrl, relativeFilePath])
|
||||
invalidUrlsStore = addToDictionary(invalidUrlsStore, relativeFilePath, selectedUrl)
|
||||
except ConnectionError:
|
||||
invalidUrlsStore.append([selectedUrl, relativeFilePath])
|
||||
invalidUrlsStore = addToDictionary(invalidUrlsStore, relativeFilePath, selectedUrl)
|
||||
except (IOError, OSError):
|
||||
pass
|
||||
return
|
||||
makeReports(alterableUrlsStore, nonAlterableUrlsStore, invalidUrlsStore)
|
||||
|
||||
def makeReports():
|
||||
fieldnames = ['filepath', 'url']
|
||||
def makeReports(alterableUrlsStore, nonAlterableUrlsStore, invalidUrlsStore):
|
||||
with open('Report_AlterableUrls_FindHttpURLs.csv', mode='w', newline='') as csv_file:
|
||||
writer = csv.writer(csv_file, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
|
||||
writer = csv.writer(csv_file, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL)
|
||||
writer.writerow(["url", "relativeFilepath"])
|
||||
for pair in alterableUrlsStore:
|
||||
writer.writerow([pair[0], pair[1]])
|
||||
for fileKey in alterableUrlsStore:
|
||||
for url in alterableUrlsStore[fileKey]:
|
||||
writer.writerow([url, fileKey])
|
||||
with open('Report_NonAlterableUrls_FindHttpURLs.csv', mode='w', newline='') as csv_file:
|
||||
writer = csv.writer(csv_file, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
|
||||
writer = csv.writer(csv_file, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL)
|
||||
writer.writerow(["url", "relativeFilepath"])
|
||||
for pair in alterableUrlsStore:
|
||||
writer.writerow([pair[0], pair[1]])
|
||||
for fileKey in nonAlterableUrlsStore:
|
||||
for url in nonAlterableUrlsStore[fileKey]:
|
||||
writer.writerow([url, fileKey])
|
||||
with open('Report_InvalidUrls_FindHttpURLs.csv', mode='w', newline='') as csv_file:
|
||||
writer = csv.writer(csv_file, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
|
||||
writer = csv.writer(csv_file, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL)
|
||||
writer.writerow(["url", "relativeFilepath"])
|
||||
for pair in alterableUrlsStore:
|
||||
writer.writerow([pair[0], pair[1]])
|
||||
for fileKey in invalidUrlsStore:
|
||||
for url in invalidUrlsStore[fileKey]:
|
||||
writer.writerow([url, fileKey])
|
||||
return
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python3 FindHttpURLs.py [PATH_TO_NimbusML_REPOSITORY]")
|
||||
print("Usage: python3 findHttpURLs.py [PATH_TO_NimbusML_REPOSITORY]")
|
||||
exit(1)
|
||||
findHttpUrls(sys.argv[1])
|
||||
makeReports()
|
||||
|
||||
alterableUrlsStore = []
|
||||
invalidUrlsStore = []
|
||||
nonAlterableUrlsStore = []
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Загрузка…
Ссылка в новой задаче