Update FindHttpURLs.py and ChangHttpURLsToHttps.py

This commit is contained in:
Mustafa Bal 2019-10-22 13:05:11 -07:00
Родитель 168deb5320
Коммит 038262f175
2 изменённых файлов: 42 добавлений и 29 удалений

Просмотреть файл

@ -2,10 +2,11 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# -------------------------------------------------------------------------
# Converts all valid HTTP links to HTTPS, where the fed
# HTTP links are found in Report_AlterableUrls_FindHttpURLs.csv, which
# is generated by FindHttpURLs.py
# usage: python3 ChangeHttpURLsToHttps.py [PATH_TO_Report_FindHttpURLs.txt] [PATH_TO_ROOT_OF_NIMBUSML_DIRECTORY]
# usage: python3 changeHttpURLsToHttps.py [PATH_TO_Report_FindHttpURLs.txt] [PATH_TO_ROOT_OF_NIMBUSML_DIRECTORY]
# output: Report_ReplaceHttpsURLs.txt
import sys
@ -14,7 +15,7 @@ import csv
def changeUrls(pathToReportCsv, pathToRootDirectory):
with open(pathToReportCsv, newline='') as csvFile:
csv_reader = csv.reader(csvFile, delimiter=',')
csv_reader = csv.reader(csvFile, delimiter='\t')
line_count = 0
for row in csv_reader:
if line_count == 0:
@ -22,7 +23,8 @@ def changeUrls(pathToReportCsv, pathToRootDirectory):
else:
#URL: row[0]
#relativePath: row[1]
absolutePath = pathToRootDirectory[:-1]+row[1]
print(row[1])
absolutePath = pathToRootDirectory+row[1]
fullText = open(absolutePath).read()
fullText = fullText.replace(row[0], row[0].replace('http', 'https'))
f = open(absolutePath, 'w')
@ -34,7 +36,7 @@ def changeUrls(pathToReportCsv, pathToRootDirectory):
def main():
if len(sys.argv) < 3:
print("Usage: python3 ChangeHttpURLsToHttps.py [PATH_TO_Report_FindHttpURLs.txt] [PATH_TO_ORIGINAL_NIMBUSML_DIRECTORY]")
print("Usage: python3 changeHttpURLsToHttps.py [PATH_TO_Report_FindHttpURLs.txt] [PATH_TO_ORIGINAL_NIMBUSML_DIRECTORY]")
exit(1)
changeUrls(sys.argv[1], sys.argv[2])

Просмотреть файл

@ -2,9 +2,10 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# -------------------------------------------------------------------------
# Finds all HTTP URLs found in the NimbusML repository
# Converts all valid HTTP links to HTTPS
# Usage: python3 FindHttpURLs.py [PATH_TO_NimbusML_REPOSITORY]
# Usage: python3 findHttpURLs.py [PATH_TO_NimbusML_REPOSITORY]
# Output: Report_AlterableUrls_FindHttpURLs.csv, [Report_NonAlterableUrls_FindHttpURLs.csv, Report_InvalidUrls_FindHttpURLs.csv]
# Required non-standard pip library: urlextract
@ -13,9 +14,21 @@ import sys
import os
import requests
import csv
import collections
from urlextract import URLExtract
def addToDictionary(dict, key, value):
if key not in dict:
dict[key] = [value]
else:
if value not in dict[key]:
dict[key].append(value)
return dict
def findHttpUrls(searchRootDirectory):
alterableUrlsStore = {}
nonAlterableUrlsStore = {}
invalidUrlsStore = {}
extractor = URLExtract()
lengthOfOriginalRootPath = -1
for root, _, files in os.walk(searchRootDirectory, onerror=None):
@ -23,7 +36,7 @@ def findHttpUrls(searchRootDirectory):
lengthOfOriginalRootPath = len(root)
for filename in files:
absoluteFilePath = os.path.join(root, filename)
relativeFilePath = '.' + absoluteFilePath[lengthOfOriginalRootPath-1:]
relativeFilePath = '.' + absoluteFilePath[lengthOfOriginalRootPath:]
try:
with open(absoluteFilePath, "rb") as f:
data = f.read()
@ -43,47 +56,45 @@ def findHttpUrls(searchRootDirectory):
try:
newRequest = requests.get(changedSelectedUrl)
if newRequest.status_code == 200:
alterableUrlsStore.append([selectedUrl, relativeFilePath])
alterableUrlsStore = addToDictionary(alterableUrlsStore, relativeFilePath, selectedUrl)
else:
nonAlterableUrlsStore.append([selectedUrl, relativeFilePath])
nonAlterableUrlsStore = addToDictionary(nonAlterableUrlsStore, relativeFilePath, selectedUrl)
except:
nonAlterableUrlsStore.append([selectedUrl, relativeFilePath])
nonAlterableUrlsStore = addToDictionary(nonAlterableUrlsStore, relativeFilePath, selectedUrl)
else:
invalidUrlsStore.append([selectedUrl, relativeFilePath])
invalidUrlsStore = addToDictionary(invalidUrlsStore, relativeFilePath, selectedUrl)
except ConnectionError:
invalidUrlsStore.append([selectedUrl, relativeFilePath])
invalidUrlsStore = addToDictionary(invalidUrlsStore, relativeFilePath, selectedUrl)
except (IOError, OSError):
pass
return
makeReports(alterableUrlsStore, nonAlterableUrlsStore, invalidUrlsStore)
def makeReports():
fieldnames = ['filepath', 'url']
def makeReports(alterableUrlsStore, nonAlterableUrlsStore, invalidUrlsStore):
with open('Report_AlterableUrls_FindHttpURLs.csv', mode='w', newline='') as csv_file:
writer = csv.writer(csv_file, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
writer = csv.writer(csv_file, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL)
writer.writerow(["url", "relativeFilepath"])
for pair in alterableUrlsStore:
writer.writerow([pair[0], pair[1]])
for fileKey in alterableUrlsStore:
for url in alterableUrlsStore[fileKey]:
writer.writerow([url, fileKey])
with open('Report_NonAlterableUrls_FindHttpURLs.csv', mode='w', newline='') as csv_file:
writer = csv.writer(csv_file, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
writer = csv.writer(csv_file, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL)
writer.writerow(["url", "relativeFilepath"])
for pair in alterableUrlsStore:
writer.writerow([pair[0], pair[1]])
for fileKey in nonAlterableUrlsStore:
for url in nonAlterableUrlsStore[fileKey]:
writer.writerow([url, fileKey])
with open('Report_InvalidUrls_FindHttpURLs.csv', mode='w', newline='') as csv_file:
writer = csv.writer(csv_file, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
writer = csv.writer(csv_file, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL)
writer.writerow(["url", "relativeFilepath"])
for pair in alterableUrlsStore:
writer.writerow([pair[0], pair[1]])
for fileKey in invalidUrlsStore:
for url in invalidUrlsStore[fileKey]:
writer.writerow([url, fileKey])
return
def main():
if len(sys.argv) < 2:
print("Usage: python3 FindHttpURLs.py [PATH_TO_NimbusML_REPOSITORY]")
print("Usage: python3 findHttpURLs.py [PATH_TO_NimbusML_REPOSITORY]")
exit(1)
findHttpUrls(sys.argv[1])
makeReports()
alterableUrlsStore = []
invalidUrlsStore = []
nonAlterableUrlsStore = []
if __name__ == "__main__":
main()