From 038262f175f08cd13c6feff036822d55dcd88947 Mon Sep 17 00:00:00 2001 From: Mustafa Bal Date: Tue, 22 Oct 2019 13:05:11 -0700 Subject: [PATCH] Update FindHttpURLs.py and ChangHttpURLsToHttps.py --- src/python/tools/ChangeHttpURLsToHttps.py | 10 ++-- src/python/tools/FindHttpURLs.py | 61 +++++++++++++---------- 2 files changed, 42 insertions(+), 29 deletions(-) diff --git a/src/python/tools/ChangeHttpURLsToHttps.py b/src/python/tools/ChangeHttpURLsToHttps.py index aa99e56..6616783 100644 --- a/src/python/tools/ChangeHttpURLsToHttps.py +++ b/src/python/tools/ChangeHttpURLsToHttps.py @@ -2,10 +2,11 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # ------------------------------------------------------------------------- + # Converts all valid HTTP links to HTTPS, where the fed # HTTP links are found in Report_AlterableUrls_FindHttpURLs.csv, which # is generated by FindHttpURLs.py -# usage: python3 ChangeHttpURLsToHttps.py [PATH_TO_Report_FindHttpURLs.txt] [PATH_TO_ROOT_OF_NIMBUSML_DIRECTORY] +# usage: python3 changeHttpURLsToHttps.py [PATH_TO_Report_FindHttpURLs.txt] [PATH_TO_ROOT_OF_NIMBUSML_DIRECTORY] # output: Report_ReplaceHttpsURLs.txt import sys @@ -14,7 +15,7 @@ import csv def changeUrls(pathToReportCsv, pathToRootDirectory): with open(pathToReportCsv, newline='') as csvFile: - csv_reader = csv.reader(csvFile, delimiter=',') + csv_reader = csv.reader(csvFile, delimiter='\t') line_count = 0 for row in csv_reader: if line_count == 0: @@ -22,7 +23,8 @@ def changeUrls(pathToReportCsv, pathToRootDirectory): else: #URL: row[0] #relativePath: row[1] - absolutePath = pathToRootDirectory[:-1]+row[1] + print(row[1]) + absolutePath = pathToRootDirectory+row[1] fullText = open(absolutePath).read() fullText = fullText.replace(row[0], row[0].replace('http', 'https')) f = open(absolutePath, 'w') @@ -34,7 +36,7 @@ def changeUrls(pathToReportCsv, pathToRootDirectory): def main(): if len(sys.argv) < 3: - print("Usage: python3 ChangeHttpURLsToHttps.py [PATH_TO_Report_FindHttpURLs.txt] [PATH_TO_ORIGINAL_NIMBUSML_DIRECTORY]") + print("Usage: python3 changeHttpURLsToHttps.py [PATH_TO_Report_FindHttpURLs.txt] [PATH_TO_ORIGINAL_NIMBUSML_DIRECTORY]") exit(1) changeUrls(sys.argv[1], sys.argv[2]) diff --git a/src/python/tools/FindHttpURLs.py b/src/python/tools/FindHttpURLs.py index 3f9d85c..15f9bce 100644 --- a/src/python/tools/FindHttpURLs.py +++ b/src/python/tools/FindHttpURLs.py @@ -2,9 +2,10 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # ------------------------------------------------------------------------- + # Finds all HTTP URLs found in the NimbusML repository # Converts all valid HTTP links to HTTPS -# Usage: python3 FindHttpURLs.py [PATH_TO_NimbusML_REPOSITORY] +# Usage: python3 findHttpURLs.py [PATH_TO_NimbusML_REPOSITORY] # Output: Report_AlterableUrls_FindHttpURLs.csv, [Report_NonAlterableUrls_FindHttpURLs.csv, Report_InvalidUrls_FindHttpURLs.csv] # Required non-standard pip library: urlextract @@ -13,9 +14,21 @@ import sys import os import requests import csv +import collections from urlextract import URLExtract +def addToDictionary(dict, key, value): + if key not in dict: + dict[key] = [value] + else: + if value not in dict[key]: + dict[key].append(value) + return dict + def findHttpUrls(searchRootDirectory): + alterableUrlsStore = {} + nonAlterableUrlsStore = {} + invalidUrlsStore = {} extractor = URLExtract() lengthOfOriginalRootPath = -1 for root, _, files in os.walk(searchRootDirectory, onerror=None): @@ -23,7 +36,7 @@ def findHttpUrls(searchRootDirectory): lengthOfOriginalRootPath = len(root) for filename in files: absoluteFilePath = os.path.join(root, filename) - relativeFilePath = '.' + absoluteFilePath[lengthOfOriginalRootPath-1:] + relativeFilePath = '.' + absoluteFilePath[lengthOfOriginalRootPath:] try: with open(absoluteFilePath, "rb") as f: data = f.read() @@ -43,47 +56,45 @@ def findHttpUrls(searchRootDirectory): try: newRequest = requests.get(changedSelectedUrl) if newRequest.status_code == 200: - alterableUrlsStore.append([selectedUrl, relativeFilePath]) + alterableUrlsStore = addToDictionary(alterableUrlsStore, relativeFilePath, selectedUrl) else: - nonAlterableUrlsStore.append([selectedUrl, relativeFilePath]) + nonAlterableUrlsStore = addToDictionary(nonAlterableUrlsStore, relativeFilePath, selectedUrl) except: - nonAlterableUrlsStore.append([selectedUrl, relativeFilePath]) + nonAlterableUrlsStore = addToDictionary(nonAlterableUrlsStore, relativeFilePath, selectedUrl) else: - invalidUrlsStore.append([selectedUrl, relativeFilePath]) + invalidUrlsStore = addToDictionary(invalidUrlsStore, relativeFilePath, selectedUrl) except ConnectionError: - invalidUrlsStore.append([selectedUrl, relativeFilePath]) + invalidUrlsStore = addToDictionary(invalidUrlsStore, relativeFilePath, selectedUrl) except (IOError, OSError): pass - return + makeReports(alterableUrlsStore, nonAlterableUrlsStore, invalidUrlsStore) -def makeReports(): - fieldnames = ['filepath', 'url'] +def makeReports(alterableUrlsStore, nonAlterableUrlsStore, invalidUrlsStore): with open('Report_AlterableUrls_FindHttpURLs.csv', mode='w', newline='') as csv_file: - writer = csv.writer(csv_file, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) + writer = csv.writer(csv_file, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL) writer.writerow(["url", "relativeFilepath"]) - for pair in alterableUrlsStore: - writer.writerow([pair[0], pair[1]]) + for fileKey in alterableUrlsStore: + for url in alterableUrlsStore[fileKey]: + writer.writerow([url, fileKey]) with open('Report_NonAlterableUrls_FindHttpURLs.csv', mode='w', newline='') as csv_file: - writer = csv.writer(csv_file, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) + writer = csv.writer(csv_file, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL) writer.writerow(["url", "relativeFilepath"]) - for pair in alterableUrlsStore: - writer.writerow([pair[0], pair[1]]) + for fileKey in nonAlterableUrlsStore: + for url in nonAlterableUrlsStore[fileKey]: + writer.writerow([url, fileKey]) with open('Report_InvalidUrls_FindHttpURLs.csv', mode='w', newline='') as csv_file: - writer = csv.writer(csv_file, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) + writer = csv.writer(csv_file, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL) writer.writerow(["url", "relativeFilepath"]) - for pair in alterableUrlsStore: - writer.writerow([pair[0], pair[1]]) + for fileKey in invalidUrlsStore: + for url in invalidUrlsStore[fileKey]: + writer.writerow([url, fileKey]) return def main(): if len(sys.argv) < 2: - print("Usage: python3 FindHttpURLs.py [PATH_TO_NimbusML_REPOSITORY]") + print("Usage: python3 findHttpURLs.py [PATH_TO_NimbusML_REPOSITORY]") exit(1) findHttpUrls(sys.argv[1]) - makeReports() - -alterableUrlsStore = [] -invalidUrlsStore = [] -nonAlterableUrlsStore = [] + if __name__ == "__main__": main() \ No newline at end of file