Update FindHttpURLs.py and ChangHttpURLsToHttps.py

2019-10-22 13:05:11 -07:00 · 2019-10-22 13:05:11 -07:00 · 038262f175
--- a/src/python/tools/ChangeHttpURLsToHttps.py
+++ b/src/python/tools/ChangeHttpURLsToHttps.py
@ -2,10 +2,11 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 # -------------------------------------------------------------------------
+
 # Converts all valid HTTP links to HTTPS, where the fed
 # HTTP links are found in Report_AlterableUrls_FindHttpURLs.csv, which
 # is generated by FindHttpURLs.py
-# usage: python3 ChangeHttpURLsToHttps.py [PATH_TO_Report_FindHttpURLs.txt] [PATH_TO_ROOT_OF_NIMBUSML_DIRECTORY]
+# usage: python3 changeHttpURLsToHttps.py [PATH_TO_Report_FindHttpURLs.txt] [PATH_TO_ROOT_OF_NIMBUSML_DIRECTORY]
 # output: Report_ReplaceHttpsURLs.txt

 import sys
@ -14,7 +15,7 @@ import csv

 def changeUrls(pathToReportCsv, pathToRootDirectory):
    with open(pathToReportCsv, newline='') as csvFile:
-        csv_reader = csv.reader(csvFile, delimiter=',')
+        csv_reader = csv.reader(csvFile, delimiter='\t')
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
@ -22,7 +23,8 @@ def changeUrls(pathToReportCsv, pathToRootDirectory):
            else:
                #URL: row[0]
                #relativePath: row[1]
-                absolutePath = pathToRootDirectory[:-1]+row[1]
+                print(row[1])
+                absolutePath = pathToRootDirectory+row[1]
                fullText = open(absolutePath).read()
                fullText = fullText.replace(row[0], row[0].replace('http', 'https'))
                f = open(absolutePath, 'w')
@ -34,7 +36,7 @@ def changeUrls(pathToReportCsv, pathToRootDirectory):

 def main():
    if len(sys.argv) < 3:
-        print("Usage: python3 ChangeHttpURLsToHttps.py [PATH_TO_Report_FindHttpURLs.txt] [PATH_TO_ORIGINAL_NIMBUSML_DIRECTORY]")
+        print("Usage: python3 changeHttpURLsToHttps.py [PATH_TO_Report_FindHttpURLs.txt] [PATH_TO_ORIGINAL_NIMBUSML_DIRECTORY]")
        exit(1)
    changeUrls(sys.argv[1], sys.argv[2])

--- a/src/python/tools/FindHttpURLs.py
+++ b/src/python/tools/FindHttpURLs.py
@ -2,9 +2,10 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 # -------------------------------------------------------------------------
+
 # Finds all HTTP URLs found in the NimbusML repository
 # Converts all valid HTTP links to HTTPS
-# Usage: python3 FindHttpURLs.py [PATH_TO_NimbusML_REPOSITORY]
+# Usage: python3 findHttpURLs.py [PATH_TO_NimbusML_REPOSITORY]
 # Output: Report_AlterableUrls_FindHttpURLs.csv, [Report_NonAlterableUrls_FindHttpURLs.csv, Report_InvalidUrls_FindHttpURLs.csv]

 # Required non-standard pip library: urlextract
@ -13,9 +14,21 @@ import sys
 import os
 import requests
 import csv
+import collections
 from urlextract import URLExtract

+def addToDictionary(dict, key, value):
+    if key not in dict:
+        dict[key] = [value]
+    else:
+        if value not in dict[key]:
+            dict[key].append(value)
+    return dict
+
 def findHttpUrls(searchRootDirectory):
+    alterableUrlsStore = {}
+    nonAlterableUrlsStore = {}
+    invalidUrlsStore = {}
    extractor = URLExtract()
    lengthOfOriginalRootPath = -1
    for root, _, files in os.walk(searchRootDirectory, onerror=None):
@ -23,7 +36,7 @@ def findHttpUrls(searchRootDirectory):
             lengthOfOriginalRootPath = len(root)
        for filename in files:
            absoluteFilePath = os.path.join(root, filename)
-            relativeFilePath = '.' + absoluteFilePath[lengthOfOriginalRootPath-1:]
+            relativeFilePath = '.' + absoluteFilePath[lengthOfOriginalRootPath:]
            try:
                with open(absoluteFilePath, "rb") as f:
                    data = f.read()
@ -43,47 +56,45 @@ def findHttpUrls(searchRootDirectory):
                                try:
                                    newRequest = requests.get(changedSelectedUrl)
                                    if newRequest.status_code == 200:
-                                        alterableUrlsStore.append([selectedUrl, relativeFilePath])
+                                        alterableUrlsStore = addToDictionary(alterableUrlsStore, relativeFilePath, selectedUrl)
                                    else:
-                                        nonAlterableUrlsStore.append([selectedUrl, relativeFilePath])
+                                        nonAlterableUrlsStore = addToDictionary(nonAlterableUrlsStore, relativeFilePath, selectedUrl)
                                except:
-                                    nonAlterableUrlsStore.append([selectedUrl, relativeFilePath])
+                                    nonAlterableUrlsStore = addToDictionary(nonAlterableUrlsStore, relativeFilePath, selectedUrl)
                            else:
-                                invalidUrlsStore.append([selectedUrl, relativeFilePath])
+                                invalidUrlsStore = addToDictionary(invalidUrlsStore, relativeFilePath, selectedUrl)
                        except ConnectionError:
-                            invalidUrlsStore.append([selectedUrl, relativeFilePath])
+                            invalidUrlsStore = addToDictionary(invalidUrlsStore, relativeFilePath, selectedUrl)
            except (IOError, OSError):
                pass
-    return
+    makeReports(alterableUrlsStore, nonAlterableUrlsStore, invalidUrlsStore)

-def makeReports():
-    fieldnames = ['filepath', 'url']
+def makeReports(alterableUrlsStore, nonAlterableUrlsStore, invalidUrlsStore):
    with open('Report_AlterableUrls_FindHttpURLs.csv', mode='w', newline='') as csv_file:
-        writer = csv.writer(csv_file, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
+        writer = csv.writer(csv_file, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(["url", "relativeFilepath"])
-        for pair in alterableUrlsStore:
-            writer.writerow([pair[0], pair[1]])
+        for fileKey in alterableUrlsStore:
+            for url in alterableUrlsStore[fileKey]:
+                writer.writerow([url, fileKey])
    with open('Report_NonAlterableUrls_FindHttpURLs.csv', mode='w', newline='') as csv_file:
-        writer = csv.writer(csv_file, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
+        writer = csv.writer(csv_file, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(["url", "relativeFilepath"])
-        for pair in alterableUrlsStore:
-            writer.writerow([pair[0], pair[1]])
+        for fileKey in nonAlterableUrlsStore:
+            for url in nonAlterableUrlsStore[fileKey]:
+                writer.writerow([url, fileKey])
    with open('Report_InvalidUrls_FindHttpURLs.csv', mode='w', newline='') as csv_file:
-        writer = csv.writer(csv_file, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
+        writer = csv.writer(csv_file, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(["url", "relativeFilepath"])
-        for pair in alterableUrlsStore:
-            writer.writerow([pair[0], pair[1]])
+        for fileKey in invalidUrlsStore:
+            for url in invalidUrlsStore[fileKey]:
+                writer.writerow([url, fileKey])
    return 

 def main():
    if len(sys.argv) < 2:
-        print("Usage: python3 FindHttpURLs.py [PATH_TO_NimbusML_REPOSITORY]")
+        print("Usage: python3 findHttpURLs.py [PATH_TO_NimbusML_REPOSITORY]")
        exit(1)
    findHttpUrls(sys.argv[1])
-    makeReports()
-
-alterableUrlsStore = []
-invalidUrlsStore = []
-nonAlterableUrlsStore = []
+    
 if __name__ == "__main__":
    main()