Add scripts for converting kr content to static html

2018-10-16 13:49:10 -04:00 · 2018-10-16 13:49:10 -04:00 · 92bd6284d4
--- a/script/README.md
+++ b/script/README.md
@ -0,0 +1,14 @@
+Scripts for converting content. This mostly exists for historical context
+as the tools here were used for a one-time conversion from AirBnb's
+Knowledge Repo to a static site generator.
+
+The conversion can be run as follows:
+
+```
+python3 -m venv venv
+source venv/bin/active
+pip install beautifulsoup4 PyYAML
+./script/convert-knowledge-metadata.sh
+./script/scrape-kr.sh
+./script/remove-nav-all.sh
+```
--- a/script/convert-knowledge-metadata.sh
+++ b/script/convert-knowledge-metadata.sh
@ -0,0 +1,18 @@
+#!/bin/bash
+
+# Pull yaml frontmatter from knowledge.md files and convert to docere report.json.
+
+# Move to the root of the git repository.
+cd $(git rev-parse --show-toplevel)
+
+for d in $(find $PWD -name '*.kp' -type d); do
+    echo "processing $d"
+    cat "$d"/knowledge.md \
+        | sed 's/\(201.-..-..\).*/"\1"/' \
+        | sed 's/created_at/publish_date/' \
+        | python -c 'import sys, yaml, json; json.dump(next(yaml.load_all(sys.stdin)), sys.stdout, indent=4)' \
+                 > "$d"/report.json
+    #jupyter nbconvert "$d"/orig_src/*.ipynb
+    #sed '/^---$/,/^---$/d' "$d"/orig_src/*.html > "$d"/index.html
+    #rm "$d"/orig_src/*.html
+done
--- a/script/remove-nav-all.sh
+++ b/script/remove-nav-all.sh
@ -0,0 +1,6 @@
+#!/bin/bash
+
+for d in $(find $PWD -name '*.kp' -type d); do
+    echo "d is $d"
+    ./script/remove-nav.py "$d/rendered_from_kr.html" > "$d/index.html"
+done
--- a/script/remove-nav.py
+++ b/script/remove-nav.py
@ -0,0 +1,44 @@
+#!/usr/bin/env python
+
+# Takes raw HTML from stdin as scraped from Knowledge Repo and "staticizes" it
+# by removing the navigation chrome that required the KR database and modifying
+# paths to static assets.
+
+import sys
+
+from bs4 import BeautifulSoup
+
+with open(sys.argv[1]) as f:
+    soup = BeautifulSoup(f.read(), 'html.parser')
+
+for div in soup.find_all("div", {'class':'navbar'}):
+    div.decompose()
+
+for div in soup.find_all("div", {'class':'btn-group'}):
+    div.decompose()
+
+for div in soup.find_all("button"):
+    div.decompose()
+
+for div in soup.find_all("div", {'class':'footer'}):
+    div.decompose()
+
+for div in soup.find_all("div", {'id':'pageview_stats'}):
+    div.parent.decompose()
+
+for span in soup.find_all("span", {'class':'tags'}):
+    span.decompose()
+
+soup.find("textarea").parent.parent.parent.parent.parent.parent.decompose()
+
+for item in soup.find_all(src=True):
+    src = item['src'] or ""
+    if src.startswith('/static/'):
+        item['src'] = 'https://reports.telemetry.mozilla.org' + src
+
+for item in soup.find_all(href=True):
+    href = item['href'] or ""
+    if href.startswith('/static/'):
+        item['href'] = 'https://reports.telemetry.mozilla.org' + href
+
+print(soup)
--- a/script/scrape-kr.sh
+++ b/script/scrape-kr.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+
+# Scrape existing rendered Knowledge Repo pages and store as static html.
+
+# Move to the root of the git repository.
+cd $(git rev-parse --show-toplevel)
+
+for d in $(find . -name '*week?.kp' -type d); do
+    echo "$d"
+    url="http://reports.telemetry.mozilla.org/post/${d/.\//}"
+    echo $url
+    curl "$url" -o "$d/rendered_from_kr.html"
+done