diff --git a/script/README.md b/script/README.md new file mode 100644 index 0000000..71ad985 --- /dev/null +++ b/script/README.md @@ -0,0 +1,14 @@ +Scripts for converting content. This mostly exists for historical context +as the tools here were used for a one-time conversion from AirBnb's +Knowledge Repo to a static site generator. + +The conversion can be run as follows: + +``` +python3 -m venv venv +source venv/bin/active +pip install beautifulsoup4 PyYAML +./script/convert-knowledge-metadata.sh +./script/scrape-kr.sh +./script/remove-nav-all.sh +``` diff --git a/script/convert-knowledge-metadata.sh b/script/convert-knowledge-metadata.sh new file mode 100755 index 0000000..49fe644 --- /dev/null +++ b/script/convert-knowledge-metadata.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# Pull yaml frontmatter from knowledge.md files and convert to docere report.json. + +# Move to the root of the git repository. +cd $(git rev-parse --show-toplevel) + +for d in $(find $PWD -name '*.kp' -type d); do + echo "processing $d" + cat "$d"/knowledge.md \ + | sed 's/\(201.-..-..\).*/"\1"/' \ + | sed 's/created_at/publish_date/' \ + | python -c 'import sys, yaml, json; json.dump(next(yaml.load_all(sys.stdin)), sys.stdout, indent=4)' \ + > "$d"/report.json + #jupyter nbconvert "$d"/orig_src/*.ipynb + #sed '/^---$/,/^---$/d' "$d"/orig_src/*.html > "$d"/index.html + #rm "$d"/orig_src/*.html +done diff --git a/script/remove-nav-all.sh b/script/remove-nav-all.sh new file mode 100755 index 0000000..1626c8b --- /dev/null +++ b/script/remove-nav-all.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +for d in $(find $PWD -name '*.kp' -type d); do + echo "d is $d" + ./script/remove-nav.py "$d/rendered_from_kr.html" > "$d/index.html" +done diff --git a/script/remove-nav.py b/script/remove-nav.py new file mode 100755 index 0000000..bbc1b6f --- /dev/null +++ b/script/remove-nav.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python + +# Takes raw HTML from stdin as scraped from Knowledge Repo and "staticizes" it +# by removing the navigation chrome that required the KR database and modifying +# paths to static assets. + +import sys + +from bs4 import BeautifulSoup + +with open(sys.argv[1]) as f: + soup = BeautifulSoup(f.read(), 'html.parser') + +for div in soup.find_all("div", {'class':'navbar'}): + div.decompose() + +for div in soup.find_all("div", {'class':'btn-group'}): + div.decompose() + +for div in soup.find_all("button"): + div.decompose() + +for div in soup.find_all("div", {'class':'footer'}): + div.decompose() + +for div in soup.find_all("div", {'id':'pageview_stats'}): + div.parent.decompose() + +for span in soup.find_all("span", {'class':'tags'}): + span.decompose() + +soup.find("textarea").parent.parent.parent.parent.parent.parent.decompose() + +for item in soup.find_all(src=True): + src = item['src'] or "" + if src.startswith('/static/'): + item['src'] = 'https://reports.telemetry.mozilla.org' + src + +for item in soup.find_all(href=True): + href = item['href'] or "" + if href.startswith('/static/'): + item['href'] = 'https://reports.telemetry.mozilla.org' + href + +print(soup) diff --git a/script/scrape-kr.sh b/script/scrape-kr.sh new file mode 100755 index 0000000..86a4254 --- /dev/null +++ b/script/scrape-kr.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +# Scrape existing rendered Knowledge Repo pages and store as static html. + +# Move to the root of the git repository. +cd $(git rev-parse --show-toplevel) + +for d in $(find . -name '*week?.kp' -type d); do + echo "$d" + url="http://reports.telemetry.mozilla.org/post/${d/.\//}" + echo $url + curl "$url" -o "$d/rendered_from_kr.html" +done