Add scripts for converting kr content to static html

This commit is contained in:
Jeff Klukas 2018-10-16 13:49:10 -04:00
Родитель 73191b0028
Коммит 92bd6284d4
5 изменённых файлов: 95 добавлений и 0 удалений

14
script/README.md Normal file
Просмотреть файл

@ -0,0 +1,14 @@
Scripts for converting content. This mostly exists for historical context
as the tools here were used for a one-time conversion from AirBnb's
Knowledge Repo to a static site generator.
The conversion can be run as follows:
```
python3 -m venv venv
source venv/bin/active
pip install beautifulsoup4 PyYAML
./script/convert-knowledge-metadata.sh
./script/scrape-kr.sh
./script/remove-nav-all.sh
```

Просмотреть файл

@ -0,0 +1,18 @@
#!/bin/bash
# Pull yaml frontmatter from knowledge.md files and convert to docere report.json.
# Move to the root of the git repository.
cd $(git rev-parse --show-toplevel)
for d in $(find $PWD -name '*.kp' -type d); do
echo "processing $d"
cat "$d"/knowledge.md \
| sed 's/\(201.-..-..\).*/"\1"/' \
| sed 's/created_at/publish_date/' \
| python -c 'import sys, yaml, json; json.dump(next(yaml.load_all(sys.stdin)), sys.stdout, indent=4)' \
> "$d"/report.json
#jupyter nbconvert "$d"/orig_src/*.ipynb
#sed '/^---$/,/^---$/d' "$d"/orig_src/*.html > "$d"/index.html
#rm "$d"/orig_src/*.html
done

6
script/remove-nav-all.sh Executable file
Просмотреть файл

@ -0,0 +1,6 @@
#!/bin/bash
for d in $(find $PWD -name '*.kp' -type d); do
echo "d is $d"
./script/remove-nav.py "$d/rendered_from_kr.html" > "$d/index.html"
done

44
script/remove-nav.py Executable file
Просмотреть файл

@ -0,0 +1,44 @@
#!/usr/bin/env python
# Takes raw HTML from stdin as scraped from Knowledge Repo and "staticizes" it
# by removing the navigation chrome that required the KR database and modifying
# paths to static assets.
import sys
from bs4 import BeautifulSoup
with open(sys.argv[1]) as f:
soup = BeautifulSoup(f.read(), 'html.parser')
for div in soup.find_all("div", {'class':'navbar'}):
div.decompose()
for div in soup.find_all("div", {'class':'btn-group'}):
div.decompose()
for div in soup.find_all("button"):
div.decompose()
for div in soup.find_all("div", {'class':'footer'}):
div.decompose()
for div in soup.find_all("div", {'id':'pageview_stats'}):
div.parent.decompose()
for span in soup.find_all("span", {'class':'tags'}):
span.decompose()
soup.find("textarea").parent.parent.parent.parent.parent.parent.decompose()
for item in soup.find_all(src=True):
src = item['src'] or ""
if src.startswith('/static/'):
item['src'] = 'https://reports.telemetry.mozilla.org' + src
for item in soup.find_all(href=True):
href = item['href'] or ""
if href.startswith('/static/'):
item['href'] = 'https://reports.telemetry.mozilla.org' + href
print(soup)

13
script/scrape-kr.sh Executable file
Просмотреть файл

@ -0,0 +1,13 @@
#!/bin/bash
# Scrape existing rendered Knowledge Repo pages and store as static html.
# Move to the root of the git repository.
cd $(git rev-parse --show-toplevel)
for d in $(find . -name '*week?.kp' -type d); do
echo "$d"
url="http://reports.telemetry.mozilla.org/post/${d/.\//}"
echo $url
curl "$url" -o "$d/rendered_from_kr.html"
done