Add scripts for converting kr content to static html
This commit is contained in:
Родитель
73191b0028
Коммит
92bd6284d4
|
@ -0,0 +1,14 @@
|
|||
Scripts for converting content. This mostly exists for historical context
|
||||
as the tools here were used for a one-time conversion from AirBnb's
|
||||
Knowledge Repo to a static site generator.
|
||||
|
||||
The conversion can be run as follows:
|
||||
|
||||
```
|
||||
python3 -m venv venv
|
||||
source venv/bin/active
|
||||
pip install beautifulsoup4 PyYAML
|
||||
./script/convert-knowledge-metadata.sh
|
||||
./script/scrape-kr.sh
|
||||
./script/remove-nav-all.sh
|
||||
```
|
|
@ -0,0 +1,18 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Pull yaml frontmatter from knowledge.md files and convert to docere report.json.
|
||||
|
||||
# Move to the root of the git repository.
|
||||
cd $(git rev-parse --show-toplevel)
|
||||
|
||||
for d in $(find $PWD -name '*.kp' -type d); do
|
||||
echo "processing $d"
|
||||
cat "$d"/knowledge.md \
|
||||
| sed 's/\(201.-..-..\).*/"\1"/' \
|
||||
| sed 's/created_at/publish_date/' \
|
||||
| python -c 'import sys, yaml, json; json.dump(next(yaml.load_all(sys.stdin)), sys.stdout, indent=4)' \
|
||||
> "$d"/report.json
|
||||
#jupyter nbconvert "$d"/orig_src/*.ipynb
|
||||
#sed '/^---$/,/^---$/d' "$d"/orig_src/*.html > "$d"/index.html
|
||||
#rm "$d"/orig_src/*.html
|
||||
done
|
|
@ -0,0 +1,6 @@
|
|||
#!/bin/bash
|
||||
|
||||
for d in $(find $PWD -name '*.kp' -type d); do
|
||||
echo "d is $d"
|
||||
./script/remove-nav.py "$d/rendered_from_kr.html" > "$d/index.html"
|
||||
done
|
|
@ -0,0 +1,44 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
# Takes raw HTML from stdin as scraped from Knowledge Repo and "staticizes" it
|
||||
# by removing the navigation chrome that required the KR database and modifying
|
||||
# paths to static assets.
|
||||
|
||||
import sys
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
with open(sys.argv[1]) as f:
|
||||
soup = BeautifulSoup(f.read(), 'html.parser')
|
||||
|
||||
for div in soup.find_all("div", {'class':'navbar'}):
|
||||
div.decompose()
|
||||
|
||||
for div in soup.find_all("div", {'class':'btn-group'}):
|
||||
div.decompose()
|
||||
|
||||
for div in soup.find_all("button"):
|
||||
div.decompose()
|
||||
|
||||
for div in soup.find_all("div", {'class':'footer'}):
|
||||
div.decompose()
|
||||
|
||||
for div in soup.find_all("div", {'id':'pageview_stats'}):
|
||||
div.parent.decompose()
|
||||
|
||||
for span in soup.find_all("span", {'class':'tags'}):
|
||||
span.decompose()
|
||||
|
||||
soup.find("textarea").parent.parent.parent.parent.parent.parent.decompose()
|
||||
|
||||
for item in soup.find_all(src=True):
|
||||
src = item['src'] or ""
|
||||
if src.startswith('/static/'):
|
||||
item['src'] = 'https://reports.telemetry.mozilla.org' + src
|
||||
|
||||
for item in soup.find_all(href=True):
|
||||
href = item['href'] or ""
|
||||
if href.startswith('/static/'):
|
||||
item['href'] = 'https://reports.telemetry.mozilla.org' + href
|
||||
|
||||
print(soup)
|
|
@ -0,0 +1,13 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Scrape existing rendered Knowledge Repo pages and store as static html.
|
||||
|
||||
# Move to the root of the git repository.
|
||||
cd $(git rev-parse --show-toplevel)
|
||||
|
||||
for d in $(find . -name '*week?.kp' -type d); do
|
||||
echo "$d"
|
||||
url="http://reports.telemetry.mozilla.org/post/${d/.\//}"
|
||||
echo $url
|
||||
curl "$url" -o "$d/rendered_from_kr.html"
|
||||
done
|
Загрузка…
Ссылка в новой задаче