This commit is contained in:
hoosteeno 2013-08-09 16:19:44 -06:00
Коммит 2d9a2a7c04
4 изменённых файлов: 80 добавлений и 0 удалений

5
.gitignore поставляемый Normal file
Просмотреть файл

@ -0,0 +1,5 @@
website-archive.mozilla.org/
www.mozilla.org/
.DS_Store
.swp
venv

1
README.txt Normal file
Просмотреть файл

@ -0,0 +1 @@
wget -e robots=off -w 1 --mirror -p --adjust-extension --no-parent --convert-links --no-host-directories -P www.mozilla.org/devpreview_releasenotes http://www.mozilla.org/projects/devpreview/releasenotes/

72
process_files.py Normal file
Просмотреть файл

@ -0,0 +1,72 @@
import sys
import os
import argparse
import fnmatch
from datetime import datetime
from bs4 import BeautifulSoup
from bs4 import Comment
# recursive directory pattern search
def locate(pattern, root):
for path, dirs, files in os.walk(os.path.abspath(root)):
for filename in fnmatch.filter(files, pattern):
yield os.path.join(path, filename)
# verify the argument is a valid dir
def readable_dir(prospective_dir):
if not os.path.isdir(prospective_dir):
raise argparse.ArgumentTypeError(
"{0} is not a valid path".format(prospective_dir)
)
if os.access(prospective_dir, os.R_OK):
return prospective_dir
else:
raise argparse.ArgumentTypeError(
"{0} is not a readable dir".format(prospective_dir)
)
# parse arguments
parser = argparse.ArgumentParser(description='Process a directory of content.')
parser.add_argument('directory',
help='directory to process',
type=readable_dir,
action='store')
args = parser.parse_args()
# define strings to replace, strings to insert
css = '''
body { padding-top: 25px; }
#archived { margin: 0; padding: 5px; position: absolute; top: 0; left: 0; height: 25px; width: 100%; z-index: 1000; text-align: center; font: bold 1.143em/1 Arial, Calibri, Helvetica, "Helvetica Neue"; color: #f5f3ed; background-color: #4d5151; }
#archived a { color: #fff; }
#archived a:hover { color: #fff; text-decoration: underline; }
'''
text = 'You are viewing information archived from Mozilla.org on %s.' % (
datetime.utcnow().strftime("%Y-%m-%d")
)
# process every file
for filename in locate("*.html", args.directory):
with open(filename, "r") as f:
soup = BeautifulSoup(f)
if (len(soup.select('#archived')) == 0):
print 'Processing %s' % (filename)
# get rid of search form
for s in soup.select('#quick-search'):
s.replace_with(Comment('search removed'))
# add styles for notification block
style = soup.new_tag('style', type='text/css')
style.append(css)
soup.head.append(style)
# add notification block
div = soup.new_tag('div', id='archived')
div.append(text)
soup.body.insert(0, div)
with open(filename, "w") as f:
f.write(str(soup))

2
requirements.txt Normal file
Просмотреть файл

@ -0,0 +1,2 @@
beautifulsoup4==4.2.1
wsgiref==0.1.2