From 907aed8854215ebb38e905754d5cfae454e4107f Mon Sep 17 00:00:00 2001 From: Paul McLanahan Date: Tue, 15 Jul 2014 14:58:22 -0400 Subject: [PATCH] Update title handling on import. --- import_html.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/import_html.py b/import_html.py index 2c53dc1..6402a5e 100755 --- a/import_html.py +++ b/import_html.py @@ -21,9 +21,8 @@ from pyquery import PyQuery as pq BASE_PATH = Path(__file__).resolve().parent -TITLE_RE = re.compile('\$html_title = [\'"](.*)[\'"];') +TITLE_RE = re.compile('\$html_title = [\'"]MFSA (\d{4}-\d{2,4}):?\s+(.*?)[\'"];') DIE_PHP = re.compile(r'<\?.*?\?>', re.DOTALL) -MFSA_ID_RE = re.compile(r'\d{4}-\d{2,4}') config = {} @@ -35,7 +34,7 @@ def die_php_die(file_path): contents = fh.read() m = TITLE_RE.search(contents) - return m.group(1), DIE_PHP.sub('', contents) + return m.group(1), m.group(2), DIE_PHP.sub('', contents) def extract_metadata(doc): @@ -67,11 +66,7 @@ def extract_metadata(doc): metadata[curr_key][-1] += ' ' metadata[curr_key][-1] += etree.tostring(el) - if doc.eq(0).is_('h1'): - doc = pq(doc[2:]) - else: - doc = pq(doc[1:]) - return metadata, doc + return metadata, pq(doc[doc.index(doc('p')[0]) + 1:]) def slugify(value): @@ -110,7 +105,8 @@ def process_announce(): announce_path = config['input_path'] / 'announce' counter = 0 for announcement in announce_path.glob('*/mfsa*.html'): - title, html = die_php_die(announcement) + id, title, html = die_php_die(announcement) + title = title.replace(r"\'", "'") doc = pq(html) if doc('#main-content'): # it's the old style @@ -120,8 +116,9 @@ def process_announce(): doc = pq(doc.children()[2:]) metadata, doc = extract_metadata(doc) - metadata['page_title'] = [title] - metadata['mfsa_id'] = [MFSA_ID_RE.search(title).group(0)] + if 'title' not in metadata: + metadata['title'] = [title] + metadata['mfsa_id'] = [id] write_file(announcement, metadata, unicode(doc)) counter += 1