Update title handling on import.
This commit is contained in:
Родитель
ca161d0bc2
Коммит
907aed8854
|
@ -21,9 +21,8 @@ from pyquery import PyQuery as pq
|
||||||
|
|
||||||
|
|
||||||
BASE_PATH = Path(__file__).resolve().parent
|
BASE_PATH = Path(__file__).resolve().parent
|
||||||
TITLE_RE = re.compile('\$html_title = [\'"](.*)[\'"];')
|
TITLE_RE = re.compile('\$html_title = [\'"]MFSA (\d{4}-\d{2,4}):?\s+(.*?)[\'"];')
|
||||||
DIE_PHP = re.compile(r'<\?.*?\?>', re.DOTALL)
|
DIE_PHP = re.compile(r'<\?.*?\?>', re.DOTALL)
|
||||||
MFSA_ID_RE = re.compile(r'\d{4}-\d{2,4}')
|
|
||||||
|
|
||||||
config = {}
|
config = {}
|
||||||
|
|
||||||
|
@ -35,7 +34,7 @@ def die_php_die(file_path):
|
||||||
contents = fh.read()
|
contents = fh.read()
|
||||||
|
|
||||||
m = TITLE_RE.search(contents)
|
m = TITLE_RE.search(contents)
|
||||||
return m.group(1), DIE_PHP.sub('', contents)
|
return m.group(1), m.group(2), DIE_PHP.sub('', contents)
|
||||||
|
|
||||||
|
|
||||||
def extract_metadata(doc):
|
def extract_metadata(doc):
|
||||||
|
@ -67,11 +66,7 @@ def extract_metadata(doc):
|
||||||
metadata[curr_key][-1] += ' '
|
metadata[curr_key][-1] += ' '
|
||||||
metadata[curr_key][-1] += etree.tostring(el)
|
metadata[curr_key][-1] += etree.tostring(el)
|
||||||
|
|
||||||
if doc.eq(0).is_('h1'):
|
return metadata, pq(doc[doc.index(doc('p')[0]) + 1:])
|
||||||
doc = pq(doc[2:])
|
|
||||||
else:
|
|
||||||
doc = pq(doc[1:])
|
|
||||||
return metadata, doc
|
|
||||||
|
|
||||||
|
|
||||||
def slugify(value):
|
def slugify(value):
|
||||||
|
@ -110,7 +105,8 @@ def process_announce():
|
||||||
announce_path = config['input_path'] / 'announce'
|
announce_path = config['input_path'] / 'announce'
|
||||||
counter = 0
|
counter = 0
|
||||||
for announcement in announce_path.glob('*/mfsa*.html'):
|
for announcement in announce_path.glob('*/mfsa*.html'):
|
||||||
title, html = die_php_die(announcement)
|
id, title, html = die_php_die(announcement)
|
||||||
|
title = title.replace(r"\'", "'")
|
||||||
doc = pq(html)
|
doc = pq(html)
|
||||||
if doc('#main-content'):
|
if doc('#main-content'):
|
||||||
# it's the old style
|
# it's the old style
|
||||||
|
@ -120,8 +116,9 @@ def process_announce():
|
||||||
doc = pq(doc.children()[2:])
|
doc = pq(doc.children()[2:])
|
||||||
|
|
||||||
metadata, doc = extract_metadata(doc)
|
metadata, doc = extract_metadata(doc)
|
||||||
metadata['page_title'] = [title]
|
if 'title' not in metadata:
|
||||||
metadata['mfsa_id'] = [MFSA_ID_RE.search(title).group(0)]
|
metadata['title'] = [title]
|
||||||
|
metadata['mfsa_id'] = [id]
|
||||||
write_file(announcement, metadata, unicode(doc))
|
write_file(announcement, metadata, unicode(doc))
|
||||||
counter += 1
|
counter += 1
|
||||||
|
|
||||||
|
|
Загрузка…
Ссылка в новой задаче