Merge pull request #11 from mmmavis/issue-10-parse-data-from-new-sheet-template

Parse data from the new Google Spreadsheet template
This commit is contained in:
Mavis Ou 2016-04-22 20:17:35 -07:00
Родитель 3918cd8bef e921154e30
Коммит 69279b0e99
2 изменённых файлов: 97 добавлений и 38 удалений

3
.gitignore поставляемый Normal file
Просмотреть файл

@ -0,0 +1,3 @@
venv
.env
*.json

Просмотреть файл

@ -12,8 +12,8 @@ from oauth2client.client import SignedJwtAssertionCredentials
GITHUB_CONFIG = {
'TOKEN': os.environ['GITHUB_TOKEN'],
'REPO_OWNER': 'mozilla',
'REPO_NAME': 'mozfest-schedule-app',
'REPO_OWNER': os.environ['REPO_OWNER'],
'REPO_NAME': os.environ['REPO_NAME'],
'TARGET_FILE': 'sessions.json',
'TARGET_BRANCHES': ['gh-pages',],
}
@ -29,10 +29,11 @@ GOOGLE_API_CONFIG = {
GOOGLE_SPREADSHEET_KEY = os.environ['GOOGLE_SPREADSHEET_KEY'] or ''
FETCH_MULTIPLE_WORKSHEETS = True
WORKSHEETS_TO_FETCH = ['For All Participants', 'Italian', 'Japanese']
WORKSHEETS_TO_SKIP = ['Template', '(backup) original imported data']
MAKE_LOCAL_JSON = False
COMMIT_JSON_TO_GITHUB = True
MAKE_LOCAL_JSON = True
COMMIT_JSON_TO_GITHUB = False
def authenticate_with_google():
@ -58,26 +59,82 @@ def open_google_spreadsheet():
def fetch_data(multiple_sheets=False, worksheets_to_skip=[]):
spreadsheet = open_google_spreadsheet()
data = {
'timeblocks': fetch_worksheets(spreadsheet, multiple_sheets, ['* Timeblock Values']),
'sessions': fetch_worksheets(spreadsheet, multiple_sheets, WORKSHEETS_TO_FETCH)
}
return data
def fetch_worksheets(spreadsheet, multiple_sheets=False, worksheets_to_fetch=[]):
if not multiple_sheets:
# Return data from first worksheet in Google spreadsheet.
worksheet = spreadsheet.get_worksheet(0)
data = worksheet.get_all_records(empty2zero=False)
else:
# Return data from all worksheets in Google spreadsheet, optionally
# skipping sheets identified by title in `WORKSHEETS_TO_SKIP`.
# Return data from all worksheets in worksheets_to_fetch
data = []
worksheet_list = [
sheet for sheet in spreadsheet.worksheets() if sheet.title not in WORKSHEETS_TO_SKIP
sheet for sheet in spreadsheet.worksheets() if sheet.title in worksheets_to_fetch
]
for worksheet in worksheet_list:
worksheet.title
data.extend(worksheet.get_all_records(empty2zero=False))
return data
return data
def transform_data(data):
def slugify_timeblock(timeblock):
time_data = timeblock.split('(')
# "slugified" version of timeblock
timeblock = time_data[0].strip()
timeblock = timeblock.lower().replace(' ','-').replace(',','').replace(':','-').replace('*','')
return timeblock
def transform_timeblock_data(data):
def _transform_response_item(item, skip=False):
# make sure vars are strings
_transformed_item = {k: unicode(v) for k, v in item.iteritems() if k}
if 'order' in _transformed_item:
_transformed_item['order'] = _transformed_item.pop('order', '')
# remove rows with `id` that is blank or provides instructions
try:
int(_transformed_item['order'])
except:
skip = True
#
if 'reserved for everyone' in _transformed_item:
reserved_for_everyone = _transformed_item.pop('reserved for everyone', '')
if reserved_for_everyone:
_transformed_item['reserved for everyone'] = reserved_for_everyone
# transform `Auto Generated. Do Not Modify.` column name into `key` key
if 'Auto Generated. Do Not Modify.' in _transformed_item:
_transformed_item['key'] = slugify_timeblock(_transformed_item.pop('Auto Generated. Do Not Modify.', ''))
# if we've triggered the skip flag anywhere, drop this record
if skip:
_transformed_item = None
return _transformed_item
# empty list to hold any items we need to duplicate
cloned_data = []
# pass initial data through the transformer
transformed_data = filter(None, [_transform_response_item(item) for item in data])
# and add in any items we had to duplicate
transformed_data.extend(
filter(None, [_transform_response_item(item) for item in cloned_data])
)
return transformed_data
def transform_session_data(data):
'''
Transforms data and filters individual schedule items for fields we want
to publish. Currently, this:
@ -89,7 +146,7 @@ def transform_data(data):
* removes any rows that don't have a numeric `id`
* creates a concatenated `facilitators` key
* removes invalid pathway labels that were used for GitHub workflow
* creates a `scheduleblock` key based on data in `time` column
* creates a `timeblock` key based on data in `time` column
* creates Saturday and Sunday versions of sessions marked 'all-weekend'
* infers a `day` and `start` key based on data in `time` column
* prepends `location` with the word 'Floor'
@ -98,10 +155,6 @@ def transform_data(data):
# make sure vars are strings
_transformed_item = {k: unicode(v) for k, v in item.iteritems() if k}
# don't need `proposalSpreadsheetRowNumber` for schedule app
if 'proposalSpreadsheetRowNumber' in _transformed_item:
del _transformed_item['proposalSpreadsheetRowNumber']
# transform `name` column name into `title` key
# and skip rows that represent pathways, or have no name
if 'name' in _transformed_item:
@ -113,8 +166,8 @@ def transform_data(data):
# transform `githubIssueNumber` column name into `id` key
# (and skip rows without a valid id)
if 'githubIssueNumber' in _transformed_item:
_transformed_item['id'] = _transformed_item.pop('githubIssueNumber', '')
if 'id' in _transformed_item:
_transformed_item['id'] = _transformed_item.pop('id', '')
# remove rows with `id` that is blank or provides instructions
try:
@ -127,7 +180,7 @@ def transform_data(data):
name_list = []
name_detail_list = []
for key in _transformed_item.keys():
if key.startswith('facilitator_'):
if key.startswith('facilitator'):
name_list.append(_transformed_item[key].split(",")[0])
name_detail_list.append(_transformed_item.pop(key))
_transformed_item['facilitators'] = ', '.join(filter(None, name_list))
@ -135,45 +188,45 @@ def transform_data(data):
# remove invalid pathway labels that were used for GitHub workflow
pathway_skip_keywords = ['accepted','consideration','stipend','sample']
pathway_list = _transformed_item['pathways'].split(',')
pathway_list = _transformed_item['tags'].split(',')
pathway_list = [
name for name in pathway_list if not set(pathway_skip_keywords).intersection(set(name.lower().split()))
]
_transformed_item['pathways'] = ','.join(pathway_list)
_transformed_item['tags'] = ','.join(pathway_list)
# create `scheduleblock` key based on `time`
time_data = _transformed_item.pop('time', '').split('(')
# "slugified" version of scheduleblock
scheduleblock = time_data[0].strip()
scheduleblock = scheduleblock.lower().replace(' ','-')
_transformed_item['scheduleblock'] = scheduleblock
# create `timeblock` key based on `timeblock`
time_data = _transformed_item.pop('timeblock', '')
timeblock = slugify_timeblock(time_data)
_transformed_item['timeblock'] = timeblock
# infer session day
if 'saturday' in _transformed_item['scheduleblock']:
if 'saturday' in _transformed_item['timeblock']:
_transformed_item['day'] = 'Saturday'
if 'sunday' in _transformed_item['scheduleblock']:
if 'sunday' in _transformed_item['timeblock']:
_transformed_item['day'] = 'Sunday'
if 'all-s' in _transformed_item['scheduleblock']:
_transformed_item['start'] = 'All Day'
# infer start time
# if 'all-s' in _transformed_item['timeblock']:
# _transformed_item['start'] = 'All Day'
# start time
if len(time_data) > 1:
start_time = time_data[1].strip('()').split(' ')[0]
start_time = time_data.split('(')
start_time = start_time[len(start_time)-1].strip(')')[-5:] # return the last 5 character
try:
# attempt to coerce to 12-hour format
d = datetime.strptime(start_time, "%H:%M")
start_time = d.strftime("%I:%M %p")
except:
start_time = ''
pass
_transformed_item['start'] = start_time
# create Saturday and Sunday versions of sessions marked 'all-weekend'
if 'weekend' in _transformed_item['scheduleblock']:
if 'weekend' in _transformed_item['timeblock']:
_transformed_item['start'] = 'All Weekend'
if 'clone_flag' in item:
_transformed_item['scheduleblock'] = 'all-sunday'
_transformed_item['timeblock'] = 'all-sunday'
_transformed_item['day'] = 'Sunday'
_transformed_item['start'] = 'All Day'
else:
_transformed_item['scheduleblock'] = 'all-saturday'
_transformed_item['timeblock'] = 'all-saturday'
_transformed_item['day'] = 'Saturday'
_transformed_item['start'] = 'All Day'
# create a cloned version for Sunday
@ -182,8 +235,8 @@ def transform_data(data):
cloned_data.append(cloned_item)
# prepend `location` with the word 'Floor'
if _transformed_item['location'] and not _transformed_item['location'].startswith('Floor'):
_transformed_item['location'] = 'Floor {0}'.format(_transformed_item['location'])
# if _transformed_item['location'] and not _transformed_item['location'].startswith('Floor'):
# _transformed_item['location'] = 'Floor {0}'.format(_transformed_item['location'])
# if we've triggered the skip flag anywhere, drop this record
if skip:
@ -266,9 +319,12 @@ def commit_json(data, target_config=GITHUB_CONFIG, commit=COMMIT_JSON_TO_GITHUB)
def update_schedule():
data = fetch_data(multiple_sheets=FETCH_MULTIPLE_WORKSHEETS, worksheets_to_skip=WORKSHEETS_TO_SKIP)
#print 'Fetched the data ...'
print 'Fetched the data ...'
data = transform_data(data)
data = {
'timeblocks': transform_timeblock_data(data['timeblocks']),
'sessions': transform_session_data(data['sessions'])
}
#print 'Prepped the data ...'
session_json = make_json(data, store_locally=MAKE_LOCAL_JSON)