This commit is contained in:
wcbeard 2020-08-10 10:50:07 -04:00
Родитель 98026d1189
Коммит c3332a9743
5 изменённых файлов: 110 добавлений и 5 удалений

Просмотреть файл

@ -29,10 +29,10 @@ RUN conda update anaconda
# RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && apt-get update -y && apt-get install google-cloud-sdk -y
COPY env_r.yaml requirements_dev.txt /tmp/
COPY env_r.yaml /tmp/
RUN conda env update -n base -f /tmp/env_r.yaml
# RUN pip install -r /tmp/requirements.txt
COPY requirements_dev.txt /tmp/
RUN pip install -r /tmp/requirements_dev.txt
WORKDIR /sreg
@ -43,4 +43,4 @@ WORKDIR /sreg
# RUN echo "project_id = moz-fx-ds-283" > /root/.bigqueryrc
# RUN echo "project_id = moz-fx-data-shared-prod" > /root/.bigqueryrc
# CMD /bin/bash /webrender_intel_win10_nightly/run.sh
CMD /bin/bash etl.sh

Просмотреть файл

@ -1,6 +1,26 @@
slow_regressions
==============================
# Shell aliases
```sh
alias db="docker build -t ds_546_prod ."
alias dr="docker run -v=$HOME/.config/gcloud:/root/.config/gcloud -v ~/repos/sreg:/sreg -it ds_546_prod /bin/bash"
function da () {
export CONTAINER=`docker container ls | pcregrep -o "^[a-z0-9]+"`
docker exec -it $CONTAINER /bin/bash
}
```
# Status
`python -m slow_regressions.utils.diagnose_etl dates`
# Ipython
```
@ -8,13 +28,28 @@ slow_regressions
```
# Workflow
## etl.sh layout
- `slow_regressions.load_raw_test_data etl`
- Downloads summary data from
`moz-fx-data-derived-datasets.taskclusteretl.perfherder` to
temp table
-
- `python -m slow_regressions.etl load_brms --brms_dir='/sreg/data/' --date="$yesterday"`
- `time Rscript slow_regressions/model/smod.r "/sreg/data/$yesterday/"`
- `python -m slow_regressions.etl upload_model_data \
--subdate="$yesterday" --model_data_dir='/sreg/data/'`
## Upload test summaries
```python
gtd.extract_upload_test_data(bq_query=bq.bq_query2, start_date=6)
```
```python
```sh
python -m slow_regressions.load_raw_test_data etl --start_date=0
```

8
etl.sh
Просмотреть файл

@ -2,7 +2,13 @@
yesterday=$(python -m slow_regressions.load_raw_test_data yesterday)
echo "Running for $yesterday"
python -m slow_regressions.load_raw_test_data etl --start_date=0 --end_date="$yesterday"
# RAW_DATA_START_DATE='2020-07-07'
RAW_DATA_START_DATE=0
# Download raw taskcluster data, upload summaries
python -m slow_regressions.load_raw_test_data etl --start_date=$RAW_DATA_START_DATE --end_date="$yesterday"
# Download summarized taskcluster data, save locally
python -m slow_regressions.etl load_brms --brms_dir='/sreg/data/' --date="$yesterday"
time Rscript slow_regressions/model/smod.r "/sreg/data/$yesterday/"

Просмотреть файл

@ -225,6 +225,24 @@ def pull_existing_dates(
return bq_query(q).iloc[:, 0]
def find_missing(test_dates, lastn=None):
test_dates = pd.to_datetime(pd.Series(sorted(test_dates)))
slen = lambda x: len(set(x)) # noqa: E731
start = min(test_dates)
days_till_yesterday = pd.date_range(start, dt.date.today())[:-1]
msg = set(days_till_yesterday) - set(pd.to_datetime(test_dates))
print(
f"table has {slen(test_dates)}/{slen(days_till_yesterday)} "
"possible dates"
)
res = pd.Series(sorted(msg), dtype=np.dtype("<M8[ns]"))
if not lastn:
return res
return res[-lastn:]
def filter_existing_dates(
df, bq_loc, convert_to_date=False, date_field="date"
):

Просмотреть файл

@ -0,0 +1,46 @@
import datetime as dt
from fire import Fire
import pandas as pd # type: ignore
import numpy as np # type: ignore
import slow_regressions.utils.bq_utils as bq
def make_date_query(sql_loc, dt_field=None):
if dt_field:
date_def = f"date({dt_field}) as"
else:
date_def = ""
q = f"""
select
distinct {date_def} date
from {sql_loc}
order by date
"""
return q
def check_table_dates(bq_locs=bq.bq_locs):
# global test_dates, samp_dates, inp_dates
q_test = make_date_query(bq_locs.test.sql, dt_field="time")
q_samps = make_date_query(bq_locs.samples.sql, dt_field="")
q_inp = make_date_query(bq_locs.input_data.sql, dt_field="")
test_dates = bq.bq_query2(q_test)
samp_dates = bq.bq_query2(q_samps)
inp_dates = bq.bq_query2(q_inp)
print(f"\n\ntest table: {bq_locs.test.sql}")
print(bq.find_missing(test_dates.date, lastn=20))
print(f"\n\nsamp_dates table: {bq_locs.samples.sql}")
print(bq.find_missing(samp_dates.date, lastn=20))
print(f"\n\nq_inp table: {bq_locs.input_data.sql}")
print(bq.find_missing(inp_dates.date, lastn=20))
if __name__ == "__main__":
Fire({"dates": check_table_dates})