documentation updates
This commit is contained in:
Родитель
98026d1189
Коммит
c3332a9743
|
@ -29,10 +29,10 @@ RUN conda update anaconda
|
|||
|
||||
# RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && apt-get update -y && apt-get install google-cloud-sdk -y
|
||||
|
||||
COPY env_r.yaml requirements_dev.txt /tmp/
|
||||
COPY env_r.yaml /tmp/
|
||||
RUN conda env update -n base -f /tmp/env_r.yaml
|
||||
|
||||
# RUN pip install -r /tmp/requirements.txt
|
||||
COPY requirements_dev.txt /tmp/
|
||||
RUN pip install -r /tmp/requirements_dev.txt
|
||||
|
||||
WORKDIR /sreg
|
||||
|
@ -43,4 +43,4 @@ WORKDIR /sreg
|
|||
# RUN echo "project_id = moz-fx-ds-283" > /root/.bigqueryrc
|
||||
# RUN echo "project_id = moz-fx-data-shared-prod" > /root/.bigqueryrc
|
||||
|
||||
# CMD /bin/bash /webrender_intel_win10_nightly/run.sh
|
||||
CMD /bin/bash etl.sh
|
||||
|
|
37
README.md
37
README.md
|
@ -1,6 +1,26 @@
|
|||
slow_regressions
|
||||
==============================
|
||||
|
||||
# Shell aliases
|
||||
|
||||
```sh
|
||||
alias db="docker build -t ds_546_prod ."
|
||||
alias dr="docker run -v=$HOME/.config/gcloud:/root/.config/gcloud -v ~/repos/sreg:/sreg -it ds_546_prod /bin/bash"
|
||||
|
||||
|
||||
function da () {
|
||||
export CONTAINER=`docker container ls | pcregrep -o "^[a-z0-9]+"`
|
||||
docker exec -it $CONTAINER /bin/bash
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
|
||||
# Status
|
||||
|
||||
`python -m slow_regressions.utils.diagnose_etl dates`
|
||||
|
||||
|
||||
# Ipython
|
||||
|
||||
```
|
||||
|
@ -8,13 +28,28 @@ slow_regressions
|
|||
```
|
||||
|
||||
# Workflow
|
||||
|
||||
## etl.sh layout
|
||||
|
||||
- `slow_regressions.load_raw_test_data etl`
|
||||
- Downloads summary data from
|
||||
`moz-fx-data-derived-datasets.taskclusteretl.perfherder` to
|
||||
temp table
|
||||
-
|
||||
- `python -m slow_regressions.etl load_brms --brms_dir='/sreg/data/' --date="$yesterday"`
|
||||
- `time Rscript slow_regressions/model/smod.r "/sreg/data/$yesterday/"`
|
||||
- `python -m slow_regressions.etl upload_model_data \
|
||||
--subdate="$yesterday" --model_data_dir='/sreg/data/'`
|
||||
|
||||
|
||||
|
||||
## Upload test summaries
|
||||
|
||||
```python
|
||||
gtd.extract_upload_test_data(bq_query=bq.bq_query2, start_date=6)
|
||||
```
|
||||
|
||||
```python
|
||||
```sh
|
||||
python -m slow_regressions.load_raw_test_data etl --start_date=0
|
||||
```
|
||||
|
||||
|
|
8
etl.sh
8
etl.sh
|
@ -2,7 +2,13 @@
|
|||
yesterday=$(python -m slow_regressions.load_raw_test_data yesterday)
|
||||
echo "Running for $yesterday"
|
||||
|
||||
python -m slow_regressions.load_raw_test_data etl --start_date=0 --end_date="$yesterday"
|
||||
# RAW_DATA_START_DATE='2020-07-07'
|
||||
RAW_DATA_START_DATE=0
|
||||
|
||||
# Download raw taskcluster data, upload summaries
|
||||
python -m slow_regressions.load_raw_test_data etl --start_date=$RAW_DATA_START_DATE --end_date="$yesterday"
|
||||
|
||||
# Download summarized taskcluster data, save locally
|
||||
python -m slow_regressions.etl load_brms --brms_dir='/sreg/data/' --date="$yesterday"
|
||||
|
||||
time Rscript slow_regressions/model/smod.r "/sreg/data/$yesterday/"
|
||||
|
|
|
@ -225,6 +225,24 @@ def pull_existing_dates(
|
|||
return bq_query(q).iloc[:, 0]
|
||||
|
||||
|
||||
def find_missing(test_dates, lastn=None):
|
||||
test_dates = pd.to_datetime(pd.Series(sorted(test_dates)))
|
||||
slen = lambda x: len(set(x)) # noqa: E731
|
||||
start = min(test_dates)
|
||||
|
||||
days_till_yesterday = pd.date_range(start, dt.date.today())[:-1]
|
||||
msg = set(days_till_yesterday) - set(pd.to_datetime(test_dates))
|
||||
|
||||
print(
|
||||
f"table has {slen(test_dates)}/{slen(days_till_yesterday)} "
|
||||
"possible dates"
|
||||
)
|
||||
res = pd.Series(sorted(msg), dtype=np.dtype("<M8[ns]"))
|
||||
if not lastn:
|
||||
return res
|
||||
return res[-lastn:]
|
||||
|
||||
|
||||
def filter_existing_dates(
|
||||
df, bq_loc, convert_to_date=False, date_field="date"
|
||||
):
|
||||
|
|
|
@ -0,0 +1,46 @@
|
|||
import datetime as dt
|
||||
|
||||
from fire import Fire
|
||||
import pandas as pd # type: ignore
|
||||
import numpy as np # type: ignore
|
||||
|
||||
import slow_regressions.utils.bq_utils as bq
|
||||
|
||||
|
||||
def make_date_query(sql_loc, dt_field=None):
|
||||
if dt_field:
|
||||
date_def = f"date({dt_field}) as"
|
||||
else:
|
||||
date_def = ""
|
||||
|
||||
q = f"""
|
||||
select
|
||||
distinct {date_def} date
|
||||
from {sql_loc}
|
||||
order by date
|
||||
"""
|
||||
return q
|
||||
|
||||
|
||||
def check_table_dates(bq_locs=bq.bq_locs):
|
||||
# global test_dates, samp_dates, inp_dates
|
||||
q_test = make_date_query(bq_locs.test.sql, dt_field="time")
|
||||
q_samps = make_date_query(bq_locs.samples.sql, dt_field="")
|
||||
q_inp = make_date_query(bq_locs.input_data.sql, dt_field="")
|
||||
|
||||
test_dates = bq.bq_query2(q_test)
|
||||
samp_dates = bq.bq_query2(q_samps)
|
||||
inp_dates = bq.bq_query2(q_inp)
|
||||
|
||||
print(f"\n\ntest table: {bq_locs.test.sql}")
|
||||
print(bq.find_missing(test_dates.date, lastn=20))
|
||||
|
||||
print(f"\n\nsamp_dates table: {bq_locs.samples.sql}")
|
||||
print(bq.find_missing(samp_dates.date, lastn=20))
|
||||
|
||||
print(f"\n\nq_inp table: {bq_locs.input_data.sql}")
|
||||
print(bq.find_missing(inp_dates.date, lastn=20))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
Fire({"dates": check_table_dates})
|
Загрузка…
Ссылка в новой задаче