DENG-1019: Cleaned backfill validations (#5248)

* Cleaned backfill validations
This commit is contained in:
Winnie Chan 2024-03-22 14:54:13 -07:00 коммит произвёл GitHub
Родитель c0a4a85151
Коммит 325d982f31
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
6 изменённых файлов: 238 добавлений и 220 удалений

Просмотреть файл

@ -93,36 +93,54 @@ class Backfill:
@entry_date.validator
def validate_entry_date(self, attribute, value):
"""Check that provided entry date is valid."""
"""Check that provided entry date is not in the future."""
if date.today() < value:
raise ValueError(f"Backfill entry {value} can't be in the future.")
@start_date.validator
def validate_start_date(self, attribute, value):
"""Check that provided start date is valid."""
"""Check that provided start date is before end date and entry date."""
if self.end_date < value or self.entry_date < value:
raise ValueError(f"Invalid start date: {value}.")
@end_date.validator
def validate_end_date(self, attribute, value):
"""Check that provided end date is valid."""
if value < self.start_date or self.entry_date < self.end_date:
"""Check that provided end date is after start date and before entry date."""
if value < self.start_date or value > self.entry_date:
raise ValueError(f"Invalid end date: {value}.")
@excluded_dates.validator
def validate_excluded_dates(self, attribute, value):
"""Check that provided excluded dates are valid."""
"""Check that provided excluded dates are between start and end dates, are sorted and contain no duplicates."""
if not all(map(lambda e: self.start_date < e < self.end_date, value)):
raise ValueError(f"Invalid excluded dates: {value}.")
if not value == sorted(value):
raise ValueError(
f"Existing backfill entry with excluded dates not sorted: {value}."
)
if not len(value) == len(set(value)):
raise ValueError(
f"Existing backfill entry with duplicate excluded dates: {value}."
)
@watchers.validator
def validate_watchers(self, attribute, value):
"""Check that provided watchers are valid."""
"""Check that provided watchers are valid emails or Github identity with no duplicates."""
if not value or not all(
map(lambda e: e and is_email_or_github_identity(e), value)
):
raise ValueError(f"Invalid email or Github identity for watchers: {value}.")
if len(value) != len(set(value)):
raise ValueError(f"Duplicate watcher in ({value}).")
@reason.validator
def validate_reason(self, attribute, value):
"""Check that provided status is not empty."""
if not value:
raise ValueError("Reason in backfill entry should not be empty.")
@status.validator
def validate_status(self, attribute, value):
"""Check that provided status is valid."""

Просмотреть файл

@ -6,20 +6,15 @@ from typing import List
from ..backfill.parse import DEFAULT_REASON, DEFAULT_WATCHER, Backfill, BackfillStatus
def validate_duplicate_entry_dates(entry_1: Backfill, entry_2: Backfill) -> None:
def validate_duplicate_entry_dates(
backfill_entry: Backfill, backfills: list[Backfill]
) -> None:
"""Check if backfill entries have the same entry dates."""
if entry_1.entry_date == entry_2.entry_date:
raise ValueError(f"Duplicate backfill with entry date: {entry_1.entry_date}.")
def validate_overlap_dates(entry_1: Backfill, entry_2: Backfill) -> None:
"""Check overlap dates between two backfill entries."""
if max(entry_1.start_date, entry_2.start_date) <= min(
entry_1.end_date, entry_2.end_date
):
raise ValueError(
f"Existing backfill entry with overlap dates from: {entry_2.entry_date}."
)
for b in backfills:
if backfill_entry.entry_date == b.entry_date:
raise ValueError(
f"Duplicate backfill with entry date: {backfill_entry.entry_date}."
)
def validate_excluded_dates(entry: Backfill) -> None:
@ -34,25 +29,23 @@ def validate_excluded_dates(entry: Backfill) -> None:
)
def validate_reason(entry: Backfill) -> None:
"""Check if backfill reason is the same as default or empty."""
if not entry.reason or entry.reason == DEFAULT_REASON:
raise ValueError(f"Invalid Reason: {entry.reason}.")
def validate_default_reason(entry: Backfill) -> None:
"""Check if backfill reason is the same as default."""
if entry.reason == DEFAULT_REASON:
raise ValueError(f"Default reason found: {entry.reason}.")
def validate_watchers(entry: Backfill) -> None:
"""Check if backfill watcher is the same as default or duplicated."""
if DEFAULT_WATCHER in entry.watchers or len(entry.watchers) != len(
set(entry.watchers)
):
raise ValueError(f"Duplicate or default watcher in ({entry.watchers}).")
def validate_default_watchers(entry: Backfill) -> None:
"""Check if backfill watcher is the same as default."""
if DEFAULT_WATCHER in entry.watchers:
raise ValueError(f"Default watcher found: ({entry.watchers}).")
def validate_entries_are_sorted(backfills: List[Backfill]) -> None:
"""Check if list of backfill entries are sorted."""
entry_dates = [backfill.entry_date for backfill in backfills]
"""Check if list of backfill entries are sorted by entry dates."""
entry_dates = [b.entry_date for b in backfills]
if not entry_dates == sorted(entry_dates, reverse=True):
raise ValueError("Backfill entries are not sorted")
raise ValueError("Backfill entries are not sorted by entry dates")
def validate_file(file: Path) -> None:
@ -61,18 +54,26 @@ def validate_file(file: Path) -> None:
validate_entries(backfills)
def validate_duplicate_entry_with_initiate_status(
backfill_entry: Backfill, backfills: list
) -> None:
"""Check if list of backfill entries have more than one entry with Initiate Status."""
if backfill_entry.status == BackfillStatus.INITIATE:
for b in backfills:
if b.status == BackfillStatus.INITIATE:
raise ValueError(
"Backfill entries cannot contain more than one entry with Initiate status"
)
def validate_entries(backfills: list) -> None:
"""Validate a list of backfill entries."""
for i, backfill_entry_1 in enumerate(backfills):
validate_watchers(backfill_entry_1)
validate_reason(backfill_entry_1)
validate_excluded_dates(backfill_entry_1)
# validate against other entries with initiate status
if backfill_entry_1.status == BackfillStatus.INITIATE:
for backfill_entry_2 in backfills[i + 1 :]:
if backfill_entry_2.status == BackfillStatus.INITIATE:
validate_duplicate_entry_dates(backfill_entry_1, backfill_entry_2)
validate_overlap_dates(backfill_entry_1, backfill_entry_2)
for i, backfill_entry in enumerate(backfills):
validate_default_watchers(backfill_entry)
validate_default_reason(backfill_entry)
validate_duplicate_entry_dates(backfill_entry, backfills[i + 1 :])
validate_excluded_dates(backfill_entry)
validate_duplicate_entry_with_initiate_status(
backfill_entry, backfills[i + 1 :]
)
validate_entries_are_sorted(backfills)

Просмотреть файл

@ -29,9 +29,8 @@ from ..backfill.utils import (
validate_metadata_workgroups,
)
from ..backfill.validate import (
validate_duplicate_entry_dates,
validate_duplicate_entry_with_initiate_status,
validate_file,
validate_overlap_dates,
)
from ..cli.query import backfill as query_backfill
from ..cli.query import deploy
@ -126,10 +125,7 @@ def create(
status=BackfillStatus.INITIATE,
)
for existing_entry in existing_backfills:
validate_duplicate_entry_dates(new_entry, existing_entry)
if existing_entry.status == BackfillStatus.INITIATE:
validate_overlap_dates(new_entry, existing_entry)
validate_duplicate_entry_with_initiate_status(new_entry, existing_backfills)
existing_backfills.insert(0, new_entry)

Просмотреть файл

@ -80,9 +80,9 @@ class TestParseBackfill(object):
assert "Invalid" in str(e.value)
assert "watchers" in str(e.value)
def test_no_watchers(self):
def test_empty_watchers_should_fail(self):
with pytest.raises(ValueError) as e:
invalid_watchers = [""]
invalid_watchers = []
Backfill(
TEST_BACKFILL_1.entry_date,
TEST_BACKFILL_1.start_date,
@ -96,6 +96,21 @@ class TestParseBackfill(object):
assert "Invalid" in str(e.value)
assert "watchers" in str(e.value)
def test_empty_reason_should_fail(self):
with pytest.raises(ValueError) as e:
invalid_reason = None
Backfill(
TEST_BACKFILL_1.entry_date,
TEST_BACKFILL_1.start_date,
TEST_BACKFILL_1.end_date,
TEST_BACKFILL_1.excluded_dates,
invalid_reason,
TEST_BACKFILL_1.watchers,
TEST_BACKFILL_1.status,
)
assert "Reason in backfill entry should not be empty" in str(e.value)
def test_multiple_watchers(self):
valid_watchers = TEST_BACKFILL_1.watchers + [
"test2@example.org",
@ -116,6 +131,21 @@ class TestParseBackfill(object):
"test3@example.org",
]
def test_duplicate_watchers_should_fail(self):
with pytest.raises(ValueError) as e:
duplicate_watchers = [DEFAULT_WATCHER, DEFAULT_WATCHER]
Backfill(
TEST_BACKFILL_1.entry_date,
TEST_BACKFILL_1.start_date,
TEST_BACKFILL_1.end_date,
TEST_BACKFILL_1.excluded_dates,
TEST_BACKFILL_1.reason,
duplicate_watchers,
TEST_BACKFILL_1.status,
)
assert "Duplicate watcher" in str(e.value)
def test_all_status(self):
valid_status = [status.value for status in BackfillStatus]
for i, status in enumerate(BackfillStatus):
@ -257,6 +287,38 @@ class TestParseBackfill(object):
assert "Invalid excluded dates" in str(e.value)
def test_excluded_dates_duplicates_should_fail(self):
with pytest.raises(ValueError) as e:
invalid_excluded_dates = [date(2021, 2, 3), date(2021, 2, 3)]
Backfill(
TEST_BACKFILL_1.entry_date,
TEST_BACKFILL_1.start_date,
TEST_BACKFILL_1.end_date,
invalid_excluded_dates,
TEST_BACKFILL_1.reason,
TEST_BACKFILL_1.watchers,
TEST_BACKFILL_1.status,
)
assert "duplicate excluded dates" in str(e.value)
def test_excluded_dates_not_sorted_should_fail(self):
with pytest.raises(ValueError) as e:
invalid_excluded_dates = [date(2021, 2, 4), date(2021, 2, 3)]
Backfill(
TEST_BACKFILL_1.entry_date,
TEST_BACKFILL_1.start_date,
TEST_BACKFILL_1.end_date,
invalid_excluded_dates,
TEST_BACKFILL_1.reason,
TEST_BACKFILL_1.watchers,
TEST_BACKFILL_1.status,
)
assert "excluded dates not sorted" in str(e.value)
def test_invalid_status(self):
with pytest.raises(AttributeError):
invalid_status = "invalid_status"

Просмотреть файл

@ -1,17 +1,21 @@
from datetime import date
from pathlib import Path
import pytest
from bigquery_etl.backfill.parse import BACKFILL_FILE, DEFAULT_REASON, Backfill
from bigquery_etl.backfill.parse import (
BACKFILL_FILE,
DEFAULT_REASON,
DEFAULT_WATCHER,
Backfill,
BackfillStatus,
)
from bigquery_etl.backfill.validate import (
validate_default_reason,
validate_default_watchers,
validate_duplicate_entry_dates,
validate_entries,
validate_entries_are_sorted,
validate_excluded_dates,
validate_file,
validate_overlap_dates,
validate_reason,
)
from tests.backfill.test_parse_backfill import TEST_BACKFILL_1, TEST_BACKFILL_2
@ -22,59 +26,15 @@ VALID_WATCHER = "test@example.org"
class TestValidateBackfill(object):
def test_duplicate_entry_dates_pass(self):
validate_duplicate_entry_dates(TEST_BACKFILL_1, TEST_BACKFILL_2)
def test_duplicate_entry_dates_fail(self):
def test_entries_duplicate_entry_dates_should_fail(self):
backfills = [TEST_BACKFILL_1, TEST_BACKFILL_1]
with pytest.raises(ValueError) as e:
validate_duplicate_entry_dates(TEST_BACKFILL_1, TEST_BACKFILL_1)
validate_duplicate_entry_dates(TEST_BACKFILL_1, backfills)
assert "Duplicate backfill" in str(e.value)
assert "Duplicate backfill with entry date" in str(e.value)
def test_overlap_dates_pass(self):
validate_overlap_dates(TEST_BACKFILL_1, TEST_BACKFILL_2)
def test_overlap_dates_fail(self):
with pytest.raises(ValueError) as e:
validate_overlap_dates(TEST_BACKFILL_1, TEST_BACKFILL_1)
assert "overlap dates" in str(e.value)
def test_excluded_dates_duplicates(self):
invalid_excluded_dates = [date(2021, 2, 3), date(2021, 2, 3)]
invalid_backfill = Backfill(
TEST_BACKFILL_1.entry_date,
TEST_BACKFILL_1.start_date,
TEST_BACKFILL_1.end_date,
invalid_excluded_dates,
TEST_BACKFILL_1.reason,
TEST_BACKFILL_1.watchers,
TEST_BACKFILL_1.status,
)
with pytest.raises(ValueError) as e:
validate_excluded_dates(invalid_backfill)
assert "duplicate excluded dates" in str(e.value)
def test_excluded_dates_not_sorted(self):
invalid_excluded_dates = [date(2021, 2, 4), date(2021, 2, 3)]
invalid_backfill = Backfill(
TEST_BACKFILL_1.entry_date,
TEST_BACKFILL_1.start_date,
TEST_BACKFILL_1.end_date,
invalid_excluded_dates,
TEST_BACKFILL_1.reason,
TEST_BACKFILL_1.watchers,
TEST_BACKFILL_1.status,
)
with pytest.raises(ValueError) as e:
validate_excluded_dates(invalid_backfill)
assert "excluded dates not sorted" in str(e.value)
def test_valid_reason_pass(self):
def test_valid_reason_should_pass(self):
valid_backfill = Backfill(
TEST_BACKFILL_1.entry_date,
TEST_BACKFILL_1.start_date,
@ -85,40 +45,31 @@ class TestValidateBackfill(object):
TEST_BACKFILL_1.status,
)
validate_reason(valid_backfill)
validate_default_reason(valid_backfill)
def test_reason_default_fail(self):
invalid_reason = DEFAULT_REASON
def test_validate_default_reason_should_fail(self):
invalid_backfill = Backfill(
TEST_BACKFILL_1.entry_date,
TEST_BACKFILL_1.start_date,
TEST_BACKFILL_1.end_date,
TEST_BACKFILL_1.excluded_dates,
invalid_reason,
DEFAULT_REASON,
TEST_BACKFILL_1.watchers,
TEST_BACKFILL_1.status,
)
with pytest.raises(ValueError) as e:
validate_reason(invalid_backfill)
validate_default_reason(invalid_backfill)
assert "Invalid Reason" in str(e.value)
assert "Default reason" in str(e.value)
def test_validate_default_watcher_should_fail(self):
TEST_BACKFILL_1.watchers = [DEFAULT_WATCHER]
def test_reason_empty_fail(self):
invalid_reason = ""
invalid_backfill = Backfill(
TEST_BACKFILL_1.entry_date,
TEST_BACKFILL_1.start_date,
TEST_BACKFILL_1.end_date,
TEST_BACKFILL_1.excluded_dates,
invalid_reason,
TEST_BACKFILL_1.watchers,
TEST_BACKFILL_1.status,
)
with pytest.raises(ValueError) as e:
validate_reason(invalid_backfill)
validate_default_watchers(TEST_BACKFILL_1)
assert "Invalid Reason" in str(e.value)
assert "Default watcher" in str(e.value)
def test_entries_sorted(self):
backfills = [TEST_BACKFILL_2, TEST_BACKFILL_1]
@ -131,12 +82,26 @@ class TestValidateBackfill(object):
assert "Backfill entries are not sorted" in str(e.value)
def test_validate_entries_pass(self):
def test_validate_entries_duplicate_entry_with_initiate_status_should_fail(self):
TEST_BACKFILL_1.watchers = [VALID_WATCHER]
TEST_BACKFILL_1.reason = VALID_REASON
TEST_BACKFILL_2.watchers = [VALID_WATCHER]
TEST_BACKFILL_2.reason = VALID_REASON
backfills = [TEST_BACKFILL_2, TEST_BACKFILL_1]
with pytest.raises(ValueError) as e:
validate_entries(backfills)
assert (
"Backfill entries cannot contain more than one entry with Initiate status"
in str(e.value)
)
def test_validate_entries(self):
TEST_BACKFILL_1.watchers = [VALID_WATCHER]
TEST_BACKFILL_1.reason = VALID_REASON
TEST_BACKFILL_2.watchers = [VALID_WATCHER]
TEST_BACKFILL_2.reason = VALID_REASON
TEST_BACKFILL_2.status = BackfillStatus.COMPLETE.value
backfills = [TEST_BACKFILL_2, TEST_BACKFILL_1]
validate_entries(backfills)
def test_validate_file(self):

Просмотреть файл

@ -352,7 +352,7 @@ class TestBackfill:
[date(2021, 2, 3)],
VALID_REASON,
[VALID_WATCHER],
DEFAULT_STATUS,
BackfillStatus.COMPLETE,
)
backfill_entry_2 = Backfill(
@ -392,6 +392,66 @@ class TestBackfill:
assert backfills[1] == backfill_entry_1
assert backfills[0] == backfill_entry_2
def test_create_backfill_with_exsting_entry_with_initiate_status_should_fail(
self, runner
):
with runner.isolated_filesystem():
SQL_DIR = "sql/moz-fx-data-shared-prod/test/test_query_v1"
os.makedirs(SQL_DIR)
with open(
"sql/moz-fx-data-shared-prod/test/test_query_v1/query.sql", "w"
) as f:
f.write("SELECT 1")
with open(
"sql/moz-fx-data-shared-prod/test/test_query_v1/metadata.yaml",
"w",
) as f:
f.write(yaml.dump(TABLE_METADATA_CONF))
with open(
"sql/moz-fx-data-shared-prod/test/dataset_metadata.yaml", "w"
) as f:
f.write(yaml.dump(DATASET_METADATA_CONF))
backfill_entry_1 = Backfill(
date(2021, 5, 3),
date(2021, 1, 3),
date(2021, 5, 3),
[date(2021, 2, 3)],
VALID_REASON,
[VALID_WATCHER],
DEFAULT_STATUS,
)
backfill_file = (
Path("sql/moz-fx-data-shared-prod/test/test_query_v1") / BACKFILL_FILE
)
backfill_file.write_text(backfill_entry_1.to_yaml())
assert BACKFILL_FILE in os.listdir(
"sql/moz-fx-data-shared-prod/test/test_query_v1"
)
backfills = Backfill.entries_from_file(backfill_file)
assert backfills[0] == backfill_entry_1
result = runner.invoke(
create,
[
"moz-fx-data-shared-prod.test.test_query_v1",
"--start_date=2023-03-01",
"--end_date=2023-03-10",
],
)
assert result.exit_code == 1
assert (
"Backfill entries cannot contain more than one entry with Initiate status"
in str(result.exception)
)
def test_validate_backfill(self, runner):
with runner.isolated_filesystem():
SQL_DIR = "sql/moz-fx-data-shared-prod/test/test_query_v1"
@ -491,7 +551,7 @@ class TestBackfill:
],
)
assert result.exit_code == 1
assert "Invalid Reason" in result.output
assert "Default reason" in result.output
def test_validate_backfill_empty_reason(self, runner):
with runner.isolated_filesystem():
@ -529,7 +589,10 @@ class TestBackfill:
],
)
assert result.exit_code == 1
assert "Invalid Reason" in result.output
assert (
"Reason in backfill entry should not be empty"
in result.exception.args[0]
)
def test_validate_backfill_invalid_watcher(self, runner):
with runner.isolated_filesystem():
@ -613,7 +676,7 @@ class TestBackfill:
],
)
assert result.exit_code == 1
assert "Duplicate or default watcher" in result.output
assert "Duplicate watcher" in result.exception.args[0]
def test_validate_backfill_invalid_status(self, runner):
with runner.isolated_filesystem():
@ -844,7 +907,7 @@ class TestBackfill:
],
)
assert result.exit_code == 1
assert "duplicate excluded dates" in result.output
assert "duplicate excluded dates" in result.exception.args[0]
def test_validate_backfill_invalid_excluded_dates_not_sorted(self, runner):
with runner.isolated_filesystem():
@ -890,7 +953,7 @@ class TestBackfill:
],
)
assert result.exit_code == 1
assert "excluded dates not sorted" in result.output
assert "excluded dates not sorted" in result.exception.args[0]
def test_validate_backfill_entries_not_sorted(self, runner):
with runner.isolated_filesystem():
@ -922,7 +985,7 @@ class TestBackfill:
" reason: test_reason\n"
" watchers:\n"
" - test@example.org\n"
" status: Initiate\n"
" status: Complete\n"
)
assert BACKFILL_FILE in os.listdir(SQL_DIR)
@ -936,93 +999,6 @@ class TestBackfill:
assert result.exit_code == 1
assert "entries are not sorted" in result.output
def test_validate_backfill_overlap_dates(self, runner):
with runner.isolated_filesystem():
SQL_DIR = "sql/moz-fx-data-shared-prod/test/test_query_v1"
os.makedirs(SQL_DIR)
with open(
"sql/moz-fx-data-shared-prod/test/test_query_v1/query.sql", "w"
) as f:
f.write("SELECT 1")
with open(
"sql/moz-fx-data-shared-prod/test/test_query_v1/metadata.yaml",
"w",
) as f:
f.write(yaml.dump(TABLE_METADATA_CONF))
with open(
"sql/moz-fx-data-shared-prod/test/dataset_metadata.yaml", "w"
) as f:
f.write(yaml.dump(DATASET_METADATA_CONF))
backfill_file = Path(SQL_DIR) / BACKFILL_FILE
backfill_file.write_text(
BACKFILL_YAML_TEMPLATE + "\n"
"2021-05-03:\n"
" start_date: 2021-01-03\n"
" end_date: 2021-05-03\n"
" reason: test_reason\n"
" watchers:\n"
" - test@example.org\n"
" status: Initiate\n"
)
assert BACKFILL_FILE in os.listdir(SQL_DIR)
result = runner.invoke(
validate,
[
"moz-fx-data-shared-prod.test.test_query_v1",
],
)
assert result.exit_code == 1
assert "overlap dates" in result.output
def test_validate_backfill_overlap_dates_not_drafting_status(self, runner):
with runner.isolated_filesystem():
SQL_DIR = "sql/moz-fx-data-shared-prod/test/test_query_v1"
os.makedirs(SQL_DIR)
with open(
"sql/moz-fx-data-shared-prod/test/test_query_v1/query.sql", "w"
) as f:
f.write("SELECT 1")
with open(
"sql/moz-fx-data-shared-prod/test/test_query_v1/metadata.yaml",
"w",
) as f:
f.write(yaml.dump(TABLE_METADATA_CONF))
with open(
"sql/moz-fx-data-shared-prod/test/dataset_metadata.yaml", "w"
) as f:
f.write(yaml.dump(DATASET_METADATA_CONF))
backfill_file = Path(SQL_DIR) / BACKFILL_FILE
backfill_file.write_text(
BACKFILL_YAML_TEMPLATE + "\n"
"2021-05-03:\n"
" start_date: 2021-01-03\n"
" end_date: 2021-05-03\n"
" reason: test_reason\n"
" watchers:\n"
" - test@example.org\n"
" status: Complete\n"
)
assert BACKFILL_FILE in os.listdir(SQL_DIR)
result = runner.invoke(
validate,
[
"moz-fx-data-shared-prod.test.test_query_v1",
],
)
assert result.exit_code == 0
def test_backfill_info_one_table_all_status(self, runner):
with runner.isolated_filesystem():
SQL_DIR = "sql/moz-fx-data-shared-prod/test/test_query_v1"