[AIRFLOW-4085] FileSensor now takes glob patterns for `filepath` (#5358)

This commit is contained in:
Zacharya 2019-09-04 13:02:28 +03:00 коммит произвёл Ash Berlin-Taylor
Родитель ba9e521e71
Коммит c9e2d04fde
3 изменённых файлов: 66 добавлений и 11 удалений

Просмотреть файл

@ -39,6 +39,9 @@ assists users migrating to a new version.
<!-- END doctoc generated TOC please keep comment here to allow auto update -->
## Airflow Master
### Changes to FileSensor
FileSensor is now takes a glob pattern, not just a filename. If the filename you are looking for has `*`, `?`, or `[` in it then you should replace these with `[*]`, `[?]`, and `[[]`.
### Change dag loading duration metric name
Change DAG file loading duration metric from
`dag.loading-duration.<dag_id>` to `dag.loading-duration.<dag_file>`. This is to

Просмотреть файл

@ -19,7 +19,7 @@
#
import os
import stat
from glob import glob
from airflow.contrib.hooks.fs_hook import FSHook
from airflow.sensors.base_sensor_operator import BaseSensorOperator
@ -37,7 +37,7 @@ class FileSensor(BaseSensorOperator):
connection id
:type fs_conn_id: str
:param filepath: File or folder name (relative to
the base path set within the connection)
the base path set within the connection), can be a glob.
:type fs_conn_id: str
"""
template_fields = ('filepath',)
@ -58,14 +58,12 @@ class FileSensor(BaseSensorOperator):
basepath = hook.get_path()
full_path = os.path.join(basepath, self.filepath)
self.log.info('Poking for file %s', full_path)
try:
if stat.S_ISDIR(os.stat(full_path).st_mode):
for _, _, files in os.walk(full_path):
if len(files):
return True
else:
# full_path was a file directly
for path in glob(full_path):
if os.path.isfile(path):
return True
except OSError:
return False
for _, _, files in os.walk(full_path):
if len(files) > 0:
return True
return False

Просмотреть файл

@ -21,6 +21,7 @@
import unittest
import shutil
import tempfile
import os.path
from airflow import models, DAG
from airflow.exceptions import AirflowSensorTimeout
@ -78,6 +79,7 @@ class TestFileSensor(unittest.TestCase):
fs_conn_id='fs_default',
dag=self.dag,
timeout=0,
poke_interval=1
)
task._hook = self.hook
try:
@ -95,6 +97,7 @@ class TestFileSensor(unittest.TestCase):
fs_conn_id='fs_default',
dag=self.dag,
timeout=0,
poke_interval=1
)
task._hook = self.hook
try:
@ -134,6 +137,57 @@ class TestFileSensor(unittest.TestCase):
task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE,
ignore_ti_state=True)
def test_wildcard_file(self):
suffix = '.txt'
with tempfile.NamedTemporaryFile(suffix=suffix) as tmp:
fileglob = os.path.join(os.path.dirname(tmp.name), '*' + suffix)
task = FileSensor(
task_id='test',
filepath=fileglob,
fs_conn_id='fs_default',
dag=self.dag,
timeout=0,
)
task._hook = self.hook
task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE,
ignore_ti_state=True)
def test_subdirectory_not_empty(self):
suffix = '.txt'
dir_ = tempfile.mkdtemp()
subdir = tempfile.mkdtemp(dir=dir_)
with tempfile.NamedTemporaryFile(suffix=suffix, dir=subdir):
task = FileSensor(
task_id='test',
filepath=dir_,
fs_conn_id='fs_default',
dag=self.dag,
timeout=0,
)
task._hook = self.hook
task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE,
ignore_ti_state=True)
shutil.rmtree(dir_)
def test_subdirectory_empty(self):
dir_ = tempfile.mkdtemp()
tempfile.mkdtemp(dir=dir_)
task = FileSensor(
task_id='test',
filepath=dir_,
fs_conn_id='fs_default',
dag=self.dag,
timeout=0,
poke_interval=1
)
task._hook = self.hook
with self.assertRaises(AirflowSensorTimeout):
task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE,
ignore_ti_state=True)
shutil.rmtree(dir_)
if __name__ == '__main__':
unittest.main()