diff --git a/.gitignore b/.gitignore index 9c0f4ca4..991a0fb0 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,6 @@ Session.vim # sftp configuration file sftp-config.json + +# Python +__pycache__ diff --git a/Dockerfile b/Dockerfile index 24920d5e..55caedff 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,7 +12,7 @@ ENV DEBIAN_FRONTEND noninteractive ENV TERM linux # Airflow -ARG AIRFLOW_VERSION=1.8.2 +ARG AIRFLOW_VERSION=1.9.0 ARG AIRFLOW_HOME=/usr/local/airflow # Define en_US. @@ -57,7 +57,7 @@ RUN set -ex \ && pip install ndg-httpsclient \ && pip install pyasn1 \ && pip install apache-airflow[crypto,celery,postgres,hive,jdbc]==$AIRFLOW_VERSION \ - && pip install celery[redis]==3.1.17 \ + && pip install celery[redis]==4.0.2 \ && apt-get purge --auto-remove -yqq $buildDeps \ && apt-get clean \ && rm -rf \ diff --git a/circle.yml b/circle.yml index 5981fe23..cd93ab2f 100644 --- a/circle.yml +++ b/circle.yml @@ -12,4 +12,4 @@ test: pre: - sleep 5 override: - - docker run puckel/docker-airflow version |grep '1.8.2' + - docker run puckel/docker-airflow version |grep '1.9.0' diff --git a/config/airflow.cfg b/config/airflow.cfg index 6d313f03..6da94249 100644 --- a/config/airflow.cfg +++ b/config/airflow.cfg @@ -12,18 +12,26 @@ dags_folder = /usr/local/airflow/dags base_log_folder = /usr/local/airflow/logs # Airflow can store logs remotely in AWS S3 or Google Cloud Storage. Users -# must supply a remote location URL (starting with either 's3://...' or -# 'gs://...') and an Airflow connection id that provides access to the storage +# must supply an Airflow connection id that provides access to the storage # location. -remote_base_log_folder = remote_log_conn_id = -# Use server-side encryption for logs stored in S3 encrypt_s3_logs = False -# DEPRECATED option for remote log storage, use remote_base_log_folder instead! -s3_log_folder = + +# Logging level +logging_level = INFO + +# Logging class +# Specify the class that will specify the logging configuration +# This class has to be on the python classpath +# logging_config_class = my.path.default_local_settings.LOGGING_CONFIG +logging_config_class = + +# Log format +log_format = [%%(asctime)s] {{%%(filename)s:%%(lineno)d}} %%(levelname)s - %%(message)s +simple_log_format = %%(asctime)s %%(levelname)s - %%(message)s # The executor class that airflow should use. Choices include -# SequentialExecutor, LocalExecutor, CeleryExecutor +# SequentialExecutor, LocalExecutor, CeleryExecutor, DaskExecutor executor = CeleryExecutor # The SqlAlchemy connection string to the metadata database. @@ -89,6 +97,18 @@ security = # values at runtime) unit_test_mode = False +# Name of handler to read task instance logs. +# Default to use file task handler. +task_log_reader = file.task + +# Whether to enable pickling for xcom (note that this is insecure and allows for +# RCE exploits). This will be deprecated in Airflow 2.0 (be forced to False). +enable_xcom_pickling = True + +# When a task is killed forcefully, this is the amount of time in seconds that +# it has to cleanup after it is sent a SIGTERM, before it is SIGKILLED +killed_task_cleanup_time = 60 + [cli] # In what way should the cli access the API. The LocalClient will use the # database directly, while the json_client will use the api running on the @@ -168,6 +188,10 @@ filter_by_owner = False # in order to user the ldapgroup mode. owner_mode = user +# Default DAG view. Valid values are: +# tree, graph, duration, gantt, landing_times +dag_default_view = tree + # Default DAG orientation. Valid values are: # LR (Left->Right), TB (Top->Bottom), RL (Right->Left), BT (Bottom->Top) dag_orientation = LR @@ -184,6 +208,9 @@ log_fetch_timeout_sec = 5 # DAGs by default hide_paused_dags_by_default = False +# Consistent page size across all listing views in the UI +page_size = 100 + [email] email_backend = airflow.utils.email.send_email_smtp @@ -198,7 +225,7 @@ smtp_ssl = False # smtp_user = airflow # smtp_password = airflow smtp_port = 25 -smtp_mail_from = airflow@airflow.com +smtp_mail_from = airflow@example.com [celery] # This section only applies if you are using the CeleryExecutor in @@ -238,6 +265,19 @@ flower_port = 5555 # Default queue that tasks get assigned to and that worker listen on. default_queue = default +# Import path for celery configuration options +celery_config_options = airflow.config_templates.default_celery.DEFAULT_CELERY_CONFIG + +# No SSL +celery_ssl_active = False + +[dask] +# This section only applies if you are using the DaskExecutor in +# [core] section above + +# The IP address and port of the Dask cluster's scheduler. +cluster_address = 127.0.0.1:8786 + [scheduler] # Task instances listen for external kill signal (when you clear tasks # from the CLI or the UI), this defines the frequency at which they should @@ -276,6 +316,11 @@ scheduler_zombie_task_threshold = 300 # DAG definition (catchup) catchup_by_default = True +# This changes the batch size of queries in the scheduling main loop. +# This depends on query length limits and how long you are willing to hold locks. +# 0 for no limit +max_tis_per_query = 0 + # Statsd (https://github.com/etsy/statsd) integration settings statsd_on = False statsd_host = localhost @@ -283,12 +328,25 @@ statsd_port = 8125 statsd_prefix = airflow # The scheduler can run multiple threads in parallel to schedule dags. -# This defines how many threads will run. However airflow will never -# use more threads than the amount of cpu cores available. +# This defines how many threads will run. max_threads = 2 authenticate = False +[ldap] +# set this to ldaps://: +uri = +user_filter = objectClass=* +user_name_attr = uid +group_member_attr = memberOf +superuser_filter = +data_profiler_filter = +bind_user = cn=Manager,dc=example,dc=com +bind_password = insecure +basedn = dc=example,dc=com +cacert = /etc/ca/ldap_ca.crt +search_scope = LEVEL + [mesos] # Mesos master address which MesosExecutor will connect to. master = localhost:5050 diff --git a/docker-compose-CeleryExecutor.yml b/docker-compose-CeleryExecutor.yml index 95025601..303fae25 100644 --- a/docker-compose-CeleryExecutor.yml +++ b/docker-compose-CeleryExecutor.yml @@ -16,7 +16,7 @@ services: # - ./pgdata:/var/lib/postgresql/data/pgdata webserver: - image: puckel/docker-airflow:1.8.2 + image: puckel/docker-airflow:1.9.0 restart: always depends_on: - postgres @@ -41,7 +41,7 @@ services: retries: 3 flower: - image: puckel/docker-airflow:1.8.2 + image: puckel/docker-airflow:1.9.0 restart: always depends_on: - redis @@ -53,7 +53,7 @@ services: command: flower scheduler: - image: puckel/docker-airflow:1.8.2 + image: puckel/docker-airflow:1.9.0 restart: always depends_on: - webserver @@ -70,7 +70,7 @@ services: command: scheduler worker: - image: puckel/docker-airflow:1.8.2 + image: puckel/docker-airflow:1.9.0 restart: always depends_on: - scheduler diff --git a/docker-compose-LocalExecutor.yml b/docker-compose-LocalExecutor.yml index d05c06dd..fbad6319 100644 --- a/docker-compose-LocalExecutor.yml +++ b/docker-compose-LocalExecutor.yml @@ -8,7 +8,7 @@ services: - POSTGRES_DB=airflow webserver: - image: puckel/docker-airflow:1.8.2 + image: puckel/docker-airflow:1.9.0 restart: always depends_on: - postgres