apache · potiuk · Jul 27, 2022 · Jul 25, 2022 · Jul 27, 2022
diff --git a/airflow/providers/apache/hive/CHANGELOG.rst b/airflow/providers/apache/hive/CHANGELOG.rst
@@ -24,6 +24,17 @@
 Changelog
 ---------
 
+Breaking Changes
+~~~~~~~~~~~~~~~~
+
+* The ``hql`` parameter in ``get_records`` of ``HiveServer2Hook`` has been renamed to sql to match the
+  ``get_records`` DbApiHook signature. If you used it as a positional parameter, this is no change for you,
+  but if you used it as keyword one, you need to rename it.
+* ``hive_conf`` parameter has been renamed to ``parameters`` and it is now second parameter, to match ``get_records``
+  signature from the DbApiHook. You need to rename it if you used it.
+* ``schema`` parameter in ``get_records`` is an optional kwargs extra parameter that you can add, to match
+  the schema of ``get_records`` from DbApiHook.
+
 3.1.0
 .....
 

diff --git a/airflow/providers/apache/hive/hooks/hive.py b/airflow/providers/apache/hive/hooks/hive.py
@@ -24,7 +24,7 @@
 import warnings
 from collections import OrderedDict
 from tempfile import NamedTemporaryFile, TemporaryDirectory
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, Iterable, List, Mapping, Optional, Union
 
 import pandas
 import unicodecsv as csv
@@ -857,15 +857,15 @@ def get_conn(self, schema: Optional[str] = None) -> Any:
 
     def _get_results(
         self,
-        hql: Union[str, List[str]],
+        sql: Union[str, List[str]],
         schema: str = 'default',
         fetch_size: Optional[int] = None,
-        hive_conf: Optional[Dict[Any, Any]] = None,
+        hive_conf: Optional[Union[Iterable, Mapping]] = None,
     ) -> Any:
         from pyhive.exc import ProgrammingError
 
-        if isinstance(hql, str):
-            hql = [hql]
+        if isinstance(sql, str):
+            sql = [sql]
         previous_description = None
         with contextlib.closing(self.get_conn(schema)) as conn, contextlib.closing(conn.cursor()) as cur:
 
@@ -882,7 +882,7 @@ def _get_results(
                 for k, v in env_context.items():
                     cur.execute(f"set {k}={v}")
 
-            for statement in hql:
+            for statement in sql:
                 cur.execute(statement)
                 # we only get results of statements that returns
                 lowered_statement = statement.lower().strip()
@@ -911,29 +911,29 @@ def _get_results(
 
     def get_results(
         self,
-        hql: str,
+        sql: Union[str, List[str]],
         schema: str = 'default',
         fetch_size: Optional[int] = None,
-        hive_conf: Optional[Dict[Any, Any]] = None,
+        hive_conf: Optional[Union[Iterable, Mapping]] = None,
     ) -> Dict[str, Any]:
         """
         Get results of the provided hql in target schema.
 
-        :param hql: hql to be executed.
+        :param sql: hql to be executed.
         :param schema: target schema, default to 'default'.
         :param fetch_size: max size of result to fetch.
         :param hive_conf: hive_conf to execute alone with the hql.
         :return: results of hql execution, dict with data (list of results) and header
         :rtype: dict
         """
-        results_iter = self._get_results(hql, schema, fetch_size=fetch_size, hive_conf=hive_conf)
+        results_iter = self._get_results(sql, schema, fetch_size=fetch_size, hive_conf=hive_conf)
         header = next(results_iter)
         results = {'data': list(results_iter), 'header': header}
         return results
 
     def to_csv(
         self,
-        hql: str,
+        sql: str,
         csv_filepath: str,
         schema: str = 'default',
         delimiter: str = ',',
@@ -945,7 +945,7 @@ def to_csv(
         """
         Execute hql in target schema and write results to a csv file.
 
-        :param hql: hql to be executed.
+        :param sql: hql to be executed.
         :param csv_filepath: filepath of csv to write results into.
         :param schema: target schema, default to 'default'.
         :param delimiter: delimiter of the csv file, default to ','.
@@ -955,7 +955,7 @@ def to_csv(
         :param hive_conf: hive_conf to execute alone with the hql.
 
         """
-        results_iter = self._get_results(hql, schema, fetch_size=fetch_size, hive_conf=hive_conf)
+        results_iter = self._get_results(sql, schema, fetch_size=fetch_size, hive_conf=hive_conf)
         header = next(results_iter)
         message = None
 
@@ -982,14 +982,14 @@ def to_csv(
         self.log.info("Done. Loaded a total of %s rows.", i)
 
     def get_records(
-        self, hql: str, schema: str = 'default', hive_conf: Optional[Dict[Any, Any]] = None
+        self, sql: Union[str, List[str]], parameters: Optional[Union[Iterable, Mapping]] = None, **kwargs
     ) -> Any:
         """
-        Get a set of records from a Hive query.
+        Get a set of records from a Hive query. You can optionally pass 'schema' kwarg
+        which specifies target schema and default to 'default'.
 
-        :param hql: hql to be executed.
-        :param schema: target schema, default to 'default'.
-        :param hive_conf: hive_conf to execute alone with the hql.
+        :param sql: hql to be executed.
+        :param parameters: optional configuration passed to get_results
         :return: result of hive execution
         :rtype: list
 
@@ -998,19 +998,20 @@ def get_records(
         >>> len(hh.get_records(sql))
         100
         """
-        return self.get_results(hql, schema=schema, hive_conf=hive_conf)['data']
+        schema = kwargs['schema'] if 'schema' in kwargs else 'default'
+        return self.get_results(sql, schema=schema, hive_conf=parameters)['data']
 
     def get_pandas_df(  # type: ignore
         self,
-        hql: str,
+        sql: str,
         schema: str = 'default',
         hive_conf: Optional[Dict[Any, Any]] = None,
         **kwargs,
     ) -> pandas.DataFrame:
         """
         Get a pandas dataframe from a Hive query
 
-        :param hql: hql to be executed.
+        :param sql: hql to be executed.
         :param schema: target schema, default to 'default'.
         :param hive_conf: hive_conf to execute alone with the hql.
         :param kwargs: (optional) passed into pandas.DataFrame constructor
@@ -1025,6 +1026,6 @@ def get_pandas_df(  # type: ignore
 
         :return: pandas.DateFrame
         """
-        res = self.get_results(hql, schema=schema, hive_conf=hive_conf)
+        res = self.get_results(sql, schema=schema, hive_conf=hive_conf)
         df = pandas.DataFrame(res['data'], columns=[c[0] for c in res['header']], **kwargs)
         return df
diff --git a/airflow/providers/apache/hive/operators/hive_stats.py b/airflow/providers/apache/hive/operators/hive_stats.py
@@ -138,7 +138,7 @@ def execute(self, context: "Context") -> None:
 
         presto = PrestoHook(presto_conn_id=self.presto_conn_id)
         self.log.info('Executing SQL check: %s', sql)
-        row = presto.get_first(hql=sql)
+        row = presto.get_first(sql)
         self.log.info("Record: %s", row)
         if not row:
             raise AirflowException("The query returned None")

diff --git a/airflow/providers/apache/hive/transfers/hive_to_mysql.py b/airflow/providers/apache/hive/transfers/hive_to_mysql.py
@@ -111,7 +111,7 @@ def execute(self, context: 'Context'):
                 mysql = self._call_preoperator()
                 mysql.bulk_load(table=self.mysql_table, tmp_file=tmp_file.name)
         else:
-            hive_results = hive.get_records(self.sql, hive_conf=hive_conf)
+            hive_results = hive.get_records(self.sql, parameters=hive_conf)
             mysql = self._call_preoperator()
             mysql.insert_rows(table=self.mysql_table, rows=hive_results)
 

diff --git a/airflow/providers/apache/hive/transfers/hive_to_samba.py b/airflow/providers/apache/hive/transfers/hive_to_samba.py
@@ -68,7 +68,7 @@ def execute(self, context: 'Context'):
         with NamedTemporaryFile() as tmp_file:
             self.log.info("Fetching file from Hive")
             hive = HiveServer2Hook(hiveserver2_conn_id=self.hiveserver2_conn_id)
-            hive.to_csv(hql=self.hql, csv_filepath=tmp_file.name, hive_conf=context_to_airflow_vars(context))
+            hive.to_csv(self.hql, csv_filepath=tmp_file.name, hive_conf=context_to_airflow_vars(context))
             self.log.info("Pushing to samba")
             samba = SambaHook(samba_conn_id=self.samba_conn_id)
             samba.push_from_local(self.destination_filepath, tmp_file.name)
diff --git a/airflow/providers/apache/pinot/hooks/pinot.py b/airflow/providers/apache/pinot/hooks/pinot.py
@@ -275,7 +275,9 @@ def get_uri(self) -> str:
         endpoint = conn.extra_dejson.get('endpoint', 'query/sql')
         return f'{conn_type}://{host}/{endpoint}'
 
-    def get_records(self, sql: str, parameters: Optional[Union[Iterable, Mapping]] = None) -> Any:
+    def get_records(
+        self, sql: Union[str, List[str]], parameters: Optional[Union[Iterable, Mapping]] = None, **kwargs
+    ) -> Any:
         """
         Executes the sql and returns a set of records.
 
@@ -287,7 +289,9 @@ def get_records(self, sql: str, parameters: Optional[Union[Iterable, Mapping]] =
             cur.execute(sql)
             return cur.fetchall()
 
-    def get_first(self, sql: str, parameters: Optional[Union[Iterable, Mapping]] = None) -> Any:
+    def get_first(
+        self, sql: Union[str, List[str]], parameters: Optional[Union[Iterable, Mapping]] = None
+    ) -> Any:
         """
         Executes the sql and returns the first resulting row.
 

@@ -15,6 +15,11 @@
     specific language governing permissions and limitations
     under the License.
 
+.. NOTE TO CONTRIBUTORS:
+    Please, only add notes to the Changelog just below the "Changelog" header when there are some breaking changes
+    and you want to add an explanation to the users on how they are supposed to deal with them.
+    The changelog is updated and maintained semi-automatically by release manager.
+
 
 Changelog
 ---------

@@ -181,7 +181,12 @@ def get_pandas_df_by_chunks(self, sql, parameters=None, *, chunksize, **kwargs):
         with closing(self.get_conn()) as conn:
             yield from psql.read_sql(sql, con=conn, params=parameters, chunksize=chunksize, **kwargs)
 
-    def get_records(self, sql, parameters=None):
+    def get_records(
+        self,
+        sql: Union[str, List[str]],
+        parameters: Optional[Union[Iterable, Mapping]] = None,
+        **kwargs: dict,
+    ):
         """
         Executes the sql and returns a set of records.
 
@@ -197,7 +202,7 @@ def get_records(self, sql, parameters=None):
                     cur.execute(sql)
                 return cur.fetchall()
 
-    def get_first(self, sql, parameters=None):
+    def get_first(self, sql: Union[str, List[str]], parameters=None):
         """
         Executes the sql and returns the first resulting row.
 

diff --git a/airflow/providers/exasol/hooks/exasol.py b/airflow/providers/exasol/hooks/exasol.py
@@ -77,7 +77,12 @@ def get_pandas_df(self, sql: str, parameters: Optional[dict] = None, **kwargs) -
             df = conn.export_to_pandas(sql, query_params=parameters, **kwargs)
             return df
 
-    def get_records(self, sql: str, parameters: Optional[dict] = None) -> List[Union[dict, Tuple[Any, ...]]]:
+    def get_records(
+        self,
+        sql: Union[str, List[str]],
+        parameters: Optional[Union[Iterable, Mapping]] = None,
+        **kwargs: dict,
+    ) -> List[Union[dict, Tuple[Any, ...]]]:
         """
         Executes the sql and returns a set of records.
 
@@ -89,7 +94,7 @@ def get_records(self, sql: str, parameters: Optional[dict] = None) -> List[Union
             with closing(conn.execute(sql, parameters)) as cur:
                 return cur.fetchall()
 
-    def get_first(self, sql: str, parameters: Optional[dict] = None) -> Optional[Any]:
+    def get_first(self, sql: Union[str, List[str]], parameters: Optional[dict] = None) -> Optional[Any]:
         """
         Executes the sql and returns the first resulting row.
 

diff --git a/airflow/providers/google/cloud/hooks/cloud_sql.py b/airflow/providers/google/cloud/hooks/cloud_sql.py
@@ -426,11 +426,11 @@ def __init__(
         self.sql_proxy_was_downloaded = False
         self.sql_proxy_version = sql_proxy_version
         self.download_sql_proxy_dir = None
-        self.sql_proxy_process = None  # type: Optional[Popen]
+        self.sql_proxy_process: Optional[Popen] = None
         self.instance_specification = instance_specification
         self.project_id = project_id
         self.gcp_conn_id = gcp_conn_id
-        self.command_line_parameters = []  # type:  List[str]
+        self.command_line_parameters: List[str] = []
         self.cloud_sql_proxy_socket_directory = self.path_prefix
         self.sql_proxy_path = (
             sql_proxy_binary_path if sql_proxy_binary_path else self.path_prefix + "_cloud_sql_proxy"
@@ -705,28 +705,28 @@ def __init__(
         self.gcp_cloudsql_conn_id = gcp_cloudsql_conn_id
         self.cloudsql_connection = self.get_connection(self.gcp_cloudsql_conn_id)
         self.extras = self.cloudsql_connection.extra_dejson
-        self.project_id = self.extras.get('project_id', default_gcp_project_id)  # type: Optional[str]
-        self.instance = self.extras.get('instance')  # type: Optional[str]
-        self.database = self.cloudsql_connection.schema  # type: Optional[str]
-        self.location = self.extras.get('location')  # type: Optional[str]
-        self.database_type = self.extras.get('database_type')  # type: Optional[str]
-        self.use_proxy = self._get_bool(self.extras.get('use_proxy', 'False'))  # type: bool
-        self.use_ssl = self._get_bool(self.extras.get('use_ssl', 'False'))  # type: bool
-        self.sql_proxy_use_tcp = self._get_bool(self.extras.get('sql_proxy_use_tcp', 'False'))  # type: bool
-        self.sql_proxy_version = self.extras.get('sql_proxy_version')  # type: Optional[str]
-        self.sql_proxy_binary_path = self.extras.get('sql_proxy_binary_path')  # type: Optional[str]
-        self.user = self.cloudsql_connection.login  # type: Optional[str]
-        self.password = self.cloudsql_connection.password  # type: Optional[str]
-        self.public_ip = self.cloudsql_connection.host  # type: Optional[str]
-        self.public_port = self.cloudsql_connection.port  # type: Optional[int]
-        self.sslcert = self.extras.get('sslcert')  # type: Optional[str]
-        self.sslkey = self.extras.get('sslkey')  # type: Optional[str]
-        self.sslrootcert = self.extras.get('sslrootcert')  # type: Optional[str]
+        self.project_id = self.extras.get('project_id', default_gcp_project_id)
+        self.instance = self.extras.get('instance')
+        self.database = self.cloudsql_connection.schema
+        self.location = self.extras.get('location')
+        self.database_type = self.extras.get('database_type')
+        self.use_proxy = self._get_bool(self.extras.get('use_proxy', 'False'))
+        self.use_ssl = self._get_bool(self.extras.get('use_ssl', 'False'))
+        self.sql_proxy_use_tcp = self._get_bool(self.extras.get('sql_proxy_use_tcp', 'False'))
+        self.sql_proxy_version = self.extras.get('sql_proxy_version')
+        self.sql_proxy_binary_path = self.extras.get('sql_proxy_binary_path')
+        self.user = self.cloudsql_connection.login
+        self.password = self.cloudsql_connection.password
+        self.public_ip = self.cloudsql_connection.host
+        self.public_port = self.cloudsql_connection.port
+        self.sslcert = self.extras.get('sslcert')
+        self.sslkey = self.extras.get('sslkey')
+        self.sslrootcert = self.extras.get('sslrootcert')
         # Port and socket path and db_hook are automatically generated
         self.sql_proxy_tcp_port = None
-        self.sql_proxy_unique_path = None  # type: Optional[str]
-        self.db_hook = None  # type: Optional[Union[PostgresHook, MySqlHook]]
-        self.reserved_tcp_socket = None  # type: Optional[socket.socket]
+        self.sql_proxy_unique_path: Optional[str] = None
+        self.db_hook: Optional[Union[PostgresHook, MySqlHook]] = None
+        self.reserved_tcp_socket: Optional[socket.socket] = None
         # Generated based on clock + clock sequence. Unique per host (!).
         # This is important as different hosts share the database
         self.db_conn_id = str(uuid.uuid1())
@@ -828,18 +828,18 @@ def _generate_connection_uri(self) -> str:
         if not self.database_type:
             raise ValueError("The database_type should be set")
 
-        database_uris = CONNECTION_URIS[self.database_type]  # type: Dict[str, Dict[str, str]]
+        database_uris = CONNECTION_URIS[self.database_type]
         ssl_spec = None
         socket_path = None
         if self.use_proxy:
-            proxy_uris = database_uris['proxy']  # type: Dict[str, str]
+            proxy_uris = database_uris['proxy']
             if self.sql_proxy_use_tcp:
                 format_string = proxy_uris['tcp']
             else:
                 format_string = proxy_uris['socket']
                 socket_path = f"{self.sql_proxy_unique_path}/{self._get_instance_socket_name()}"
         else:
-            public_uris = database_uris['public']  # type: Dict[str, str]
+            public_uris = database_uris['public']
             if self.use_ssl:
                 format_string = public_uris['ssl']
                 ssl_spec = {'cert': self.sslcert, 'key': self.sslkey, 'ca': self.sslrootcert}
@@ -876,7 +876,7 @@ def _generate_connection_uri(self) -> str:
         return connection_uri
 
     def _get_instance_socket_name(self) -> str:
-        return self.project_id + ":" + self.location + ":" + self.instance  # type: ignore
+        return self.project_id + ":" + self.location + ":" + self.instance
 
     def _get_sqlproxy_instance_specification(self) -> str:
         instance_specification = self._get_instance_socket_name()
@@ -921,10 +921,13 @@ def get_database_hook(self, connection: Connection) -> Union[PostgresHook, MySql
         that uses proxy or connects directly to the Google Cloud SQL database.
         """
         if self.database_type == 'postgres':
-            self.db_hook = PostgresHook(connection=connection, schema=self.database)
+            db_hook: Union[PostgresHook, MySqlHook] = PostgresHook(
+                connection=connection, schema=self.database
+            )
         else:
-            self.db_hook = MySqlHook(connection=connection, schema=self.database)
-        return self.db_hook
+            db_hook = MySqlHook(connection=connection, schema=self.database)
+        self.db_hook = db_hook
+        return db_hook
 
     def cleanup_database_hook(self) -> None:
         """Clean up database hook after it was used."""

diff --git a/airflow/providers/presto/CHANGELOG.rst b/airflow/providers/presto/CHANGELOG.rst
@@ -24,6 +24,12 @@
 Changelog
 ---------
 
+Breaking changes
+~~~~~~~~~~~~~~~~
+
+Deprecated ``hql`` parameter has been removed in ``get_records``, ``get_first``, ``get_pandas_df`` and ``run``
+methods of the ``TrinoHook``.
+
 3.1.0
 .....