From f2760e0776d1fabb6999611169aac8864b47904c Mon Sep 17 00:00:00 2001 From: Zain Patel <30357972+mzjp2@users.noreply.github.com> Date: Tue, 24 Mar 2020 11:00:08 +0000 Subject: [PATCH] [KED-1497] Add in bandit for security scanning as a pre-commit hook (#505) --- .pre-commit-config.yaml | 6 +++++ kedro/extras/datasets/pandas/gbq_dataset.py | 2 +- .../extras/datasets/pickle/pickle_dataset.py | 2 +- .../datasets/spark/spark_hive_dataset.py | 23 ++++++++++--------- test_requirements.txt | 1 + tools/ipython/ipython_loader.py | 2 +- 6 files changed, 22 insertions(+), 14 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e240ae1cc3..49dfb6f65d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -106,3 +106,9 @@ repos: types: [file, python] exclude: ^kedro/template/ entry: isort + - id: bandit + name: "Bandit security check" + language: system + types: [file, python] + exclude: ^kedro/template/|^tests/ + entry: bandit -ll diff --git a/kedro/extras/datasets/pandas/gbq_dataset.py b/kedro/extras/datasets/pandas/gbq_dataset.py index 3284a587b1..b8e75dee13 100644 --- a/kedro/extras/datasets/pandas/gbq_dataset.py +++ b/kedro/extras/datasets/pandas/gbq_dataset.py @@ -142,7 +142,7 @@ def _describe(self) -> Dict[str, Any]: ) def _load(self) -> pd.DataFrame: - sql = "select * from {}.{}".format(self._dataset, self._table_name) + sql = "select * from {}.{}".format(self._dataset, self._table_name) # nosec return pd.read_gbq( sql, project_id=self._project_id, diff --git a/kedro/extras/datasets/pickle/pickle_dataset.py b/kedro/extras/datasets/pickle/pickle_dataset.py index d3a1843a8c..7d40114e89 100644 --- a/kedro/extras/datasets/pickle/pickle_dataset.py +++ b/kedro/extras/datasets/pickle/pickle_dataset.py @@ -154,7 +154,7 @@ def _load(self) -> Any: load_path = get_filepath_str(self._get_load_path(), self._protocol) with self._fs.open(load_path, mode="rb") as fs_file: - return pickle.loads(fs_file.read(), **self._load_args) + return pickle.loads(fs_file.read(), **self._load_args) # nosec def _save(self, data: Any) -> None: save_path = get_filepath_str(self._get_save_path(), self._protocol) diff --git a/kedro/extras/datasets/spark/spark_hive_dataset.py b/kedro/extras/datasets/spark/spark_hive_dataset.py index be22a3dcc5..0fe43ea58e 100644 --- a/kedro/extras/datasets/spark/spark_hive_dataset.py +++ b/kedro/extras/datasets/spark/spark_hive_dataset.py @@ -75,15 +75,16 @@ def __init__( def __enter__(self): self._data.createOrReplaceTempView("tmp") + self._spark_session.sql( - "create table {stage_database_name}.{stage_table_name} as select * from tmp".format( - stage_database_name=self._stage_database_name, + "create table {stage_db_name}.{stage_table_name} as select * from tmp".format( # nosec + stage_db_name=self._stage_database_name, stage_table_name=self._stage_table_name, ) ).take(1) self.staged_data = self._spark_session.sql( - "select * from {stage_database_name}.{stage_table_name}".format( - stage_database_name=self._stage_database_name, + "select * from {stage_db_name}.{stage_table_name}".format( # nosec + stage_db_name=self._stage_database_name, stage_table_name=self._stage_table_name, ) ) @@ -91,8 +92,8 @@ def __enter__(self): def __exit__(self, exc_type, exc_val, exc_tb): self._spark_session.sql( - "drop table {stage_database_name}.{stage_table_name}".format( - stage_database_name=self._stage_database_name, + "drop table {stage_db_name}.{stage_table_name}".format( + stage_db_name=self._stage_database_name, stage_table_name=self._stage_table_name, ) ) @@ -208,12 +209,12 @@ def _get_spark() -> SparkSession: def _create_empty_hive_table(self, data): data.createOrReplaceTempView("tmp") self._get_spark().sql( - "create table {database}.{table} select * from tmp limit 1".format( + "create table {database}.{table} select * from tmp limit 1".format( # nosec table=self._table, database=self._database ) ) self._get_spark().sql( - "truncate table {database}.{table}".format( + "truncate table {database}.{table}".format( # nosec database=self._database, table=self._table ) ) @@ -226,7 +227,7 @@ def _load(self) -> DataFrame: ) ) return self._get_spark().sql( - "select * from {database}.{table}".format( + "select * from {database}.{table}".format( # nosec database=self._database, table=self._table ) ) @@ -246,7 +247,7 @@ def _save(self, data: DataFrame) -> None: def _insert_save(self, data: DataFrame) -> None: data.createOrReplaceTempView("tmp") self._get_spark().sql( - "insert into {database}.{table} select {columns} from tmp".format( + "insert into {database}.{table} select {columns} from tmp".format( # nosec database=self._database, table=self._table, columns=", ".join(self._table_columns), @@ -282,7 +283,7 @@ def _overwrite_save(self, data: DataFrame) -> None: self._get_spark().sql( "truncate table {database}.{table}".format( database=self._database, table=self._table - ) + ) # nosec ) self._insert_save(data) diff --git a/test_requirements.txt b/test_requirements.txt index 96f037abce..ed6bf77cd3 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -2,6 +2,7 @@ azure-storage-blob>=1.1.0, <2.0 azure-storage-file>=1.1.0, <2.0 azure-storage-queue>=1.1.0, <2.0 +bandit>=1.6.2, <2.0 behave==1.2.6 biopython>=1.73, <2.0 black==v19.10.b0; python_version >= '3.6' diff --git a/tools/ipython/ipython_loader.py b/tools/ipython/ipython_loader.py index 6ae65dbe04..8d07710068 100644 --- a/tools/ipython/ipython_loader.py +++ b/tools/ipython/ipython_loader.py @@ -108,7 +108,7 @@ def run_startup_scripts(startup_dir: pathlib.Path): compiled = compile( script.read_text(encoding="utf-8"), str(script), "exec" ) - exec(compiled, globals()) # pylint: disable=exec-used + exec(compiled, globals()) # pylint: disable=exec-used # nosec except Exception as err: # pylint: disable=broad-except logging.error( "Startup script `%s` failed:\n%s: %s",