From 480d75500c023af4b7215b086aff51f4c00f5e96 Mon Sep 17 00:00:00 2001
From: Mikhail Bautin <mbautin@users.noreply.github.com>
Date: Tue, 24 Aug 2021 08:41:24 -0700
Subject: [PATCH] [#9804] Add Python type annotations to the Spark test runner

Summary:
Add Python type annotations to the Spark test runner. Also use the codecheck
tool to verify a subset of Python scripts during the build.

Test Plan: Jenkins: build type: release

Reviewers: jharveysmith, steve.varnau

Reviewed By: steve.varnau

Subscribers: ybase

Differential Revision: https://phabricator.dev.yugabyte.com/D12701
---
 bin/run_codecheck                        |   9 ++
 build-support/common-test-env.sh         |   4 +-
 build-support/run_tests_on_spark.py      | 183 +++++++++++++----------
 build-support/split_long_command_line.py |  27 ++--
 build-support/validate_build_root.py     |   2 +-
 codecheck.ini                            |  47 ++++++
 python/yb/yb_dist_tests.py               | 115 +++++++++-----
 requirements.txt                         |   2 +
 requirements_frozen.txt                  |  26 ++--
 tox.ini                                  |  16 ++
 10 files changed, 282 insertions(+), 149 deletions(-)
 create mode 100755 bin/run_codecheck
 create mode 100644 codecheck.ini
 create mode 100644 tox.ini

diff --git a/bin/run_codecheck b/bin/run_codecheck
new file mode 100755
index 000000000000..a89167bf9998
--- /dev/null
+++ b/bin/run_codecheck
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+. "${BASH_SOURCE%/*}/../build-support/common-build-env.sh"
+
+activate_virtualenv
+set_pythonpath
+
+codecheck "$@"
diff --git a/build-support/common-test-env.sh b/build-support/common-test-env.sh
index 1ab71f02e6f1..059aafbda06e 100644
--- a/build-support/common-test-env.sh
+++ b/build-support/common-test-env.sh
@@ -1850,11 +1850,13 @@ run_python_doctest() {
 
 run_python_tests() {
   activate_virtualenv
+  check_python_script_syntax
   (
     export PYTHONPATH=$YB_SRC_ROOT/python
     run_python_doctest
+    log "Invoking the codecheck tool"
+    python3 -m codecheck
   )
-  check_python_script_syntax
 }
 
 should_run_java_test_methods_separately() {
diff --git a/build-support/run_tests_on_spark.py b/build-support/run_tests_on_spark.py
index df991aff1dd5..41abb0e288bf 100755
--- a/build-support/run_tests_on_spark.py
+++ b/build-support/run_tests_on_spark.py
@@ -64,14 +64,13 @@
 import sys
 import time
 import traceback
-import tempfile
 import errno
 import signal
 from datetime import datetime
 
 from collections import defaultdict
 
-from typing import List
+from typing import List, Dict, Set, Tuple, Optional, Any, cast
 
 BUILD_SUPPORT_DIR = os.path.dirname(os.path.realpath(__file__))
 
@@ -86,12 +85,12 @@
 TIME_SEC_TO_START_RUNNING_TEST = 5 * 60
 
 
-def wait_for_path_to_exist(target_path):
+def wait_for_path_to_exist(target_path: str) -> None:
     if os.path.exists(target_path):
         return
     waited_for_sec = 0
     start_time_sec = time.time()
-    printed_msg_at_sec = 0
+    printed_msg_at_sec: float = 0.0
     MSG_PRINT_INTERVAL_SEC = 5.0
     TIMEOUT_SEC = 120
     while not os.path.exists(target_path):
@@ -114,8 +113,7 @@ def wait_for_path_to_exist(target_path):
 sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'python'))
 from yb import yb_dist_tests  # noqa
 from yb import command_util  # noqa
-from yb.common_util import set_to_comma_sep_str, get_bool_env_var, is_macos  # noqa
-from yb.yb_dist_tests import TestDescriptor  # noqa
+from yb.common_util import set_to_comma_sep_str, is_macos  # noqa
 
 # Special Jenkins environment variables. They are propagated to tasks running in a distributed way
 # on Spark.
@@ -177,21 +175,21 @@ def wait_for_path_to_exist(target_path):
 # Global variables. Some of these are used on the remote worker side.
 verbose = False
 g_spark_master_url_override = None
-propagated_env_vars = {}
+propagated_env_vars: Dict[str, str] = {}
 global_conf_dict = None
 spark_context = None
 archive_sha256sum = None
 
 
-def configure_logging():
+def configure_logging() -> None:
     log_level = logging.INFO
     logging.basicConfig(
         level=log_level,
         format="[%(filename)s:%(lineno)d] %(asctime)s %(levelname)s: %(message)s")
 
 
-def is_pid_running(pid):
-    import psutil
+def is_pid_running(pid: int) -> bool:
+    import psutil  # type: ignore
     try:
         process = psutil.Process(pid)
         return process.status() != psutil.STATUS_ZOMBIE
@@ -200,7 +198,7 @@ def is_pid_running(pid):
         return False
 
 
-def delete_if_exists_log_errors(file_path):
+def delete_if_exists_log_errors(file_path: str) -> None:
     if os.path.exists(file_path):
         try:
             if os.path.isdir(file_path):
@@ -211,20 +209,20 @@ def delete_if_exists_log_errors(file_path):
             logging.error("Error deleting file %s: %s", file_path, os_error)
 
 
-def log_heading(msg):
+def log_heading(msg: str) -> None:
     logging.info('\n%s\n%s\n%s' % ('-' * 80, msg, '-' * 80))
 
 
 # Initializes the spark context. The details list will be incorporated in the Spark application
 # name visible in the Spark web UI.
-def init_spark_context(details=[]):
+def init_spark_context(details: List[str] = []) -> None:
     global spark_context
     if spark_context:
         return
     log_heading("Initializing Spark context")
-    global_conf = yb_dist_tests.global_conf
-    build_type = yb_dist_tests.global_conf.build_type
-    from pyspark import SparkContext
+    global_conf = yb_dist_tests.get_global_conf()
+    build_type = global_conf.build_type
+    from pyspark import SparkContext  # type: ignore
     SparkContext.setSystemProperty('spark.task.maxFailures', str(SPARK_TASK_MAX_FAILURES))
 
     spark_master_url = g_spark_master_url_override
@@ -232,7 +230,7 @@ def init_spark_context(details=[]):
         if is_macos():
             logging.info("This is macOS, using the macOS Spark cluster")
             spark_master_url = SPARK_URLS['macos']
-        elif yb_dist_tests.global_conf.build_type in ['asan', 'tsan']:
+        elif build_type in ['asan', 'tsan']:
             logging.info("Using a separate Spark cluster for ASAN and TSAN tests")
             spark_master_url = SPARK_URLS['linux_asan_tsan']
         else:
@@ -267,23 +265,23 @@ def init_spark_context(details=[]):
     log_heading("Initialized Spark context")
 
 
-def set_global_conf_for_spark_jobs():
+def set_global_conf_for_spark_jobs() -> None:
     global global_conf_dict
-    global_conf_dict = vars(yb_dist_tests.global_conf)
+    global_conf_dict = vars(yb_dist_tests.get_global_conf())
 
 
-def get_bash_path():
+def get_bash_path() -> str:
     if sys.platform == 'darwin':
         return '/usr/local/bin/bash'
     return '/bin/bash'
 
 
-def parallel_run_test(test_descriptor_str):
+def parallel_run_test(test_descriptor_str: str) -> yb_dist_tests.TestResult:
     """
     This is invoked in parallel to actually run tests.
     """
     global_conf = initialize_remote_task()
-    from yb import yb_dist_tests, command_util
+    from yb import yb_dist_tests
 
     wait_for_path_to_exist(global_conf.build_root)
 
@@ -321,7 +319,7 @@ def parallel_run_test(test_descriptor_str):
     # We could use "run_program" here, but it collects all the output in memory, which is not
     # ideal for a large amount of test log output. The "tee" part also makes the output visible in
     # the standard error of the Spark task as well, which is sometimes helpful for debugging.
-    def run_test():
+    def run_test() -> Tuple[int, float]:
         start_time_sec = time.time()
         error_log_dir_path = os.path.dirname(os.path.abspath(test_descriptor.error_output_path))
         if not os.path.isdir(error_log_dir_path):
@@ -468,22 +466,24 @@ def run_test():
         os.umask(old_umask)
 
 
-def get_bash_shebang():
+def get_bash_shebang() -> str:
     # Prefer /usr/local/bin/bash as we install Bash 4+ there on macOS.
     if os.path.exists('/usr/local/bin/bash'):
         return '/usr/local/bin/bash'
     return '/usr/bin/env bash'
 
 
-def initialize_remote_task():
+# This is executed on a Spark executor as part of running a task.
+def initialize_remote_task() -> yb_dist_tests.GlobalTestConfig:
     configure_logging()
 
+    assert global_conf_dict is not None
     global_conf = yb_dist_tests.set_global_conf_from_dict(global_conf_dict)
     global_conf.set_env_on_spark_worker(propagated_env_vars)
     if not global_conf.archive_for_workers:
         return global_conf
 
-    from pyspark import SparkFiles
+    from pyspark import SparkFiles  # type: ignore
     archive_name = os.path.basename(SparkFiles.get(global_conf.archive_for_workers))
     expected_archive_sha256sum = global_conf.archive_sha256sum
     assert expected_archive_sha256sum is not None
@@ -576,7 +576,7 @@ def initialize_remote_task():
     return global_conf
 
 
-def parallel_list_test_descriptors(rel_test_path):
+def parallel_list_test_descriptors(rel_test_path: str) -> List[str]:
     """
     This is invoked in parallel to list all individual tests within our C++ test programs. Without
     this, listing all gtest tests across 330 test programs might take about 5 minutes on TSAN and 2
@@ -623,8 +623,8 @@ def parallel_list_test_descriptors(rel_test_path):
     #    BloomStatsTestWithIter/1  # GetParam() = (true, false)
     #    BloomStatsTestWithIter/2  # GetParam() = (false, false)
 
-    current_test = None
-    test_descriptors = []
+    current_test: Optional[str] = None
+    test_descriptors: List[str] = []
     test_descriptor_prefix = rel_test_path + yb_dist_tests.TEST_DESCRIPTOR_SEPARATOR
     for line in prog_result.stdout.split("\n"):
         if ('Starting tracking the heap' in line or 'Dumping heap profile to' in line):
@@ -632,6 +632,7 @@ def parallel_list_test_descriptors(rel_test_path):
         line = line.rstrip()
         trimmed_line = HASH_COMMENT_RE.sub('', line.strip()).strip()
         if line.startswith('  '):
+            assert current_test is not None
             test_descriptors.append(test_descriptor_prefix + current_test + trimmed_line)
         else:
             current_test = trimmed_line
@@ -639,7 +640,7 @@ def parallel_list_test_descriptors(rel_test_path):
     return test_descriptors
 
 
-def get_username():
+def get_username() -> str:
     try:
         return os.getlogin()
     except OSError as ex:
@@ -652,7 +653,7 @@ def get_username():
             user_from_env = os.getenv('USER')
             if user_from_env:
                 return user_from_env
-            id_output = subprocess.check_output('id').strip()
+            id_output = subprocess.check_output('id').strip().decode('utf-8')
             ID_OUTPUT_RE = re.compile(r'^uid=\d+[(]([^)]+)[)]\s.*')
             match = ID_OUTPUT_RE.match(id_output)
             if match:
@@ -663,38 +664,47 @@ def get_username():
             raise ex
 
 
-def get_mac_shared_nfs(path):
+def get_mac_shared_nfs(path: str) -> str:
     LOCAL_PATH = "/Volumes/share"
     if not path.startswith(LOCAL_PATH):
         raise ValueError("Local path %s does not start with expected prefix '%s'.\n" %
                          (path, LOCAL_PATH))
     relpath = path[len(LOCAL_PATH):]
-    return "/Volumes/net/v1/" + os.environ.get('YB_BUILD_HOST') + relpath
+    yb_build_host = os.environ.get('YB_BUILD_HOST')
+    if yb_build_host is None:
+        raise ValueError("The YB_BUILD_HOST environment variable is not set")
+    return "/Volumes/net/v1/" + yb_build_host + relpath
 
 
-def get_jenkins_job_name():
-    return os.environ.get('JOB_NAME', None)
+def get_jenkins_job_name() -> Optional[str]:
+    return os.environ.get('JOB_NAME')
 
 
-def get_jenkins_job_name_path_component():
+def get_jenkins_job_name_path_component() -> str:
     jenkins_job_name = get_jenkins_job_name()
     if jenkins_job_name:
         return "job_" + jenkins_job_name
-    else:
-        return "unknown_jenkins_job"
 
+    return "unknown_jenkins_job"
 
-def get_report_parent_dir(report_base_dir):
+
+def get_report_parent_dir(report_base_dir: str) -> str:
     """
     @return a directory to store build report, relative to the given base directory. Path components
             are based on build type, Jenkins job name, etc.
     """
-    return os.path.join(report_base_dir,
-                        yb_dist_tests.global_conf.build_type,
-                        get_jenkins_job_name_path_component())
-
-
-def save_json_to_paths(short_description, json_data, output_paths, should_gzip=False):
+    global_conf = yb_dist_tests.get_global_conf()
+    return os.path.join(
+        report_base_dir,
+        global_conf.build_type,
+        get_jenkins_job_name_path_component())
+
+
+def save_json_to_paths(
+        short_description: str,
+        json_data: Any,
+        output_paths: List[str],
+        should_gzip: bool = False) -> None:
     """
     Saves the given JSON-friendly data structure to the list of paths (exact copy at each path),
     optionally gzipping the output.
@@ -710,17 +720,21 @@ def save_json_to_paths(short_description, json_data, output_paths, should_gzip=F
         final_output_path = output_path + ('.gz' if should_gzip else '')
         logging.info("Saving {} to {}".format(short_description, final_output_path))
         if should_gzip:
-            with gzip.open(final_output_path, 'wb') as output_file:
-                output_file.write(json_data_str.encode('utf-8'))
+            with gzip.open(final_output_path, 'wb') as output_file_plain:
+                output_file_plain.write(json_data_str.encode('utf-8'))
         else:
-            with open(final_output_path, 'w') as output_file:
-                output_file.write(json_data_str)
+            with open(final_output_path, 'w') as output_file_gzip:
+                output_file_gzip.write(json_data_str)
 
 
-def save_report(report_base_dir, results, total_elapsed_time_sec, spark_succeeded,
-                save_to_build_dir=False):
+def save_report(
+        report_base_dir: str,
+        results: List[yb_dist_tests.TestResult],
+        total_elapsed_time_sec: float,
+        spark_succeeded: bool,
+        save_to_build_dir: bool = False) -> None:
     historical_report_path = None
-    global_conf = yb_dist_tests.global_conf
+    global_conf = yb_dist_tests.get_global_conf()
 
     if report_base_dir:
         historical_report_parent_dir = get_report_parent_dir(report_base_dir)
@@ -742,12 +756,12 @@ def save_report(report_base_dir, results, total_elapsed_time_sec, spark_succeede
 
         historical_report_path = os.path.join(
                 historical_report_parent_dir,
-                '{}_{}__user_{}__build_{}.json'.format(
+                '{}.json'.format('_'.join([
                     global_conf.build_type,
                     time.strftime('%Y-%m-%dT%H_%M_%S'),
                     username,
                     get_jenkins_job_name_path_component(),
-                    os.environ.get('BUILD_ID', 'unknown')))
+                    os.environ.get('BUILD_ID', 'unknown')])))
 
     test_reports_by_descriptor = {}
     for result in results:
@@ -789,7 +803,7 @@ def save_report(report_base_dir, results, total_elapsed_time_sec, spark_succeede
         save_json_to_paths('short build report', report, [short_report_path], should_gzip=False)
 
 
-def is_one_shot_test(rel_binary_path):
+def is_one_shot_test(rel_binary_path: str) -> bool:
     if rel_binary_path in ONE_SHOT_TESTS:
         return True
     for non_gtest_test in ONE_SHOT_TESTS:
@@ -798,13 +812,14 @@ def is_one_shot_test(rel_binary_path):
     return False
 
 
-def collect_cpp_tests(cpp_test_program_filter: List[str]) -> List[yb_dist_tests.TestDescriptor]:
+def collect_cpp_tests(
+        cpp_test_program_filter_list: List[str]) -> List[yb_dist_tests.TestDescriptor]:
     """
     Collect C++ test programs to run.
-    @param cpp_test_program_filter: a list of C++ test program names to be used as a filter
+    @param cpp_test_program_filter_list: a list of C++ test program names to be used as a filter
     """
 
-    global_conf = yb_dist_tests.global_conf
+    global_conf = yb_dist_tests.get_global_conf()
     logging.info("Collecting the list of C++ test programs (locally; not a Spark job)")
     start_time_sec = time.time()
     build_root_realpath = os.path.realpath(global_conf.build_root)
@@ -835,8 +850,8 @@ def collect_cpp_tests(cpp_test_program_filter: List[str]) -> List[yb_dist_tests.
     logging.info("Collected %d test programs in %.2f sec" % (
         len(test_programs), elapsed_time_sec))
 
-    if cpp_test_program_filter:
-        cpp_test_program_filter = set(cpp_test_program_filter)
+    if cpp_test_program_filter_list:
+        cpp_test_program_filter = set(cpp_test_program_filter_list)
         unfiltered_test_programs = test_programs
 
         # test_program contains test paths relative to the root directory (including directory
@@ -891,8 +906,9 @@ def collect_cpp_tests(cpp_test_program_filter: List[str]) -> List[yb_dist_tests.
 
     # Use fewer "slices" (tasks) than there are test programs, in hope to get some batching.
     num_slices = (len(test_programs) + 1) / 2
+    assert spark_context is not None
     all_test_descriptor_lists = run_spark_action(
-        lambda: spark_context.parallelize(
+        lambda: spark_context.parallelize(  # type: ignore
             test_programs, numSlices=num_slices).map(parallel_list_test_descriptors).collect()
     )
     elapsed_time_sec = time.time() - start_time_sec
@@ -916,21 +932,22 @@ def collect_cpp_tests(cpp_test_program_filter: List[str]) -> List[yb_dist_tests.
     return [yb_dist_tests.TestDescriptor(s) for s in test_descriptor_strs]
 
 
-def is_writable(dir_path):
+def is_writable(dir_path: str) -> bool:
     return os.access(dir_path, os.W_OK)
 
 
-def is_parent_dir_writable(file_path):
+def is_parent_dir_writable(file_path: str) -> bool:
     return is_writable(os.path.dirname(file_path))
 
 
-def fatal_error(msg):
+def fatal_error(msg: str) -> None:
     logging.error("Fatal: " + msg)
     raise RuntimeError(msg)
 
 
-def get_java_test_descriptors():
-    java_test_list_path = os.path.join(yb_dist_tests.global_conf.build_root, 'java_test_list.txt')
+def get_java_test_descriptors() -> List[yb_dist_tests.TestDescriptor]:
+    java_test_list_path = os.path.join(
+        yb_dist_tests.get_global_conf().build_root, 'java_test_list.txt')
     if not os.path.exists(java_test_list_path):
         raise IOError(
             "Java test list not found at '%s'. Please run ./yb_build.sh --collect-java-tests to "
@@ -949,7 +966,7 @@ def get_java_test_descriptors():
     return java_test_descriptors
 
 
-def collect_tests(args):
+def collect_tests(args: argparse.Namespace) -> List[yb_dist_tests.TestDescriptor]:
     test_conf = {}
     if args.test_conf:
         with open(args.test_conf) as test_conf_file:
@@ -967,7 +984,7 @@ def collect_tests(args):
     cpp_test_descriptors = []
     if args.run_cpp_tests:
         cpp_test_programs = test_conf.get('cpp_test_programs')
-        cpp_test_descriptors = collect_cpp_tests(cpp_test_programs)
+        cpp_test_descriptors = collect_cpp_tests(cast(List[str], cpp_test_programs))
 
     java_test_descriptors = []
     if args.run_java_tests:
@@ -992,7 +1009,7 @@ def collect_tests(args):
     return test_descriptors
 
 
-def load_test_list(test_list_path):
+def load_test_list(test_list_path: str) -> List[yb_dist_tests.TestDescriptor]:
     logging.info("Loading the list of tests to run from %s", test_list_path)
     test_descriptors = []
     with open(test_list_path, 'r') as input_file:
@@ -1003,7 +1020,7 @@ def load_test_list(test_list_path):
     return test_descriptors
 
 
-def propagate_env_vars():
+def propagate_env_vars() -> None:
     num_propagated = 0
     for env_var_name in JENKINS_ENV_VARS:
         if env_var_name in os.environ:
@@ -1019,8 +1036,8 @@ def propagate_env_vars():
     logging.info("Number of propagated environment variables: %s", num_propagated)
 
 
-def run_spark_action(action):
-    import py4j
+def run_spark_action(action: Any) -> Any:
+    import py4j  # type: ignore
     try:
         results = action()
     except py4j.protocol.Py4JJavaError:
@@ -1030,7 +1047,7 @@ def run_spark_action(action):
     return results
 
 
-def main():
+def main() -> None:
     parser = argparse.ArgumentParser(
         description='Run tests on Spark.')
     parser.add_argument('--verbose', action='store_true',
@@ -1138,11 +1155,12 @@ def main():
         fatal_error("File specified by --test_list does not exist or is not a file: '{}'".format(
             test_list_path))
 
+    global_conf = yb_dist_tests.get_global_conf()
     if ('YB_MVN_LOCAL_REPO' not in os.environ and
             args.run_java_tests and
             args.send_archive_to_workers):
         os.environ['YB_MVN_LOCAL_REPO'] = os.path.join(
-                yb_dist_tests.global_conf.build_root, 'm2_repository')
+                global_conf.build_root, 'm2_repository')
         logging.info("Automatically setting YB_MVN_LOCAL_REPO to %s",
                      os.environ['YB_MVN_LOCAL_REPO'])
 
@@ -1162,10 +1180,12 @@ def main():
     # This needs to be done before Spark context initialization, which will happen as we try to
     # collect all gtest tests in all C++ test programs.
     if args.send_archive_to_workers:
-        archive_exists = os.path.exists(yb_dist_tests.global_conf.archive_for_workers)
+        archive_exists = (
+            global_conf.archive_for_workers is not None and
+            os.path.exists(global_conf.archive_for_workers))
         if args.recreate_archive_for_workers or not archive_exists:
-            archive_sha_path = os.path.join(yb_dist_tests.global_conf.yb_src_root,
-                                            'extracted_from_archive.sha256')
+            archive_sha_path = os.path.join(
+                global_conf.yb_src_root, 'extracted_from_archive.sha256')
             if os.path.exists(archive_sha_path):
                 os.remove(archive_sha_path)
 
@@ -1174,8 +1194,9 @@ def main():
             yb_dist_tests.compute_archive_sha256sum()
 
             # Local host may also be worker, so leave expected checksum here after archive created.
+            assert global_conf.archive_sha256sum is not None
             with open(archive_sha_path, 'w') as archive_sha:
-                archive_sha.write(yb_dist_tests.global_conf.archive_sha256sum)
+                archive_sha.write(global_conf.archive_sha256sum)
         else:
             yb_dist_tests.compute_archive_sha256sum()
 
@@ -1233,7 +1254,7 @@ def main():
         # Randomize test order to avoid any kind of skew.
         random.shuffle(test_descriptors)
 
-        test_names_rdd = spark_context.parallelize(
+        test_names_rdd = spark_context.parallelize(  # type: ignore
                 [test_descriptor.descriptor_str for test_descriptor in test_descriptors],
                 numSlices=total_num_tests)
 
@@ -1248,8 +1269,8 @@ def main():
 
     logging.info("Tests are done, set of exit codes: %s, tentative global exit code: %s",
                  sorted(test_exit_codes), global_exit_code)
-    num_tests_by_language = defaultdict(int)
-    failures_by_language = defaultdict(int)
+    num_tests_by_language: Dict[str, int] = defaultdict(int)
+    failures_by_language: Dict[str, int] = defaultdict(int)
     failed_test_desc_strs = []
     had_errors_copying_artifacts = False
     for result in results:
diff --git a/build-support/split_long_command_line.py b/build-support/split_long_command_line.py
index c1a12521d706..808bb47d0c10 100755
--- a/build-support/split_long_command_line.py
+++ b/build-support/split_long_command_line.py
@@ -18,17 +18,18 @@
 
 import sys
 
-line_length = 0
-buffer = ''
-for line in sys.stdin:
-    for c in line.rstrip():
-        if c.isspace() and line_length > 80:
-            print(buffer + c + '\\\\')
-            buffer = ''
-            line_length = 0
-        else:
-            buffer += c
-            line_length += 1
+if __name__ == '__main__':
+    line_length = 0
+    buffer = ''
+    for line in sys.stdin:
+        for c in line.rstrip():
+            if c.isspace() and line_length > 80:
+                print(buffer + c + '\\\\')
+                buffer = ''
+                line_length = 0
+            else:
+                buffer += c
+                line_length += 1
 
-if buffer:
-    sys.stdout.write(buffer)
+    if buffer:
+        sys.stdout.write(buffer)
diff --git a/build-support/validate_build_root.py b/build-support/validate_build_root.py
index 2fb06556dd65..14e361636e0e 100755
--- a/build-support/validate_build_root.py
+++ b/build-support/validate_build_root.py
@@ -25,7 +25,7 @@
 
     if not build_root.startswith(internal_build_root_parent_dir) and \
        not build_root.startswith(external_build_root_parent_dir):
-        print ("Build root '{}' is not within either '{}' or '{}'".format(
+        print("Build root '{}' is not within either '{}' or '{}'".format(
             build_root,
             internal_build_root_parent_dir,
             internal_build_root_parent_dir), file=sys.stderr)
diff --git a/codecheck.ini b/codecheck.ini
new file mode 100644
index 000000000000..123e638b32fb
--- /dev/null
+++ b/codecheck.ini
@@ -0,0 +1,47 @@
+# Copyright (c) Yugabyte, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations
+# under the License.
+
+[default]
+mypy_config = mypy.ini
+
+[checks]
+
+# Only keep Python-related checks on for now, and run them for a limited subset of files.
+mypy = on
+compile = on
+pycodestyle = on
+doctest = on
+import = on
+unittest = on
+shellcheck = off
+
+[files]
+
+# TODO: add codecheck support for a list of plain file paths (not regexes) and use it here.
+included_regex_list =
+    ^python/yb/__init__[.]py$
+    ^python/yb/command_util[.]py$
+    ^python/yb/library_packager[.]py$
+    ^python/yb/linuxbrew[.]py$
+    ^python/yb/common_util[.]py$
+    ^python/yb/tool_base[.]py$
+    ^python/yb/compile_commands[.]py$
+    ^python/yb/run_pvs_studio_analyzer[.]py$
+    ^python/yb/build_postgres[.]py$
+    ^python/yb/os_detection[.]py$
+    ^python/yb/thirdparty_tool[.]py$
+    ^python/yb/yb_dist_tests[.]py$
+    ^build-support/is_same_path[.]py$
+    ^build-support/kill_long_running_minicluster_daemons[.]py$
+    ^build-support/split_long_command_line[.]py$
+    ^build-support/validate_build_root[.]py$
+    ^build-support/run_tests_on_spark[.]py$
diff --git a/python/yb/yb_dist_tests.py b/python/yb/yb_dist_tests.py
index 4acc41a3087d..14c711e0414a 100644
--- a/python/yb/yb_dist_tests.py
+++ b/python/yb/yb_dist_tests.py
@@ -24,6 +24,7 @@
 import tempfile
 import atexit
 import glob
+import argparse
 
 from functools import total_ordering
 
@@ -31,7 +32,7 @@
 from yb.common_util import get_build_type_from_build_root, \
                            get_compiler_type_from_build_root, \
                            is_macos  # nopep8
-
+from typing import Optional, List, Set, Dict, cast
 
 # This is used to separate relative binary path from gtest_filter for C++ tests in what we call
 # a "test descriptor" (a string that identifies a particular test).
@@ -46,7 +47,7 @@
 TEST_DESCRIPTOR_ATTEMPT_INDEX_RE = re.compile(
     r'^(.*)' + TEST_DESCRIPTOR_ATTEMPT_PREFIX + r'(\d+)$')
 
-global_conf = None
+global_conf: Optional['GlobalTestConfig'] = None
 
 CLOCK_SYNC_WAIT_LOGGING_INTERVAL_SEC = 10
 
@@ -69,7 +70,15 @@ class TestDescriptor:
     - A Java test class source path (including .java/.scala extension) relative to the "java"
       directory in the YugabyteDB source tree.
     """
-    def __init__(self, descriptor_str):
+    descriptor_str_without_attempt_index: str
+    attempt_index: int
+    is_jvm_based: bool
+    language: str
+    args_for_run_test: str
+    error_output_path: str
+
+    def __init__(self, descriptor_str: str) -> None:
+        assert global_conf is not None
         self.descriptor_str = descriptor_str
 
         attempt_index_match = TEST_DESCRIPTOR_ATTEMPT_INDEX_RE.match(descriptor_str)
@@ -104,14 +113,23 @@ def __init__(self, descriptor_str):
             else:
                 # The "test descriptors string " is the Java source file path relative to the "java"
                 # directory.
-                mvn_module, package_and_class_with_slashes = JAVA_TEST_DESCRIPTOR_RE.match(
-                    self.descriptor_str_without_attempt_index).groups()
+                java_descriptor_match = JAVA_TEST_DESCRIPTOR_RE.match(
+                    self.descriptor_str_without_attempt_index)
+                if java_descriptor_match is None:
+                    raise ValueError(
+                        f"Java/Scala test descriptor {self.descriptor_str_without_attempt_index} "
+                        f"could not be parsed using the regular expression "
+                        f"{JAVA_TEST_DESCRIPTOR_RE}")
+
+                mvn_module, package_and_class_with_slashes = java_descriptor_match.groups()
 
                 package_and_class = package_and_class_with_slashes.replace('/', '.')
                 self.args_for_run_test = "{} {}".format(mvn_module, package_and_class)
                 output_file_name = package_and_class
         else:
             self.language = 'C++'
+            test_name: Optional[str]
+
             # This is a C++ test.
             if TEST_DESCRIPTOR_SEPARATOR in self.descriptor_str_without_attempt_index:
                 rel_test_binary, test_name = self.descriptor_str_without_attempt_index.split(
@@ -135,28 +153,30 @@ def __init__(self, descriptor_str):
         self.error_output_path = os.path.join(
                 global_conf.build_root, 'yb-test-logs', output_file_name + '__error.log')
 
-    def __str__(self):
+    def __str__(self) -> str:
         if self.attempt_index == 1:
             return self.descriptor_str_without_attempt_index
-        return "{}{}{}".format[
+        return ''.join([
             self.descriptor_str_without_attempt_index,
             TEST_DESCRIPTOR_ATTEMPT_PREFIX,
-            self.attempt_index
-            ]
+            str(self.attempt_index)
+        ])
 
-    def str_for_file_name(self):
+    def str_for_file_name(self) -> str:
         return str(self).replace('/', '__').replace(':', '_')
 
-    def __eq__(self, other):
-        return self.descriptor_str == other.descriptor_str
+    def __eq__(self, other: object) -> bool:
+        other_descriptor = cast(TestDescriptor, other)
+        return self.descriptor_str == other_descriptor.descriptor_str
 
-    def __ne__(self, other):
+    def __ne__(self, other: object) -> bool:
         return not (self == other)
 
-    def __lt__(self, other):
-        return self.descriptor_str < other.descriptor_str
+    def __lt__(self, other: object) -> bool:
+        other_descriptor = cast(TestDescriptor, other)
+        return self.descriptor_str < other_descriptor.descriptor_str
 
-    def with_attempt_index(self, attempt_index):
+    def with_attempt_index(self, attempt_index: int) -> 'TestDescriptor':
         assert attempt_index >= 1
         copied = copy.copy(self)
         copied.attempt_index = attempt_index
@@ -167,14 +187,23 @@ def with_attempt_index(self, attempt_index):
 
 
 class GlobalTestConfig:
-    def __init__(self,
-                 build_root,
-                 build_type,
-                 yb_src_root,
-                 archive_for_workers,
-                 rel_build_root,
-                 archive_sha256sum,
-                 compiler_type):
+    build_root: str
+    build_type: str
+    yb_src_root: str
+    archive_for_workers: Optional[str]
+    rel_build_root: str
+    archive_sha256sum: Optional[str]
+    compiler_type: str
+
+    def __init__(
+            self,
+            build_root: str,
+            build_type: str,
+            yb_src_root: str,
+            archive_for_workers: Optional[str],
+            rel_build_root: str,
+            archive_sha256sum: Optional[str],
+            compiler_type: str) -> None:
         self.build_root = os.path.abspath(build_root)
         self.build_type = build_type
         self.yb_src_root = yb_src_root
@@ -183,10 +212,11 @@ def __init__(self,
         self.archive_sha256sum = archive_sha256sum
         self.compiler_type = compiler_type
 
-    def get_run_test_script_path(self):
+    def get_run_test_script_path(self) -> str:
         return os.path.join(self.yb_src_root, 'build-support', 'run-test.sh')
 
-    def set_env_on_spark_worker(self, propagated_env_vars={}):
+    def set_env_on_spark_worker(
+            self, propagated_env_vars: Dict[str, str] = {}) -> None:
         """
         Used on the distributed worker side (inside functions that run on Spark) to configure the
         necessary environment.
@@ -209,7 +239,7 @@ def set_env_on_spark_worker(self, propagated_env_vars={}):
          'num_errors_copying_artifacts'])
 
 
-def set_global_conf_from_args(args):
+def set_global_conf_from_args(args: argparse.Namespace) -> GlobalTestConfig:
     build_root = os.path.realpath(args.build_root)
 
     # This module is expected to be under python/yb.
@@ -260,7 +290,7 @@ def set_global_conf_from_args(args):
     return global_conf
 
 
-def set_global_conf_from_dict(global_conf_dict):
+def set_global_conf_from_dict(global_conf_dict: Dict[str, str]) -> GlobalTestConfig:
     """
     This is used in functions that run on Spark. We use a dictionary to pass the configuration from
     the main program to distributed workers.
@@ -268,9 +298,9 @@ def set_global_conf_from_dict(global_conf_dict):
     global global_conf
     try:
         global_conf = GlobalTestConfig(**global_conf_dict)
-    except TypeError as ex:
-        raise TypeError("Cannot set global configuration from dictionary %s: %s" % (
-            repr(global_conf_dict), ex.message))
+    except Exception as ex:
+        logging.exception("Cannot set global configuration from dictionary %s" % global_conf_dict)
+        raise ex
 
     return global_conf
 
@@ -311,7 +341,7 @@ def set_global_conf_from_dict(global_conf_dict):
 ]
 
 
-def find_rel_java_paths_to_archive(yb_src_root):
+def find_rel_java_paths_to_archive(yb_src_root: str) -> List[str]:
     paths = []
     for ent in [False, True]:
         path_components = []
@@ -364,7 +394,8 @@ def validate_mvn_local_repo(mvn_local_repo: str) -> None:
         logging.info(f"All Maven plugin patterns were found in local repo {mvn_local_repo}")
 
 
-def create_archive_for_workers():
+def create_archive_for_workers() -> None:
+    assert global_conf is not None
     dest_path = global_conf.archive_for_workers
     if dest_path is None:
         return
@@ -448,12 +479,12 @@ def create_archive_for_workers():
 # These SHA256-related functions are duplicated in download_and_extract_archive.py, because that
 # script should not depend on any Python modules.
 
-def validate_sha256sum(checksum_str):
+def validate_sha256sum(checksum_str: str) -> None:
     if not re.match(r'^[0-9a-f]{64}$', checksum_str):
         raise ValueError("Invalid SHA256 checksum: '%s', expected 64 hex characters", checksum_str)
 
 
-def compute_sha256sum(file_path):
+def compute_sha256sum(file_path: str) -> str:
     cmd_line = None
     if sys.platform.startswith('linux'):
         cmd_line = ['sha256sum', file_path]
@@ -467,30 +498,32 @@ def compute_sha256sum(file_path):
     return checksum_str
 
 
-def compute_archive_sha256sum():
+def compute_archive_sha256sum() -> None:
+    assert global_conf is not None
     if global_conf.archive_for_workers is not None:
         global_conf.archive_sha256sum = compute_sha256sum(global_conf.archive_for_workers)
         logging.info("SHA256 checksum of archive %s: %s" % (
             global_conf.archive_for_workers, global_conf.archive_sha256sum))
 
 
-def to_real_nfs_path(path):
+def to_real_nfs_path(path: str) -> str:
     assert path.startswith('/'), "Expecting the path to be absolute: %s" % path
     path = os.path.abspath(path)
     return '/real_%s' % path[1:]
 
 
-def get_tmp_filename(prefix='', suffix='', auto_remove=False):
+def get_tmp_filename(prefix: str = '', suffix: str = '', auto_remove: bool = False) -> str:
     fd, file_path = tempfile.mkstemp(prefix=prefix, suffix=suffix)
     os.close(fd)
     os.remove(file_path)
     if auto_remove:
-        def cleanup():
+        def cleanup() -> None:
             if os.path.exists(file_path):
                 os.remove(file_path)
         atexit.register(cleanup)
     return file_path
 
 
-if __name__ == '__main__':
-    main()
+def get_global_conf() -> GlobalTestConfig:
+    assert global_conf is not None
+    return global_conf
diff --git a/requirements.txt b/requirements.txt
index 964b3201a83d..1240fd20408f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,3 +12,5 @@ semantic-version
 six
 wheel
 yugabyte_pycommon
+codecheck
+pycodestyle
\ No newline at end of file
diff --git a/requirements_frozen.txt b/requirements_frozen.txt
index 1de95d9882c9..7e0ada846654 100644
--- a/requirements_frozen.txt
+++ b/requirements_frozen.txt
@@ -1,33 +1,35 @@
 bashlex==0.15
 boto==2.49.0
-certifi==2020.12.5
-cffi==1.14.5
-chardet==4.0.0
+certifi==2021.5.30
+cffi==1.14.6
+charset-normalizer==2.0.4
 click==8.0.1
+codecheck==1.1.2
 compiledb==0.10.1
 Deprecated==1.2.12
-distro==1.5.0
+distro==1.6.0
 downloadutil==1.0.2
-idna==2.10
-mypy==0.812
+idna==3.2
+mypy==0.910
 mypy-extensions==0.4.3
 overrides==6.1.0
-packaging==20.9
+packaging==21.0
 psutil==5.8.0
+pycodestyle==2.7.0
 pycparser==2.20
 PyGithub==1.55
 PyJWT==2.1.0
 PyNaCl==1.4.0
 pyparsing==2.4.7
-requests==2.25.1
-ruamel.yaml==0.17.4
-ruamel.yaml.clib==0.2.2
+requests==2.26.0
+ruamel.yaml==0.17.13
+ruamel.yaml.clib==0.2.6
 semantic-version==2.8.5
 shutilwhich==1.1.0
 six==1.16.0
-typed-ast==1.4.3
+toml==0.10.2
 typing-extensions==3.10.0.0
 typing-utils==0.1.0
-urllib3==1.26.5
+urllib3==1.26.6
 wrapt==1.12.1
 yugabyte-pycommon==1.9.15
diff --git a/tox.ini b/tox.ini
new file mode 100644
index 000000000000..487442e7efce
--- /dev/null
+++ b/tox.ini
@@ -0,0 +1,16 @@
+# Copyright (c) Yugabyte, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations
+# under the License.
+
+# Pycodestyle looks at tox.ini to get the project-specific style.
+# https://pycodestyle.pycqa.org/en/latest/intro.html
+[pycodestyle]
+max-line-length = 100