From 2cbde0e6fd08b8716225a51d4e11e305b2ba61f0 Mon Sep 17 00:00:00 2001
From: Sanketh Varamballi <sankethv03@gmail.com>
Date: Tue, 20 Sep 2022 17:35:01 -0400
Subject: [PATCH] added static typing to data_utils.py

---
 dataprofiler/data_readers/data_utils.py | 63 +++++++++++++------------
 1 file changed, 33 insertions(+), 30 deletions(-)

diff --git a/dataprofiler/data_readers/data_utils.py b/dataprofiler/data_readers/data_utils.py
index 6e7abeec0..1708b072c 100644
--- a/dataprofiler/data_readers/data_utils.py
+++ b/dataprofiler/data_readers/data_utils.py
@@ -1,10 +1,12 @@
 """Contains functions for data readers."""
 import json
+from logging import Logger
 import re
+from typing import Any, Dict, Generator, Iterator, List, Optional, Tuple, Union, cast
 import urllib
 from builtins import next
 from collections import OrderedDict
-from io import BytesIO, TextIOWrapper
+from io import BytesIO, StringIO, TextIOWrapper
 
 import dateutil
 import pandas as pd
@@ -15,10 +17,10 @@
 from .. import dp_logging
 from .filepath_or_buffer import FileOrBufferHandler, is_stream_buffer  # NOQA
 
-logger = dp_logging.get_child_logger(__name__)
+logger: Logger = dp_logging.get_child_logger(__name__)
 
 
-def data_generator(data_list):
+def data_generator(data_list: List[str]) -> Generator[str, None, None]:
     """
     Take a list and return a generator on the list.
 
@@ -31,7 +33,7 @@ def data_generator(data_list):
         yield item
 
 
-def generator_on_file(file_object):
+def generator_on_file(file_object: Union[StringIO, BytesIO]) -> Generator[Union[str, bytes], None, None]:
     """
     Take a file and return a generator that returns lines.
 
@@ -49,7 +51,7 @@ def generator_on_file(file_object):
     file_object.close()
 
 
-def convert_int_to_string(x):
+def convert_int_to_string(x: int) -> str:
     """
     Convert the given input to string.
 
@@ -69,7 +71,7 @@ def convert_int_to_string(x):
         return str(x)
 
 
-def unicode_to_str(data, ignore_dicts=False):
+def unicode_to_str(data: Union[str, List, Dict], ignore_dicts: bool=False) -> Union[str, List, Dict]:
     """
     Convert data to string representation if it is a unicode string.
 
@@ -99,7 +101,7 @@ def unicode_to_str(data, ignore_dicts=False):
     return data
 
 
-def json_to_dataframe(json_lines, selected_columns=None, read_in_string=False):
+def json_to_dataframe(json_lines: List[Dict], selected_columns: Optional[List[str]]=None, read_in_string: bool=False) -> Tuple[pd.DataFrame, pd.Series]:
     """
     Take list of json objects and return dataframe representing json list.
 
@@ -137,7 +139,7 @@ def json_to_dataframe(json_lines, selected_columns=None, read_in_string=False):
     return df, original_df_dtypes
 
 
-def read_json_df(data_generator, selected_columns=None, read_in_string=False):
+def read_json_df(data_generator: Generator, selected_columns: Optional[List[str]]=None, read_in_string: bool=False) -> Tuple[Iterator[pd.DataFrame], pd.Series]:
     """
     Return an iterator that returns a chunk of data as dataframe in each call.
 
@@ -187,10 +189,10 @@ def read_json_df(data_generator, selected_columns=None, read_in_string=False):
         k += 1
     if not lines and k:
         raise ValueError("No JSON data could be read from these data.")
-    return json_to_dataframe(lines, selected_columns, read_in_string)
+    return json_to_dataframe(cast(List[Dict], lines), selected_columns, read_in_string)
 
 
-def read_json(data_generator, selected_columns=None, read_in_string=False):
+def read_json(data_generator: Generator, selected_columns: Optional[List[str]]=None, read_in_string: bool=False) -> List[Dict]:
     """
     Return the lines of a json.
 
@@ -239,17 +241,17 @@ def read_json(data_generator, selected_columns=None, read_in_string=False):
         k += 1
     if not lines and k:
         raise ValueError("No JSON data could be read from these data.")
-    return lines
+    return cast(List[Dict], lines)
 
 
 def read_csv_df(
-    file_path,
-    delimiter,
-    header,
-    selected_columns=[],
-    read_in_string=False,
-    encoding="utf-8",
-):
+    file_path: Union[str, BytesIO, TextIOWrapper],
+    delimiter: str,
+    header: int,
+    selected_columns: List[str]=[],
+    read_in_string: bool=False,
+    encoding: str="utf-8",
+) -> pd.DataFrame:
     """
     Read a CSV file in chunks and return dataframe in form of iterator.
 
@@ -299,13 +301,14 @@ def read_csv_df(
 
     # if the buffer was wrapped, detach it before returning
     if is_buf_wrapped:
+        assert isinstance(file_path, TextIOWrapper)
         file_path.detach()
     fo.close()
 
     return data
 
 
-def read_parquet_df(file_path, selected_columns=None, read_in_string=False):
+def read_parquet_df(file_path: str, selected_columns: Optional[List[str]]=None, read_in_string: bool=False) -> Tuple[pd.DataFrame, pd.Series]:
     """
     Return an iterator that returns one row group each time.
 
@@ -349,7 +352,7 @@ def read_parquet_df(file_path, selected_columns=None, read_in_string=False):
     return data, original_df_dtypes
 
 
-def read_text_as_list_of_strs(file_path, encoding=None):
+def read_text_as_list_of_strs(file_path: str, encoding: Optional[str]=None) -> List[str]:
     """
     Return list of strings relative to the chunk size.
 
@@ -367,7 +370,7 @@ def read_text_as_list_of_strs(file_path, encoding=None):
     return data
 
 
-def detect_file_encoding(file_path, buffer_size=1024, max_lines=20):
+def detect_file_encoding(file_path: str, buffer_size: int=1024, max_lines: int=20) -> str:
     """
     Determine encoding of files within initial `max_lines` of length `buffer_size`.
 
@@ -456,7 +459,7 @@ def _decode_is_valid(encoding):
     return encoding.lower()
 
 
-def detect_cell_type(cell):
+def detect_cell_type(cell: str) -> str:
     """
     Detect the cell type (int, float, etc).
 
@@ -488,7 +491,7 @@ def detect_cell_type(cell):
     return cell_type
 
 
-def get_delimiter_regex(delimiter=",", quotechar=","):
+def get_delimiter_regex(delimiter: str=",", quotechar: str=",") -> re.Pattern[str]:
     """
     Build regex for delimiter checks.
 
@@ -518,7 +521,7 @@ def get_delimiter_regex(delimiter=",", quotechar=","):
     return re.compile(delimiter_regex + quotechar_regex)
 
 
-def find_nth_loc(string=None, search_query=None, n=0, ignore_consecutive=True):
+def find_nth_loc(string: Optional[str]=None, search_query: Optional[str]=None, n: int=0, ignore_consecutive: bool=True) -> Tuple[int, int]:
     """
     Search string via search_query and return nth index in which query occurs.
 
@@ -565,8 +568,8 @@ def find_nth_loc(string=None, search_query=None, n=0, ignore_consecutive=True):
 
 
 def load_as_str_from_file(
-    file_path, file_encoding=None, max_lines=10, max_bytes=65536, chunk_size_bytes=1024
-):
+    file_path: str, file_encoding: Optional[str]=None, max_lines: int=10, max_bytes: int=65536, chunk_size_bytes: int=1024
+) -> str:
     """
     Load data from a csv file up to a specific line OR byte_size.
 
@@ -602,7 +605,7 @@ def load_as_str_from_file(
 
             # Return either the last index of sample_lines OR
             # the index of the newline char that matches remaining_lines
-            search_query_value = "\n"
+            search_query_value: Union[str, bytes] = "\n"
             if isinstance(sample_lines, bytes):
                 search_query_value = b"\n"
 
@@ -611,7 +614,7 @@ def load_as_str_from_file(
             while start_loc < len_sample_lines - 1 and total_occurrences < max_lines:
                 loc, occurrence = find_nth_loc(
                     sample_lines[start_loc:],
-                    search_query=search_query_value,
+                    search_query=cast(str, search_query_value), # TODO: make sure find_nth_loc() works with search_query as bytes
                     n=remaining_lines,
                 )
 
@@ -629,7 +632,7 @@ def load_as_str_from_file(
     return data_as_str
 
 
-def is_valid_url(url_as_string):
+def is_valid_url(url_as_string: Any) -> bool:
     """
     Determine whether a given string is a valid URL.
 
@@ -646,7 +649,7 @@ def is_valid_url(url_as_string):
     return all([result.scheme, result.netloc])
 
 
-def url_to_bytes(url_as_string, options):
+def url_to_bytes(url_as_string: str, options: Dict) -> BytesIO:
     """
     Read in URL and converts it to a byte stream.