From 2cbde0e6fd08b8716225a51d4e11e305b2ba61f0 Mon Sep 17 00:00:00 2001 From: Sanketh Varamballi Date: Tue, 20 Sep 2022 17:35:01 -0400 Subject: [PATCH] added static typing to data_utils.py --- dataprofiler/data_readers/data_utils.py | 63 +++++++++++++------------ 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/dataprofiler/data_readers/data_utils.py b/dataprofiler/data_readers/data_utils.py index 6e7abeec0..1708b072c 100644 --- a/dataprofiler/data_readers/data_utils.py +++ b/dataprofiler/data_readers/data_utils.py @@ -1,10 +1,12 @@ """Contains functions for data readers.""" import json +from logging import Logger import re +from typing import Any, Dict, Generator, Iterator, List, Optional, Tuple, Union, cast import urllib from builtins import next from collections import OrderedDict -from io import BytesIO, TextIOWrapper +from io import BytesIO, StringIO, TextIOWrapper import dateutil import pandas as pd @@ -15,10 +17,10 @@ from .. import dp_logging from .filepath_or_buffer import FileOrBufferHandler, is_stream_buffer # NOQA -logger = dp_logging.get_child_logger(__name__) +logger: Logger = dp_logging.get_child_logger(__name__) -def data_generator(data_list): +def data_generator(data_list: List[str]) -> Generator[str, None, None]: """ Take a list and return a generator on the list. @@ -31,7 +33,7 @@ def data_generator(data_list): yield item -def generator_on_file(file_object): +def generator_on_file(file_object: Union[StringIO, BytesIO]) -> Generator[Union[str, bytes], None, None]: """ Take a file and return a generator that returns lines. @@ -49,7 +51,7 @@ def generator_on_file(file_object): file_object.close() -def convert_int_to_string(x): +def convert_int_to_string(x: int) -> str: """ Convert the given input to string. @@ -69,7 +71,7 @@ def convert_int_to_string(x): return str(x) -def unicode_to_str(data, ignore_dicts=False): +def unicode_to_str(data: Union[str, List, Dict], ignore_dicts: bool=False) -> Union[str, List, Dict]: """ Convert data to string representation if it is a unicode string. @@ -99,7 +101,7 @@ def unicode_to_str(data, ignore_dicts=False): return data -def json_to_dataframe(json_lines, selected_columns=None, read_in_string=False): +def json_to_dataframe(json_lines: List[Dict], selected_columns: Optional[List[str]]=None, read_in_string: bool=False) -> Tuple[pd.DataFrame, pd.Series]: """ Take list of json objects and return dataframe representing json list. @@ -137,7 +139,7 @@ def json_to_dataframe(json_lines, selected_columns=None, read_in_string=False): return df, original_df_dtypes -def read_json_df(data_generator, selected_columns=None, read_in_string=False): +def read_json_df(data_generator: Generator, selected_columns: Optional[List[str]]=None, read_in_string: bool=False) -> Tuple[Iterator[pd.DataFrame], pd.Series]: """ Return an iterator that returns a chunk of data as dataframe in each call. @@ -187,10 +189,10 @@ def read_json_df(data_generator, selected_columns=None, read_in_string=False): k += 1 if not lines and k: raise ValueError("No JSON data could be read from these data.") - return json_to_dataframe(lines, selected_columns, read_in_string) + return json_to_dataframe(cast(List[Dict], lines), selected_columns, read_in_string) -def read_json(data_generator, selected_columns=None, read_in_string=False): +def read_json(data_generator: Generator, selected_columns: Optional[List[str]]=None, read_in_string: bool=False) -> List[Dict]: """ Return the lines of a json. @@ -239,17 +241,17 @@ def read_json(data_generator, selected_columns=None, read_in_string=False): k += 1 if not lines and k: raise ValueError("No JSON data could be read from these data.") - return lines + return cast(List[Dict], lines) def read_csv_df( - file_path, - delimiter, - header, - selected_columns=[], - read_in_string=False, - encoding="utf-8", -): + file_path: Union[str, BytesIO, TextIOWrapper], + delimiter: str, + header: int, + selected_columns: List[str]=[], + read_in_string: bool=False, + encoding: str="utf-8", +) -> pd.DataFrame: """ Read a CSV file in chunks and return dataframe in form of iterator. @@ -299,13 +301,14 @@ def read_csv_df( # if the buffer was wrapped, detach it before returning if is_buf_wrapped: + assert isinstance(file_path, TextIOWrapper) file_path.detach() fo.close() return data -def read_parquet_df(file_path, selected_columns=None, read_in_string=False): +def read_parquet_df(file_path: str, selected_columns: Optional[List[str]]=None, read_in_string: bool=False) -> Tuple[pd.DataFrame, pd.Series]: """ Return an iterator that returns one row group each time. @@ -349,7 +352,7 @@ def read_parquet_df(file_path, selected_columns=None, read_in_string=False): return data, original_df_dtypes -def read_text_as_list_of_strs(file_path, encoding=None): +def read_text_as_list_of_strs(file_path: str, encoding: Optional[str]=None) -> List[str]: """ Return list of strings relative to the chunk size. @@ -367,7 +370,7 @@ def read_text_as_list_of_strs(file_path, encoding=None): return data -def detect_file_encoding(file_path, buffer_size=1024, max_lines=20): +def detect_file_encoding(file_path: str, buffer_size: int=1024, max_lines: int=20) -> str: """ Determine encoding of files within initial `max_lines` of length `buffer_size`. @@ -456,7 +459,7 @@ def _decode_is_valid(encoding): return encoding.lower() -def detect_cell_type(cell): +def detect_cell_type(cell: str) -> str: """ Detect the cell type (int, float, etc). @@ -488,7 +491,7 @@ def detect_cell_type(cell): return cell_type -def get_delimiter_regex(delimiter=",", quotechar=","): +def get_delimiter_regex(delimiter: str=",", quotechar: str=",") -> re.Pattern[str]: """ Build regex for delimiter checks. @@ -518,7 +521,7 @@ def get_delimiter_regex(delimiter=",", quotechar=","): return re.compile(delimiter_regex + quotechar_regex) -def find_nth_loc(string=None, search_query=None, n=0, ignore_consecutive=True): +def find_nth_loc(string: Optional[str]=None, search_query: Optional[str]=None, n: int=0, ignore_consecutive: bool=True) -> Tuple[int, int]: """ Search string via search_query and return nth index in which query occurs. @@ -565,8 +568,8 @@ def find_nth_loc(string=None, search_query=None, n=0, ignore_consecutive=True): def load_as_str_from_file( - file_path, file_encoding=None, max_lines=10, max_bytes=65536, chunk_size_bytes=1024 -): + file_path: str, file_encoding: Optional[str]=None, max_lines: int=10, max_bytes: int=65536, chunk_size_bytes: int=1024 +) -> str: """ Load data from a csv file up to a specific line OR byte_size. @@ -602,7 +605,7 @@ def load_as_str_from_file( # Return either the last index of sample_lines OR # the index of the newline char that matches remaining_lines - search_query_value = "\n" + search_query_value: Union[str, bytes] = "\n" if isinstance(sample_lines, bytes): search_query_value = b"\n" @@ -611,7 +614,7 @@ def load_as_str_from_file( while start_loc < len_sample_lines - 1 and total_occurrences < max_lines: loc, occurrence = find_nth_loc( sample_lines[start_loc:], - search_query=search_query_value, + search_query=cast(str, search_query_value), # TODO: make sure find_nth_loc() works with search_query as bytes n=remaining_lines, ) @@ -629,7 +632,7 @@ def load_as_str_from_file( return data_as_str -def is_valid_url(url_as_string): +def is_valid_url(url_as_string: Any) -> bool: """ Determine whether a given string is a valid URL. @@ -646,7 +649,7 @@ def is_valid_url(url_as_string): return all([result.scheme, result.netloc]) -def url_to_bytes(url_as_string, options): +def url_to_bytes(url_as_string: str, options: Dict) -> BytesIO: """ Read in URL and converts it to a byte stream.