-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
239 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,239 @@ | ||
import pandas as pd | ||
import ast | ||
import csv | ||
from contextlib import redirect_stdout | ||
from io import StringIO | ||
import unicodedata | ||
import re | ||
import sqlite3 | ||
from typing import List, Tuple, Any, Union | ||
import numpy as np | ||
from datetime import datetime | ||
|
||
################################### | ||
### Dataframe related functions ### | ||
################################### | ||
|
||
def remove_merged_suffixes(df): | ||
# define a pattern to match the merged suffixes | ||
pattern = re.compile(r'^(.*) \.\d+$') | ||
|
||
# iterate over the columns | ||
for col in df.columns: | ||
# iterate over the values in the column | ||
for idx, value in df[col].items(): | ||
match = pattern.match(str(value)) | ||
if match: | ||
# if the value matches the pattern, replace it with the matched group | ||
new_value = match.group(1).strip() | ||
# check if the new value is in the column, including column name | ||
if new_value in df[col].drop(idx).values or new_value == col: | ||
df.at[idx, col] = new_value | ||
return df | ||
|
||
def markdown_to_df(markdown_string): | ||
""" | ||
Parse a markdown table to a pandas dataframe. | ||
Parameters: | ||
markdown_string (str): The markdown table string. | ||
Returns: | ||
pd.DataFrame: The parsed markdown table as a pandas dataframe. | ||
""" | ||
|
||
# Split the markdown string into lines | ||
lines = markdown_string.strip().split("\n") | ||
|
||
# strip leading/trailing '|' | ||
lines = [line.strip('|') for line in lines] | ||
|
||
# Check if the markdown string is empty or only contains the header and delimiter | ||
if len(lines) < 2: | ||
raise ValueError("Markdown string should contain at least a header, delimiter and one data row.") | ||
|
||
# Check if the markdown string contains the correct delimiter for a table | ||
if not set(lines[1].strip()) <= set(['-', '|', ' ', ':']): | ||
# means the second line is not a delimiter line | ||
# we do nothing | ||
pass | ||
# Remove the delimiter line | ||
else: | ||
del lines[1] | ||
|
||
# Join the lines back into a single string, and use StringIO to make it file-like | ||
markdown_file_like = StringIO("\n".join(lines)) | ||
|
||
# Use pandas to read the "file", assuming the first row is the header and the separator is '|' | ||
df = pd.read_csv(markdown_file_like, sep='|', skipinitialspace=True, quoting=csv.QUOTE_NONE) | ||
|
||
# Strip whitespace from column names and values | ||
df.columns = df.columns.str.strip() | ||
|
||
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x) | ||
|
||
# normalize unicode characters | ||
df = df.map(lambda x: unicodedata.normalize("NFKD", x) if isinstance(x, str) else x) | ||
|
||
return df | ||
|
||
def print_partial_markdown(df, keep: int=3): | ||
# Concatenate the first `keep` and last `keep` rows of the dataframe | ||
combined_df = pd.concat([df.head(keep), df.tail(keep)]) | ||
|
||
# Convert the combined dataframe to markdown | ||
markdown_output = combined_df.to_markdown(index=True) | ||
|
||
# Insert the "..." separator in the appropriate line | ||
markdown_lines = markdown_output.split('\n') | ||
separator_index = len(df.head(keep).to_markdown(index=True).split('\n')) | ||
markdown_lines.insert(separator_index, '...') | ||
|
||
# Join the lines to form the final markdown string and print | ||
final_output = '\n'.join(markdown_lines) | ||
|
||
return final_output | ||
|
||
def convert_cells_to_numbers(df): | ||
# Helper function to remove commas and try to convert to numeric | ||
def to_numeric(cell): | ||
if isinstance(cell, str): # Check if the cell is of string type | ||
no_comma = cell.replace(',', '') # Remove commas | ||
# Check if the string without commas can be a float | ||
try: | ||
float(no_comma) | ||
return pd.to_numeric(no_comma, errors='coerce') | ||
except ValueError: | ||
return cell # If it can't be a number, return the original cell | ||
return pd.to_numeric(cell, errors='coerce') | ||
|
||
|
||
# Apply the function to each cell in the dataframe | ||
return df.map(to_numeric) | ||
|
||
def infer_dtype(df): | ||
""" | ||
Attempt to convert columns in a DataFrame to a more appropriate data type. | ||
:param df: Input DataFrame | ||
:return: DataFrame with updated dtypes | ||
""" | ||
|
||
for col in df.columns: | ||
# Try converting to numeric | ||
df[col] = pd.to_numeric(df[col], errors='ignore') | ||
|
||
# If the column type is still object (string) after trying numeric conversion, try datetime conversion | ||
if df[col].dtype == 'object': | ||
try: | ||
df[col] = pd.to_datetime(df[col], errors='coerce', infer_datetime_format=True) | ||
except: | ||
pass | ||
|
||
return df | ||
|
||
# def parse_code_from_string(input_string): | ||
# """ | ||
# Parse executable code from a string. | ||
|
||
# Parameters: | ||
# input_string (str): The input string. | ||
|
||
# Returns: | ||
# str: The parsed code. | ||
# """ | ||
|
||
# # Removes `, whitespace & python from start | ||
# input_string = re.sub(r"^(\s|`)*(?i:python)?\s*", "", input_string, flags=re.IGNORECASE) | ||
# # Removes whitespace & ` from end | ||
# input_string = re.sub(r"(\s|`)*$", "", input_string) | ||
|
||
# return input_string | ||
def parse_code_from_string(input_string): | ||
""" | ||
Parse executable code from a string, handling various markdown-like code block formats. | ||
Parameters: | ||
input_string (str): The input string. | ||
Returns: | ||
str: The parsed code. | ||
""" | ||
|
||
# Pattern to match code blocks wrapped in triple backticks, with optional language specification | ||
triple_backtick_pattern = r"```(\w*\s*)?(.*?)```" | ||
match = re.search(triple_backtick_pattern, input_string, flags=re.DOTALL | re.IGNORECASE) | ||
if match: | ||
return match.group(2).strip() | ||
|
||
# Pattern to match code blocks wrapped in single backticks | ||
single_backtick_pattern = r"`(.*?)`" | ||
match = re.search(single_backtick_pattern, input_string, flags=re.DOTALL) | ||
if match: | ||
return match.group(1).strip() | ||
|
||
# Default return if no code block patterns are matched | ||
return input_string.strip() | ||
|
||
|
||
def python_repl_ast(code, custom_globals=None, custom_locals=None, memory=None): | ||
""" | ||
Run command with own globals/locals and returns anything printed. | ||
Parameters: | ||
code (str): The code to execute. | ||
custom_globals (dict): The globals to use. | ||
custom_locals (dict): The locals to use. | ||
memory (dict): The state/memory to retain between invocations. | ||
Returns: | ||
tuple: (str: The output of the code, dict: updated memory). | ||
""" | ||
|
||
if memory is None: | ||
memory = {} | ||
|
||
if custom_globals is None: | ||
custom_globals = globals().copy() | ||
else: | ||
custom_globals = {**globals(), **custom_globals} | ||
|
||
if custom_locals is None: | ||
custom_locals = memory.copy() | ||
else: | ||
custom_locals = {**custom_locals, **memory} | ||
|
||
try: | ||
tree = ast.parse(code) | ||
module = ast.Module(tree.body[:-1], type_ignores=[]) | ||
|
||
# Execute all lines except the last | ||
exec(ast.unparse(module), custom_globals, custom_locals) | ||
|
||
# Prepare the last line | ||
module_end = ast.Module(tree.body[-1:], type_ignores=[]) | ||
module_end_str = ast.unparse(module_end) | ||
|
||
io_buffer = StringIO() | ||
|
||
# Redirect stdout to our buffer and attempt to evaluate the last line | ||
with redirect_stdout(io_buffer): | ||
try: | ||
ret = eval(module_end_str, custom_globals, custom_locals) | ||
if ret is not None: | ||
output = str(ret) | ||
else: | ||
output = io_buffer.getvalue() | ||
except Exception: | ||
# If evaluating fails, try executing it instead | ||
exec(module_end_str, custom_globals, custom_locals) | ||
output = io_buffer.getvalue() | ||
|
||
# Update memory with new variable states | ||
memory.update(custom_locals) | ||
|
||
# Return any output captured during execution along with the updated memory | ||
return output, memory | ||
|
||
except Exception as e: | ||
return "{}: {}".format(type(e).__name__, str(e)), memory |