-
Notifications
You must be signed in to change notification settings - Fork 1
/
csv_data.py
148 lines (125 loc) · 4.39 KB
/
csv_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# Copyright 2024 Google LLC.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""CSV data formatting and extracting.
Contains all the CSV functions for reading the data and extracting
the metrics and dimensions.
"""
import csv
from datetime import datetime
import logging
import pandas as pd
def get_csv_data(file) -> dict:
"""Gets the data from an uploaded CSV file.
Args:
file: The file of the CSV file.
Returns:
The data from the CSV data as a dictionary.
"""
try:
fstring = file.read().decode("utf8")
return [
{k: v.replace(",", "") for k, v in row.items()}
for row in csv.DictReader(
fstring.splitlines(), skipinitialspace=True, quoting=csv.QUOTE_ALL
)
]
except ValueError as e:
logging.exception(e)
err = {"error": f"Error: {e}"}
return err
def get_csv_columns_by_type(data: dict) -> tuple[list, str, list]:
"""Gets the CSV or Sheet columns data and returns a list of values for the
date column,
the numerical columns and the text based columns.
Args:
data: The raw CSV data from the file upload or reading of the Google sheet.
Returns:
List of dates, list of numerical columns and list of text based columns as
a Tuple.
"""
keys_list = list(data[0].keys())
date_column = ""
string_columns = []
for key in keys_list:
if "date" in key.lower():
date_column = key
keys_list.remove(key)
break
for rows in data:
for key in keys_list:
if key != date_column:
if rows.get(key) == "":
rows.update({key: "0"})
if not str(rows.get(key)).replace(".", "").isnumeric():
string_columns.append(key)
keys_list.remove(key)
return keys_list, date_column, string_columns
def csv_get_date_range(data: dict, date_column: str,
convert: bool) -> tuple[str, str]:
"""Gets the range of the dates for the start of and end of date values.
Args:
data: The raw CSV data from the file upload or reading of the Google sheet.
date_column: The column the dates are in.
convert: Converts date format from Connect Benchmarks to standard format if
true.
Returns:
Returns the minimum and maximum date.
"""
date_list = []
for item in data:
if date_column in item:
if convert:
date_obj = datetime.strptime(item[date_column], "%b %d %Y")
item_to_add = date_obj.strftime("%Y-%m-%d")
else:
item_to_add = item[date_column]
date_list.append(item_to_add)
return str(min(date_list)), str(max(date_list))
def csv_merge_data(csv_data: dict, csv_settings: dict, raw_data: dict, pre_tag: str) -> dict:
"""Merges the CSV and Google sheets data with the raw data object.
Args:
csv_data: The raw CSV data from the file upload or reading of the Google
sheet.
csv_settings: The settings that have been defined by the user (date ranges,
event date, target event etc.).
raw_data: The overall raw data from other datasources to be merged with the
CSV/sheet data.
Returns:
The raw data with all CSV/sheet data merged.
"""
for item in csv_data:
if item[csv_settings["date_column"]] in raw_data:
for col in csv_settings["metrics"]:
raw_data[item[csv_settings["date_column"]]].update(
{f"{pre_tag}_{col}": item[col]}
)
else:
for col in csv_settings["metrics"]:
if item[csv_settings["date_column"]] in raw_data:
raw_data[item[csv_settings["date_column"]]].update(
{f"{pre_tag}_{col}": item[col]}
)
else:
raw_data[item[csv_settings["date_column"]]] = {
f"{pre_tag}_{col}": item[col]
}
return raw_data
def get_date_from_str(date) -> str:
"""Converts the Connect Benchmark date format to the yyyy-mm-dd.
Args:
date: CB date in the MMM dd yyyy format.
Returns:
The date in the yyyy-mm-dd format.
"""
return datetime.strptime(date, "%b %d %Y").strftime("%Y-%m-%d")