-
Notifications
You must be signed in to change notification settings - Fork 6
/
05c_pandas_json.py
68 lines (32 loc) · 2 KB
/
05c_pandas_json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/env python
# coding: utf-8
# ## JSON files
#
# A JSON file is a file that stores simple data structures and objects in JavaScript Object Notation (JSON) format, which is a standard data interchange format. It is primarily used for transmitting data between a web application and a server. JSON files are lightweight, text-based, human-readable, and can be edited using a text editor.
# In[1]:
import pandas as pd
from pathlib import Path
# In[2]:
dir_file = Path('__file__').resolve().parents[0]
# In[13]:
dir_data = dir_file / 'data' / 'Storico meteo'
year = 2012
fns = dir_data.glob(f'meteo-{year}-*.json')
ff = sorted(fns)
# In[14]:
# DO NOT RUN THIS CELL
%%time
df = pd.read_json(ff[0], lines=True)
# In[15]:
df
# In[16]:
get_ipython().run_cell_magic('time', '', "df = pd.read_json(ff[0], lines=True, chunksize=10000) # chunksize is the number of rows per chunk\ndf_list = list()\nfor c in df:\n c.drop(columns=['version', 'ident', 'network'], axis=1, inplace=True)\n c.set_index('date', inplace=True)\n #pd.json_normalize(c.data.values[0])\n value_speed = [x[0]['vars']['B05001']['v'] for x in c.data.values if 'B05001' in x[0]['vars']]\n c['w_speed'] = value_speed\n c.drop(['data'], axis=1, inplace=True)\n df_list.append(c)\ndf = pd.concat(df_list)")
# ### Dask
#
#
# Parallelize any Python code with Dask Futures, letting you scale any function and for loop, and giving you control and power in any situation.
# In[32]:
import dask.dataframe as dd
# In[128]:
get_ipython().run_cell_magic('time', '', "ddf = dd.read_json(ff[0], blocksize=5000000) # blocksize is size in bytes of each block \nddf = ddf.drop(columns=['version', 'ident', 'network'], axis=1)\nlist_var = list()\nfor np in range(ddf.npartitions):\n ddfp = ddf.partitions[np]\n ddfp = ddfp.set_index('date')\n value_speed = [x[0]['vars']['B05001']['v'] for x in ddfp.data.compute().values]\n list_var.append(value_speed)\n#ddf = ddf.drop(columns=['data'], axis=1)")
# #### Parallelization!