-
Notifications
You must be signed in to change notification settings - Fork 4
/
process_json.py
63 lines (50 loc) · 1.91 KB
/
process_json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# coding=utf-8
import json
import pandas
import gzip
from itertools import combinations
from collections import defaultdict
import pandas as pd
def parse_date(date):
"""
Given a date in a format like:
1 MAY 2016 - 00:30
Return:
2016-05-01
"""
months = {"ene": "01", "feb": "02", "mar": "03", "abr": "04", "may": "05",
"jun": "06", "jul": "07", "ago": "08", "sep": "09", "oct": "10",
"nov": "11", "dic": "12"}
pieces = date.lower().split(" ")
new_date = pieces[2] + "-" + months[pieces[1]] + "-" + ("%02d" % int(pieces[0]))
return(new_date)
def get_tag_pairs(tags):
"""Returns all the possible 2-tag combination"""
tags = sorted([x.lower() for x in tags])
return([x for x in combinations(tags, 2)])
def get_relations(opeds, tag):
"""
Returns a count of the relations used with a particular tag.
"""
count = defaultdict(int)
tags = [x["tags"] for x in opeds if tag in x["tags"]]
for group in tags:
for t in group:
count[t] += 1
return count
if __name__ == "__main__":
with gzip.open("/home/chema/Dropbox/data/cubazuela/elpais_opeds_full.json.gz", "r") as f:
opeds = json.load(f)
# Change dates and filter tags
processed_opeds = []
filter_tags = (u"opinión", )
for oped in opeds:
oped["date"] = parse_date(oped["date"])
oped["tags"] = [x.lower() for x in oped["tags"] if x.lower() not in filter_tags]
#tag_pairs = get_tag_pairs(tags)
#for tag_pair in tag_pairs:
# processed_opeds.append({"date": date, "title": oped["title"], "url": oped["url"],
# "tag_pair_1": tag_pair[0], "tag_pair_2": tag_pair[1]})
# Convert to Pandas object and dump to CSV file
#processed_opeds = pd.DataFrame(processed_opeds)
#processed_opeds.to_csv("/tmp/elpais_opeds.csv", index = False, encoding = "utf8")