Skip to content

Commit

Permalink
fix unicode deencoding issues in influence, with better parser
Browse files Browse the repository at this point in the history
  • Loading branch information
sbenthall committed Oct 21, 2024
1 parent 42568c2 commit 68af135
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 10 deletions.
13 changes: 10 additions & 3 deletions bigbang/analysis/influence.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import bigbang.parse as parse
import bigbang.analysis.utils as utils

import email.header

import matplotlib.pyplot as plt
import pandas as pd

Expand Down Expand Up @@ -45,10 +47,13 @@ def normalize_senders_by_domain(row):
if dd.loc[row["domain"]]["category"] in good_categories:
return lookup_stakeholder_by_domain(row["domain"])
else:
return parse.clean_from(row["From"])
cleaned = parse.clean_from(row["From"])

return cleaned
except Exception as e:
print(e)
return parse.clean_from(row["From"])
cleaned = parse.clean_from(row["From"])
print(row["From"], " --> ", cleaned)
return cleaned


def is_affiliation(domain):
Expand All @@ -73,6 +78,8 @@ def augment(arx):
arx.data["domain"] = arx.data["From"].apply(utils.extract_domain)
arx.data["sender_cat"] = arx.data.apply(normalize_senders_by_domain, axis=1)

# TODO test for garbage here?


def aggregate_activity(aarx, top_n):
"""
Expand Down
26 changes: 19 additions & 7 deletions bigbang/parse.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import email
import email.header
import html2text
import logging
import re
Expand All @@ -11,9 +11,9 @@
re_cache = {
"top_exp": re.compile(r"From .*\d\d\d\d\n"),
"msg_id": re.compile(r"<\S*@\S*>"),
"from_header_1" : re.compile(r"\"?([\w \-\=\?]*[\w\=])\"? <([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)>")
}


def split_references(refs):
return re_cache["msg_id"].findall(refs)

Expand All @@ -34,22 +34,34 @@ def clean_from(m_from):
"""
Return a person's name extracted from 'From' field
of email, based on heuristics.
TODO: fix the unclear name of this method
"""

cleaned = m_from

try:
if "(" in m_from:
cleaned = m_from[m_from.index("(") + 1 : m_from.rindex(")")]
elif "<" in m_from:
# if m_from.index("<") > -1:
cleaned = m_from[0 : m_from.index("<") - 1]

except ValueError:
warnings.warn("%s is hard to clean" % (m_from))

cleaned = cleaned.strip('"')

match = re.search(re_cache["from_header_1"], cleaned)

if match:
name = match.group(1)

part, charset = email.header.decode_header(name)[0]

if charset is None:
charset = 'utf-8'
if isinstance(part, bytes):
part = part.decode(charset)

cleaned = part

return cleaned


Expand All @@ -67,6 +79,7 @@ def clean_name(name):
Returns None if the name portion is missing anything name-like. Otherwise, returns the cleaned name.
"""
name = email.header.decode_header(name)[0][0]

# we see these specific strings due to parsing issues in email somewhere
name = name.replace("unknown charset", " ")
Expand All @@ -82,7 +95,6 @@ def clean_name(name):
# do we need to also catch email archives that use anti-spam measures?
# like: .replace(' at ','@')

# TODO: decode or collapse rfc2231 encodings, like '=?utf-8?q?carlos_gonz=c3=a1lez-cadenas?=' ?

name = name.strip() # remove leading and trailing whitespace

Expand Down

0 comments on commit 68af135

Please sign in to comment.