fix unicode deencoding issues in influence, with better parser

datactive · Oct 21, 2024 · 68af135 · 68af135
1 parent 42568c2
commit 68af135
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 10 deletions.
diff --git a/bigbang/analysis/influence.py b/bigbang/analysis/influence.py
@@ -3,6 +3,8 @@
 import bigbang.parse as parse
 import bigbang.analysis.utils as utils
 
+import email.header
+
 import matplotlib.pyplot as plt
 import pandas as pd
 
@@ -45,10 +47,13 @@ def normalize_senders_by_domain(row):
         if dd.loc[row["domain"]]["category"] in good_categories:
             return lookup_stakeholder_by_domain(row["domain"])
         else:
-            return parse.clean_from(row["From"])
+            cleaned = parse.clean_from(row["From"])
+
+            return cleaned
     except Exception as e:
-        print(e)
-        return parse.clean_from(row["From"])
+        cleaned = parse.clean_from(row["From"])
+        print(row["From"], " --> ", cleaned)
+        return cleaned
 
 
 def is_affiliation(domain):
@@ -73,6 +78,8 @@ def augment(arx):
     arx.data["domain"] = arx.data["From"].apply(utils.extract_domain)
     arx.data["sender_cat"] = arx.data.apply(normalize_senders_by_domain, axis=1)
 
+    # TODO test for garbage here?
+
 
 def aggregate_activity(aarx, top_n):
     """

diff --git a/bigbang/parse.py b/bigbang/parse.py
@@ -1,4 +1,4 @@
-import email
+import email.header
 import html2text
 import logging
 import re
@@ -11,9 +11,9 @@
 re_cache = {
     "top_exp": re.compile(r"From .*\d\d\d\d\n"),
     "msg_id": re.compile(r"<\S*@\S*>"),
+    "from_header_1" : re.compile(r"\"?([\w \-\=\?]*[\w\=])\"? <([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)>")
 }
 
-
 def split_references(refs):
     return re_cache["msg_id"].findall(refs)
 
@@ -34,22 +34,34 @@ def clean_from(m_from):
     """
     Return a person's name extracted from 'From' field
     of email, based on heuristics.
+    
+    TODO: fix the unclear name of this method
     """
 
     cleaned = m_from
 
     try:
         if "(" in m_from:
             cleaned = m_from[m_from.index("(") + 1 : m_from.rindex(")")]
-        elif "<" in m_from:
-            # if m_from.index("<") > -1:
-            cleaned = m_from[0 : m_from.index("<") - 1]
-
     except ValueError:
         warnings.warn("%s is hard to clean" % (m_from))
 
     cleaned = cleaned.strip('"')
 
+    match = re.search(re_cache["from_header_1"], cleaned)
+
+    if match:
+        name = match.group(1)
+
+        part, charset = email.header.decode_header(name)[0]
+
+        if charset is None:
+            charset = 'utf-8'
+        if isinstance(part, bytes):
+            part = part.decode(charset)
+
+        cleaned = part
+
     return cleaned
 
 
@@ -67,6 +79,7 @@ def clean_name(name):
 
     Returns None if the name portion is missing anything name-like. Otherwise, returns the cleaned name.
     """
+    name = email.header.decode_header(name)[0][0]
 
     # we see these specific strings due to parsing issues in email somewhere
     name = name.replace("unknown charset", " ")
@@ -82,7 +95,6 @@ def clean_name(name):
     # do we need to also catch email archives that use anti-spam measures?
     # like: .replace(' at ','@')
 
-    # TODO: decode or collapse rfc2231 encodings, like '=?utf-8?q?carlos_gonz=c3=a1lez-cadenas?=' ?
 
     name = name.strip()  # remove leading and trailing whitespace