Skip to content

Ete4 #209

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 28 commits into
base: main
Choose a base branch
from
Open

Ete4 #209

Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
efb588e
chg: Prepare migration to ete4
Rafiot Aug 10, 2023
83386c4
chg: Install ete4 for testing
Rafiot Aug 10, 2023
73eff23
Merge branch 'main' into ete4
Rafiot Aug 25, 2023
900d341
chg: Bump deps
Rafiot Sep 6, 2023
263fc8c
Merge branch 'main' into ete4
Rafiot Sep 6, 2023
cbe9442
chg: proper numpy for python 3.8
Rafiot Sep 6, 2023
96a81ac
Merge branch 'main' into ete4
Rafiot Oct 12, 2023
80f2600
Merge branch 'main' into ete4
Rafiot Nov 5, 2023
55feff4
Merge branch 'main' into ete4
Rafiot Jan 5, 2024
7230d81
fix: Broken logic in test.
Rafiot Jan 5, 2024
3e58149
chg: Bump deps
Rafiot Feb 4, 2024
2ffebbb
Merge branch 'main' into ete4
Rafiot Feb 4, 2024
863beff
fix: install ete4 in actions
Rafiot Feb 4, 2024
81a4490
chg: Bump deps
Rafiot Mar 22, 2024
ba4cea0
chg: properly use ete4
Rafiot Mar 22, 2024
3613384
chg: fix merge
Rafiot Mar 22, 2024
1e0376f
chg: Bump deps
Rafiot Apr 30, 2024
d266950
Merge branch 'main' into ete4
Rafiot Jun 21, 2024
0e35282
chg: Buump deps
Rafiot Aug 4, 2024
fcaf0c1
Merge branch 'main' into ete4
Rafiot Sep 1, 2024
f802114
Merge branch 'main' into ete4
Rafiot Mar 29, 2025
6e2dfa2
chg: Replace features by props
Rafiot Mar 29, 2025
17c92fe
chg: rename add_feature -> add_prop
Rafiot Mar 30, 2025
561852e
fix: properly set contains_rendered_urlnode property
Rafiot Apr 10, 2025
3c2f362
Merge branch 'main' into ete4
Rafiot Apr 29, 2025
d42ee31
chg: Fix tests
Rafiot Apr 29, 2025
a270f54
chg; Bump deps
Rafiot May 10, 2025
3fd74e4
chg: Bump deps
Rafiot May 10, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/pytests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ name: Pytest

on:
push:
branches: [ main ]
branches: [ main, ete4 ]
pull_request:
branches: [ main ]

Expand Down
75 changes: 38 additions & 37 deletions har2tree/har2tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,20 +313,20 @@ def __init__(self, har_path: Path, capture_uuid: str):
# Generate cookies lookup tables
# All the initial cookies sent with the initial request given to splash
self.initial_cookies: dict[str, dict[str, Any]] = {}
if hasattr(self._nodes_list[0], 'cookies_sent'):
if 'cookies_sent' in self._nodes_list[0].props:
self.initial_cookies = {key: cookie for key, cookie in self._nodes_list[0].cookies_sent.items()}

# Dictionary of all cookies received during the capture
self.cookies_received: dict[str, list[tuple[str, URLNode, bool]]] = defaultdict(list)
for n in self._nodes_list:
if hasattr(n, 'cookies_received'):
if 'cookies_received' in n.props:
for domain, c_received, is_3rd_party in n.cookies_received:
self.cookies_received[c_received].append((domain, n, is_3rd_party))

# Dictionary of all cookies sent during the capture
self.cookies_sent: dict[str, list[URLNode]] = defaultdict(list)
for n in self._nodes_list:
if hasattr(n, 'cookies_sent'):
if 'cookies_sent' in n.props:
for c_sent in n.cookies_sent.keys():
self.cookies_sent[c_sent].append(n)

Expand All @@ -342,7 +342,7 @@ def __init__(self, har_path: Path, capture_uuid: str):
self.locally_created_not_sent: dict[str, dict[str, Any]] = self.locally_created.copy()
# Cross reference the source of the cookie
for n in self._nodes_list:
if hasattr(n, 'cookies_sent'):
if 'cookies_sent' in n.props:
for c_sent in n.cookies_sent:
# Remove cookie from list if sent during the capture.
self.locally_created_not_sent.pop(c_sent, None)
Expand All @@ -359,7 +359,7 @@ def __init__(self, har_path: Path, capture_uuid: str):

# Add context if urls are found in external_ressources
for n in self._nodes_list:
if hasattr(n, 'external_ressources'):
if 'external_ressources' in n.props:
for type_ressource, urls in n.external_ressources.items():
for url in urls:
if url not in self.all_url_requests:
Expand All @@ -369,39 +369,39 @@ def __init__(self, har_path: Path, capture_uuid: str):
# If the body of the response was empty, skip.
continue
if type_ressource == 'img':
node.add_feature('image', True)
node.add_prop('image', True)

if type_ressource == 'script':
node.add_feature('js', True)
node.add_prop('js', True)

if type_ressource == 'video':
node.add_feature('video', True)
node.add_prop('video', True)

if type_ressource == 'audio':
node.add_feature('audio', True)
node.add_prop('audio', True)

if type_ressource == 'iframe':
node.add_feature('iframe', True)
node.add_prop('iframe', True)

if type_ressource == 'embed': # FIXME other icon?
node.add_feature('octet_stream', True)
node.add_prop('octet_stream', True)

if type_ressource == 'source': # FIXME: Can be audio, video, or picture
node.add_feature('octet_stream', True)
node.add_prop('octet_stream', True)

# NOTE: the URL is probably not a CSS
# if type_ressource == 'link': # FIXME: Probably a css?
# node.add_feature('css', True)
# node.add_prop('css', True)

if type_ressource == 'object': # FIXME: Same as embed, but more things
node.add_feature('octet_stream', True)
node.add_prop('octet_stream', True)

self.url_tree = self._nodes_list.pop(0)

@property
def initial_referer(self) -> str | None:
'''The referer passed to the first URL in the tree'''
if hasattr(self.url_tree, 'referer'):
if 'referer' in self.url_tree.props:
return self.url_tree.referer
return None

Expand Down Expand Up @@ -442,7 +442,7 @@ def stats(self) -> dict[str, Any]:
@property
def redirects(self) -> list[str]:
"""List of redirects for this tree"""
return [a.name for a in reversed(self.rendered_node.get_ancestors())] + [self.rendered_node.name]
return [a.name for a in reversed(list(self.rendered_node.ancestors()))] + [self.rendered_node.name]

@property
def root_referer(self) -> str | None:
Expand Down Expand Up @@ -472,7 +472,7 @@ def build_all_hashes(self, algorithm: str='sha1') -> dict[str, list[URLNode]]:
h = hashlib.new(algorithm)
h.update(urlnode.body.getbuffer())
to_return[h.hexdigest()].append(urlnode)
if hasattr(urlnode, 'embedded_ressources'):
if 'embedded_ressources' in urlnode.props:
for _mimetype, blobs in urlnode.embedded_ressources.items():
for blob in blobs:
h = hashlib.new(algorithm)
Expand Down Expand Up @@ -515,22 +515,22 @@ def _load_url_entries(self) -> None:

n = URLNode(capture_uuid=self.har.capture_uuid, name=unquote_plus(url_entry['request']['url']))
n.load_har_entry(url_entry, list(self.all_url_requests.keys()))
if hasattr(n, 'redirect_url'):
if 'redirect_url' in n.props:
self.all_redirects.append(n.redirect_url)

if hasattr(n, 'initiator_url'):
if 'initiator_url' in n.props:
# The HAR file was created by chrome/chromium and we got the _initiator key
self.all_initiator_url[n.initiator_url].append(n.name)

if url_entry['startedDateTime'] in self.har.pages_start_times:
for page in self.har.pages_start_times[url_entry['startedDateTime']]:
if hasattr(n, 'pageref') and page['id'] == n.pageref:
if 'pageref' in n.props and page['id'] == n.pageref:
# This node is the root entry of a page. Can be used as a fallback when we build the tree
self.pages_root[n.pageref] = n.uuid
break

# NOTE 2021-05-28: Ignore referer for first entry
if hasattr(n, 'referer') and i > 0:
if 'referer' in n.props and i > 0:
# NOTE 2021-05-14: referer to self are a real thing: url -> POST to self
if n.name != n.referer or ('method' in n.request and n.request['method'] == 'POST'):
self.all_referer[n.referer].append(n.name)
Expand All @@ -544,27 +544,26 @@ def _load_url_entries(self) -> None:
for page in pages:
if page['id'] not in self.pages_root:
for node in self._nodes_list:
if not hasattr(node, 'pageref'):
if 'pageref' not in node.props:
# 2022-11-19: No pageref for this node in the HAR file,
# this is weird but we need it as a fallback.
node.add_feature('pageref', page['id'])
node.add_prop('pageref', page['id'])
if node.pageref == page['id']:
self.pages_root[node.pageref] = node.uuid
break

def get_host_node_by_uuid(self, uuid: str) -> HostNode:
"""Returns the node with this UUID from the HostNode tree"""
return self.hostname_tree.search_nodes(uuid=uuid)[0]
return self.hostname_tree.get_first_by_feature('uuid', uuid, expect_missing=False)

def get_url_node_by_uuid(self, uuid: str) -> URLNode:
"""Returns the node with this UUID from the URLNode tree"""
return self.url_tree.search_nodes(uuid=uuid)[0]
return self.url_tree.get_first_by_feature('uuid', uuid, expect_missing=False)

@property
def rendered_node(self) -> URLNode:
node = self.url_tree.search_nodes(name=self.har.final_redirect)
if node:
return node[0]
if node := self.url_tree.get_first_by_feature('name', self.har.final_redirect, expect_missing=True):
return node

browser_errors = ['chrome-error', 'about:blank']
if self.har.final_redirect and not any(self.har.final_redirect.startswith(r) for r in browser_errors):
Expand All @@ -574,7 +573,7 @@ def rendered_node(self) -> URLNode:
pass
# Just try to get the best guess: first node after JS/HTTP redirects
curnode = self.url_tree
while hasattr(curnode, 'redirect') and curnode.redirect:
while 'redirect' in curnode.props and curnode.redirect:
for child in curnode.children:
if child.name == curnode.redirect_url:
curnode = child
Expand Down Expand Up @@ -614,7 +613,7 @@ def make_hostname_tree(self, root_nodes_url: URLNode | list[URLNode], root_node_

child_node_hostname.add_url(child_node_url)

if not child_node_url.is_leaf():
if not child_node_url.is_leaf:
sub_roots[child_node_hostname].append(child_node_url)

for child_node_hostname, child_nodes_url in sub_roots.items():
Expand Down Expand Up @@ -654,13 +653,13 @@ def make_tree(self) -> URLNode:

@trace_make_subtree_fallback
def _make_subtree_fallback(self, node: URLNode, dev_debug: bool=False) -> None:
if hasattr(node, 'referer'):
if 'referer' in node.props:
# 2022-04-28: the node has a referer, but for some reason, it could't be attached to the tree
# Probable reason: the referer is a part of the URL (hostname)
# FIXME: this is a very dirty fix, but I'm not sure we can do it any better
if (referer_hostname := urlparse(node.referer).hostname):
# the referer has a hostname
if (nodes_with_hostname := self.url_tree.search_nodes(hostname=referer_hostname)):
if nodes_with_hostname := list(self.url_tree.search_nodes(hostname=referer_hostname)):
# the hostname has at least a node in the tree
for node_with_hostname in nodes_with_hostname:
if not node_with_hostname.empty_response:
Expand All @@ -685,14 +684,14 @@ def _make_subtree_fallback(self, node: URLNode, dev_debug: bool=False) -> None:
if dev_debug:
self.logger.warning(f'Failed to attach URLNode in the normal process, attaching node to page {node.pageref} - Node: {page_root_node.uuid} - {page_root_node.name}.')
self._make_subtree(page_root_node, [node])
elif self.url_tree.search_nodes(name=self.har.final_redirect):
elif final_redirect := self.url_tree.get_first_by_feature('name', self.har.final_redirect, expect_missing=True):
# Generally, when we have a bunch of redirects, they do not branch out before the final landing page
# *but* it is not always the case: some intermediary redirects will have calls to 3rd party pages.
# Hopefully, this last case was taken care of in the branch above.
# In this branch, we get the landing page after the redirects (if any), and attach the node to it.
if dev_debug:
self.logger.warning(f'Failed to attach URLNode in the normal process, attaching node to final redirect: {self.har.final_redirect}.')
self._make_subtree(self.url_tree.search_nodes(name=self.har.final_redirect)[0], [node])
self._make_subtree(final_redirect, [node])
elif 'pages' in self.har.har['log']:
# No luck, the node is root for this pageref, let's attach it to the prior page in the list, or the very first node (tree root)
page_before = self.har.har['log']['pages'][0]
Expand Down Expand Up @@ -745,7 +744,7 @@ def _make_subtree(self, root: URLNode, nodes_to_attach: list[URLNode] | None=Non
for unode in unodes:
# NOTE: as we're calling the method recursively, a node containing URLs in its external_ressources will attach
# the the subnodes to itself, even if the subnodes have a different referer. It will often be correct, but not always.
if hasattr(unode, 'redirect') and not hasattr(unode, 'redirect_to_nothing'):
if 'redirect' in unode.props and 'redirect_to_nothing' not in unode.props:
# If the subnode has a redirect URL set, we get all the requests matching this URL
# One may think the entry related to this redirect URL has a referer to the parent. One would be wrong.
# URL 1 has a referer, and redirects to URL 2. URL 2 has the same referer as URL 1.
Expand Down Expand Up @@ -787,7 +786,9 @@ def _make_subtree(self, root: URLNode, nodes_to_attach: list[URLNode] | None=Non
# The URL (unode.name) is in the list of known urls initiating calls
for u in self.all_initiator_url[unode.name]:
matching_urls = [url_node for url_node in self.all_url_requests[u]
if url_node in self._nodes_list and hasattr(url_node, 'initiator_url') and url_node.initiator_url == unode.name]
if url_node in self._nodes_list
and 'initiator_url' in url_node.props
and url_node.initiator_url == unode.name]
self._nodes_list = [node for node in self._nodes_list if node not in matching_urls]
if dev_debug:
self.logger.warning(f'Found via initiator from {unode.name} to {matching_urls}.')
Expand All @@ -810,7 +811,7 @@ def _make_subtree(self, root: URLNode, nodes_to_attach: list[URLNode] | None=Non
self._nodes_list = [node for node in self._nodes_list if node != url_node]
self._make_subtree(unode, [url_node])

if hasattr(unode, 'external_ressources'):
if 'external_ressources' in unode.props:
# the url loads external things, and some of them have no referer....
for external_tag, links in unode.external_ressources.items():
for link in links:
Expand Down
Loading