DistriNet · GJFR · Mar 27, 2025 · Mar 18, 2025 · Mar 27, 2025
diff --git a/bci/browser/binary/binary.py b/bci/browser/binary/binary.py
@@ -102,8 +102,21 @@ def is_available_locally(self):
     def is_available_online(self):
         return self.state.has_online_binary()
 
-    @abstractmethod
     def download_binary(self):
+        if self.is_available_locally():
+            logger.debug(f'Binary for {self.state} was already downloaded ({self.get_bin_path()})')
+        else:
+            binary_urls = self.state.get_online_binary_urls()
+            binary_dst_folder = os.path.dirname(self.get_potential_bin_path())
+            util.download_and_extract(binary_urls, binary_dst_folder)
+        self.configure_binary()
+
+    @abstractmethod
+    def configure_binary(self):
+        """
+        Configures the browser binary.
+        This method is idempotent.
+        """
         pass
 
     def is_built(self):

diff --git a/bci/browser/binary/vendors/chromium.py b/bci/browser/binary/vendors/chromium.py
@@ -1,14 +1,11 @@
 import logging
 import os
 import re
-import shutil
-import zipfile
-
-import requests
 
 from bci import cli, util
 from bci.browser.binary.artisanal_manager import ArtisanalBuildManager
 from bci.browser.binary.binary import Binary
+from bci.database.mongo.binary_cache import BinaryCache
 from bci.version_control.states.state import State
 
 logger = logging.getLogger(__name__)
@@ -19,7 +16,6 @@
 
 
 class ChromiumBinary(Binary):
-
     def __init__(self, state: State):
         super().__init__(state)
 
@@ -38,41 +34,12 @@ def browser_name(self) -> str:
     def bin_folder_path(self) -> str:
         return BIN_FOLDER_PATH
 
-    # def get_full_version(self, version: int):
-        # if re.match(r'[0-9]+\.[0-9]+\.[0-9]+', version):
-        #     return version + ".0"
-        # if re.match(r'[0-9]+', version):
-        #     return self.repo.get_release_tag(version)
-        # if re.match(r'[0-9]{2}', version):
-        #     return self.full_versions[version] + ".0"
-        # raise AttributeError("Could not convert version '%i' to full version" % version)
-        # return self.repo.get_release_tag(version)
-
     # Downloadable binaries
 
-    def download_binary(self):
-        if self.is_available_locally():
-            logger.debug(f'Binary for {self.state} was already downloaded ({self.get_bin_path()})')
-            return
-        binary_url = self.state.get_online_binary_url()
-        logger.info(f'Downloading binary for {self.state} from \'{binary_url}\'')
-        zip_file_path = f'/tmp/{self.state.name}/archive.zip'
-        if os.path.exists(os.path.dirname(zip_file_path)):
-            shutil.rmtree(os.path.dirname(zip_file_path))
-        os.makedirs(os.path.dirname(zip_file_path))
-        with requests.get(binary_url, stream=True) as req:
-            with open(zip_file_path, 'wb') as file:
-                shutil.copyfileobj(req.raw, file)
-        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
-            zip_ref.extractall(os.path.dirname(zip_file_path))
-        bin_path = self.get_potential_bin_path()
-        os.makedirs(os.path.dirname(bin_path), exist_ok=True)
-        unzipped_folder_path = os.path.join(os.path.dirname(zip_file_path), "chrome-linux")
-        self.__remove_unnecessary_files(unzipped_folder_path)
-        util.safe_move_dir(unzipped_folder_path, os.path.dirname(bin_path))
-        cli.execute_and_return_status("chmod -R a+x %s" % os.path.dirname(bin_path))
-        # Remove temporary files in /tmp/COMMIT_POS
-        shutil.rmtree(os.path.dirname(zip_file_path))
+    def configure_binary(self):
+        binary_folder = os.path.dirname(self.get_potential_bin_path())
+        self.__remove_unnecessary_files(binary_folder)
+        cli.execute_and_return_status(f'chmod -R a+x {binary_folder}')
 
     def __remove_unnecessary_files(self, binary_folder_path: str) -> None:
         """
@@ -90,6 +57,7 @@ def _get_version(self) -> str:
         if bin_path := self.get_bin_path():
             output = cli.execute_and_return_output(command, cwd=os.path.dirname(bin_path))
         else:
+            BinaryCache.remove_binary_files(self.state)
             raise AttributeError(f'Could not get binary path for {self.state}')
         match = re.match(r'Chromium (?P<version>[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)', output)
         if match:

diff --git a/bci/browser/binary/vendors/firefox.py b/bci/browser/binary/vendors/firefox.py
@@ -1,12 +1,8 @@
 import logging
 import os
 import re
-import shutil
-import tarfile
 
-import requests
-
-from bci import cli, util
+from bci import cli
 from bci.browser.binary.artisanal_manager import ArtisanalBuildManager
 from bci.browser.binary.binary import Binary
 from bci.version_control.states.state import State
@@ -19,7 +15,6 @@
 
 
 class FirefoxBinary(Binary):
-
     def __init__(self, state: State):
         super().__init__(state)
 
@@ -35,36 +30,17 @@ def browser_name(self) -> str:
     def bin_folder_path(self) -> str:
         return BIN_FOLDER_PATH
 
-    def download_binary(self):
-        if self.is_available_locally():
-            logger.debug(f'Binary for {self.state} was already downloaded ({self.get_bin_path()})')
-            return
-        binary_url = self.state.get_online_binary_url()
-        logger.debug(f'Downloading binary for {self.state} from \'{binary_url}\'')
-        tar_file_path = f'/tmp/{self.state.name}/archive.tar.bz2'
-        if os.path.exists(os.path.dirname(tar_file_path)):
-            shutil.rmtree(os.path.dirname(tar_file_path))
-        os.makedirs(os.path.dirname(tar_file_path))
-        with requests.get(binary_url, stream=True) as req:
-            with open(tar_file_path, 'wb') as file:
-                shutil.copyfileobj(req.raw, file)
-        with tarfile.open(tar_file_path, "r:bz2") as tar_ref:
-            tar_ref.extractall(os.path.dirname(tar_file_path))
-        bin_path = self.get_potential_bin_path()
-        os.makedirs(os.path.dirname(bin_path), exist_ok=True)
-        unzipped_folder_path = os.path.join(os.path.dirname(tar_file_path), "firefox")
-        util.safe_move_dir(unzipped_folder_path, os.path.dirname(bin_path))
-        cli.execute_and_return_status("chmod -R a+x %s" % os.path.dirname(bin_path))
-        cli.execute_and_return_status("chmod -R a+w %s" % os.path.dirname(bin_path))
-        # Remove temporary files in /tmp/COMMIT_POS
-        shutil.rmtree(os.path.dirname(tar_file_path))
+    def configure_binary(self) -> None:
+        binary_folder = os.path.dirname(self.get_potential_bin_path())
+        cli.execute_and_return_status(f'chmod -R a+x {binary_folder}')
+        cli.execute_and_return_status(f'chmod -R a+w {binary_folder}')
         # Add policy.json to prevent updating. (this measure is effective from version 60)
         # https://github.com/mozilla/policy-templates/blob/master/README.md
         # (For earlier versions, the prefs.js file is used)
-        distributions_path = os.path.join(os.path.dirname(bin_path), "distribution")
+        distributions_path = os.path.join(binary_folder, 'distribution')
         os.makedirs(distributions_path, exist_ok=True)
-        policies_path = os.path.join(distributions_path, "policies.json")
-        with open(policies_path, "a") as file:
+        policies_path = os.path.join(distributions_path, 'policies.json')
+        with open(policies_path, 'a') as file:
             file.write('{ "policies": { "DisableAppUpdate": true } }')
 
     def _get_version(self):

diff --git a/bci/database/mongo/binary_cache.py b/bci/database/mongo/binary_cache.py
@@ -135,6 +135,10 @@ def store_file(file_path: str) -> None:
                 elapsed_time = time.time() - start_time
                 logger.debug(f'Stored binary in {elapsed_time:.2f}s')
 
+    @staticmethod
+    def remove_binary_files(state: State) -> None:
+        BinaryCache.__remove_revision_binary_files(state.type, state.index)
+
     @staticmethod
     def __count_cached_binaries(state_type: Optional[str] = None) -> int:
         """

diff --git a/bci/util.py b/bci/util.py
@@ -6,12 +6,15 @@
 import logging
 import os
 import shutil
+import tarfile
 import time
+import zipfile
 from typing import Optional
+from urllib.parse import urlparse
 
 import requests
 
-LOGGER = logging.getLogger(__name__)
+logger = logging.getLogger(__name__)
 
 
 def safe_move_file(src_path, dst_path):
@@ -37,6 +40,7 @@ def safe_move_dir(src_path, dst_path):
             safe_move_dir(new_src_path, new_dst_path)
         else:
             raise AttributeError("Something went wrong")
+    shutil.rmtree(src_path)
 
 
 def copy_folder(src_path, dst_path):
@@ -83,30 +87,93 @@ def read_web_report(file_name):
 
 
 def request_html(url: str):
-    LOGGER.debug(f"Requesting {url}")
+    logger.debug(f"Requesting {url}")
     resp = requests.get(url, timeout=60)
     if resp.status_code >= 400:
         raise PageNotFound(f"Could not connect to url '{url}'")
     return resp.content
 
 
 def request_json(url: str):
-    LOGGER.debug(f"Requesting {url}")
+    logger.debug(f"Requesting {url}")
     resp = requests.get(url, timeout=60)
     if resp.status_code >= 400:
         raise PageNotFound(f"Could not connect to url '{url}'")
-    LOGGER.debug('Request completed')
+    logger.debug('Request completed')
     return resp.json()
 
 
 def request_final_url(url: str) -> str:
-    LOGGER.debug(f"Requesting {url}")
+    logger.debug(f"Requesting {url}")
     resp = requests.get(url, timeout=60)
     if resp.status_code >= 400:
         raise PageNotFound(f"Could not connect to url '{url}'")
-    LOGGER.debug('Request completed')
+    logger.debug('Request completed')
     return resp.url
 
 
+def download_and_extract(urls: list[str], dst_folder_path: str) -> bool:
+        """
+        Downloads the archive residing at the given URL and extracts it to the given dest_path.
+        This method currently supports zip, tar.bz2 and tar.xz archives.
+
+        :return bool: Returns True if the archive was successfully downloaded and extracted, otherwise False.
+        """
+        for url in urls:
+            logger.debug(f"Attempting to download archive from '{url}'")
+            tmp_file_name = urlparse(url).path.split('/')[-1]
+            tmp_file_path = os.path.join('/tmp', tmp_file_name)
+            if os.path.exists(tmp_file_path):
+                os.remove(tmp_file_path)
+            with requests.get(url, stream=True) as req:
+                if req.status_code != 200:
+                    continue
+                with open(tmp_file_path, 'wb') as file:
+                    shutil.copyfileobj(req.raw, file)
+            _, file_extension = os.path.splitext(tmp_file_path)
+
+            logger.debug(f"Extracting downloaded archive '{tmp_file_path}'")
+            match file_extension:
+                case '.zip':
+                    unzip(tmp_file_path, dst_folder_path)
+                case '.bz2':
+                    untar(tmp_file_path, dst_folder_path)
+                case '.xz':
+                    untar(tmp_file_path, dst_folder_path)
+                case _:
+                    AttributeError(f"File extension {file_extension} is not supported.")
+            os.remove(tmp_file_path)
+            return True
+        return False
+
+
+def unzip(src_archive_path: str, dst_folder_path: str) -> None:
+    with zipfile.ZipFile(src_archive_path, 'r') as zip:
+        members = zip.namelist()
+        top_dirs_and_files = {name.split('/')[0] for name in members}
+        # If there is a single top-level directory, we move all contents up.
+        if len(top_dirs_and_files) == 1:
+            parent_folder_path = os.path.dirname(dst_folder_path)
+            zip.extractall(parent_folder_path)
+            safe_move_dir(os.path.join(parent_folder_path, top_dirs_and_files.pop()), dst_folder_path)
+        else:
+            os.makedirs(dst_folder_path, exist_ok=True)
+            zip.extractall(dst_folder_path)
+
+
+def untar(src_archive_path: str, dst_folder_path: str) -> None:
+    os.makedirs(dst_folder_path, exist_ok=True)
+    # We do not inspects contents first like in unzip, because this is a very costly operation for tar archives.
+    with tarfile.open(src_archive_path, 'r:*') as tar:
+        tar.extractall(dst_folder_path)
+        members = os.listdir(dst_folder_path)
+        top_dirs_and_files = {name.split('/')[0] for name in members}
+        # If there is a single top-level directory, we move all contents up.
+        if len(top_dirs_and_files) == 1:
+            safe_move_dir(os.path.join(dst_folder_path, members.pop()), dst_folder_path + '_2')
+            shutil.rmtree(dst_folder_path)
+            safe_move_dir(os.path.join(dst_folder_path + '_2'), dst_folder_path)
+
+
 class PageNotFound(Exception):
     pass
diff --git a/bci/version_control/states/revisions/chromium.py b/bci/version_control/states/revisions/chromium.py
@@ -22,16 +22,16 @@ def has_online_binary(self) -> bool:
         if cached_binary_available_online is not None:
             return cached_binary_available_online
         url = f'https://www.googleapis.com/storage/v1/b/chromium-browser-snapshots/o/Linux_x64%2F{self._revision_nb}%2Fchrome-linux.zip'
-        req = requests.get(url)
-        has_binary_online = req.status_code == 200
+        response = requests.get(url, stream=True)
+        has_binary_online = response.status_code == 200
         MongoDB().store_binary_availability_online_cache('chromium', self, has_binary_online)
         return has_binary_online
 
-    def get_online_binary_url(self):
-        return (
+    def get_online_binary_urls(self) -> list[str]:
+        return [(
             'https://www.googleapis.com/download/storage/v1/b/chromium-browser-snapshots/o/%s%%2F%s%%2Fchrome-%s.zip?alt=media'
             % ('Linux_x64', self._revision_nb, 'linux')
-        )
+        )]
 
     def _fetch_missing_data(self) -> None:
         """

diff --git a/bci/version_control/states/revisions/firefox.py b/bci/version_control/states/revisions/firefox.py
@@ -26,14 +26,14 @@ def browser_name(self) -> str:
     def has_online_binary(self) -> bool:
         return RevisionCache.firefox_has_binary_for(revision_nb=self.revision_nb, revision_id=self._revision_id)
 
-    def get_online_binary_url(self) -> str:
+    def get_online_binary_urls(self) -> list[str]:
         result = RevisionCache.firefox_get_binary_info(self._revision_id)
         if result is None:
             raise AttributeError(f"Could not find binary url for '{self._revision_id}")
         binary_base_url = result['files_url']
         app_version = result['app_version']
         binary_url = f'{binary_base_url}firefox-{app_version}.en-US.linux-x86_64.tar.bz2'
-        return binary_url
+        return [binary_url]
 
     def get_previous_and_next_state_with_binary(self) -> tuple[State, State]:
         previous_revision_nb, next_revision_nb = RevisionCache.firefox_get_previous_and_next_revision_nb_with_binary(

diff --git a/bci/version_control/states/state.py b/bci/version_control/states/state.py
@@ -139,7 +139,10 @@ def has_online_binary(self) -> bool:
         pass
 
     @abstractmethod
-    def get_online_binary_url(self) -> str:
+    def get_online_binary_urls(self) -> list[str]:
+        """
+        Returns a list of URLs where the associated binary can potentially be downloaded from.
+        """
         pass
 
     def has_available_binary(self) -> bool:

diff --git a/bci/version_control/states/versions/chromium.py b/bci/version_control/states/versions/chromium.py
@@ -30,11 +30,11 @@ def has_online_binary(self):
         MongoDB().store_binary_availability_online_cache('chromium', self, has_binary_online)
         return has_binary_online
 
-    def get_online_binary_url(self):
-        return (
+    def get_online_binary_urls(self) -> list[str]:
+        return [(
             'https://www.googleapis.com/download/storage/v1/b/chromium-browser-snapshots/o/%s%%2F%s%%2Fchrome-%s.zip?alt=media'
             % ('Linux_x64', self._revision_nb, 'linux')
-        )
+        )]
 
     def convert_to_revision(self) -> ChromiumRevision:
         return ChromiumRevision(revision_nb=self._revision_nb)
diff --git a/bci/version_control/states/versions/firefox.py b/bci/version_control/states/versions/firefox.py
@@ -1,10 +1,9 @@
-from bci.version_control.repository.online.firefox import get_release_revision_number, get_release_revision_id
+from bci.version_control.repository.online.firefox import get_release_revision_id, get_release_revision_number
 from bci.version_control.states.revisions.firefox import FirefoxRevision
 from bci.version_control.states.versions.base import BaseVersion
 
 
 class FirefoxVersion(BaseVersion):
-
     def __init__(self, major_version: int):
         super().__init__(major_version)
 
@@ -21,8 +20,11 @@ def browser_name(self) -> str:
     def has_online_binary(self) -> bool:
         return True
 
-    def get_online_binary_url(self) -> str:
-        return f'https://ftp.mozilla.org/pub/firefox/releases/{self.major_version}.0/linux-x86_64/en-US/firefox-{self.major_version}.0.tar.bz2'
+    def get_online_binary_urls(self) -> list[str]:
+        return [
+            f'https://ftp.mozilla.org/pub/firefox/releases/{self.major_version}.0/linux-x86_64/en-US/firefox-{self.major_version}.0.tar.bz2',
+            f'https://ftp.mozilla.org/pub/firefox/releases/{self.major_version}.0/linux-x86_64/en-US/firefox-{self.major_version}.0.tar.xz'
+        ]
 
     def convert_to_revision(self) -> FirefoxRevision:
         return FirefoxRevision(revision_nb=self._revision_nb)