feat: add github actions workflows for scraperv2

kouloumos · kouloumos · commit 2112dadbc829 · 2025-01-09T13:00:16.000+02:00
Adds reusable workflow for scraping sources and
- workflow for BitcoinTranscripts
- minor fixes identified during testing of workflow
- removes old workflow for bitcointranscripts
diff --git a/.github/workflows/bitcointranscripts.yml b/.github/workflows/bitcointranscripts.yml
@@ -1,32 +1,14 @@
-name: Bitcoin Transcripts 
+name: BitcoinTranscripts
+
 on:
   schedule:
     - cron: '0 13 * * 3' # every Wednesday at 1pm UTC
   workflow_dispatch:
   repository_dispatch:
 
 jobs:
-  fetch:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v2
-      - uses: actions/setup-python@v2
-        with:
-          python-version: 3.9
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements.txt
-      - name: Fetch data
-        run: |
-          mkdir /tmp/data
-          python bitcointranscripts/main.py
-        env:
-            ES_ENGINE: ${{ secrets.ES_ENGINE }}
-            ES_URL: ${{ secrets.ES_URL }}
-            ES_TOKEN: ${{ secrets.ES_TOKEN }}
-            DATA_DIR: /tmp/data
-            CLOUD_ID: ${{ secrets.CLOUD_ID }}
-            USER_PASSWORD: ${{ secrets.USER_PASSWORD }}
-            USERNAME: ${{ secrets.USERNAME }}
-            INDEX: ${{ secrets.INDEX }}
+  bitcointranscripts:
+    uses: ./.github/workflows/scrape-source.yml
+    with:
+      source: bitcointranscripts
+    secrets: inherit
diff --git a/.github/workflows/scrape-source.yml b/.github/workflows/scrape-source.yml
@@ -0,0 +1,39 @@
+name: Scrape Source
+
+on:
+  workflow_call:
+    inputs:
+      source:
+        required: true
+        type: string
+
+jobs:
+  scrape:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.11"
+
+      - name: Get latest release
+        id: latest_release
+        uses: pozetroninc/github-action-get-latest-release@v0.7.0
+        with:
+          repository: bitcoinsearch/scraper
+          token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Install release
+        run: |
+          # Get the wheel filename from the release assets
+          WHEEL_URL=$(curl -s https://api.github.com/repos/bitcoinsearch/scraper/releases/latest | jq -r '.assets[] | select(.name | endswith(".whl")) | .browser_download_url')
+          pip install $WHEEL_URL
+
+      - name: Run scraper
+        run: scraper scrape --source ${{ inputs.source }}
+        env:
+          CLOUD_ID: ${{ secrets.CLOUD_ID }}
+          API_KEY: ${{ secrets.API_KEY }}
+          INDEX: ${{ secrets.INDEX }}
diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ Weekly
 - Bitcoin Talk Forum ([cron](.github/workflows/bitcointalk.yml), [source](bitcointalk))
     - only the [Development & Technical Discussion Board](https://bitcointalk.org/index.php?board=6.0)
     - only for specific authors
-- [Bitcoin Transcript](https://btctranscripts.com/) ([cron](.github/workflows/bitcointranscripts.yml), [source](bitcointranscripts))
+- [Bitcoin Transcript](https://btctranscripts.com/) ([cron](.github/workflows/bitcointranscripts.yml), [source](scraper/scrapers/bitcointranscripts.py))
 - [Bitcoin Optech](https://bitcoinops.org/) ([cron](.github/workflows/bitcoinops.yml), [source](bitcoinops))
 
 Additionally, for on-demand scraping tasks, we utilize a Scrapybot, details of which can be found in the [Scrapybot section](#scrapybot) below.
diff --git a/scraper/commands/elastic.py b/scraper/commands/elastic.py
@@ -100,14 +100,50 @@ async def cleanup():
                     click.echo(f"Index {index_name} does not exist")
                     return
 
+                # Define query based on the cleanup type
                 if test_docs_only:
-                    await output.cleanup_test_documents(index_name)
-                    click.echo(f"Cleaned up test documents from index {index_name}")
+                    query = {"query": {"term": {"test_document": True}}}
+                    operation_desc = "test documents"
                 else:
-                    output.es.delete_by_query(
-                        index=index_name, body={"query": {"match_all": {}}}
+                    query = {"query": {"match_all": {}}}
+                    operation_desc = "documents"
+
+                try:
+                    # First count how many documents will be affected
+                    count_result = output.es.count(index=index_name, body=query)
+                    doc_count = count_result["count"]
+
+                    # Ask for confirmation
+                    if not click.confirm(
+                        f"\nWarning: {doc_count} {operation_desc} will be deleted from index '{index_name}'. Do you want to continue?"
+                    ):
+                        click.echo("Operation cancelled")
+                        return
+
+                    # Proceed with deletion
+                    delete_result = output.es.delete_by_query(
+                        index=index_name, body=query
                     )
-                    click.echo(f"Removed all documents from index {index_name}")
+
+                    # Print detailed deletion results
+                    click.echo("\nDeletion Results:")
+                    click.echo(
+                        f"Total {operation_desc} deleted: {delete_result['deleted']}"
+                    )
+                    click.echo(f"Total batches: {delete_result['batches']}")
+                    click.echo(f"Documents that failed: {delete_result['failures']}")
+                    click.echo(f"Time taken: {delete_result['took']}ms")
+
+                    if delete_result.get("failures"):
+                        click.echo("\nFailures encountered:")
+                        for failure in delete_result["failures"]:
+                            click.echo(f"Document ID: {failure['_id']}")
+                            click.echo(f"Error: {failure.get('error')}")
+                            click.echo("---")
+
+                except Exception as e:
+                    click.echo(f"Error during cleanup: {e}", err=True)
+                    raise click.ClickException(str(e))
 
         return run_in_reactor(cleanup())
 
diff --git a/scraper/outputs/elasticsearch_output.py b/scraper/outputs/elasticsearch_output.py
@@ -100,7 +100,7 @@ async def _query_runs(
 
             query = {
                 "query": {"bool": {"must": must_clauses}},
-                "sort": [{"timestamp": {"order": "desc"}}],
+                "sort": [{"finished_at": {"order": "desc"}}],
                 "size": size,
             }
 
@@ -128,19 +128,6 @@ async def get_recent_runs(
         """Get the most recent runs for a source."""
         return await self._query_runs(source=source, size=limit)
 
-    async def cleanup_test_documents(self, index_name: str):
-        """Remove all test documents from the specified index."""
-        query = {"query": {"term": {"test_document": True}}}
-        try:
-            result = self.es.delete_by_query(index=index_name, body=query)
-            logger.info(
-                f"Cleaned up {result['deleted']} test documents from index {index_name}"
-            )
-        except Exception as e:
-            logger.error(f"Error cleaning up test documents: {e}")
-            logger.exception("Full traceback:")
-            raise
-
     async def create_index_with_mapping(self, index_name: str, mapping: dict):
         """
         Create an index with a specific mapping.
diff --git a/scraper/processors/topic_extractor_processor.py b/scraper/processors/topic_extractor_processor.py
@@ -1,6 +1,9 @@
 import json
+from pathlib import Path
 from typing import List
+from loguru import logger
 
+from scraper.config import get_project_root
 from scraper.models import ScrapedDocument
 from .base_processor import BaseProcessor
 from scraper.registry import processor_registry
@@ -12,8 +15,25 @@ def __init__(self):
         self.topics_list = self.load_topics()
 
     def load_topics(self) -> List[str]:
-        with open("scraper/processors/topics_list.json", "r") as f:
-            return json.load(f)["topics"]
+        topics_path = Path(get_project_root()) / "processors" / "topics_list.json"
+        try:
+            with open(topics_path, "r") as f:
+                return json.load(f)["topics"]
+        except FileNotFoundError:
+            logger.warning(
+                f"Topics file not found at {topics_path}. Using empty topics list."
+            )
+            return []
+        except json.JSONDecodeError:
+            logger.error(
+                f"Invalid JSON in topics file: {topics_path}. Using empty topics list."
+            )
+            return []
+        except KeyError:
+            logger.error(
+                f"Missing 'topics' key in topics file: {topics_path}. Using empty topics list."
+            )
+            return []
 
     async def process(self, document: ScrapedDocument) -> ScrapedDocument:
         # Placeholder logic - replace with actual topic extraction
diff --git a/scraper/scrapers/github.py b/scraper/scrapers/github.py
@@ -243,10 +243,16 @@ def customize_document(
         return document_data
 
     def generate_id(self, file_path: str) -> str:
-        # Override this method to customize ID generation
+        """
+        Override this method in subclasses to customize ID generation.
+        """
+        # Since file_path is relative (e.g. 'tabconf/2022/file.zh.md'),
+        # we can safely use directory structure in ID generation
+        dir_path = os.path.dirname(file_path)
         file_name = os.path.basename(file_path)
+        # Keep language suffix (e.g. .zh) but remove final extension (.md)
         name_without_extension = os.path.splitext(file_name)[0]
-        return f"{self.config.name.lower()}-{slugify(name_without_extension)}"
+        return f"{self.config.name.lower()}-{slugify(dir_path)}-{slugify(name_without_extension)}"
 
     def get_title(self, metadata: Dict[str, Any], body: str) -> str:
         # First, check if there's a title in the metadata
diff --git a/scraper/sources.yaml b/scraper/sources.yaml
@@ -17,10 +17,6 @@ github:
   - name: BitcoinTranscripts
     domain: https://btctranscripts.com
     url: https://github.com/bitcointranscripts/bitcointranscripts.git
-    processors:
-      - summarization
-      - topic_extractor
-      - vector_embeddings
   - name: PR-Review-Club
     domain: https://bitcoincore.reviews/
     url: https://github.com/bitcoin-core-review-club/website.git
diff --git a/scraper/utils.py b/scraper/utils.py
@@ -1,3 +1,4 @@
+import os
 import re
 import unicodedata
 from typing import Tuple
@@ -9,7 +10,7 @@ def slugify(value: str) -> str:
     """
     Convert a string to a URL-friendly slug.
     - Normalize to ASCII
-    - Replace spaces and underscores with hyphens
+    - Replace spaces, underscores and directory separators with hyphens
     - Remove characters that aren't alphanumerics, underscores, or hyphens
     - Convert to lowercase
     - Strip leading and trailing hyphens
@@ -18,6 +19,8 @@ def slugify(value: str) -> str:
     value = (
         unicodedata.normalize("NFKD", value).encode("ascii", "ignore").decode("ascii")
     )
+    # Replace directory separators to hyphens
+    value = value.replace(os.sep, "-")
 
     # Replace spaces and underscores with hyphens, remove invalid characters
     value = re.sub(r"[_\s]+", "-", value)  # Replace spaces and underscores