Skip to content

Commit 2112dad

Browse files
committed
feat: add github actions workflows for scraperv2
Adds reusable workflow for scraping sources and - workflow for BitcoinTranscripts - minor fixes identified during testing of workflow - removes old workflow for bitcointranscripts
1 parent 949c3c1 commit 2112dad

File tree

9 files changed

+123
-54
lines changed

9 files changed

+123
-54
lines changed
Lines changed: 7 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,14 @@
1-
name: Bitcoin Transcripts
1+
name: BitcoinTranscripts
2+
23
on:
34
schedule:
45
- cron: '0 13 * * 3' # every Wednesday at 1pm UTC
56
workflow_dispatch:
67
repository_dispatch:
78

89
jobs:
9-
fetch:
10-
runs-on: ubuntu-latest
11-
steps:
12-
- uses: actions/checkout@v2
13-
- uses: actions/setup-python@v2
14-
with:
15-
python-version: 3.9
16-
- name: Install dependencies
17-
run: |
18-
python -m pip install --upgrade pip
19-
pip install -r requirements.txt
20-
- name: Fetch data
21-
run: |
22-
mkdir /tmp/data
23-
python bitcointranscripts/main.py
24-
env:
25-
ES_ENGINE: ${{ secrets.ES_ENGINE }}
26-
ES_URL: ${{ secrets.ES_URL }}
27-
ES_TOKEN: ${{ secrets.ES_TOKEN }}
28-
DATA_DIR: /tmp/data
29-
CLOUD_ID: ${{ secrets.CLOUD_ID }}
30-
USER_PASSWORD: ${{ secrets.USER_PASSWORD }}
31-
USERNAME: ${{ secrets.USERNAME }}
32-
INDEX: ${{ secrets.INDEX }}
10+
bitcointranscripts:
11+
uses: ./.github/workflows/scrape-source.yml
12+
with:
13+
source: bitcointranscripts
14+
secrets: inherit
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
name: Scrape Source
2+
3+
on:
4+
workflow_call:
5+
inputs:
6+
source:
7+
required: true
8+
type: string
9+
10+
jobs:
11+
scrape:
12+
runs-on: ubuntu-latest
13+
steps:
14+
- uses: actions/checkout@v4
15+
16+
- name: Set up Python
17+
uses: actions/setup-python@v4
18+
with:
19+
python-version: "3.11"
20+
21+
- name: Get latest release
22+
id: latest_release
23+
uses: pozetroninc/github-action-get-latest-release@v0.7.0
24+
with:
25+
repository: bitcoinsearch/scraper
26+
token: ${{ secrets.GITHUB_TOKEN }}
27+
28+
- name: Install release
29+
run: |
30+
# Get the wheel filename from the release assets
31+
WHEEL_URL=$(curl -s https://api.github.com/repos/bitcoinsearch/scraper/releases/latest | jq -r '.assets[] | select(.name | endswith(".whl")) | .browser_download_url')
32+
pip install $WHEEL_URL
33+
34+
- name: Run scraper
35+
run: scraper scrape --source ${{ inputs.source }}
36+
env:
37+
CLOUD_ID: ${{ secrets.CLOUD_ID }}
38+
API_KEY: ${{ secrets.API_KEY }}
39+
INDEX: ${{ secrets.INDEX }}

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ Weekly
1515
- Bitcoin Talk Forum ([cron](.github/workflows/bitcointalk.yml), [source](bitcointalk))
1616
- only the [Development & Technical Discussion Board](https://bitcointalk.org/index.php?board=6.0)
1717
- only for specific authors
18-
- [Bitcoin Transcript](https://btctranscripts.com/) ([cron](.github/workflows/bitcointranscripts.yml), [source](bitcointranscripts))
18+
- [Bitcoin Transcript](https://btctranscripts.com/) ([cron](.github/workflows/bitcointranscripts.yml), [source](scraper/scrapers/bitcointranscripts.py))
1919
- [Bitcoin Optech](https://bitcoinops.org/) ([cron](.github/workflows/bitcoinops.yml), [source](bitcoinops))
2020

2121
Additionally, for on-demand scraping tasks, we utilize a Scrapybot, details of which can be found in the [Scrapybot section](#scrapybot) below.

scraper/commands/elastic.py

Lines changed: 41 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -100,14 +100,50 @@ async def cleanup():
100100
click.echo(f"Index {index_name} does not exist")
101101
return
102102

103+
# Define query based on the cleanup type
103104
if test_docs_only:
104-
await output.cleanup_test_documents(index_name)
105-
click.echo(f"Cleaned up test documents from index {index_name}")
105+
query = {"query": {"term": {"test_document": True}}}
106+
operation_desc = "test documents"
106107
else:
107-
output.es.delete_by_query(
108-
index=index_name, body={"query": {"match_all": {}}}
108+
query = {"query": {"match_all": {}}}
109+
operation_desc = "documents"
110+
111+
try:
112+
# First count how many documents will be affected
113+
count_result = output.es.count(index=index_name, body=query)
114+
doc_count = count_result["count"]
115+
116+
# Ask for confirmation
117+
if not click.confirm(
118+
f"\nWarning: {doc_count} {operation_desc} will be deleted from index '{index_name}'. Do you want to continue?"
119+
):
120+
click.echo("Operation cancelled")
121+
return
122+
123+
# Proceed with deletion
124+
delete_result = output.es.delete_by_query(
125+
index=index_name, body=query
109126
)
110-
click.echo(f"Removed all documents from index {index_name}")
127+
128+
# Print detailed deletion results
129+
click.echo("\nDeletion Results:")
130+
click.echo(
131+
f"Total {operation_desc} deleted: {delete_result['deleted']}"
132+
)
133+
click.echo(f"Total batches: {delete_result['batches']}")
134+
click.echo(f"Documents that failed: {delete_result['failures']}")
135+
click.echo(f"Time taken: {delete_result['took']}ms")
136+
137+
if delete_result.get("failures"):
138+
click.echo("\nFailures encountered:")
139+
for failure in delete_result["failures"]:
140+
click.echo(f"Document ID: {failure['_id']}")
141+
click.echo(f"Error: {failure.get('error')}")
142+
click.echo("---")
143+
144+
except Exception as e:
145+
click.echo(f"Error during cleanup: {e}", err=True)
146+
raise click.ClickException(str(e))
111147

112148
return run_in_reactor(cleanup())
113149

scraper/outputs/elasticsearch_output.py

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ async def _query_runs(
100100

101101
query = {
102102
"query": {"bool": {"must": must_clauses}},
103-
"sort": [{"timestamp": {"order": "desc"}}],
103+
"sort": [{"finished_at": {"order": "desc"}}],
104104
"size": size,
105105
}
106106

@@ -128,19 +128,6 @@ async def get_recent_runs(
128128
"""Get the most recent runs for a source."""
129129
return await self._query_runs(source=source, size=limit)
130130

131-
async def cleanup_test_documents(self, index_name: str):
132-
"""Remove all test documents from the specified index."""
133-
query = {"query": {"term": {"test_document": True}}}
134-
try:
135-
result = self.es.delete_by_query(index=index_name, body=query)
136-
logger.info(
137-
f"Cleaned up {result['deleted']} test documents from index {index_name}"
138-
)
139-
except Exception as e:
140-
logger.error(f"Error cleaning up test documents: {e}")
141-
logger.exception("Full traceback:")
142-
raise
143-
144131
async def create_index_with_mapping(self, index_name: str, mapping: dict):
145132
"""
146133
Create an index with a specific mapping.

scraper/processors/topic_extractor_processor.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
import json
2+
from pathlib import Path
23
from typing import List
4+
from loguru import logger
35

6+
from scraper.config import get_project_root
47
from scraper.models import ScrapedDocument
58
from .base_processor import BaseProcessor
69
from scraper.registry import processor_registry
@@ -12,8 +15,25 @@ def __init__(self):
1215
self.topics_list = self.load_topics()
1316

1417
def load_topics(self) -> List[str]:
15-
with open("scraper/processors/topics_list.json", "r") as f:
16-
return json.load(f)["topics"]
18+
topics_path = Path(get_project_root()) / "processors" / "topics_list.json"
19+
try:
20+
with open(topics_path, "r") as f:
21+
return json.load(f)["topics"]
22+
except FileNotFoundError:
23+
logger.warning(
24+
f"Topics file not found at {topics_path}. Using empty topics list."
25+
)
26+
return []
27+
except json.JSONDecodeError:
28+
logger.error(
29+
f"Invalid JSON in topics file: {topics_path}. Using empty topics list."
30+
)
31+
return []
32+
except KeyError:
33+
logger.error(
34+
f"Missing 'topics' key in topics file: {topics_path}. Using empty topics list."
35+
)
36+
return []
1737

1838
async def process(self, document: ScrapedDocument) -> ScrapedDocument:
1939
# Placeholder logic - replace with actual topic extraction

scraper/scrapers/github.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -243,10 +243,16 @@ def customize_document(
243243
return document_data
244244

245245
def generate_id(self, file_path: str) -> str:
246-
# Override this method to customize ID generation
246+
"""
247+
Override this method in subclasses to customize ID generation.
248+
"""
249+
# Since file_path is relative (e.g. 'tabconf/2022/file.zh.md'),
250+
# we can safely use directory structure in ID generation
251+
dir_path = os.path.dirname(file_path)
247252
file_name = os.path.basename(file_path)
253+
# Keep language suffix (e.g. .zh) but remove final extension (.md)
248254
name_without_extension = os.path.splitext(file_name)[0]
249-
return f"{self.config.name.lower()}-{slugify(name_without_extension)}"
255+
return f"{self.config.name.lower()}-{slugify(dir_path)}-{slugify(name_without_extension)}"
250256

251257
def get_title(self, metadata: Dict[str, Any], body: str) -> str:
252258
# First, check if there's a title in the metadata

scraper/sources.yaml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,6 @@ github:
1717
- name: BitcoinTranscripts
1818
domain: https://btctranscripts.com
1919
url: https://github.com/bitcointranscripts/bitcointranscripts.git
20-
processors:
21-
- summarization
22-
- topic_extractor
23-
- vector_embeddings
2420
- name: PR-Review-Club
2521
domain: https://bitcoincore.reviews/
2622
url: https://github.com/bitcoin-core-review-club/website.git

scraper/utils.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import os
12
import re
23
import unicodedata
34
from typing import Tuple
@@ -9,7 +10,7 @@ def slugify(value: str) -> str:
910
"""
1011
Convert a string to a URL-friendly slug.
1112
- Normalize to ASCII
12-
- Replace spaces and underscores with hyphens
13+
- Replace spaces, underscores and directory separators with hyphens
1314
- Remove characters that aren't alphanumerics, underscores, or hyphens
1415
- Convert to lowercase
1516
- Strip leading and trailing hyphens
@@ -18,6 +19,8 @@ def slugify(value: str) -> str:
1819
value = (
1920
unicodedata.normalize("NFKD", value).encode("ascii", "ignore").decode("ascii")
2021
)
22+
# Replace directory separators to hyphens
23+
value = value.replace(os.sep, "-")
2124

2225
# Replace spaces and underscores with hyphens, remove invalid characters
2326
value = re.sub(r"[_\s]+", "-", value) # Replace spaces and underscores

0 commit comments

Comments
 (0)