Skip to content

Commit

Permalink
global tor issue fixed
Browse files Browse the repository at this point in the history
  • Loading branch information
itsOwen committed Nov 8, 2024
1 parent f20fb86 commit 02752bd
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 53 deletions.
77 changes: 51 additions & 26 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,26 +4,33 @@ FROM python:3.10-slim-bullseye
# Set the working directory in the container
WORKDIR /app

# Install system dependencies including Git and Tor
# Install system dependencies
RUN apt-get update && apt-get install -y \
wget \
gnupg \
git \
tor \
tor-geoipdb \
# Additional dependencies that might be needed
netcat-traditional \
curl \
build-essential \
python3-dev \
libffi-dev \
procps \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

# Configure Tor
# Configure Tor - Simplified configuration
RUN echo "SocksPort 9050" >> /etc/tor/torrc && \
echo "ControlPort 9051" >> /etc/tor/torrc && \
echo "CookieAuthentication 1" >> /etc/tor/torrc
echo "CookieAuthentication 1" >> /etc/tor/torrc && \
echo "DataDirectory /var/lib/tor" >> /etc/tor/torrc

# Cyberscraper repo :)
# Set correct permissions for Tor
RUN chown -R debian-tor:debian-tor /var/lib/tor && \
chmod 700 /var/lib/tor

# Clone the repository
RUN git clone https://github.com/itsOwen/CyberScraper-2077.git .

# Create and activate a virtual environment
Expand All @@ -33,45 +40,63 @@ ENV PATH="/app/venv/bin:$PATH"
# Install Python dependencies
RUN pip install --no-cache-dir -r requirements.txt

# Install additional Tor-related Python packages
# Install Tor-related Python packages
RUN pip install --no-cache-dir \
PySocks \
requests[socks]

# Install playwright and its browser
RUN pip install playwright requests
RUN playwright install chromium
RUN playwright install-deps
PySocks>=1.7.1 \
requests[socks]>=2.28.1

# Expose ports for Streamlit and Tor
EXPOSE 8501
EXPOSE 9050
EXPOSE 9051
# Install playwright and browser
RUN pip install playwright requests && \
playwright install chromium && \
playwright install-deps

# Create a shell script to run the application
# Create run script with proper Tor startup
RUN echo '#!/bin/bash\n\
\n\
# Start Tor service\n\
echo "Starting Tor service..."\n\
service tor start\n\
\n\
# Wait for Tor to be ready\n\
echo "Waiting for Tor to be ready..."\n\
timeout 60 bash -c "until nc -z localhost 9050; do sleep 1; done"\n\
echo "Waiting for Tor to start..."\n\
for i in {1..30}; do\n\
if ps aux | grep -v grep | grep -q /usr/bin/tor; then\n\
echo "Tor process is running"\n\
if nc -z localhost 9050; then\n\
echo "Tor SOCKS port is listening"\n\
break\n\
fi\n\
fi\n\
if [ $i -eq 30 ]; then\n\
echo "Warning: Tor might not be ready, but continuing..."\n\
fi\n\
sleep 1\n\
done\n\
\n\
# Verify Tor status\n\
echo "Checking Tor service status:"\n\
service tor status\n\
\n\
# Export API key if provided\n\
if [ ! -z "$OPENAI_API_KEY" ]; then\n\
export OPENAI_API_KEY=$OPENAI_API_KEY\n\
echo "OpenAI API key configured"\n\
fi\n\
\n\
if [ ! -z "$GOOGLE_API_KEY" ]; then\n\
export GOOGLE_API_KEY=$GOOGLE_API_KEY\n\
echo "Google API key configured"\n\
fi\n\
\n\
# Check Tor connection\n\
echo "Verifying Tor connection..."\n\
curl --socks5 localhost:9050 --socks5-hostname localhost:9050 -s https://check.torproject.org/api/ip\n\
\n\
streamlit run main.py\n\
# Start the application with explicit host binding\n\
echo "Starting CyberScraper 2077..."\n\
streamlit run --server.address 0.0.0.0 --server.port 8501 main.py\n\
' > /app/run.sh

RUN chmod +x /app/run.sh

# Set the entrypoint to the shell script
# Expose ports
EXPOSE 8501 9050 9051

# Set the entrypoint
ENTRYPOINT ["/app/run.sh"]
34 changes: 12 additions & 22 deletions src/scrapers/tor/tor_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,32 +14,22 @@
)

class TorManager:
"""Manages Tor connection and session handling"""

def __init__(self, config: TorConfig = TorConfig()):
self.logger = logging.getLogger(__name__)
self.logger.setLevel(logging.DEBUG if config.debug else logging.INFO)
self.config = config
self._setup_logging()
self._setup_proxy()
# Store proxy configuration without applying globally
self.proxies = {
'http': f'socks5h://127.0.0.1:{self.config.socks_port}',
'https': f'socks5h://127.0.0.1:{self.config.socks_port}'
}

def _setup_logging(self):
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
self.logger.addHandler(handler)

def _setup_proxy(self):
"""Configure SOCKS proxy for Tor"""
try:
socks.set_default_proxy(socks.SOCKS5, "127.0.0.1", self.config.socks_port)
socket.socket = socks.socksocket
self.proxies = {
'http': f'socks5h://127.0.0.1:{self.config.socks_port}',
'https': f'socks5h://127.0.0.1:{self.config.socks_port}'
}
except Exception as e:
raise TorProxyError(f"Failed to setup Tor proxy: {str(e)}")

def get_headers(self) -> Dict[str, str]:
"""Get randomized Tor Browser-like headers"""
Expand All @@ -57,6 +47,13 @@ def get_headers(self) -> Dict[str, str]:
'Sec-Fetch-User': '?1'
}

def get_tor_session(self) -> requests.Session:
"""Create a requests session that routes through Tor"""
session = requests.Session()
session.proxies = self.proxies
session.headers = self.get_headers()
return session

async def verify_tor_connection(self) -> bool:
"""Verify Tor connection is working"""
try:
Expand All @@ -74,13 +71,6 @@ async def verify_tor_connection(self) -> bool:
except Exception as e:
raise TorConnectionError(f"Failed to verify Tor connection: {str(e)}")

def get_tor_session(self) -> requests.Session:
"""Create a requests session that routes through Tor"""
session = requests.Session()
session.proxies = self.proxies
session.headers = self.get_headers()
return session

@staticmethod
def is_onion_url(url: str) -> bool:
"""Check if the given URL is an onion service"""
Expand Down
3 changes: 2 additions & 1 deletion src/scrapers/tor/tor_scraper.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Dict, Any, List, Optional
from typing import Dict, Any
from .tor_manager import TorManager
from .tor_config import TorConfig
from .exceptions import TorException
Expand Down Expand Up @@ -31,6 +31,7 @@ async def fetch_content(self, url: str, proxy: str = None) -> str:
if not self.is_onion_url(url):
raise ValueError("Not an onion URL")

# Use Tor manager to fetch content
content = await self.tor_manager.fetch_content(url)
return content
except Exception as e:
Expand Down
15 changes: 11 additions & 4 deletions src/web_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ async def _fetch_url(self, url: str, pages: Optional[str] = None,
self.current_url = url

try:
# Check if it's an onion URL using the static method
# Check if it's an onion URL
if TorScraper.is_onion_url(url):
if progress_callback:
progress_callback("Fetching content through Tor network...")
Expand All @@ -135,10 +135,17 @@ async def _fetch_url(self, url: str, pages: Optional[str] = None,
self.current_content = content

else:
# Existing regular scraping logic
proxy = await self.proxy_manager.get_proxy()
# Regular scraping without Tor
if progress_callback:
progress_callback(f"Fetching content from {url}")

# Don't use proxy for non-onion URLs
contents = await self.playwright_scraper.fetch_content(
url, proxy, pages, url_pattern, handle_captcha
url,
proxy=None, # Explicitly set proxy to None for regular URLs
pages=pages,
url_pattern=url_pattern,
handle_captcha=handle_captcha
)
self.current_content = "\n".join(contents)

Expand Down

0 comments on commit 02752bd

Please sign in to comment.