Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added file support and fixed uri typo #60

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 34 additions & 4 deletions torrent_tracker_scraper/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
from multiprocessing import Pool
from typing import Callable, List, Tuple
from urllib.parse import urlparse
from pathlib import Path


import requests

Expand Down Expand Up @@ -83,7 +85,7 @@ def connect(self, timeout):

class Scraper:
def __init__(
self, trackers: List = [], infohashes: Tuple[List, str] = [], timeout: int = 10
self, trackerfile: str = "", trackers: List = [], infohashes: Tuple[List, str] = [], timeout: int = 10
):
"""
Launches a scraper bound to a particular tracker

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A docstring update would be good

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will try to push an update to docstring soon

Expand All @@ -92,6 +94,7 @@ def __init__(
:param infohashes (list): List of infohashes SHA-1 representation of the ```info``` key in the torrent file that should be parsed e.g. 95105D919C10E64AE4FA31067A8D37CCD33FE92D
:param timeout (int): Timeout value in seconds, program exits if no response received within this period
"""
self.trackerfile = trackerfile
self.trackers = trackers
self.infohashes = infohashes
self.timeout = timeout
Expand All @@ -115,9 +118,27 @@ def get_good_infohashes(self) -> list:
)
return good_infohashes

def get_trackers_viafile(self,trackers,filename):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would you mind writing a test case to cover this? Thanks.

my_file = Path(filename)
try:
my_abs_path = my_file.resolve(strict=True)
except FileNotFoundError as e:
logger.error("External tracker file not found: %s", e)
#raise Exception("External tracker file not found: %s" % e)
else:
file1 = open(filename, 'r')

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can use Path().open() to also open and read the file so file1 = my_file.open()

https://docs.python.org/3/library/pathlib.html#pathlib.Path.open

for line in file1.readlines():
if is_not_blank(line):
trackers.append(line.rstrip())

def get_trackers(self) -> list:
if not self.trackers:
trackers = list()
if hasattr(self, "trackerfile"):
if self.trackerfile != '':
splitted = self.trackerfile.split(',')
for filepath in splitted:
self.get_trackers_viafile(trackers,filepath)
response = requests.get("https://newtrackon.com/api/stable")
response = io.StringIO(response.text)
for line in response.readlines():
Expand All @@ -137,7 +158,11 @@ def _connect_request(self, transaction_id: int):
)
self.connection.sock.send(packet)
# Receive a Connect Request response
res = self.connection.sock.recv(16)
try:
res = self.connection.sock.recv(16)
except ConnectionResetError as e:
return -1, "Connection reset error: for {}: {}".format(self.connection, e)
#raise Exception("Connection reset error: %s" % e)
try:
_, response_transaction_id, connection_id = struct.unpack(">LLQ", res)
except struct.error as e:
Expand Down Expand Up @@ -211,7 +236,7 @@ def scrape_tracker(self, tracker):

logger.debug("Parsing list of infohashes [%s]", tracker.netloc)
self.connection = Connection(tracker.hostname, tracker.port, self.timeout)
tracker_url = f"{tracker.scheme}//:{tracker.netloc}"
tracker_url = f"{tracker.scheme}://{tracker.netloc}"
# Quit scraping if there is no connection
if self.connection.sock is None:
# TODO: Return info which tracker failed
Expand All @@ -226,7 +251,9 @@ def scrape_tracker(self, tracker):
except socket.timeout as e:
logger.error("Socket timeout for %s: %s", self.connection, e)
return ["Socket timeout for %s: %s" % (self.connection, e)]

if response_transaction_id == -1:
logger.error(connection_id)
return [connection_id]
if transaction_id != response_transaction_id:
raise RuntimeError(
"Transaction ID doesnt match in connect request [%s]. Expected %d, got %d"
Expand All @@ -243,6 +270,9 @@ def scrape_tracker(self, tracker):
results += _bad_infohashes
return {"tracker": tracker_url, "results": results}

def Addtrackfile(self, filename): #comma seperated lists of files to read trackers from

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I cant find references to this method anyhere, where is it used?

self.trackerfile += filename

def scrape(self):
"""
Takes in an infohash or infohashes. Returns seeders, leechers and completed
Expand Down