-
Notifications
You must be signed in to change notification settings - Fork 14
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Added file support and fixed uri typo #60
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,6 +10,8 @@ | |
from multiprocessing import Pool | ||
from typing import Callable, List, Tuple | ||
from urllib.parse import urlparse | ||
from pathlib import Path | ||
|
||
|
||
import requests | ||
|
||
|
@@ -83,7 +85,7 @@ def connect(self, timeout): | |
|
||
class Scraper: | ||
def __init__( | ||
self, trackers: List = [], infohashes: Tuple[List, str] = [], timeout: int = 10 | ||
self, trackerfile: str = "", trackers: List = [], infohashes: Tuple[List, str] = [], timeout: int = 10 | ||
): | ||
""" | ||
Launches a scraper bound to a particular tracker | ||
|
@@ -92,6 +94,7 @@ def __init__( | |
:param infohashes (list): List of infohashes SHA-1 representation of the ```info``` key in the torrent file that should be parsed e.g. 95105D919C10E64AE4FA31067A8D37CCD33FE92D | ||
:param timeout (int): Timeout value in seconds, program exits if no response received within this period | ||
""" | ||
self.trackerfile = trackerfile | ||
self.trackers = trackers | ||
self.infohashes = infohashes | ||
self.timeout = timeout | ||
|
@@ -115,9 +118,27 @@ def get_good_infohashes(self) -> list: | |
) | ||
return good_infohashes | ||
|
||
def get_trackers_viafile(self,trackers,filename): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would you mind writing a test case to cover this? Thanks. |
||
my_file = Path(filename) | ||
try: | ||
my_abs_path = my_file.resolve(strict=True) | ||
except FileNotFoundError as e: | ||
logger.error("External tracker file not found: %s", e) | ||
#raise Exception("External tracker file not found: %s" % e) | ||
else: | ||
file1 = open(filename, 'r') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we can use https://docs.python.org/3/library/pathlib.html#pathlib.Path.open |
||
for line in file1.readlines(): | ||
if is_not_blank(line): | ||
trackers.append(line.rstrip()) | ||
|
||
def get_trackers(self) -> list: | ||
if not self.trackers: | ||
trackers = list() | ||
if hasattr(self, "trackerfile"): | ||
if self.trackerfile != '': | ||
splitted = self.trackerfile.split(',') | ||
for filepath in splitted: | ||
self.get_trackers_viafile(trackers,filepath) | ||
response = requests.get("https://newtrackon.com/api/stable") | ||
response = io.StringIO(response.text) | ||
for line in response.readlines(): | ||
|
@@ -137,7 +158,11 @@ def _connect_request(self, transaction_id: int): | |
) | ||
self.connection.sock.send(packet) | ||
# Receive a Connect Request response | ||
res = self.connection.sock.recv(16) | ||
try: | ||
res = self.connection.sock.recv(16) | ||
except ConnectionResetError as e: | ||
return -1, "Connection reset error: for {}: {}".format(self.connection, e) | ||
#raise Exception("Connection reset error: %s" % e) | ||
try: | ||
_, response_transaction_id, connection_id = struct.unpack(">LLQ", res) | ||
except struct.error as e: | ||
|
@@ -211,7 +236,7 @@ def scrape_tracker(self, tracker): | |
|
||
logger.debug("Parsing list of infohashes [%s]", tracker.netloc) | ||
self.connection = Connection(tracker.hostname, tracker.port, self.timeout) | ||
tracker_url = f"{tracker.scheme}//:{tracker.netloc}" | ||
tracker_url = f"{tracker.scheme}://{tracker.netloc}" | ||
# Quit scraping if there is no connection | ||
if self.connection.sock is None: | ||
# TODO: Return info which tracker failed | ||
|
@@ -226,7 +251,9 @@ def scrape_tracker(self, tracker): | |
except socket.timeout as e: | ||
logger.error("Socket timeout for %s: %s", self.connection, e) | ||
return ["Socket timeout for %s: %s" % (self.connection, e)] | ||
|
||
if response_transaction_id == -1: | ||
logger.error(connection_id) | ||
return [connection_id] | ||
if transaction_id != response_transaction_id: | ||
raise RuntimeError( | ||
"Transaction ID doesnt match in connect request [%s]. Expected %d, got %d" | ||
|
@@ -243,6 +270,9 @@ def scrape_tracker(self, tracker): | |
results += _bad_infohashes | ||
return {"tracker": tracker_url, "results": results} | ||
|
||
def Addtrackfile(self, filename): #comma seperated lists of files to read trackers from | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I cant find references to this method anyhere, where is it used? |
||
self.trackerfile += filename | ||
|
||
def scrape(self): | ||
""" | ||
Takes in an infohash or infohashes. Returns seeders, leechers and completed | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A docstring update would be good
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I will try to push an update to docstring soon