Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add providing location for fetch #56

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ six==1.14.0
urllib3==1.25.8
wcwidth==0.1.8
zipp==1.2.0
kiss-headers==2.3.0
73 changes: 52 additions & 21 deletions src/fetchcode/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,13 @@

from ftplib import FTP
from mimetypes import MimeTypes
import os
import tempfile
from pathlib import Path
from pathlib import PurePosixPath
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe just PurePath? It will be PurePosixPath on posix anyway and ci runs tests on {linux,macos,windows}

Copy link
Author

@quepop quepop Aug 27, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I use PurePosixPaths only to handle URLs so i think it is more explicit that way.

from urllib.parse import urlparse
from kiss_headers import parse_it
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


import requests
import tempfile


class Response:
Expand All @@ -41,14 +43,35 @@ def __init__(self, location, content_type, size, url):
def fetch_http(url, location):
"""
Return a `Response` object built from fetching the content at a HTTP/HTTPS based `url` URL string
saving the content in a file at `location`
Saving the content in a file at `location`
If `location` is an existing directory - try to deduce the filename
If deduction failed, save the content in a temporary file created at a `location`
"""
r = requests.get(url)
with open(location, 'wb') as f:

if Path.is_dir(location):
content_disposition = parse_it(r.headers).get("content-disposition") or {}
filename_priority = [
content_disposition.get("filename*"),
content_disposition.get("filename"),
PurePosixPath(urlparse(url).path).name,
]
filename_found = False
for filename in filename_priority:
if filename is not None and len(filename):
filename_found = True
location /= filename
break
if not filename_found:
location = Path(
tempfile.NamedTemporaryFile(dir=location, delete=False).name
)

with open(location, "wb") as f:
f.write(r.content)

content_type = r.headers.get('content-type')
size = r.headers.get('content-length')
content_type = r.headers.get("content-type")
size = r.headers.get("content-length")
size = int(size) if size else None

resp = Response(location=location, content_type=content_type, size=size, url=url)
Expand All @@ -59,49 +82,57 @@ def fetch_http(url, location):
def fetch_ftp(url, location):
"""
Return a `Response` object built from fetching the content at a FTP based `url` URL string
saving the content in a file at `location`
Saving the content in a file at `location`
If `location` is an existing directory - deduce the filename from the URL
"""
url_parts = urlparse(url)

netloc = url_parts.netloc
path = url_parts.path
dir, file = os.path.split(path)
path = PurePosixPath(url_parts.path)
directory = path.parent
filename = path.name

if Path.is_dir(location):
location /= filename

ftp = FTP(netloc)
ftp.login()

size = ftp.size(path)
size = ftp.size(str(path))
mime = MimeTypes()
mime_type = mime.guess_type(file)
mime_type = mime.guess_type(filename)
if mime_type:
content_type = mime_type[0]
else:
content_type = None

ftp.cwd(dir)
file = 'RETR {}'.format(file)
with open(location, 'wb') as f:
ftp.retrbinary(file, f.write)
ftp.cwd(str(directory))
filename = "RETR {}".format(filename)
with open(location, "wb") as f:
ftp.retrbinary(filename, f.write)
ftp.close()

resp = Response(location=location, content_type=content_type, size=size, url=url)
return resp


def fetch(url):
def fetch(url, location=None):
"""
Return a `Response` object built from fetching the content at the `url` URL string and store content at a temporary file.
Return a `Response` object built from fetching the content at the `url` URL string and store content at a provided `location`
If `location` is None, save the content in a newly created temporary file
If `location` is an existing directory - try to deduce the filename
"""

temp = tempfile.NamedTemporaryFile(delete=False)
location = temp.name
if location is None:
temp = tempfile.NamedTemporaryFile(delete=False)
location = Path(temp.name)

url_parts = urlparse(url)
scheme = url_parts.scheme

fetchers = {'ftp': fetch_ftp, 'http': fetch_http, 'https': fetch_http}
fetchers = {"ftp": fetch_ftp, "http": fetch_http, "https": fetch_http}

if scheme in fetchers:
return fetchers.get(scheme)(url, location)

raise Exception('Not a supported/known scheme.')
raise Exception("Not a supported/known scheme.")
87 changes: 69 additions & 18 deletions tests/test_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,52 +14,103 @@
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.

from fetchcode import fetch
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no need to move this import, it was correctly placed: https://www.python.org/dev/peps/pep-0008/#imports

from pathlib import Path
from unittest import mock

import pytest

from fetchcode import fetch

FILENAMES = ["a.ext", "b.ext", "c.ext"]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

consider switching to dict

Suggested change
FILENAMES = ["a.ext", "b.ext", "c.ext"]
FILENAMES = {"a": "a.ext",
"b": "b.ext",
"c": "c.ext"}

so you can access them as FILENAMES["a"] making it a little bit clearer than positionals

HTTP_URL = "http://example.com/"
FTP_URL = "ftp://example.com/"


@mock.patch("fetchcode.requests.get")
def test_fetch_http_with_filename_provided(mock_get):
with mock.patch("fetchcode.open", mock.mock_open()) as mocked_file:
location = Path.home() / FILENAMES[0]
response = fetch(HTTP_URL, location)
assert response is not None
assert response.location == location


@pytest.mark.parametrize(
"parameters, expected_filename",
[
pytest.param(f'filename*="{FILENAMES[0]}"; filename=""', FILENAMES[0]),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

consider adding few test cases:

Suggested change
pytest.param(f'filename*="{FILENAMES[0]}"; filename=""', FILENAMES[0]),
pytest.param(f'filename*="{FILENAMES["a"]}"', FILENAMES["a"], id='should_choose_filename*'),
pytest.param(f'filename="{FILENAMES["a"]}"', FILENAMES["a"], id='should_choose_filename'),
pytest.param(f'filename*="{FILENAMES["a"]}"; filename={FILENAMES["b"]}', FILENAMES["a"], id='filename*_should_take_precedence'),
pytest.param(f'filename*="{FILENAMES["a"]}"; filename=""', FILENAMES["a"], id='should_choose_non_empty_filename*'),
pytest.param(f'filename*=""; filename="{FILENAMES["a"]}"', FILENAMES["a"], id='should_choose_non_empty_filename'),

this could be more readable if filename in url would also be param here and explicitly tested

pytest.param(f'filename*=""; filename="{FILENAMES[1]}"', FILENAMES[1]),
pytest.param(f'filename*=""; filename=""', FILENAMES[2]),
],
)
@mock.patch("fetchcode.requests.get")
def test_fetch_http_with_filename_deduction(mock_get, parameters, expected_filename):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

consider changing parameters name to something more fitting, maybe content_disposition or filename?

mock_get.return_value.headers = {"content-disposition": f"attachment; {parameters}"}
with mock.patch("fetchcode.open", mock.mock_open()) as mocked_file:
location = Path.home()
response = fetch(HTTP_URL + FILENAMES[2], location)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FILENAMES[2] is confusing. I assume this is to test 3rd input set where name is inferred from url?
It could be made more readable - add url_filename parameter to tests and name those inputs (see comment above) so its clear what is tested here

assert response is not None
assert response.location == (location / expected_filename)

@mock.patch('fetchcode.requests.get')

@mock.patch("fetchcode.tempfile.NamedTemporaryFile")
@mock.patch("fetchcode.requests.get")
def test_fetch_http_filename_deduction_failed(mock_get, mock_tmp):
location = Path.home()
mock_get.return_value.headers = {}
mock_tmp.return_value.name = location / FILENAMES[0]
with mock.patch("fetchcode.open", mock.mock_open()) as mocked_file:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why name it when the name is not used?

response = fetch(HTTP_URL, location)
assert response is not None
assert response.location == (location / FILENAMES[0])


@mock.patch("fetchcode.requests.get")
def test_fetch_http_with_tempfile(mock_get):
mock_get.return_value.headers = {
'content-type': 'image/png',
'content-length': '1000999',
"content-type": "image/png",
"content-length": "1000999",
}

with mock.patch('fetchcode.open', mock.mock_open()) as mocked_file:
url = 'https://raw.githubusercontent.com/TG1999/converge/master/assets/Group%2022.png'
with mock.patch("fetchcode.open", mock.mock_open()) as mocked_file:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no need to name it

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thats not my test (black reformatted it)

url = "https://raw.githubusercontent.com/TG1999/converge/master/assets/Group%2022.png"
response = fetch(url=url)
assert response is not None
assert 1000999 == response.size
assert url == response.url
assert 'image/png' == response.content_type
assert "image/png" == response.content_type


@mock.patch('fetchcode.FTP')
@mock.patch("fetchcode.FTP")
def test_fetch_with_wrong_url(mock_get):
with pytest.raises(Exception) as e_info:
url = 'ftp://speedtest/1KB.zip'
url = "ftp://speedtest/1KB.zip"
response = fetch(url=url)
assert 'Not a valid URL' == e_info
assert "Not a valid URL" == e_info


@mock.patch('fetchcode.FTP', autospec=True)
@mock.patch("fetchcode.FTP", autospec=True)
def test_fetch_ftp_with_tempfile(mock_ftp_constructor):
mock_ftp = mock_ftp_constructor.return_value
mock_ftp_constructor.return_value.size.return_value = 1024
with mock.patch('fetchcode.open', mock.mock_open()) as mocked_file:
response = fetch('ftp://speedtest.tele2.net/1KB.zip')
with mock.patch("fetchcode.open", mock.mock_open()) as mocked_file:
response = fetch("ftp://speedtest.tele2.net/1KB.zip")
assert 1024 == response.size
mock_ftp_constructor.assert_called_with('speedtest.tele2.net')
mock_ftp_constructor.assert_called_with("speedtest.tele2.net")
assert mock_ftp.login.called == True
mock_ftp.cwd.assert_called_with('/')
mock_ftp.cwd.assert_called_with("/")
assert mock_ftp.retrbinary.called


@mock.patch("fetchcode.FTP")
def test_fetch_ftp_with_filename_deduction(mock_ftp):
with mock.patch("fetchcode.open", mock.mock_open()) as mocked_file:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ibidem

location = Path.home()
response = fetch(FTP_URL + FILENAMES[0], location)
assert response.location == (location / FILENAMES[0])


def test_fetch_with_scheme_not_present():
with pytest.raises(Exception) as e_info:
url = 'abc://speedtest/1KB.zip'
url = "abc://speedtest/1KB.zip"
response = fetch(url=url)
assert 'Not a supported/known scheme.' == e_info
assert "Not a supported/known scheme." == e_info