Skip to content

Commit

Permalink
strip /api/files in desy spider
Browse files Browse the repository at this point in the history
  • Loading branch information
MJedr committed Mar 15, 2023
1 parent 575b189 commit c17f0a5
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 1 deletion.
2 changes: 1 addition & 1 deletion hepcrawl/spiders/desy_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def _is_local_path(cls, url):
def _get_full_uri(self, file_name, schema='https'):
self.move_file_to_processed(file_name)
url = self.s3_url_for_file(file_name, bucket=self.s3_output_bucket)
return url
return url.lstrip('/api/files/')

def parse(self, response):
"""Parse a ``Desy`` jsonl file into a :class:`hepcrawl.utils.ParsedItem`.
Expand Down
1 change: 1 addition & 0 deletions tests/functional/desy/test_desy.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,7 @@ def test_desy_crawl_twice(expected_results, settings, cleanup):
# preproces s3 urls
for rec in gotten_records:
for document in rec.get('documents', []):
assert '/api/files' not in document
if settings['CRAWLER_ARGUMENTS']['s3_server'] in document['url']:
assert "&Expires=" in document['url']
document['url'] = document['url'].split('&Expires=')[0]
Expand Down

0 comments on commit c17f0a5

Please sign in to comment.