Skip to content

Commit 3a3f797

Browse files
committed
feat: add support for paper title search (#90)
Resolves: #90
1 parent e15816f commit 3a3f797

File tree

5 files changed

+209
-8
lines changed

5 files changed

+209
-8
lines changed

semanticscholar/AsyncSemanticScholar.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -397,8 +397,9 @@ async def search_paper(
397397
min_citation_count: int = None,
398398
limit: int = 100,
399399
bulk: bool = False,
400-
sort: str = None
401-
) -> PaginatedResults:
400+
sort: str = None,
401+
match_title: bool = False
402+
) -> Union[PaginatedResults, Paper]:
402403
'''Search for papers by keyword. Performs a search query based on the \
403404
S2 search relevance algorithm, or a bulk retrieval of basic paper \
404405
data without search relevance (if bulk=True). Paper relevance \
@@ -439,8 +440,11 @@ async def search_paper(
439440
<field>:<order> format, where "field" is either paperId, \
440441
publicationDate, or citationCount, and "order" is asc \
441442
(ascending) or desc (descending).
443+
:param bool match_title: (optional) retrieve a single paper whose \
444+
title best matches the given query.
442445
:returns: query results.
443-
:rtype: :class:`semanticscholar.PaginatedResults.PaginatedResults`
446+
:rtype: :class:`semanticscholar.PaginatedResults.PaginatedResults` or \
447+
:class:`semanticscholar.Paper.Paper`
444448
'''
445449

446450
if limit < 1 or limit > 100:
@@ -461,6 +465,12 @@ async def search_paper(
461465
warnings.warn(
462466
'The sort parameter is only used when bulk=True.')
463467

468+
if match_title:
469+
url += '/match'
470+
if bulk:
471+
raise ValueError(
472+
'The match_title parameter is not allowed when bulk=True.')
473+
464474
query += f'&year={year}' if year else ''
465475

466476
if publication_types:
@@ -504,7 +514,7 @@ async def search_paper(
504514
max_results=max_results
505515
)
506516

507-
return results
517+
return results if not match_title else results[0]
508518

509519
async def get_author(
510520
self,

semanticscholar/SemanticScholar.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -298,8 +298,9 @@ def search_paper(
298298
min_citation_count: int = None,
299299
limit: int = 100,
300300
bulk: bool = False,
301-
sort: str = None
302-
) -> PaginatedResults:
301+
sort: str = None,
302+
match_title: bool = False
303+
) -> Union[PaginatedResults, Paper]:
303304
'''Search for papers by keyword. Performs a search query based on the \
304305
S2 search relevance algorithm, or a bulk retrieval of basic paper \
305306
data without search relevance (if bulk=True). Paper relevance \
@@ -340,8 +341,11 @@ def search_paper(
340341
<field>:<order> format, where "field" is either paperId, \
341342
publicationDate, or citationCount, and "order" is asc \
342343
(ascending) or desc (descending).
344+
:param bool match_title: (optional) retrieve a single paper whose \
345+
title best matches the given query.
343346
:returns: query results.
344-
:rtype: :class:`semanticscholar.PaginatedResults.PaginatedResults`
347+
:rtype: :class:`semanticscholar.PaginatedResults.PaginatedResults` or \
348+
:class:`semanticscholar.Paper.Paper`
345349
'''
346350

347351
loop = asyncio.get_event_loop()
@@ -358,7 +362,8 @@ def search_paper(
358362
min_citation_count=min_citation_count,
359363
limit=limit,
360364
bulk=bulk,
361-
sort=sort
365+
sort=sort,
366+
match_title=match_title
362367
)
363368
)
364369

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
interactions:
2+
- request:
3+
body: ''
4+
headers:
5+
accept:
6+
- '*/*'
7+
accept-encoding:
8+
- gzip, deflate
9+
connection:
10+
- keep-alive
11+
host:
12+
- api.semanticscholar.org
13+
user-agent:
14+
- python-httpx/0.27.0
15+
method: GET
16+
uri: https://api.semanticscholar.org/graph/v1/paper/search/match?query=mining%20association%20rules%20between&fields=abstract,authors,citationCount,citationStyles,corpusId,externalIds,fieldsOfStudy,influentialCitationCount,isOpenAccess,journal,openAccessPdf,paperId,publicationDate,publicationTypes,publicationVenue,referenceCount,s2FieldsOfStudy,title,url,venue,year&offset=0&limit=100
17+
response:
18+
body:
19+
string: '{"data": [{"paperId": "6fe8c5bf8dddaadf10c765133d38dfef5714347f", "externalIds":
20+
{"MAG": "2998574808", "DBLP": "conf/sigmod/AgrawalIS93", "DOI": "10.1145/170035.170072",
21+
"CorpusId": 490415}, "corpusId": 490415, "publicationVenue": null, "url":
22+
"https://www.semanticscholar.org/paper/6fe8c5bf8dddaadf10c765133d38dfef5714347f",
23+
"title": "Mining association rules between sets of items in large databases",
24+
"abstract": "We are given a large database of customer transactions. Each
25+
transaction consists of items purchased by a customer in a visit. We present
26+
an efficient algorithm that generates all significant association rules between
27+
items in the database. The algorithm incorporates buffer management and novel
28+
estimation and pruning techniques. We also present results of applying this
29+
algorithm to sales data obtained from a large retailing company, which shows
30+
the effectiveness of the algorithm.", "venue": "SIGMOD Conference", "year":
31+
1993, "referenceCount": 16, "citationCount": 16127, "influentialCitationCount":
32+
1442, "isOpenAccess": true, "openAccessPdf": {"url": "https://dl.acm.org/doi/pdf/10.1145/170036.170072",
33+
"status": "BRONZE"}, "fieldsOfStudy": ["Computer Science"], "s2FieldsOfStudy":
34+
[{"category": "Computer Science", "source": "external"}, {"category": "Computer
35+
Science", "source": "s2-fos-model"}], "publicationTypes": ["JournalArticle",
36+
"Book", "Conference"], "publicationDate": "1993-06-01", "journal": {"name":
37+
"Proceedings of the 1993 ACM SIGMOD international conference on Management
38+
of data"}, "citationStyles": {"bibtex": "@Article{Agrawal1993MiningAR,\n author
39+
= {R. Agrawal and T. Imielinski and A. Swami},\n booktitle = {SIGMOD Conference},\n
40+
journal = {Proceedings of the 1993 ACM SIGMOD international conference on
41+
Management of data},\n title = {Mining association rules between sets of items
42+
in large databases},\n year = {1993}\n}\n"}, "authors": [{"authorId": "144947410",
43+
"name": "R. Agrawal"}, {"authorId": "1733797", "name": "T. Imielinski"}, {"authorId":
44+
"31536502", "name": "A. Swami"}], "matchScore": 109.24948}]}
45+
46+
'
47+
headers:
48+
Access-Control-Allow-Origin:
49+
- '*'
50+
Connection:
51+
- keep-alive
52+
Content-Length:
53+
- '2061'
54+
Content-Type:
55+
- application/json
56+
Date:
57+
- Sat, 13 Jul 2024 21:32:29 GMT
58+
Via:
59+
- 1.1 fb1afad5f0fba8de226916a2cd3f83ca.cloudfront.net (CloudFront)
60+
X-Amz-Cf-Id:
61+
- TkTBfJ5v8T3RauNr_ArL3gjaNEwKLYLYAcZXicorKKpXJ03i5nVP_w==
62+
X-Amz-Cf-Pop:
63+
- GRU1-P4
64+
X-Cache:
65+
- Miss from cloudfront
66+
x-amz-apigw-id:
67+
- a3rRGG-aPHcEBFQ=
68+
x-amzn-Remapped-Connection:
69+
- keep-alive
70+
x-amzn-Remapped-Content-Length:
71+
- '2061'
72+
x-amzn-Remapped-Date:
73+
- Sat, 13 Jul 2024 21:32:29 GMT
74+
x-amzn-Remapped-Server:
75+
- gunicorn
76+
x-amzn-RequestId:
77+
- 98740656-525b-40bc-9d53-4359ddb7ff6d
78+
status:
79+
code: 200
80+
message: OK
81+
version: 1
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
interactions:
2+
- request:
3+
body: ''
4+
headers:
5+
accept:
6+
- '*/*'
7+
accept-encoding:
8+
- gzip, deflate
9+
connection:
10+
- keep-alive
11+
host:
12+
- api.semanticscholar.org
13+
user-agent:
14+
- python-httpx/0.27.0
15+
method: GET
16+
uri: https://api.semanticscholar.org/graph/v1/paper/search/match?query=mining%20association%20rules%20between&fields=abstract,authors,citationCount,citationStyles,corpusId,externalIds,fieldsOfStudy,influentialCitationCount,isOpenAccess,journal,openAccessPdf,paperId,publicationDate,publicationTypes,publicationVenue,referenceCount,s2FieldsOfStudy,title,url,venue,year&offset=0&limit=100
17+
response:
18+
body:
19+
string: '{"data": [{"paperId": "6fe8c5bf8dddaadf10c765133d38dfef5714347f", "externalIds":
20+
{"MAG": "2998574808", "DBLP": "conf/sigmod/AgrawalIS93", "DOI": "10.1145/170035.170072",
21+
"CorpusId": 490415}, "corpusId": 490415, "publicationVenue": null, "url":
22+
"https://www.semanticscholar.org/paper/6fe8c5bf8dddaadf10c765133d38dfef5714347f",
23+
"title": "Mining association rules between sets of items in large databases",
24+
"abstract": "We are given a large database of customer transactions. Each
25+
transaction consists of items purchased by a customer in a visit. We present
26+
an efficient algorithm that generates all significant association rules between
27+
items in the database. The algorithm incorporates buffer management and novel
28+
estimation and pruning techniques. We also present results of applying this
29+
algorithm to sales data obtained from a large retailing company, which shows
30+
the effectiveness of the algorithm.", "venue": "SIGMOD Conference", "year":
31+
1993, "referenceCount": 16, "citationCount": 16127, "influentialCitationCount":
32+
1442, "isOpenAccess": true, "openAccessPdf": {"url": "https://dl.acm.org/doi/pdf/10.1145/170036.170072",
33+
"status": "BRONZE"}, "fieldsOfStudy": ["Computer Science"], "s2FieldsOfStudy":
34+
[{"category": "Computer Science", "source": "external"}, {"category": "Computer
35+
Science", "source": "s2-fos-model"}], "publicationTypes": ["JournalArticle",
36+
"Book", "Conference"], "publicationDate": "1993-06-01", "journal": {"name":
37+
"Proceedings of the 1993 ACM SIGMOD international conference on Management
38+
of data"}, "citationStyles": {"bibtex": "@Article{Agrawal1993MiningAR,\n author
39+
= {R. Agrawal and T. Imielinski and A. Swami},\n booktitle = {SIGMOD Conference},\n
40+
journal = {Proceedings of the 1993 ACM SIGMOD international conference on
41+
Management of data},\n title = {Mining association rules between sets of items
42+
in large databases},\n year = {1993}\n}\n"}, "authors": [{"authorId": "144947410",
43+
"name": "R. Agrawal"}, {"authorId": "1733797", "name": "T. Imielinski"}, {"authorId":
44+
"31536502", "name": "A. Swami"}], "matchScore": 109.003174}]}
45+
46+
'
47+
headers:
48+
Access-Control-Allow-Origin:
49+
- '*'
50+
Connection:
51+
- keep-alive
52+
Content-Length:
53+
- '2062'
54+
Content-Type:
55+
- application/json
56+
Date:
57+
- Sat, 13 Jul 2024 21:44:41 GMT
58+
Via:
59+
- 1.1 7d073aa92ff4e68c0fc44a7c7e34e5fc.cloudfront.net (CloudFront)
60+
X-Amz-Cf-Id:
61+
- u3Dut6TOSHqtN7VWPe4HavgtRmSAs_6Axx2DRSsgVHVpajDaa1dceQ==
62+
X-Amz-Cf-Pop:
63+
- FOR50-P2
64+
X-Cache:
65+
- Miss from cloudfront
66+
x-amz-apigw-id:
67+
- a3tDjFQ0vHcEVyg=
68+
x-amzn-Remapped-Connection:
69+
- keep-alive
70+
x-amzn-Remapped-Content-Length:
71+
- '2062'
72+
x-amzn-Remapped-Date:
73+
- Sat, 13 Jul 2024 21:44:41 GMT
74+
x-amzn-Remapped-Server:
75+
- gunicorn
76+
x-amzn-RequestId:
77+
- 1b89b328-6c1f-48a7-963e-ee429d277195
78+
status:
79+
code: 200
80+
message: OK
81+
version: 1

tests/test_semanticscholar.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -481,6 +481,18 @@ def test_search_paper_bulk_retrieval_sorted_results_desc(self):
481481
def test_search_paper_with_relevance_and_sort(self):
482482
with self.assertWarns(UserWarning):
483483
self.sch.search_paper('kubernetes', sort='citationCount')
484+
485+
@test_vcr.use_cassette
486+
def test_search_paper_match_title(self):
487+
query = 'mining association rules between'
488+
expected_title = ('Mining association rules between sets of items in '
489+
'large databases')
490+
paper = self.sch.search_paper(query, match_title=True)
491+
self.assertEqual(paper.title, expected_title)
492+
493+
def test_search_paper_match_title_and_bulk_retrieval(self):
494+
with self.assertRaises(ValueError):
495+
self.sch.search_paper('test', match_title=True, bulk=True)
484496

485497
@test_vcr.use_cassette
486498
def test_search_author(self):
@@ -924,6 +936,18 @@ async def test_search_paper_with_relevance_and_sort_async(self):
924936
with self.assertWarns(UserWarning):
925937
await self.sch.search_paper('kubernetes', sort='citationCount')
926938

939+
@test_vcr.use_cassette
940+
async def test_search_paper_match_title_async(self):
941+
query = 'mining association rules between'
942+
expected_title = ('Mining association rules between sets of items in '
943+
'large databases')
944+
paper = await self.sch.search_paper(query, match_title=True)
945+
self.assertEqual(paper.title, expected_title)
946+
947+
async def test_search_paper_match_title_and_bulk_retrieval_async(self):
948+
with self.assertRaises(ValueError):
949+
await self.sch.search_paper('test', match_title=True, bulk=True)
950+
927951
@test_vcr.use_cassette
928952
async def test_search_author_async(self):
929953
data = await self.sch.search_author('turing')

0 commit comments

Comments
 (0)