Skip to content

Commit 6ec0ec4

Browse files
authored
chore!: Remove Request.query_params field (#639)
### Description - Remove `Request.query_params` field. - The URL params can be sent as a part of the URL and both our HTTP clients can handle it. ### Issues - Closes: #615 ### Testing - Add a new test checking the URL query params. ### Checklist - [x] CI passed
1 parent 49403a1 commit 6ec0ec4

7 files changed

Lines changed: 40 additions & 28 deletions

File tree

docs/examples/fill_and_submit_web_form.mdx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ Now, let's create a POST request with the form fields and their values using the
4646
{RequestExample}
4747
</CodeBlock>
4848

49-
Alternatively, you can send form data as URL parameters using the `query_params` argument. It depends on the form and how it is implemented. However, sending the data as a POST request body using the `payload` is generally a better approach.
49+
Alternatively, you can send form data as URL parameters using the `url` argument. It depends on the form and how it is implemented. However, sending the data as a POST request body using the `payload` is generally a better approach.
5050

5151
## Implementing the crawler
5252

src/crawlee/_request.py

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
)
2020
from typing_extensions import Self
2121

22-
from crawlee._types import EnqueueStrategy, HttpHeaders, HttpMethod, HttpPayload, HttpQueryParams, JsonSerializable
22+
from crawlee._types import EnqueueStrategy, HttpHeaders, HttpMethod, HttpPayload, JsonSerializable
2323
from crawlee._utils.crypto import crypto_random_object_id
2424
from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id
2525
from crawlee._utils.urls import extract_query_params, validate_http_url
@@ -139,9 +139,6 @@ class BaseRequestData(BaseModel):
139139
headers: Annotated[HttpHeaders, Field(default_factory=HttpHeaders)] = HttpHeaders()
140140
"""HTTP request headers."""
141141

142-
query_params: Annotated[HttpQueryParams, Field(alias='queryParams', default_factory=dict)] = {}
143-
"""URL query parameters."""
144-
145142
payload: HttpPayload | None = None
146143
"""HTTP request payload."""
147144

@@ -182,7 +179,6 @@ def from_url(
182179
*,
183180
method: HttpMethod = 'GET',
184181
headers: HttpHeaders | None = None,
185-
query_params: HttpQueryParams | None = None,
186182
payload: HttpPayload | None = None,
187183
label: str | None = None,
188184
unique_key: str | None = None,
@@ -193,7 +189,6 @@ def from_url(
193189
) -> Self:
194190
"""Create a new `BaseRequestData` instance from a URL. See `Request.from_url` for more details."""
195191
headers = headers or HttpHeaders()
196-
query_params = query_params or {}
197192

198193
unique_key = unique_key or compute_unique_key(
199194
url,
@@ -212,7 +207,6 @@ def from_url(
212207
id=id,
213208
method=method,
214209
headers=headers,
215-
query_params=query_params,
216210
payload=payload,
217211
**kwargs,
218212
)
@@ -276,7 +270,6 @@ def from_url(
276270
*,
277271
method: HttpMethod = 'GET',
278272
headers: HttpHeaders | None = None,
279-
query_params: HttpQueryParams | None = None,
280273
payload: HttpPayload | None = None,
281274
label: str | None = None,
282275
unique_key: str | None = None,
@@ -297,7 +290,6 @@ def from_url(
297290
url: The URL of the request.
298291
method: The HTTP method of the request.
299292
headers: The HTTP headers of the request.
300-
query_params: The query parameters of the URL.
301293
payload: The data to be sent as the request body. Typically used with 'POST' or 'PUT' requests.
302294
label: A custom label to differentiate between request types. This is stored in `user_data`, and it is
303295
used for request routing (different requests go to different handlers).
@@ -317,7 +309,6 @@ def from_url(
317309
raise ValueError('`always_enqueue` cannot be used with a custom `unique_key`')
318310

319311
headers = headers or HttpHeaders()
320-
query_params = query_params or {}
321312

322313
unique_key = unique_key or compute_unique_key(
323314
url,
@@ -339,7 +330,6 @@ def from_url(
339330
id=id,
340331
method=method,
341332
headers=headers,
342-
query_params=query_params,
343333
payload=payload,
344334
**kwargs,
345335
)
@@ -440,7 +430,6 @@ def __eq__(self, other: object) -> bool:
440430
and self.unique_key == other.unique_key
441431
and self.method == other.method
442432
and self.headers == other.headers
443-
and self.query_params == other.query_params
444433
and self.payload == other.payload
445434
and self.user_data == other.user_data
446435
and self.retry_count == other.retry_count

src/crawlee/_types.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,6 @@
5050

5151
HttpMethod: TypeAlias = Literal['GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'CONNECT', 'OPTIONS', 'TRACE', 'PATCH']
5252

53-
HttpQueryParams: TypeAlias = dict[str, str]
54-
5553
HttpPayload: TypeAlias = bytes
5654

5755

src/crawlee/http_clients/_base.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
if TYPE_CHECKING:
1111
from collections.abc import Iterable
1212

13-
from crawlee._types import HttpHeaders, HttpMethod, HttpPayload, HttpQueryParams
13+
from crawlee._types import HttpHeaders, HttpMethod, HttpPayload
1414
from crawlee.base_storage_client._models import Request
1515
from crawlee.proxy_configuration import ProxyInfo
1616
from crawlee.sessions import Session
@@ -112,7 +112,6 @@ async def send_request(
112112
*,
113113
method: HttpMethod = 'GET',
114114
headers: HttpHeaders | None = None,
115-
query_params: HttpQueryParams | None = None,
116115
payload: HttpPayload | None = None,
117116
session: Session | None = None,
118117
proxy_info: ProxyInfo | None = None,
@@ -125,7 +124,6 @@ async def send_request(
125124
url: The URL to send the request to.
126125
method: The HTTP method to use.
127126
headers: The headers to include in the request.
128-
query_params: The query parameters to include in the request.
129127
payload: The data to be sent as the request body.
130128
session: The session associated with the request.
131129
proxy_info: The information about the proxy to be used.

src/crawlee/http_clients/_httpx.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
if TYPE_CHECKING:
1717
from collections.abc import Iterable
1818

19-
from crawlee._types import HttpMethod, HttpPayload, HttpQueryParams
19+
from crawlee._types import HttpMethod, HttpPayload
2020
from crawlee.base_storage_client._models import Request
2121
from crawlee.proxy_configuration import ProxyInfo
2222
from crawlee.statistics import Statistics
@@ -141,7 +141,6 @@ async def crawl(
141141
url=request.url,
142142
method=request.method,
143143
headers=headers,
144-
params=request.query_params,
145144
content=request.payload,
146145
cookies=session.cookies if session else None,
147146
extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
@@ -176,7 +175,6 @@ async def send_request(
176175
*,
177176
method: HttpMethod = 'GET',
178177
headers: HttpHeaders | None = None,
179-
query_params: HttpQueryParams | None = None,
180178
payload: HttpPayload | None = None,
181179
session: Session | None = None,
182180
proxy_info: ProxyInfo | None = None,
@@ -188,7 +186,6 @@ async def send_request(
188186
url=url,
189187
method=method,
190188
headers=dict(headers) if headers else None,
191-
params=query_params,
192189
content=payload,
193190
extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
194191
)

src/crawlee/http_clients/curl_impersonate.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626

2727
from curl_cffi.requests import Response
2828

29-
from crawlee._types import HttpMethod, HttpQueryParams
29+
from crawlee._types import HttpMethod
3030
from crawlee.base_storage_client._models import Request
3131
from crawlee.proxy_configuration import ProxyInfo
3232
from crawlee.sessions import Session
@@ -130,7 +130,6 @@ async def crawl(
130130
url=request.url,
131131
method=request.method.upper(), # type: ignore # curl-cffi requires uppercase method
132132
headers=request.headers,
133-
params=request.query_params,
134133
data=request.payload,
135134
cookies=session.cookies if session else None,
136135
allow_redirects=True,
@@ -162,7 +161,6 @@ async def send_request(
162161
*,
163162
method: HttpMethod = 'GET',
164163
headers: HttpHeaders | None = None,
165-
query_params: HttpQueryParams | None = None,
166164
payload: HttpPayload | None = None,
167165
session: Session | None = None,
168166
proxy_info: ProxyInfo | None = None,
@@ -175,7 +173,6 @@ async def send_request(
175173
url=url,
176174
method=method.upper(), # type: ignore # curl-cffi requires uppercase method
177175
headers=dict(headers) if headers else None,
178-
params=query_params,
179176
data=payload,
180177
cookies=session.cookies if session else None,
181178
allow_redirects=True,

tests/unit/http_crawler/test_http_crawler.py

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -246,12 +246,45 @@ async def request_handler(context: HttpCrawlingContext) -> None:
246246
await crawler.run([request])
247247

248248
# The request handler should be called once.
249-
assert len(responses) == 1
249+
assert len(responses) == 1, 'The request handler should be called once.'
250250

251251
# The reconstructed payload data should match the original payload. We have to flatten the values, because
252252
# parse_qs returns a list of values for each key.
253253
response_data = {
254254
k: v[0] if len(v) == 1 else v for k, v in parse_qs(responses[0]['data'].strip("b'").strip("'")).items()
255255
}
256256

257-
assert response_data == payload
257+
assert response_data == payload, 'The reconstructed payload data should match the original payload.'
258+
259+
260+
@pytest.mark.parametrize(
261+
'http_client_class',
262+
[CurlImpersonateHttpClient, HttpxHttpClient],
263+
ids=['curl', 'httpx'],
264+
)
265+
async def test_sending_url_query_params(http_client_class: type[BaseHttpClient]) -> None:
266+
http_client = http_client_class()
267+
crawler = HttpCrawler(http_client=http_client)
268+
269+
responses = []
270+
271+
@crawler.router.default_handler
272+
async def request_handler(context: HttpCrawlingContext) -> None:
273+
response = json.loads(context.http_response.read())
274+
# The httpbin.org/get endpoint returns the provided query parameters in the response.
275+
responses.append(response)
276+
277+
base_url = 'https://httpbin.org/get'
278+
query_params = {'param1': 'value1', 'param2': 'value2'}
279+
request = Request.from_url(url=f'{base_url}?{urlencode(query_params)}')
280+
281+
await crawler.run([request])
282+
283+
# The request handler should be called once.
284+
assert len(responses) == 1, 'The request handler should be called once.'
285+
286+
# Validate the response query parameters.
287+
response_args = responses[0]['args']
288+
assert (
289+
response_args == query_params
290+
), 'The reconstructed query parameters should match the original query parameters.'

0 commit comments

Comments
 (0)