Skip to content

Commit

Permalink
chore(pyo3): update pyo3@0.23
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Jan 27, 2025
1 parent 7a9fb2d commit a36c332
Show file tree
Hide file tree
Showing 6 changed files with 98 additions and 77 deletions.
7 changes: 4 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
edition = "2021"
name = "spider_rs"
version = "0.0.53"
version = "0.0.55"
repository = "https://github.com/spider-rs/spider-py"
license = "MIT"
description = "The fastest web crawler and indexer."
Expand All @@ -13,9 +13,10 @@ crate-type = ["cdylib"]
indexmap = "2"
num_cpus = "1"
spider = { version = "2", features = ["cron", "regex", "cookies", "socks", "chrome", "control", "smart", "chrome_intercept", "cache", "serde", "openai", "headers" ] }
pyo3 = { version = "0.20.3", features = ["extension-module", "serde"] }
pyo3-asyncio = { version = "0.20", features = ["attributes", "tokio-runtime"] }
pyo3 = { version = "0.23", features = ["extension-module", "serde"] }
pyo3-async-runtimes = { version = "0.23", features = ["attributes", "tokio-runtime"] }
serde_json = "1"
spider_scraper = "0.1"

[target.x86_64-unknown-linux-gnu.dependencies]
openssl-sys = { version = "0.9.96", features = ["vendored"] }
Expand Down
8 changes: 5 additions & 3 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,18 @@ pub use utils::pydict_to_json_value;
pub use website::Website;

#[pyfunction]
fn crawl(py: Python, url: String, raw_content: Option<bool>) -> PyResult<&PyAny> {
pyo3_asyncio::tokio::future_into_py(py, async move {
#[pyo3(signature = (url, raw_content=None))]
/// Crawl a website storing the links found.
fn crawl(py: Python, url: String, raw_content: Option<bool>) -> PyResult<Bound<PyAny>> {
pyo3_async_runtimes::tokio::future_into_py(py, async move {
let w = shortcut::crawl(url, raw_content).await;

Ok(w)
})
}

#[pymodule]
fn spider_rs(_py: Python<'_>, m: &PyModule) -> PyResult<()> {
fn spider_rs(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_function(wrap_pyfunction!(crawl, m)?)?;
m.add_class::<Website>()?;
m.add_class::<Page>()?;
Expand Down
15 changes: 7 additions & 8 deletions src/npage.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
use crate::page::header_map_to_hash_map;
use pyo3::prelude::*;
use spider::{
lazy_static::lazy_static,
packages::scraper::{Html, Selector},
};
use spider::lazy_static::lazy_static;
use std::collections::{HashMap, HashSet};

lazy_static! {
static ref TITLE_SELECTOR: scraper::Selector = scraper::Selector::parse("title").unwrap();
}

/// a simple page object
#[derive(Default, Clone)]
#[pyclass]
Expand Down Expand Up @@ -70,11 +71,9 @@ pub fn new_page(res: &spider::page::Page, raw: bool) -> NPage {
impl NPage {
fn __call__(&self) {}

/// the html page title.
/// the html page title. TODO: remove for built in spider title passing.
pub fn title(&self) -> String {
lazy_static! {
static ref TITLE_SELECTOR: Selector = Selector::parse("title").unwrap();
}
use scraper::Html;
let fragment: Html = Html::parse_document(&self.content);
match fragment.select(&TITLE_SELECTOR).next() {
Some(title) => title.inner_html(),
Expand Down
15 changes: 8 additions & 7 deletions src/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ pub struct Page {
selectors: Option<(
CompactString,
spider::smallvec::SmallVec<[CompactString; 2]>,
CompactString
CompactString,
)>,
/// the url for the page
pub url: String,
Expand Down Expand Up @@ -45,8 +45,9 @@ pub fn header_map_to_hash_map(header_map: &HeaderMap) -> HashMap<String, String>

#[pymethods]
impl Page {
/// a new page
/// A new page to collect.
#[new]
#[pyo3(signature = (url, subdomains=None, tld=None, headers=None))]
pub fn new(
url: String,
subdomains: Option<bool>,
Expand Down Expand Up @@ -77,16 +78,16 @@ impl Page {
client
};
}
let s = pyo3_asyncio::tokio::get_runtime()
let s = pyo3_async_runtimes::tokio::get_runtime()
.block_on(async move {
let page = spider::page::Page::new_page(&slf.url, &PAGE_CLIENT).await;
slf.status_code = page.status_code.into();
slf.inner = Some(page);
slf.selectors = spider::page::get_page_selectors(
slf.selectors = Some(spider::page::get_page_selectors(
&slf.url,
slf.subdomains.unwrap_or_default(),
slf.tld.unwrap_or_default(),
);
));
Ok::<PyRefMut<'_, Page>, ()>(slf)
})
.unwrap();
Expand All @@ -99,9 +100,9 @@ impl Page {
match &slf.selectors {
Some(selectors) => match &slf.inner {
Some(inner) => {
let links = pyo3_asyncio::tokio::get_runtime()
let links = pyo3_async_runtimes::tokio::get_runtime()
.block_on(async move {
let links = inner.links(&selectors).await;
let links = inner.to_owned().links(&selectors, &None).await;
Ok::<spider::hashbrown::HashSet<spider::CaseInsensitiveString>, ()>(links)
})
.unwrap_or_default();
Expand Down
18 changes: 11 additions & 7 deletions src/utils.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
use pyo3::types::PyAnyMethods;
use pyo3::types::PyDictMethods;
use pyo3::types::{PyAny, PyDict, PyList};
use pyo3::PyResult;
use pyo3::{Bound, PyResult};
use serde_json::Value as JsonValue;

/// convert pyobject to json value
pub fn pyobj_to_json_value(obj: &PyAny) -> PyResult<JsonValue> {
pub fn pyobj_to_json_value(obj: &Bound<PyAny>) -> PyResult<JsonValue> {
// Handle None
if obj.is_none() {
Ok(JsonValue::Null)
Expand All @@ -23,15 +25,17 @@ pub fn pyobj_to_json_value(obj: &PyAny) -> PyResult<JsonValue> {
Ok(JsonValue::String(val.to_string()))
} else if let Ok(list) = obj.downcast::<PyList>() {
let mut vec = Vec::new();
for item in list.iter() {
vec.push(pyobj_to_json_value(item)?);

while let Ok(item) = list.try_iter() {
vec.push(pyobj_to_json_value(&item)?);
}

Ok(JsonValue::Array(vec))
} else if let Ok(dict) = obj.downcast::<PyDict>() {
let mut map = serde_json::Map::new();
for (k, v) in dict.iter() {
let key: &str = k.extract()?;
let value = pyobj_to_json_value(v)?;
let value = pyobj_to_json_value(&v)?;
map.insert(key.to_string(), value);
}
Ok(JsonValue::Object(map))
Expand All @@ -43,12 +47,12 @@ pub fn pyobj_to_json_value(obj: &PyAny) -> PyResult<JsonValue> {
}

/// convert pydict to json value
pub fn pydict_to_json_value(py_dict: &pyo3::types::PyDict) -> PyResult<JsonValue> {
pub fn pydict_to_json_value(py_dict: &Bound<pyo3::types::PyDict>) -> PyResult<JsonValue> {
let mut map = serde_json::Map::new();

for (k, v) in py_dict.iter() {
let key: &str = k.extract()?;
let value: JsonValue = pyobj_to_json_value(v)?;
let value: JsonValue = pyobj_to_json_value(&v)?;
map.insert(key.to_string(), value);
}

Expand Down
Loading

0 comments on commit a36c332

Please sign in to comment.