Skip to content

Commit

Permalink
Added a new feature to get images info from the pdf page. (#275)
Browse files Browse the repository at this point in the history
* Added a new feature to get images info from the pdf page.

* Fix code issues after failed PR check
  • Loading branch information
ZLATAN628 authored Apr 23, 2024
1 parent 4b28a49 commit 7ebf25f
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 0 deletions.
67 changes: 67 additions & 0 deletions src/document.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ use std::cmp::max;
use std::collections::{BTreeMap, HashMap};
use std::io::Write;
use std::str;
use crate::xobject::PdfImage;

/// A PDF document.
///
Expand Down Expand Up @@ -517,6 +518,72 @@ impl Document {
annotations
}

pub fn get_page_images(&self, page_id: ObjectId) -> Result<Vec<PdfImage>> {
let mut images = vec![];
if let Ok(page) = self.get_dictionary(page_id) {
let resources = self.get_dict_in_dict(page, b"Resources")?;
let xobject = self.get_dict_in_dict(resources, b"XObject")?;
for (_, xvalue) in xobject.iter() {
let id = xvalue.as_reference()?;
let xvalue = self.get_object(id)?;
let xvalue = xvalue.as_stream()?;
let dict = &xvalue.dict;
if dict.get(b"Subtype")?.as_name()? != b"Image" {
continue;
}
let width = dict.get(b"Width")?.as_i64()?;
let height = dict.get(b"Height")?.as_i64()?;
let color_space = match dict.get(b"ColorSpace") {
Ok(cs) => {
match cs {
Object::Array(array) => {
Some(String::from_utf8_lossy(array[0].as_name()?).to_string())
}
Object::Name(name) => {
Some(String::from_utf8_lossy(name).to_string())
}
_ => None
}
}
Err(_) => None
};
let bits_per_component = match dict.get(b"BitsPerComponent") {
Ok(bpc) => {
Some(bpc.as_i64()?)
}
Err(_) => None
};
let mut filters = vec![];
if let Ok(filter) = dict.get(b"Filter") {
match filter {
Object::Array(array) => {
for obj in array.iter() {
let name = obj.as_name()?;
filters.push(String::from_utf8_lossy(name).to_string());
}
}
Object::Name(name) => {
filters.push(String::from_utf8_lossy(name).to_string());
}
_ => {}
}
};

images.push(PdfImage {
id,
width,
height,
color_space,
bits_per_component,
filters: Some(filters),
content: &xvalue.content,
origin_dict: &xvalue.dict,
});
}
}
Ok(images)
}

pub fn decode_text(encoding: Option<&str>, bytes: &[u8]) -> String {
if let Some(encoding) = encoding {
info!("{}", encoding);
Expand Down
14 changes: 14 additions & 0 deletions src/xobject.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,20 @@ use std::path::Path;
#[cfg(feature = "embed_image")]
use crate::Result;

#[derive(Debug, Clone)]
pub struct PdfImage<'a> {
pub id: ObjectId,
pub width: i64,
pub height: i64,
pub color_space: Option<String>,
pub filters: Option<Vec<String>>,
pub bits_per_component: Option<i64>,
/// Image Data
pub content: &'a [u8],
/// Origin Stream Dictionary
pub origin_dict: &'a Dictionary,
}

pub fn form(boundingbox: Vec<f32>, matrix: Vec<f32>, content: Vec<u8>) -> Stream {
let mut dict = Dictionary::new();
dict.set("Type", Object::Name(b"XObject".to_vec()));
Expand Down

0 comments on commit 7ebf25f

Please sign in to comment.