Skip to content

Commit 82fb53a

Browse files
authored
Create OmniParser demo (huggingface#13)
* Create simple omniparser demo (Node.js) * Improve output * Improve comment
1 parent c3be780 commit 82fb53a

File tree

5 files changed

+1369
-0
lines changed

5 files changed

+1369
-0
lines changed

omniparser-node/captioning.js

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
import {
2+
Florence2ForConditionalGeneration,
3+
AutoProcessor,
4+
AutoTokenizer,
5+
} from "@huggingface/transformers";
6+
7+
export class Caption {
8+
/**
9+
* Create a new Caption model.
10+
* @param {import('@huggingface/transformers').PreTrainedModel} model The model to use for captioning
11+
* @param {import('@huggingface/transformers').Processor} processor The processor to use for captioning
12+
* @param {import('@huggingface/transformers').PreTrainedTokenizer} tokenizer The tokenizer to use for captioning
13+
*/
14+
constructor(model, processor, tokenizer) {
15+
this.model = model;
16+
this.processor = processor;
17+
this.tokenizer = tokenizer;
18+
19+
// Prepare text inputs
20+
this.task = "<CAPTION>";
21+
const prompts = processor.construct_prompts(this.task);
22+
this.text_inputs = tokenizer(prompts);
23+
}
24+
25+
/**
26+
* Generate a caption for an image.
27+
* @param {import('@huggingface/transformers').RawImage} image The input image.
28+
* @returns {Promise<string>} The caption for the image
29+
*/
30+
async describe(image) {
31+
const vision_inputs = await this.processor(image);
32+
33+
// Generate text
34+
const generated_ids = await this.model.generate({
35+
...this.text_inputs,
36+
...vision_inputs,
37+
max_new_tokens: 256,
38+
});
39+
40+
// Decode generated text
41+
const generated_text = this.tokenizer.batch_decode(generated_ids, {
42+
skip_special_tokens: false,
43+
})[0];
44+
45+
// Post-process the generated text
46+
const result = this.processor.post_process_generation(
47+
generated_text,
48+
this.task,
49+
image.size,
50+
);
51+
return result[this.task];
52+
}
53+
54+
static async from_pretrained(model_id) {
55+
const model = await Florence2ForConditionalGeneration.from_pretrained(
56+
model_id,
57+
{ dtype: "fp32" },
58+
);
59+
const processor = await AutoProcessor.from_pretrained(model_id);
60+
const tokenizer = await AutoTokenizer.from_pretrained(model_id);
61+
62+
return new Caption(model, processor, tokenizer);
63+
}
64+
}

omniparser-node/detector.js

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
import { AutoModel, AutoProcessor, RawImage } from "@huggingface/transformers";
2+
3+
/**
4+
* @typedef {Object} Detection
5+
* @property {number} x1 The x-coordinate of the top-left corner.
6+
* @property {number} y1 The y-coordinate of the top-left corner.
7+
* @property {number} x2 The x-coordinate of the bottom-right corner.
8+
* @property {number} y2 The y-coordinate of the bottom-right corner.
9+
* @property {number} score The confidence score of the detection.
10+
*/
11+
12+
/**
13+
* Compute Intersection over Union (IoU) between two detections.
14+
* @param {Detection} a The first detection.
15+
* @param {Detection} b The second detection.
16+
*/
17+
function iou(a, b) {
18+
const x1 = Math.max(a.x1, b.x1);
19+
const y1 = Math.max(a.y1, b.y1);
20+
const x2 = Math.min(a.x2, b.x2);
21+
const y2 = Math.min(a.y2, b.y2);
22+
23+
const intersection = Math.max(0, x2 - x1) * Math.max(0, y2 - y1);
24+
const area1 = (a.x2 - a.x1) * (a.y2 - a.y1);
25+
const area2 = (b.x2 - b.x1) * (b.y2 - b.y1);
26+
const union = area1 + area2 - intersection;
27+
28+
return intersection / union;
29+
}
30+
31+
/**
32+
* Run Non-Maximum Suppression (NMS) on a list of detections.
33+
* @param {Detection[]} detections The list of detections.
34+
* @param {number} iouThreshold The IoU threshold for NMS.
35+
*/
36+
export function nms(detections, iouThreshold) {
37+
const result = [];
38+
while (detections.length > 0) {
39+
const best = detections.reduce((acc, detection) =>
40+
detection.score > acc.score ? detection : acc,
41+
);
42+
result.push(best);
43+
detections = detections.filter(
44+
(detection) => iou(detection, best) < iouThreshold,
45+
);
46+
}
47+
return result;
48+
}
49+
50+
export class Detector {
51+
/**
52+
* Create a new YOLOv8 detector.
53+
* @param {import('@huggingface/transformers').PreTrainedModel} model The model to use for detection
54+
* @param {import('@huggingface/transformers').Processor} processor The processor to use for detection
55+
*/
56+
constructor(model, processor) {
57+
this.model = model;
58+
this.processor = processor;
59+
}
60+
61+
/**
62+
* Run detection on an image.
63+
* @param {RawImage|string|URL} input The input image.
64+
* @param {Object} [options] The options for detection.
65+
* @param {number} [options.confidence_threshold=0.25] The confidence threshold.
66+
* @param {number} [options.iou_threshold=0.7] The IoU threshold for NMS.
67+
* @returns {Promise<Detection[]>} The list of detections
68+
*/
69+
async predict(
70+
input,
71+
{ confidence_threshold = 0.25, iou_threshold = 0.7 } = {},
72+
) {
73+
const image = await RawImage.read(input);
74+
const { pixel_values } = await this.processor(image);
75+
76+
// Run detection
77+
const { output0 } = await this.model({ images: pixel_values });
78+
79+
// Post-process output
80+
const permuted = output0[0].transpose(1, 0);
81+
// `permuted` is a Tensor of shape [ 5460, 5 ]:
82+
// - 5460 potential bounding boxes
83+
// - 5 parameters for each box:
84+
// - first 4 are coordinates for the bounding boxes (x-center, y-center, width, height)
85+
// - the last one is the confidence score
86+
87+
// Format output
88+
const result = [];
89+
const [scaledHeight, scaledWidth] = pixel_values.dims.slice(-2);
90+
for (const [xc, yc, w, h, score] of permuted.tolist()) {
91+
// Filter if not confident enough
92+
if (score < confidence_threshold) continue;
93+
94+
// Get pixel values, taking into account the original image size
95+
const x1 = ((xc - w / 2) / scaledWidth) * image.width;
96+
const y1 = ((yc - h / 2) / scaledHeight) * image.height;
97+
const x2 = ((xc + w / 2) / scaledWidth) * image.width;
98+
const y2 = ((yc + h / 2) / scaledHeight) * image.height;
99+
100+
// Add to result
101+
result.push({ x1, x2, y1, y2, score });
102+
}
103+
104+
return nms(result, iou_threshold);
105+
}
106+
107+
static async from_pretrained(model_id) {
108+
const model = await AutoModel.from_pretrained(model_id, { dtype: "fp32" });
109+
const processor = await AutoProcessor.from_pretrained(model_id);
110+
return new Detector(model, processor);
111+
}
112+
}

omniparser-node/index.js

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import { RawImage } from "@huggingface/transformers";
2+
import { Detector } from "./detector.js";
3+
import { Caption } from "./captioning.js";
4+
5+
// Load detection model
6+
const detector_model_id = "onnx-community/OmniParser-icon_detect";
7+
const detector = await Detector.from_pretrained(detector_model_id);
8+
9+
// Load captioning model
10+
const captioning_model_id = "onnx-community/Florence-2-base-ft";
11+
const captioning = await Caption.from_pretrained(captioning_model_id);
12+
13+
// Read image from URL
14+
const url =
15+
"https://raw.githubusercontent.com/microsoft/OmniParser/refs/heads/master/imgs/google_page.png";
16+
const image = await RawImage.read(url);
17+
18+
// Run detection
19+
const detections = await detector.predict(image, {
20+
confidence_threshold: 0.05,
21+
iou_threshold: 0.7,
22+
});
23+
24+
for (const { x1, x2, y1, y2, score } of detections) {
25+
// Crop image
26+
const bbox = [x1, y1, x2, y2].map(Math.round);
27+
const cropped_image = await image.crop(bbox);
28+
29+
// Run captioning
30+
const text = await captioning.describe(cropped_image);
31+
console.log({ text, bbox, score });
32+
}

0 commit comments

Comments
 (0)