|
| 1 | +{%- capture title -%} |
| 2 | +SmolVLMTransformer |
| 3 | +{%- endcapture -%} |
| 4 | + |
| 5 | +{%- capture description -%} |
| 6 | +Compact Multimodal Model for Visual Question Answering using SmolVLM. |
| 7 | + |
| 8 | +SmolVLMTransformer can load SmolVLM models for visual question answering. The model consists of a vision encoder, a text encoder, and a text decoder. The vision encoder encodes the input image, the text encoder processes the input question alongside the image encoding, and the text decoder generates the answer to the question. |
| 9 | + |
| 10 | +SmolVLM is a compact open multimodal model that accepts arbitrary sequences of image and text inputs to produce text outputs. Designed for efficiency, SmolVLM can answer questions about images, describe visual content, create stories grounded on multiple images, or function as a pure language model without visual inputs. |
| 11 | + |
| 12 | +Pretrained models can be loaded with `pretrained` of the companion object: |
| 13 | + |
| 14 | +```scala |
| 15 | +val visualQA = SmolVLMTransformer.pretrained() |
| 16 | + .setInputCols("image_assembler") |
| 17 | + .setOutputCol("answer") |
| 18 | +``` |
| 19 | +{%- capture input_anno -%} |
| 20 | +IMAGE |
| 21 | +{%- endcapture -%} |
| 22 | + |
| 23 | +{%- capture output_anno -%} |
| 24 | +DOCUMENT |
| 25 | +{%- endcapture -%} |
| 26 | + |
| 27 | +{%- capture python_example -%} |
| 28 | +import sparknlp |
| 29 | +from sparknlp.base import * |
| 30 | +from sparknlp.annotator import * |
| 31 | +from pyspark.ml import Pipeline |
| 32 | +from pyspark.sql.functions import lit |
| 33 | + |
| 34 | +image_df = spark.read.format("image").load(path=images_path) # Replace with your image path |
| 35 | +test_df = image_df.withColumn( |
| 36 | + "text", |
| 37 | + lit("<|im_start|>User:<image>Can you describe the image?<end_of_utterance>\nAssistant:") |
| 38 | +) |
| 39 | +imageAssembler = ImageAssembler() \\ |
| 40 | + .setInputCol("image") \\ |
| 41 | + .setOutputCol("image_assembler") |
| 42 | +visualQAClassifier = SmolVLMTransformer.pretrained() \\ |
| 43 | + .setInputCols("image_assembler") \\ |
| 44 | + .setOutputCol("answer") |
| 45 | +pipeline = Pipeline().setStages([ |
| 46 | + imageAssembler, |
| 47 | + visualQAClassifier |
| 48 | +]) |
| 49 | +result = pipeline.fit(test_df).transform(test_df) |
| 50 | +result.select("image_assembler.origin", "answer.result").show(truncate=False) |
| 51 | + |
| 52 | +{%- endcapture -%} |
| 53 | + |
| 54 | +{%- capture scala_example -%} |
| 55 | +import spark.implicits._ |
| 56 | +import com.johnsnowlabs.nlp.base._ |
| 57 | +import com.johnsnowlabs.nlp.annotator._ |
| 58 | +import org.apache.spark.ml.Pipeline |
| 59 | +import org.apache.spark.sql.DataFrame |
| 60 | +import org.apache.spark.sql.functions.lit |
| 61 | + |
| 62 | +val imageDF: DataFrame = spark.read |
| 63 | + .format("image") |
| 64 | + .option("dropInvalid", value = true) |
| 65 | + .load(imageFolder) // Replace with your image folder |
| 66 | + |
| 67 | +val testDF: DataFrame = imageDF.withColumn("text", lit("<|im_start|>User:<image>Can you describe the image?<end_of_utterance>\nAssistant:")) |
| 68 | + |
| 69 | +val imageAssembler: ImageAssembler = new ImageAssembler() |
| 70 | + .setInputCol("image") |
| 71 | + .setOutputCol("image_assembler") |
| 72 | + |
| 73 | +val visualQAClassifier = SmolVLMTransformer.pretrained() |
| 74 | + .setInputCols("image_assembler") |
| 75 | + .setOutputCol("answer") |
| 76 | + |
| 77 | +val pipeline = new Pipeline().setStages(Array( |
| 78 | + imageAssembler, |
| 79 | + visualQAClassifier |
| 80 | +)) |
| 81 | + |
| 82 | +val result = pipeline.fit(testDF).transform(testDF) |
| 83 | + |
| 84 | +result.select("image_assembler.origin", "answer.result").show(truncate=false) |
| 85 | +{%- endcapture -%} |
| 86 | + |
| 87 | +{%- capture api_link -%} |
| 88 | +[SmolVLMTransformer](/api/com/johnsnowlabs/nlp/annotators/cv/SmolVLMTransformer) |
| 89 | +{%- endcapture -%} |
| 90 | + |
| 91 | +{%- capture python_api_link -%} |
| 92 | +[SmolVLMTransformer](/api/python/reference/autosummary/sparknlp/annotator/cv/smolvlm_transformer/index.html#sparknlp.annotator.cv.smolvlm_transformer.SmolVLMTransformer) |
| 93 | +{%- endcapture -%} |
| 94 | + |
| 95 | +{%- capture source_link -%} |
| 96 | +[SmolVLMTransformer](https://github.com/JohnSnowLabs/spark-nlp/tree/master/src/main/scala/com/johnsnowlabs/nlp/annotators/cv/SmolVLMTransformer.scala) |
| 97 | +{%- endcapture -%} |
| 98 | + |
| 99 | +{% include templates/anno_template.md |
| 100 | +title=title |
| 101 | +description=description |
| 102 | +input_anno=input_anno |
| 103 | +output_anno=output_anno |
| 104 | +python_example=python_example |
| 105 | +scala_example=scala_example |
| 106 | +api_link=api_link |
| 107 | +python_api_link=python_api_link |
| 108 | +source_link=source_link |
| 109 | +%} |
0 commit comments